mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 04:58:05 +02:00
brw: Avoid rounding every convergent block load up to a full register
To simplify things, our backend rounds convergent block loads up to a full register. This causes page faults with the scratch page disabled since the address is not always aligned to a register size. Loading smaller blocks is slightly more difficult because the SEND instruction can only write back a multiple of full registers, even if the actual data is smaller. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
This commit is contained in:
parent
8ce98fedc4
commit
3ac6233655
2 changed files with 43 additions and 11 deletions
|
|
@ -4573,10 +4573,10 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
|
|||
static unsigned
|
||||
choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
|
||||
{
|
||||
const unsigned min_block = 8;
|
||||
const unsigned min_block = devinfo->has_lsc ? 1 : 4;
|
||||
const unsigned max_block = devinfo->has_lsc ? 64 : 32;
|
||||
|
||||
const unsigned block = 1 << util_logbase2(dwords);
|
||||
const unsigned block = dwords > 4 ? 1 << util_logbase2(dwords) : dwords;
|
||||
|
||||
return CLAMP(block, min_block, max_block);
|
||||
}
|
||||
|
|
@ -6188,23 +6188,32 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
|
|||
unsigned first_read_component = 0;
|
||||
|
||||
if (convergent_block_load) {
|
||||
/* If the address is a constant and alignment permits, skip unread
|
||||
* leading and trailing components. (It's probably not worth the
|
||||
/* If the address is a constant and alignment permits, skip as many
|
||||
* unread leading and trailing components as we can without splitting
|
||||
* the load into more smaller blocks. (It's probably not worth the
|
||||
* extra address math for non-constant addresses.)
|
||||
*
|
||||
* Note that SLM block loads on HDC platforms need to be 16B aligned.
|
||||
*/
|
||||
if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
|
||||
alignment >= data_bit_size / 8 &&
|
||||
(devinfo->has_lsc || mode != MEMORY_MODE_SHARED_LOCAL)) {
|
||||
alignment >= nir_bit_size / 8) {
|
||||
first_read_component = nir_def_first_component_read(&instr->def);
|
||||
unsigned last_component = nir_def_last_component_read(&instr->def);
|
||||
unsigned last_component = nir_def_last_component_read(&instr->def) + 1;
|
||||
if (!devinfo->has_lsc && mode == MEMORY_MODE_SHARED_LOCAL) {
|
||||
first_read_component = ROUND_DOWN_TO(first_read_component, 4);
|
||||
last_component = align(last_component, 4);
|
||||
}
|
||||
total = last_component - first_read_component;
|
||||
total = brw_uniform_block_size(devinfo, total);
|
||||
first_read_component =
|
||||
total >= last_component ? 0 : last_component - total;
|
||||
components = MIN2(components, last_component) - first_read_component;
|
||||
srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
|
||||
first_read_component * (data_bit_size / 8);
|
||||
components = last_component - first_read_component + 1;
|
||||
first_read_component * (nir_bit_size / 8);
|
||||
} else {
|
||||
total = brw_uniform_block_size(devinfo, components);
|
||||
}
|
||||
|
||||
total = align(components, REG_SIZE * reg_unit(devinfo) / 4);
|
||||
dest = ubld.vgrf(BRW_TYPE_UD, total);
|
||||
} else {
|
||||
total = components * bld.dispatch_width();
|
||||
|
|
@ -6218,6 +6227,11 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
|
|||
unsigned block_comps = choose_block_size_dwords(devinfo, total - done);
|
||||
const unsigned block_bytes = block_comps * (nir_bit_size / 8);
|
||||
|
||||
/* Our current choice of block sizes and 32-bit data type will
|
||||
* always give us a GRF-aligned offset into dest
|
||||
*/
|
||||
assert(done % (REG_SIZE / 4 * reg_unit(devinfo)) == 0);
|
||||
|
||||
brw_reg dst_offset = is_store ? brw_reg() :
|
||||
retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
|
||||
if (is_store) {
|
||||
|
|
@ -6228,7 +6242,7 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
|
|||
mem = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
|
||||
mem->has_no_mask_send_params = no_mask_handle;
|
||||
if (is_load)
|
||||
mem->size_written = block_bytes;
|
||||
mem->size_written = align(block_bytes, REG_SIZE * reg_unit(devinfo));
|
||||
mem->lsc_op = op;
|
||||
mem->mode = *mode;
|
||||
mem->binding_type = *binding_type;
|
||||
|
|
|
|||
|
|
@ -326,6 +326,24 @@ bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
|||
nir_intrinsic_instr *high,
|
||||
void *data);
|
||||
|
||||
/**
|
||||
* Gets the size of a nir_load_*_uniform_block_intel after its lowered
|
||||
* by the backend to a block load message, note that page faults can
|
||||
* happen if this is not accounted for when using these intrinsics.
|
||||
*/
|
||||
static inline unsigned
|
||||
brw_uniform_block_size(const struct intel_device_info *devinfo,
|
||||
unsigned num_components)
|
||||
{
|
||||
/* Round up to a supported block size, or to the nearest multiple of
|
||||
* 16 components if its any larger.
|
||||
*/
|
||||
return num_components > 8 ? align(num_components, 16)
|
||||
: num_components > 4 ? 8
|
||||
: !devinfo->has_lsc ? 4
|
||||
: num_components;
|
||||
}
|
||||
|
||||
void brw_nir_optimize(struct brw_pass_tracker *pt);
|
||||
|
||||
#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue