brw: Avoid rounding every convergent block load up to a full register

To simplify things, our backend rounds convergent block loads up to a full
register. This causes page faults with the scratch page disabled since the
address is not always aligned to a register size. Loading smaller blocks is
slightly more difficult because the SEND instruction can only write back a
multiple of full registers, even if the actual data is smaller.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
This commit is contained in:
Calder Young 2026-03-29 22:49:06 -07:00 committed by Marge Bot
parent 8ce98fedc4
commit 3ac6233655
2 changed files with 43 additions and 11 deletions

View file

@ -4573,10 +4573,10 @@ get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw_builder &bld,
static unsigned
choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
{
const unsigned min_block = 8;
const unsigned min_block = devinfo->has_lsc ? 1 : 4;
const unsigned max_block = devinfo->has_lsc ? 64 : 32;
const unsigned block = 1 << util_logbase2(dwords);
const unsigned block = dwords > 4 ? 1 << util_logbase2(dwords) : dwords;
return CLAMP(block, min_block, max_block);
}
@ -6188,23 +6188,32 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
unsigned first_read_component = 0;
if (convergent_block_load) {
/* If the address is a constant and alignment permits, skip unread
* leading and trailing components. (It's probably not worth the
/* If the address is a constant and alignment permits, skip as many
* unread leading and trailing components as we can without splitting
* the load into more smaller blocks. (It's probably not worth the
* extra address math for non-constant addresses.)
*
* Note that SLM block loads on HDC platforms need to be 16B aligned.
*/
if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
alignment >= data_bit_size / 8 &&
(devinfo->has_lsc || mode != MEMORY_MODE_SHARED_LOCAL)) {
alignment >= nir_bit_size / 8) {
first_read_component = nir_def_first_component_read(&instr->def);
unsigned last_component = nir_def_last_component_read(&instr->def);
unsigned last_component = nir_def_last_component_read(&instr->def) + 1;
if (!devinfo->has_lsc && mode == MEMORY_MODE_SHARED_LOCAL) {
first_read_component = ROUND_DOWN_TO(first_read_component, 4);
last_component = align(last_component, 4);
}
total = last_component - first_read_component;
total = brw_uniform_block_size(devinfo, total);
first_read_component =
total >= last_component ? 0 : last_component - total;
components = MIN2(components, last_component) - first_read_component;
srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
first_read_component * (data_bit_size / 8);
components = last_component - first_read_component + 1;
first_read_component * (nir_bit_size / 8);
} else {
total = brw_uniform_block_size(devinfo, components);
}
total = align(components, REG_SIZE * reg_unit(devinfo) / 4);
dest = ubld.vgrf(BRW_TYPE_UD, total);
} else {
total = components * bld.dispatch_width();
@ -6218,6 +6227,11 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
unsigned block_comps = choose_block_size_dwords(devinfo, total - done);
const unsigned block_bytes = block_comps * (nir_bit_size / 8);
/* Our current choice of block sizes and 32-bit data type will
* always give us a GRF-aligned offset into dest
*/
assert(done % (REG_SIZE / 4 * reg_unit(devinfo)) == 0);
brw_reg dst_offset = is_store ? brw_reg() :
retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
if (is_store) {
@ -6228,7 +6242,7 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
mem = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS)->as_mem();
mem->has_no_mask_send_params = no_mask_handle;
if (is_load)
mem->size_written = block_bytes;
mem->size_written = align(block_bytes, REG_SIZE * reg_unit(devinfo));
mem->lsc_op = op;
mem->mode = *mode;
mem->binding_type = *binding_type;

View file

@ -326,6 +326,24 @@ bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
nir_intrinsic_instr *high,
void *data);
/**
* Gets the size of a nir_load_*_uniform_block_intel after its lowered
* by the backend to a block load message, note that page faults can
* happen if this is not accounted for when using these intrinsics.
*/
static inline unsigned
brw_uniform_block_size(const struct intel_device_info *devinfo,
unsigned num_components)
{
/* Round up to a supported block size, or to the nearest multiple of
* 16 components if its any larger.
*/
return num_components > 8 ? align(num_components, 16)
: num_components > 4 ? 8
: !devinfo->has_lsc ? 4
: num_components;
}
void brw_nir_optimize(struct brw_pass_tracker *pt);
#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0