brw: Avoid vectorizing loads in NIR if it could extend into a different page

Took inspiration from RADV to make nir_opt_load_store_vectorize robust against
page faults, by checking the align_offset and align_mul to see if any extra
components could be overlapping into a different page.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
This commit is contained in:
Calder Young 2026-03-29 22:03:29 -07:00 committed by Marge Bot
parent 3ac6233655
commit 4120ae4963
5 changed files with 51 additions and 13 deletions

View file

@ -403,9 +403,13 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch)
/* Do vectorizing here. For some reason when trying to do it in the back
* this just isn't working.
*/
struct brw_nir_vectorize_mem_cb_data cb_data = {
.devinfo = screen->devinfo,
};
nir_load_store_vectorize_options options = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
.callback = brw_nir_should_vectorize_mem,
.cb_data = &cb_data,
.robust_modes = (nir_variable_mode)0,
};
NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options);

View file

@ -2396,8 +2396,10 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data)
void *_data)
{
struct brw_nir_vectorize_mem_cb_data *data = _data;
/* Don't combine things to generate 64-bit loads/stores. We have to split
* those back into 32-bit ones anyway and UBO loads aren't split in NIR so
* we don't want to make a mess for the back-end.
@ -2405,12 +2407,21 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
if (bit_size > 32)
return false;
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel ||
(low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel &&
low->src[0].ssa == high->src[0].ssa)) {
bool convergent_block_load =
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel ||
(low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel &&
low->src[0].ssa == high->src[0].ssa);
unsigned unaligned_size = num_components * bit_size;
unsigned aligned_size = convergent_block_load ?
brw_uniform_block_size(data->devinfo, num_components) * bit_size :
nir_round_up_components(num_components) * bit_size;
hole_size += (aligned_size - unaligned_size) / 8;
if (convergent_block_load) {
if (num_components > 4) {
if (bit_size != 32)
return false;
@ -2432,12 +2443,19 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
return false;
}
const uint32_t align = nir_combined_align(align_mul, align_offset);
if (align < bit_size / 8)
if (nir_combined_align(align_mul, align_offset) < bit_size / 8)
return false;
if (low->intrinsic == nir_intrinsic_load_global ||
low->intrinsic == nir_intrinsic_load_global_constant ||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
/* Only increase the size of loads if doing so doesn't extend into a new page. */
uint32_t mul = MIN2(align_mul, data->devinfo->mem_alignment);
unsigned end = align_offset + unaligned_size / 8;
if ((aligned_size - unaligned_size) / 8 > (align(end, mul) - end))
return false;
}
return true;
}
@ -2532,7 +2550,7 @@ get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes,
/* Choose a byte, word, or dword */
bytes = MIN2(bytes, 4);
if (bytes == 3)
bytes = is_load ? 4 : 2;
bytes = (is_load && align >= 4) ? 4 : 2;
/* Ensure we split into aligned pieces. We cannot blindly turn an i8vec4
* into i32 due to the alignment requirements. It might be possible to
@ -2642,12 +2660,16 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
{
const struct intel_device_info *devinfo = pt->compiler->devinfo;
struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
.devinfo = devinfo,
};
nir_load_store_vectorize_options options = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
nir_var_mem_global | nir_var_mem_shared |
nir_var_mem_task_payload,
.round_up_components = lsc_urb_round_up_components,
.callback = brw_nir_should_vectorize_mem,
.cb_data = &vectorize_cb_data,
.robust_modes = (nir_variable_mode)0,
};
@ -2681,6 +2703,7 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
nir_load_store_vectorize_options ubo_options = {
.modes = nir_var_mem_ubo,
.callback = brw_nir_should_vectorize_mem,
.cb_data = &vectorize_cb_data,
.robust_modes = options.robust_modes & nir_var_mem_ubo,
};

View file

@ -318,6 +318,10 @@ enum brw_reg_type brw_type_for_base_type(enum glsl_base_type base_type);
enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
nir_alu_type type);
struct brw_nir_vectorize_mem_cb_data {
const struct intel_device_info *devinfo;
};
bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,

View file

@ -145,9 +145,13 @@ compile_shader(struct anv_device *device,
/* Do vectorizing here. For some reason when trying to do it in the back
* this just isn't working.
*/
struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
.devinfo = device->info,
};
nir_load_store_vectorize_options options = {
.modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
.callback = brw_nir_should_vectorize_mem,
.cb_data = &vectorize_cb_data,
.robust_modes = (nir_variable_mode)0,
};
NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options);

View file

@ -1181,12 +1181,15 @@ anv_shader_compile_bs(struct anv_device *device,
nir_shader **resume_shaders = NULL;
uint32_t num_resume_shaders = 0;
if (nir->info.stage != MESA_SHADER_COMPUTE) {
struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
.devinfo = devinfo,
};
const nir_lower_shader_calls_options opts = {
.address_format = nir_address_format_64bit_global,
.stack_alignment = BRW_BTD_STACK_ALIGN,
.localized_loads = true,
.vectorizer_callback = brw_nir_should_vectorize_mem,
.vectorizer_data = NULL,
.vectorizer_data = &vectorize_cb_data,
.should_remat_callback = should_remat_cb,
};