From 4120ae496339d6104e057769b304cef364e074e8 Mon Sep 17 00:00:00 2001 From: Calder Young Date: Sun, 29 Mar 2026 22:03:29 -0700 Subject: [PATCH] brw: Avoid vectorizing loads in NIR if it could extend into a different page Took inspiration from RADV to make nir_opt_load_store_vectorize robust against page faults, by checking the align_offset and align_mul to see if any extra components could be overlapping into a different page. Reviewed-by: Lionel Landwerlin Part-of: --- src/gallium/drivers/iris/iris_program_cache.c | 4 ++ src/intel/compiler/brw/brw_nir.c | 47 ++++++++++++++----- src/intel/compiler/brw/brw_nir.h | 4 ++ src/intel/vulkan/anv_internal_kernels.c | 4 ++ src/intel/vulkan/anv_shader_compile.c | 5 +- 5 files changed, 51 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index 94a10bb1179..92c56517a95 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -403,9 +403,13 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch) /* Do vectorizing here. For some reason when trying to do it in the back * this just isn't working. */ + struct brw_nir_vectorize_mem_cb_data cb_data = { + .devinfo = screen->devinfo, + }; nir_load_store_vectorize_options options = { .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global, .callback = brw_nir_should_vectorize_mem, + .cb_data = &cb_data, .robust_modes = (nir_variable_mode)0, }; NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options); diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 1168f9b66c0..47a2b019ea8 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -2396,8 +2396,10 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, int64_t hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, - void *data) + void *_data) { + struct brw_nir_vectorize_mem_cb_data *data = _data; + /* Don't combine things to generate 64-bit loads/stores. We have to split * those back into 32-bit ones anyway and UBO loads aren't split in NIR so * we don't want to make a mess for the back-end. @@ -2405,12 +2407,21 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, if (bit_size > 32) return false; - if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || - low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || - low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel || - low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel || - (low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel && - low->src[0].ssa == high->src[0].ssa)) { + bool convergent_block_load = + low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || + low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || + low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel || + low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel || + (low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel && + low->src[0].ssa == high->src[0].ssa); + + unsigned unaligned_size = num_components * bit_size; + unsigned aligned_size = convergent_block_load ? + brw_uniform_block_size(data->devinfo, num_components) * bit_size : + nir_round_up_components(num_components) * bit_size; + hole_size += (aligned_size - unaligned_size) / 8; + + if (convergent_block_load) { if (num_components > 4) { if (bit_size != 32) return false; @@ -2432,12 +2443,19 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, return false; } - - const uint32_t align = nir_combined_align(align_mul, align_offset); - - if (align < bit_size / 8) + if (nir_combined_align(align_mul, align_offset) < bit_size / 8) return false; + if (low->intrinsic == nir_intrinsic_load_global || + low->intrinsic == nir_intrinsic_load_global_constant || + low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) { + /* Only increase the size of loads if doing so doesn't extend into a new page. */ + uint32_t mul = MIN2(align_mul, data->devinfo->mem_alignment); + unsigned end = align_offset + unaligned_size / 8; + if ((aligned_size - unaligned_size) / 8 > (align(end, mul) - end)) + return false; + } + return true; } @@ -2532,7 +2550,7 @@ get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes, /* Choose a byte, word, or dword */ bytes = MIN2(bytes, 4); if (bytes == 3) - bytes = is_load ? 4 : 2; + bytes = (is_load && align >= 4) ? 4 : 2; /* Ensure we split into aligned pieces. We cannot blindly turn an i8vec4 * into i32 due to the alignment requirements. It might be possible to @@ -2642,12 +2660,16 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt) { const struct intel_device_info *devinfo = pt->compiler->devinfo; + struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = { + .devinfo = devinfo, + }; nir_load_store_vectorize_options options = { .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_shared | nir_var_mem_task_payload, .round_up_components = lsc_urb_round_up_components, .callback = brw_nir_should_vectorize_mem, + .cb_data = &vectorize_cb_data, .robust_modes = (nir_variable_mode)0, }; @@ -2681,6 +2703,7 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt) nir_load_store_vectorize_options ubo_options = { .modes = nir_var_mem_ubo, .callback = brw_nir_should_vectorize_mem, + .cb_data = &vectorize_cb_data, .robust_modes = options.robust_modes & nir_var_mem_ubo, }; diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index 62eb3a5db9f..653038c8dff 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -318,6 +318,10 @@ enum brw_reg_type brw_type_for_base_type(enum glsl_base_type base_type); enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo, nir_alu_type type); +struct brw_nir_vectorize_mem_cb_data { + const struct intel_device_info *devinfo; +}; + bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, diff --git a/src/intel/vulkan/anv_internal_kernels.c b/src/intel/vulkan/anv_internal_kernels.c index ff419bb6db1..36c4a9494d9 100644 --- a/src/intel/vulkan/anv_internal_kernels.c +++ b/src/intel/vulkan/anv_internal_kernels.c @@ -145,9 +145,13 @@ compile_shader(struct anv_device *device, /* Do vectorizing here. For some reason when trying to do it in the back * this just isn't working. */ + struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = { + .devinfo = device->info, + }; nir_load_store_vectorize_options options = { .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global, .callback = brw_nir_should_vectorize_mem, + .cb_data = &vectorize_cb_data, .robust_modes = (nir_variable_mode)0, }; NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options); diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index a4347a9c7d3..8af63fbec30 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -1181,12 +1181,15 @@ anv_shader_compile_bs(struct anv_device *device, nir_shader **resume_shaders = NULL; uint32_t num_resume_shaders = 0; if (nir->info.stage != MESA_SHADER_COMPUTE) { + struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = { + .devinfo = devinfo, + }; const nir_lower_shader_calls_options opts = { .address_format = nir_address_format_64bit_global, .stack_alignment = BRW_BTD_STACK_ALIGN, .localized_loads = true, .vectorizer_callback = brw_nir_should_vectorize_mem, - .vectorizer_data = NULL, + .vectorizer_data = &vectorize_cb_data, .should_remat_callback = should_remat_cb, };