mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 04:58:05 +02:00
brw: Avoid vectorizing loads in NIR if it could extend into a different page
Took inspiration from RADV to make nir_opt_load_store_vectorize robust against page faults, by checking the align_offset and align_mul to see if any extra components could be overlapping into a different page. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40149>
This commit is contained in:
parent
3ac6233655
commit
4120ae4963
5 changed files with 51 additions and 13 deletions
|
|
@ -403,9 +403,13 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch)
|
|||
/* Do vectorizing here. For some reason when trying to do it in the back
|
||||
* this just isn't working.
|
||||
*/
|
||||
struct brw_nir_vectorize_mem_cb_data cb_data = {
|
||||
.devinfo = screen->devinfo,
|
||||
};
|
||||
nir_load_store_vectorize_options options = {
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
|
||||
.callback = brw_nir_should_vectorize_mem,
|
||||
.cb_data = &cb_data,
|
||||
.robust_modes = (nir_variable_mode)0,
|
||||
};
|
||||
NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options);
|
||||
|
|
|
|||
|
|
@ -2396,8 +2396,10 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
|||
int64_t hole_size,
|
||||
nir_intrinsic_instr *low,
|
||||
nir_intrinsic_instr *high,
|
||||
void *data)
|
||||
void *_data)
|
||||
{
|
||||
struct brw_nir_vectorize_mem_cb_data *data = _data;
|
||||
|
||||
/* Don't combine things to generate 64-bit loads/stores. We have to split
|
||||
* those back into 32-bit ones anyway and UBO loads aren't split in NIR so
|
||||
* we don't want to make a mess for the back-end.
|
||||
|
|
@ -2405,12 +2407,21 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
|||
if (bit_size > 32)
|
||||
return false;
|
||||
|
||||
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel ||
|
||||
(low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel &&
|
||||
low->src[0].ssa == high->src[0].ssa)) {
|
||||
bool convergent_block_load =
|
||||
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
|
||||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel ||
|
||||
(low->intrinsic == nir_intrinsic_load_shader_indirect_data_intel &&
|
||||
low->src[0].ssa == high->src[0].ssa);
|
||||
|
||||
unsigned unaligned_size = num_components * bit_size;
|
||||
unsigned aligned_size = convergent_block_load ?
|
||||
brw_uniform_block_size(data->devinfo, num_components) * bit_size :
|
||||
nir_round_up_components(num_components) * bit_size;
|
||||
hole_size += (aligned_size - unaligned_size) / 8;
|
||||
|
||||
if (convergent_block_load) {
|
||||
if (num_components > 4) {
|
||||
if (bit_size != 32)
|
||||
return false;
|
||||
|
|
@ -2432,12 +2443,19 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
|||
return false;
|
||||
}
|
||||
|
||||
|
||||
const uint32_t align = nir_combined_align(align_mul, align_offset);
|
||||
|
||||
if (align < bit_size / 8)
|
||||
if (nir_combined_align(align_mul, align_offset) < bit_size / 8)
|
||||
return false;
|
||||
|
||||
if (low->intrinsic == nir_intrinsic_load_global ||
|
||||
low->intrinsic == nir_intrinsic_load_global_constant ||
|
||||
low->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel) {
|
||||
/* Only increase the size of loads if doing so doesn't extend into a new page. */
|
||||
uint32_t mul = MIN2(align_mul, data->devinfo->mem_alignment);
|
||||
unsigned end = align_offset + unaligned_size / 8;
|
||||
if ((aligned_size - unaligned_size) / 8 > (align(end, mul) - end))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -2532,7 +2550,7 @@ get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes,
|
|||
/* Choose a byte, word, or dword */
|
||||
bytes = MIN2(bytes, 4);
|
||||
if (bytes == 3)
|
||||
bytes = is_load ? 4 : 2;
|
||||
bytes = (is_load && align >= 4) ? 4 : 2;
|
||||
|
||||
/* Ensure we split into aligned pieces. We cannot blindly turn an i8vec4
|
||||
* into i32 due to the alignment requirements. It might be possible to
|
||||
|
|
@ -2642,12 +2660,16 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
|
|||
{
|
||||
const struct intel_device_info *devinfo = pt->compiler->devinfo;
|
||||
|
||||
struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
|
||||
.devinfo = devinfo,
|
||||
};
|
||||
nir_load_store_vectorize_options options = {
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_ssbo |
|
||||
nir_var_mem_global | nir_var_mem_shared |
|
||||
nir_var_mem_task_payload,
|
||||
.round_up_components = lsc_urb_round_up_components,
|
||||
.callback = brw_nir_should_vectorize_mem,
|
||||
.cb_data = &vectorize_cb_data,
|
||||
.robust_modes = (nir_variable_mode)0,
|
||||
};
|
||||
|
||||
|
|
@ -2681,6 +2703,7 @@ brw_vectorize_lower_mem_access(brw_pass_tracker *pt)
|
|||
nir_load_store_vectorize_options ubo_options = {
|
||||
.modes = nir_var_mem_ubo,
|
||||
.callback = brw_nir_should_vectorize_mem,
|
||||
.cb_data = &vectorize_cb_data,
|
||||
.robust_modes = options.robust_modes & nir_var_mem_ubo,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -318,6 +318,10 @@ enum brw_reg_type brw_type_for_base_type(enum glsl_base_type base_type);
|
|||
enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
|
||||
nir_alu_type type);
|
||||
|
||||
struct brw_nir_vectorize_mem_cb_data {
|
||||
const struct intel_device_info *devinfo;
|
||||
};
|
||||
|
||||
bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
||||
unsigned bit_size,
|
||||
unsigned num_components,
|
||||
|
|
|
|||
|
|
@ -145,9 +145,13 @@ compile_shader(struct anv_device *device,
|
|||
/* Do vectorizing here. For some reason when trying to do it in the back
|
||||
* this just isn't working.
|
||||
*/
|
||||
struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
|
||||
.devinfo = device->info,
|
||||
};
|
||||
nir_load_store_vectorize_options options = {
|
||||
.modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
|
||||
.callback = brw_nir_should_vectorize_mem,
|
||||
.cb_data = &vectorize_cb_data,
|
||||
.robust_modes = (nir_variable_mode)0,
|
||||
};
|
||||
NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options);
|
||||
|
|
|
|||
|
|
@ -1181,12 +1181,15 @@ anv_shader_compile_bs(struct anv_device *device,
|
|||
nir_shader **resume_shaders = NULL;
|
||||
uint32_t num_resume_shaders = 0;
|
||||
if (nir->info.stage != MESA_SHADER_COMPUTE) {
|
||||
struct brw_nir_vectorize_mem_cb_data vectorize_cb_data = {
|
||||
.devinfo = devinfo,
|
||||
};
|
||||
const nir_lower_shader_calls_options opts = {
|
||||
.address_format = nir_address_format_64bit_global,
|
||||
.stack_alignment = BRW_BTD_STACK_ALIGN,
|
||||
.localized_loads = true,
|
||||
.vectorizer_callback = brw_nir_should_vectorize_mem,
|
||||
.vectorizer_data = NULL,
|
||||
.vectorizer_data = &vectorize_cb_data,
|
||||
.should_remat_callback = should_remat_cb,
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue