amd: optimize atomics before lowering intrinsics

ac_nir_lower_intrinsics_to_args will lower most system values.

I have to keep the divergence analysis in ACO, otherwise it goes haywire.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32782>
This commit is contained in:
Marek Olšák 2024-12-29 21:27:21 -05:00 committed by Marge Bot
parent d30c55abf1
commit 7fbca998b1
5 changed files with 28 additions and 7 deletions

View file

@ -2208,3 +2208,15 @@ ac_nir_lower_mem_access_bit_sizes(nir_shader *shader, enum amd_gfx_level gfx_lev
};
return nir_lower_mem_access_bit_sizes(shader, &lower_mem_access_options);
}
bool
ac_nir_optimize_uniform_atomics(nir_shader *nir)
{
bool progress = false;
NIR_PASS(progress, nir, ac_nir_opt_shared_append);
nir_divergence_analysis(nir);
NIR_PASS(progress, nir, nir_opt_uniform_atomics, false);
return progress;
}

View file

@ -344,6 +344,9 @@ ac_nir_flag_smem_for_loads(nir_shader *shader, enum amd_gfx_level gfx_level, boo
bool
ac_nir_lower_mem_access_bit_sizes(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm);
bool
ac_nir_optimize_uniform_atomics(nir_shader *nir);
#ifdef __cplusplus
}
#endif

View file

@ -364,16 +364,10 @@ init_context(isel_context* ctx, nir_shader* shader)
ctx->ub_config.max_workgroup_size[1] = 1024;
ctx->ub_config.max_workgroup_size[2] = 1024;
ac_nir_opt_shared_append(shader);
uint32_t options =
shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs;
nir_divergence_analysis_impl(impl, (nir_divergence_options)options);
shader->info.divergence_analysis_run = true;
if (nir_opt_uniform_atomics(shader, false)) {
nir_lower_int64(shader);
nir_divergence_analysis_impl(impl, (nir_divergence_options)options);
}
apply_nuw_to_offsets(ctx, impl);
ac_nir_flag_smem_for_loads(shader, ctx->program->gfx_level, false, true);

View file

@ -505,6 +505,12 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
NIR_PASS(_, stage->nir, nir_clear_shared_memory, shared_size, chunk_size);
}
/* This must be after lowering resources to descriptor loads and before lowering intrinsics
* to args and lowering int64.
*/
if (!radv_use_llvm_for_stage(pdev, stage->stage))
ac_nir_optimize_uniform_atomics(stage->nir);
NIR_PASS(_, stage->nir, nir_lower_int64);
NIR_PASS(_, stage->nir, nir_opt_idiv_const, 8);

View file

@ -2582,7 +2582,6 @@ static struct nir_shader *si_get_nir_shader(struct si_shader *shader, struct si_
});
NIR_PASS(progress, nir, nir_lower_pack);
NIR_PASS(progress, nir, nir_lower_int64);
NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
NIR_PASS(progress, nir, nir_lower_idiv,
&(nir_lower_idiv_options){
@ -2635,6 +2634,13 @@ static struct nir_shader *si_get_nir_shader(struct si_shader *shader, struct si_
/* This must be after vectorization because it causes bindings_different_restrict() to fail. */
NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
/* This must be after lowering resources to descriptor loads and before lowering intrinsics
* to args and lowering int64.
*/
if (nir->info.use_aco_amd)
progress |= ac_nir_optimize_uniform_atomics(nir);
NIR_PASS(progress, nir, nir_lower_int64);
NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
sel->screen->info.has_ls_vgpr_init_bug,