From 8ceef4e3d4efec0ebc5f145cde53aeffc9b70b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 21 Nov 2025 15:29:37 -0500 Subject: [PATCH] radeonsi: lower compute system values later Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader.c | 45 ++++++++++++++++++++ src/gallium/drivers/radeonsi/si_shader_nir.c | 45 -------------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 5818be70ab1..3dd87a08881 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -638,6 +638,51 @@ static void run_pre_link_optimization_passes(struct si_nir_shader_ctx *ctx) nir_shader *nir = ctx->nir; bool progress = false; + if (mesa_shader_stage_is_compute(nir->info.stage)) { + /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct + * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is + * incorrect with a non-linear thread order. + * + * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is + * divisible by 2. + */ + NIR_PASS(progress, nir, nir_lower_compute_system_values, + &(nir_lower_compute_system_values_options){ + .lower_local_invocation_index = + nir->info.derivative_group == DERIVATIVE_GROUP_QUADS || + (sel->screen->info.gfx_level >= GFX12 && + nir->info.derivative_group == DERIVATIVE_GROUP_NONE && + (nir->info.workgroup_size_variable || + (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0))) + }); + + /* Gfx12 supports this in hw. */ + if (sel->screen->info.gfx_level < GFX12 && + nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) { + NIR_PASS(progress, nir, nir_opt_cse); /* CSE load_local_invocation_id */ + NIR_PASS(progress, nir, nir_lower_compute_system_values, + &(nir_lower_compute_system_values_options){ + .shuffle_local_ids_for_quad_derivatives = true, + }); + } + } + + if (nir->info.stage == MESA_SHADER_MESH && !sel->screen->info.mesh_fast_launch_2) { + NIR_PASS(progress, nir, nir_lower_compute_system_values, + &(nir_lower_compute_system_values_options){ + /* Mesh shaders run as NGG which can implement local_invocation_index from + * the wave ID in merged_wave_info, but they don't have local_invocation_ids + * in FAST_LAUNCH=1 mode (the default on GFX10.3, deprecated on GFX11). + */ + .lower_cs_local_id_to_index = true, + /* Mesh shaders only have a 1D "vertex index" which we use + * as "workgroup index" to emulate the 3D workgroup ID. + */ + .lower_workgroup_id_to_index = true, + .shortcut_1d_workgroup_id = true, + }); + } + /* nir_opt_clip_cull_const, si_nir_kill_outputs, and ac_nir_optimize_outputs require outputs * to be scalar. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index ee638394385..06d040b6277 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -328,51 +328,6 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) NIR_PASS(_, nir, ac_nir_lower_mesh_inputs_to_mem); } - if (mesa_shader_stage_is_compute(nir->info.stage)) { - /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct - * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is - * incorrect with a non-linear thread order. - * - * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is - * divisible by 2. - */ - NIR_PASS(_, nir, nir_lower_compute_system_values, - &(nir_lower_compute_system_values_options){ - .lower_local_invocation_index = - nir->info.derivative_group == DERIVATIVE_GROUP_QUADS || - (sscreen->info.gfx_level >= GFX12 && - nir->info.derivative_group == DERIVATIVE_GROUP_NONE && - (nir->info.workgroup_size_variable || - (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0))) - }); - - /* Gfx12 supports this in hw. */ - if (sscreen->info.gfx_level < GFX12 && - nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) { - NIR_PASS(_, nir, nir_opt_cse); /* CSE load_local_invocation_id */ - NIR_PASS(_, nir, nir_lower_compute_system_values, - &(nir_lower_compute_system_values_options){ - .shuffle_local_ids_for_quad_derivatives = true, - }); - } - } - - if (nir->info.stage == MESA_SHADER_MESH && !sscreen->info.mesh_fast_launch_2) { - NIR_PASS(_, nir, nir_lower_compute_system_values, - &(nir_lower_compute_system_values_options){ - /* Mesh shaders run as NGG which can implement local_invocation_index from - * the wave ID in merged_wave_info, but they don't have local_invocation_ids - * in FAST_LAUNCH=1 mode (the default on GFX10.3, deprecated on GFX11). - */ - .lower_cs_local_id_to_index = true, - /* Mesh shaders only have a 1D "vertex index" which we use - * as "workgroup index" to emulate the 3D workgroup ID. - */ - .lower_workgroup_id_to_index = true, - .shortcut_1d_workgroup_id = true, - }); - } - si_nir_opts(sscreen, nir, true); /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */ si_nir_late_opts(nir);