From 8ceef4e3d4efec0ebc5f145cde53aeffc9b70b84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 21 Nov 2025 15:29:37 -0500
Subject: [PATCH] radeonsi: lower compute system values later

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38802>
---
 src/gallium/drivers/radeonsi/si_shader.c     | 45 ++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_shader_nir.c | 45 --------------------
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5818be70ab1..3dd87a08881 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -638,6 +638,51 @@ static void run_pre_link_optimization_passes(struct si_nir_shader_ctx *ctx)
    nir_shader *nir = ctx->nir;
    bool progress = false;
 
+   if (mesa_shader_stage_is_compute(nir->info.stage)) {
+      /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
+       * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
+       * incorrect with a non-linear thread order.
+       *
+       * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
+       * divisible by 2.
+       */
+      NIR_PASS(progress, nir, nir_lower_compute_system_values,
+               &(nir_lower_compute_system_values_options){
+                  .lower_local_invocation_index =
+                     nir->info.derivative_group == DERIVATIVE_GROUP_QUADS ||
+                     (sel->screen->info.gfx_level >= GFX12 &&
+                      nir->info.derivative_group == DERIVATIVE_GROUP_NONE &&
+                      (nir->info.workgroup_size_variable ||
+                       (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)))
+               });
+
+      /* Gfx12 supports this in hw. */
+      if (sel->screen->info.gfx_level < GFX12 &&
+          nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
+         NIR_PASS(progress, nir, nir_opt_cse); /* CSE load_local_invocation_id */
+         NIR_PASS(progress, nir, nir_lower_compute_system_values,
+                  &(nir_lower_compute_system_values_options){
+                     .shuffle_local_ids_for_quad_derivatives = true,
+                  });
+      }
+   }
+
+   if (nir->info.stage == MESA_SHADER_MESH && !sel->screen->info.mesh_fast_launch_2) {
+      NIR_PASS(progress, nir, nir_lower_compute_system_values,
+               &(nir_lower_compute_system_values_options){
+                  /* Mesh shaders run as NGG which can implement local_invocation_index from
+                   * the wave ID in merged_wave_info, but they don't have local_invocation_ids
+                   * in FAST_LAUNCH=1 mode (the default on GFX10.3, deprecated on GFX11).
+                   */
+                  .lower_cs_local_id_to_index = true,
+                  /* Mesh shaders only have a 1D "vertex index" which we use
+                   * as "workgroup index" to emulate the 3D workgroup ID.
+                   */
+                  .lower_workgroup_id_to_index = true,
+                  .shortcut_1d_workgroup_id = true,
+               });
+   }
+
    /* nir_opt_clip_cull_const, si_nir_kill_outputs, and ac_nir_optimize_outputs require outputs
     * to be scalar.
     */
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index ee638394385..06d040b6277 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -328,51 +328,6 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
       NIR_PASS(_, nir, ac_nir_lower_mesh_inputs_to_mem);
    }
 
-   if (mesa_shader_stage_is_compute(nir->info.stage)) {
-      /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
-       * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
-       * incorrect with a non-linear thread order.
-       *
-       * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
-       * divisible by 2.
-       */
-      NIR_PASS(_, nir, nir_lower_compute_system_values,
-               &(nir_lower_compute_system_values_options){
-                  .lower_local_invocation_index =
-                     nir->info.derivative_group == DERIVATIVE_GROUP_QUADS ||
-                     (sscreen->info.gfx_level >= GFX12 &&
-                      nir->info.derivative_group == DERIVATIVE_GROUP_NONE &&
-                      (nir->info.workgroup_size_variable ||
-                       (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)))
-               });
-
-      /* Gfx12 supports this in hw. */
-      if (sscreen->info.gfx_level < GFX12 &&
-          nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
-         NIR_PASS(_, nir, nir_opt_cse); /* CSE load_local_invocation_id */
-         NIR_PASS(_, nir, nir_lower_compute_system_values,
-                  &(nir_lower_compute_system_values_options){
-                     .shuffle_local_ids_for_quad_derivatives = true,
-                  });
-      }
-   }
-
-   if (nir->info.stage == MESA_SHADER_MESH && !sscreen->info.mesh_fast_launch_2) {
-      NIR_PASS(_, nir, nir_lower_compute_system_values,
-               &(nir_lower_compute_system_values_options){
-                  /* Mesh shaders run as NGG which can implement local_invocation_index from
-                   * the wave ID in merged_wave_info, but they don't have local_invocation_ids
-                   * in FAST_LAUNCH=1 mode (the default on GFX10.3, deprecated on GFX11).
-                   */
-                  .lower_cs_local_id_to_index = true,
-                  /* Mesh shaders only have a 1D "vertex index" which we use
-                   * as "workgroup index" to emulate the 3D workgroup ID.
-                   */
-                  .lower_workgroup_id_to_index = true,
-                  .shortcut_1d_workgroup_id = true,
-               });
-   }
-
    si_nir_opts(sscreen, nir, true);
    /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
    si_nir_late_opts(nir);