From e5598166b01a727753286d8948d1d42f0303b50f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 8 Apr 2026 01:47:32 -0700
Subject: [PATCH] brw: Have brw_nir_apply_key call brw_nir_lower_simd for all
 stages

brw_nir_apply_key typically knows the dispatch width (it's fixed for
geometry stages, and we clone the NIR for compute and mesh shaders).
For compute/mesh, this was the very next thing called.  For the others,
if we know the width, there's no reason not to lower it.

Scratch lowering will start using load_simd_width_intel soon, so we
need it to work in all stages.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40843>
---
 src/intel/compiler/brw/brw_compile_cs.cpp   |  2 --
 src/intel/compiler/brw/brw_compile_mesh.cpp |  4 ----
 src/intel/compiler/brw/brw_nir.c            | 15 +++++++++------
 src/intel/compiler/brw/brw_nir.h            |  2 +-
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/intel/compiler/brw/brw_compile_cs.cpp b/src/intel/compiler/brw/brw_compile_cs.cpp
index 7e295d5372c..612262bda97 100644
--- a/src/intel/compiler/brw/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw/brw_compile_cs.cpp
@@ -190,8 +190,6 @@ brw_compile_cs(const struct brw_compiler *compiler,
       BRW_NIR_SNAPSHOT("first");
       brw_nir_apply_key(pt, &key->base, dispatch_width);
 
-      BRW_NIR_PASS(brw_nir_lower_simd, dispatch_width);
-
       brw_nir_optimize(pt);
       /* brw_nir_optimize undoes late lowerings. */
       BRW_NIR_PASS(nir_opt_algebraic_late);
diff --git a/src/intel/compiler/brw/brw_compile_mesh.cpp b/src/intel/compiler/brw/brw_compile_mesh.cpp
index 4e502d848e7..715a8ef6849 100644
--- a/src/intel/compiler/brw/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw/brw_compile_mesh.cpp
@@ -362,8 +362,6 @@ brw_compile_task(const struct brw_compiler *compiler,
       BRW_NIR_SNAPSHOT("first");
       brw_nir_apply_key(pt, &key->base, dispatch_width);
 
-      BRW_NIR_PASS(brw_nir_lower_simd, dispatch_width);
-
       brw_nir_optimize(pt);
       /* brw_nir_optimize undoes late lowerings. */
       BRW_NIR_PASS(nir_opt_algebraic_late);
@@ -1100,8 +1098,6 @@ brw_compile_mesh(const struct brw_compiler *compiler,
       /* Load uniforms can do a better job for constants, so fold before it. */
       BRW_NIR_PASS(nir_opt_constant_folding);
 
-      BRW_NIR_PASS(brw_nir_lower_simd, dispatch_width);
-
       brw_nir_optimize(pt);
       /* brw_nir_optimize undoes late lowerings. */
       BRW_NIR_PASS(nir_opt_algebraic_late);
diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index 544235f1ba4..df7d31c98c0 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -3038,8 +3038,6 @@ brw_nir_apply_key(brw_pass_tracker *pt,
 
    pt->progress = false;
 
-   unsigned subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size);
-
    /* VS/TCS/TES/GS always run at a fixed SIMD width, which is what our
     * max_subgroup_size parameter represents.  Compute/Mesh can run at
     * different sizes, but we clone the NIR for each SIMD width, and pass
@@ -3052,6 +3050,8 @@ brw_nir_apply_key(brw_pass_tracker *pt,
    if (nir->info.stage != MESA_SHADER_FRAGMENT) {
       nir->info.min_subgroup_size = max_subgroup_size;
       nir->info.max_subgroup_size = max_subgroup_size;
+
+      OPT(brw_nir_lower_simd);
    }
 
    const nir_lower_subgroups_options subgroups_options = {
@@ -3442,13 +3442,16 @@ filter_simd(const nir_instr *instr, UNUSED const void *options)
 static nir_def *
 lower_simd(nir_builder *b, nir_instr *instr, void *options)
 {
-   uintptr_t simd_width = (uintptr_t)options;
+   unsigned simd_width = b->shader->info.max_subgroup_size;
+   assert(b->shader->info.min_subgroup_size == simd_width);
 
    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
    case nir_intrinsic_load_simd_width_intel:
       return nir_imm_int(b, simd_width);
 
    case nir_intrinsic_load_subgroup_id:
+      assert(mesa_shader_stage_uses_workgroup(b->shader->info.stage));
+
       /* If the whole workgroup fits in one thread, we can lower subgroup_id
        * to a constant zero.
        */
@@ -3464,10 +3467,10 @@ lower_simd(nir_builder *b, nir_instr *instr, void *options)
 }
 
 bool
-brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
+brw_nir_lower_simd(nir_shader *nir)
 {
-   return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
-                                 (void *)(uintptr_t)dispatch_width);
+   return nir->info.min_subgroup_size == nir->info.max_subgroup_size &&
+          nir_shader_lower_instructions(nir, filter_simd, lower_simd, NULL);
 }
 
 nir_variable *
diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h
index d5fb38cab7d..456a8a0a857 100644
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@@ -275,7 +275,7 @@ bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
                                         const struct
                                         intel_device_info *devinfo);
 
-bool brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width);
+bool brw_nir_lower_simd(nir_shader *nir);
 
 void brw_postprocess_nir_opts(struct brw_pass_tracker *pt);