From d5cb38e457bfc73f1367e6d6a49ebece7b878bdc Mon Sep 17 00:00:00 2001
From: Emma Anholt <emma@anholt.net>
Date: Mon, 8 Sep 2025 14:37:11 -0700
Subject: [PATCH] ir3: Move the compute shader threadsize forcing earlier.

With this, we can look at real_wavesize while running NIR passes and know
if we have to be doubled because of the shader info coming in.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37245>
---
 src/freedreno/ir3/ir3.c     | 32 +-----------------------
 src/freedreno/ir3/ir3_nir.c | 49 +++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 8dc7170e9fe..751f4dccf7a 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -220,37 +220,7 @@ ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
 
    switch (v->type) {
    case MESA_SHADER_KERNEL:
-   case MESA_SHADER_COMPUTE: {
-      unsigned threads_per_wg =
-         v->local_size[0] * v->local_size[1] * v->local_size[2];
-
-      /* If the workgroups fit in the base threadsize, then doubling would just
-       * leave us with an unused second half of each wave for no gain (The HW
-       * can't pack multiple workgroups into a wave, because the workgroups
-       * might make different barrier choices).
-       */
-      if (!v->local_size_variable) {
-         if (threads_per_wg <= compiler->threadsize_base)
-            return false;
-      }
-
-      /* For a5xx, if the workgroup size is greater than the maximum number
-       * of threads per core with 32 threads per wave (512) then we have to
-       * use the doubled threadsize because otherwise the workgroup wouldn't
-       * fit. For smaller workgroup sizes, we follow the blob and use the
-       * smaller threadsize.
-       *
-       * For a6xx, because threadsize_base is bumped to 64, we don't have to
-       * worry about the workgroup fitting.
-       */
-      if (compiler->gen < 6) {
-         return v->local_size_variable ||
-                threads_per_wg >
-                   compiler->threadsize_base * compiler->max_waves;
-      }
-
-   }
-      FALLTHROUGH;
+   case MESA_SHADER_COMPUTE:
    case MESA_SHADER_FRAGMENT: {
       /* One of the limits on maximum waves of the shader running in parallel is
        * the register count used in the shader compared to the hardware's
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 814a3c42aac..fbcfd2bde6b 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -1113,6 +1113,53 @@ atomic_supported(const nir_instr * instr, const void * data)
    return nir_instr_as_intrinsic(instr)->def.bit_size != 64;
 }
 
+/**
+ * Filters the real_wavesize that was set based on API requirements, to an
+ * appopriate value given hardware limits and the NIR shader we get.
+ *
+ * The final wavesize in the SINGLE_OR_DOUBLE case will be determined later
+ * based on register allocation.
+ */
+static void
+ir3_nir_set_threadsize(struct ir3_shader_variant *v, const nir_shader *s)
+{
+   if (v->shader_options.real_wavesize != IR3_SINGLE_OR_DOUBLE)
+      return;
+
+   if (mesa_shader_stage_is_compute(v->type)) {
+      struct ir3_compiler *compiler = v->compiler;
+      const shader_info *info = &s->info;
+      unsigned threads_per_wg = info->workgroup_size[0] *
+                                info->workgroup_size[1] *
+                                info->workgroup_size[2];
+
+      /* If the workgroups fit in the base threadsize, then doubling would just
+       * leave us with an unused second half of each wave for no gain (the HW
+       * can't pack multiple workgroups into a wave, because the workgroups
+       * might make different barrier choices).
+       */
+      if (!info->workgroup_size_variable) {
+         if (threads_per_wg <= compiler->threadsize_base)
+            v->shader_options.real_wavesize = IR3_SINGLE_ONLY;
+      }
+
+      /* For a5xx, if the workgroup size is greater than the maximum number
+       * of threads per core with 32 threads per wave (512) then we have to
+       * use the doubled threadsize because otherwise the workgroup wouldn't
+       * fit. For smaller workgroup sizes, we follow the blob and use the
+       * smaller threadsize.
+       *
+       * For a6xx, because threadsize_base is bumped to 64, we don't have to
+       * worry about the workgroup fitting.
+       */
+      if (compiler->gen < 6 &&
+          (info->workgroup_size_variable ||
+           threads_per_wg > compiler->threadsize_base * compiler->max_waves)) {
+         v->shader_options.real_wavesize = IR3_DOUBLE_ONLY;
+      };
+   }
+}
+
 void
 ir3_nir_lower_variant(struct ir3_shader_variant *so,
                       const struct ir3_shader_nir_options *options,
@@ -1126,6 +1173,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so,
       mesa_logi("----------------------");
    }
 
+   ir3_nir_set_threadsize(so, s);
+
    bool progress = false;
 
    progress |= OPT(s, nir_lower_io_to_scalar, nir_var_mem_ssbo,