ir3: Implement round-robin workaround

On later a6xx and a7xx, round-robin does not work properly when there are more than 8 active waves from the same dispatch in the same uSP. We have to clamp the register usage to a minimum to guarantee there aren't more waves. There is a problem for very large workgroups, which will have to be solved the same way as the problem with deep control flow, through implementing ReuseGPRMode. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41562>
2026-06-10 01:18:18 +02:00 · 2026-05-13 12:32:04 -04:00 · 2026-05-13 12:32:04 -04:00 · 31db17f653
commit 31db17f653
parent 25f7765d21
2 changed files with 45 additions and 0 deletions
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@ -298,6 +298,20 @@ ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
            v->name);
         exit(1);
      }
+
+      /* Due to round_robin_errata we may be unable to support forward progress
+       * guarantees between waves if there are more than 8 waves active.
+       */
+      if (v->cs.round_robin_mode && compiler->info->props.round_robin_errata) {
+         if (waves_per_wg > 8 && v->has_barrier) {
+            mesa_loge(
+               "Compute shader (%s) requires forward progress but uses more "
+               "than 8 waves.",
+               v->name);
+            exit(1);
+         }
+         max_waves = MIN2(max_waves, 8);
+      }
   }

   return max_waves;
@ -315,6 +329,29 @@ ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
                    : compiler->info->max_waves;
 }

+/* Get the minimum number of registers a shader must declare, even if it doesn't
+ * actually use as many.
+ */
+unsigned
+ir3_get_min_reg_count(const struct ir3_shader_variant *v, bool double_threadsize)
+{
+   if (!ir3_shader_compute(v) || !v->cs.round_robin_mode ||
+       !v->compiler->info->props.round_robin_errata)
+      return 0;
+
+   /* Limit occupancy to work around the round-robin errata. */
+   unsigned max_waves = 8;
+
+   /* We want to find the smallest register size where no more than
+    * (max_waves / wave_granularity) waves fit in reg_size_vec4. Calculate the
+    * maximum register size where (max_waves / wave_granularity + 1) waves fit,
+    * then add 1.
+    */
+   return (v->compiler->info->props.reg_size_vec4 /
+      ((max_waves / v->compiler->info->wave_granularity) *
+       (double_threadsize ? 2 : 1) + 1)) + 1;
+}
+
 void
 ir3_collect_info(struct ir3_shader_variant *v)
 {
@ -555,6 +592,11 @@ ir3_collect_info(struct ir3_shader_variant *v)

   info->double_threadsize = ir3_should_double_threadsize(v, regs_count);

+   /* Limit occupancy if necessary by increasing max_reg. */
+   unsigned min_reg_count = ir3_get_min_reg_count(v, info->double_threadsize);
+   if (min_reg_count > 0)
+      info->max_reg = MAX2(info->max_reg, min_reg_count - 1);
+
   /* TODO this is different for earlier gens, but earlier gens don't use this */
   info->subgroup_size = v->info.double_threadsize ? 128 : 64;

--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -885,6 +885,9 @@ unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
                                           bool double_threadsize);

+unsigned ir3_get_min_reg_count(const struct ir3_shader_variant *v,
+                               bool double_threadsize);
+
 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
                                  unsigned regs_count);