freedreno, tu, ir3: Enable tiled workgroup item dispatch on a7xx

There is a 1.6% improvement in the Sacha Willems computeshader demo. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30758>
2026-01-10 12:30:11 +01:00 · 2024-08-20 13:01:09 -04:00 · 2024-08-20 13:01:09 -04:00 · 70934f3015
commit 70934f3015
parent 58ed1854c4
6 changed files with 45 additions and 13 deletions
--- a/src/freedreno/computerator/a6xx.cc
+++ b/src/freedreno/computerator/a6xx.cc
@ -225,9 +225,17 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
                        A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
                        A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
                        A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
-      OUT_REG(ring,
-         SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
-                            .threadsize = thrsz));
+      if (CHIP == A7XX) {
+         /* TODO allow the shader to control the tiling */
+         OUT_REG(ring,
+            SP_CS_CNTL_1(A7XX, .linearlocalidregid = regid(63, 0),
+                               .threadsize = thrsz,
+                               .workitemrastorder = WORKITEMRASTORDER_LINEAR));
+      } else {
+         OUT_REG(ring,
+            SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
+                               .threadsize = thrsz));
+      }
   }

   OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -607,9 +607,23 @@ lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
 }

 static nir_def *
-lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
+lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader)
 {
-   (void)unused;
+   struct ir3_shader *shader = _shader;
+
+   /* Vulkan allows implementations to tile workgroup invocations even when
+    * subgroup operations are involved, which is implied by this Note:
+    *
+    *    "There is no direct relationship between SubgroupLocalInvocationId and
+    *    LocalInvocationId or LocalInvocationIndex."
+    *
+    * However there is no way to get SubgroupId directly, so we have to use
+    * LocalInvocationIndex here. This means that whenever we do this lowering we
+    * have to force linear dispatch to make sure that the relation between
+    * SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we
+    * expect.
+    */
+   shader->cs.force_linear_dispatch = true;

   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
@ -638,10 +652,10 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
 }

 static bool
-ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
+ir3_nir_lower_subgroup_id_cs(nir_shader *nir, struct ir3_shader *shader)
 {
-   return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
-                                        lower_subgroup_id, NULL);
+   return nir_shader_lower_instructions(nir, lower_subgroup_id_filter,
+                                        lower_subgroup_id, shader);
 }

 /**
@ -764,7 +778,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
   if ((s->info.stage == MESA_SHADER_COMPUTE) ||
       (s->info.stage == MESA_SHADER_KERNEL)) {
      bool progress = false;
-      NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);
+      NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader);

      /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
       * we need to lower again.
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@ -410,6 +410,11 @@ create_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
      shader->nir_finalized = true;
   }

+   if (v->type == MESA_SHADER_COMPUTE ||
+       v->type == MESA_SHADER_KERNEL) {
+      v->cs.force_linear_dispatch = shader->cs.force_linear_dispatch;
+   }
+
   if (!compile_variant(shader, v))
      goto fail;

--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@ -837,6 +837,7 @@ struct ir3_shader_variant {
      struct {
         unsigned req_input_mem;
         unsigned req_local_mem;
+         bool force_linear_dispatch;
      } cs;
   };

@ -909,6 +910,7 @@ struct ir3_shader {
      struct {
         unsigned req_input_mem;    /* in dwords */
         unsigned req_local_mem;
+         bool force_linear_dispatch;
      } cs;
      /* For vertex shaders: */
      struct {
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -1452,8 +1452,10 @@ tu6_emit_cs_config(struct tu_cs *cs,
                      SP_CS_CNTL_1(CHIP,
                        .linearlocalidregid = regid(63, 0),
                        .threadsize = thrsz_cs,
-                        /* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */
-                        .workitemrastorder = WORKITEMRASTORDER_LINEAR, ));
+                        .workitemrastorder =
+                           v->cs.force_linear_dispatch ?
+                           WORKITEMRASTORDER_LINEAR :
+                           WORKITEMRASTORDER_TILED, ));

      tu_cs_emit_regs(
         cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
--- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc
@ -136,8 +136,9 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
            CHIP,
            .linearlocalidregid = INVALID_REG,
            .threadsize = thrsz_cs,
-            /* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */
-            .workitemrastorder = WORKITEMRASTORDER_LINEAR,
+            .workitemrastorder =
+               v->cs.force_linear_dispatch ? WORKITEMRASTORDER_LINEAR
+                                           : WORKITEMRASTORDER_TILED,
         )
      );
      OUT_REG(ring,