From dd6ac7055f6d580e8858e30386361b3faf261df8 Mon Sep 17 00:00:00 2001
From: Job Noorman <jnoorman@igalia.com>
Date: Wed, 16 Oct 2024 20:25:32 +0200
Subject: [PATCH] ir3: optimize subgroup shuffles using shfl

One quirk of the shfl instruction is that it only works with dynamically
uniform indices. This commit adds a pass to lower shuffles to the
ir3-specific ones using a loop that iterates all distinct indices one by
one. This is based on the blob's sequence.

Signed-off-by: Job Noorman <jnoorman@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31501>
---
 src/freedreno/ir3/ir3_lower_subgroups.c | 131 ++++++++++++++++++++++++
 src/freedreno/ir3/ir3_nir.c             |   5 +-
 src/freedreno/ir3/ir3_nir.h             |   1 +
 3 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c
index 2c0b57803bc..ffed827d5f5 100644
--- a/src/freedreno/ir3/ir3_lower_subgroups.c
+++ b/src/freedreno/ir3/ir3_lower_subgroups.c
@@ -694,3 +694,134 @@ ir3_nir_lower_64b_subgroups(nir_shader *nir)
    return nir_shader_lower_instructions(nir, filter_64b_scan_reduce,
                                         lower_64b_scan_reduce, NULL);
 }
+
+static bool
+filter_shuffle(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic) {
+      return false;
+   }
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_shuffle:
+   case nir_intrinsic_shuffle_up:
+   case nir_intrinsic_shuffle_down:
+   case nir_intrinsic_shuffle_xor:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static nir_def *
+shuffle_to_uniform(nir_builder *b, nir_intrinsic_op op, struct nir_def *val,
+                   struct nir_def *id)
+{
+   switch (op) {
+   case nir_intrinsic_shuffle:
+      return nir_rotate(b, val, id);
+   case nir_intrinsic_shuffle_up:
+      return nir_shuffle_up_uniform_ir3(b, val, id);
+   case nir_intrinsic_shuffle_down:
+      return nir_shuffle_down_uniform_ir3(b, val, id);
+   case nir_intrinsic_shuffle_xor:
+      return nir_shuffle_xor_uniform_ir3(b, val, id);
+   default:
+      unreachable("filtered intrinsic");
+   }
+}
+
+/* Transforms a shuffle operation into a loop that only uses shuffles with
+ * (dynamically) uniform indices. This is based on the blob's sequence and
+ * carefully makes sure that the least amount of iterations are performed (i.e.,
+ * one iteration per distinct index) while keeping all invocations active during
+ * each shfl operation. This is necessary since shfl does not update its dst
+ * when its src is inactive.
+ *
+ * done = false;
+ * while (true) {
+ *    next_index = read_invocation_cond_ir3(index, !done);
+ *    shuffled = op_uniform(val, next_index);
+ *
+ *    if (index == next_index) {
+ *       result = shuffled;
+ *       done = true;
+ *    }
+ *
+ *    if (subgroupAll(done)) {
+ *       break;
+ *    }
+ * }
+ */
+static nir_def *
+make_shuffle_uniform(nir_builder *b, nir_def *val, nir_def *index,
+                     nir_intrinsic_op op)
+{
+   nir_variable *done =
+      nir_local_variable_create(b->impl, glsl_bool_type(), "done");
+   nir_store_var(b, done, nir_imm_false(b), 1);
+   nir_variable *result =
+      nir_local_variable_create(b->impl, glsl_type_for_def(val), "result");
+
+   nir_loop *loop = nir_push_loop(b);
+   {
+      nir_def *next_index = nir_read_invocation_cond_ir3(
+         b, index->bit_size, index, nir_inot(b, nir_load_var(b, done)));
+      next_index->divergent = false;
+      nir_def *shuffled = shuffle_to_uniform(b, op, val, next_index);
+
+      nir_if *nif = nir_push_if(b, nir_ieq(b, index, next_index));
+      {
+         nir_store_var(b, result, shuffled, 1);
+         nir_store_var(b, done, nir_imm_true(b), 1);
+      }
+      nir_pop_if(b, nif);
+
+      nir_break_if(b, nir_vote_all(b, 1, nir_load_var(b, done)));
+   }
+   nir_pop_loop(b, loop);
+
+   return nir_load_var(b, result);
+}
+
+static nir_def *
+lower_shuffle(nir_builder *b, nir_instr *instr, void *data)
+{
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   nir_def *val = intrin->src[0].ssa;
+   nir_def *index = intrin->src[1].ssa;
+
+   if (intrin->intrinsic == nir_intrinsic_shuffle) {
+      /* The hw only does relative shuffles/rotates so transform shuffle(val, x)
+       * into rotate(val, x - gl_SubgroupInvocationID) which is valid since we
+       * make sure to only use it with uniform indices.
+       */
+      index = nir_isub(b, index, nir_load_subgroup_invocation(b));
+   }
+
+   if (!index->divergent) {
+      return shuffle_to_uniform(b, intrin->intrinsic, val, index);
+   }
+
+   return make_shuffle_uniform(b, val, index, intrin->intrinsic);
+}
+
+/* Lower (relative) shuffles to be able to use the shfl instruction. One quirk
+ * of shfl is that its index has to be dynamically uniform, so we transform the
+ * standard NIR intrinsics into ir3-specific ones which require their index to
+ * be uniform.
+ */
+bool
+ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader)
+{
+   if (!shader->compiler->has_shfl) {
+      return false;
+   }
+
+   nir_convert_to_lcssa(nir, true, true);
+   nir_divergence_analysis(nir);
+   return nir_shader_lower_instructions(nir, filter_shuffle, lower_shuffle,
+                                        NULL);
+}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index ff34b3aa8df..6fa9d884b17 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -786,8 +786,8 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
             .lower_vote_bool_eq = true,
             .lower_subgroup_masks = true,
             .lower_read_invocation_to_cond = true,
-            .lower_shuffle = true,
-            .lower_relative_shuffle = true,
+            .lower_shuffle = !compiler->has_shfl,
+            .lower_relative_shuffle = !compiler->has_shfl,
             .lower_rotate_to_shuffle = !compiler->has_shfl,
             .lower_inverse_ballot = true,
       };
@@ -800,6 +800,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
       }
 
       OPT(s, nir_lower_subgroups, &options);
+      OPT(s, ir3_nir_lower_shuffle, shader);
 
       /* We want to run the 64b lowering after nir_lower_subgroups so that the
        * operations have been scalarized. However, the 64b lowering will insert
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 983a8cf7d3d..cf69d0e4c7f 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -82,6 +82,7 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
                                              int32_t shift);
 
 bool ir3_nir_lower_64b_subgroups(nir_shader *nir);
+bool ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader);
 bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v);
 
 nir_def *ir3_get_shared_driver_ubo(nir_builder *b,