diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 9fdb313fad3..dcdb96ed35b 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1357,6 +1357,15 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
 # Should be used in the shader preamble.
 intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE])
 
+intrinsic("brcst_active_ir3", dest_comp=1, src_comp=[1, 1], bit_sizes=src0,
+          indices=[CLUSTER_SIZE])
+intrinsic("reduce_clusters_ir3", dest_comp=1, src_comp=[1], bit_sizes=src0,
+          indices=[REDUCTION_OP])
+intrinsic("inclusive_scan_clusters_ir3", dest_comp=1, src_comp=[1],
+          bit_sizes=src0, indices=[REDUCTION_OP])
+intrinsic("exclusive_scan_clusters_ir3", dest_comp=1, src_comp=[1, 1],
+          bit_sizes=src0, indices=[REDUCTION_OP])
+
 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
 # without applying any format conversion in the process. If the shader needs
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
index f842e7fdb1b..2ba4df7a127 100644
--- a/src/freedreno/ir3/disasm-a3xx.c
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -193,6 +193,7 @@ static const struct opc_info {
    OPC(1, OPC_READ_FIRST_MACRO, read_first.macro),
    OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro),
    OPC(1, OPC_SCAN_MACRO, scan.macro),
+   OPC(1, OPC_SCAN_CLUSTERS_MACRO, scan_clusters.macro),
    OPC(1, OPC_SHPS_MACRO, shps.macro),
    OPC(1, OPC_PUSH_CONSTS_LOAD_MACRO, push_consts_load.macro),
 
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
index 777cfeb7113..0279abf1e63 100644
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -130,6 +130,7 @@ typedef enum {
 
    /* Macros that expand to a loop */
    OPC_SCAN_MACRO      = _OPC(1, 58),
+   OPC_SCAN_CLUSTERS_MACRO = _OPC(1, 60),
 
    /* Macros that expand to an stsc at the start of the preamble.
     * It loads into const file and should not be optimized in any way.
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 79ae245cd26..d326de7c080 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -619,11 +619,12 @@ struct ir3_array {
 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
 
 enum ir3_branch_type {
-   IR3_BRANCH_COND,   /* condition */
-   IR3_BRANCH_ANY,    /* subgroupAny(condition) */
-   IR3_BRANCH_ALL,    /* subgroupAll(condition) */
-   IR3_BRANCH_GETONE, /* subgroupElect() */
-   IR3_BRANCH_SHPS,   /* preamble start */
+   IR3_BRANCH_COND,    /* condition */
+   IR3_BRANCH_ANY,     /* subgroupAny(condition) */
+   IR3_BRANCH_ALL,     /* subgroupAll(condition) */
+   IR3_BRANCH_GETONE,  /* subgroupElect() */
+   IR3_BRANCH_GETLAST, /* getlast.w8 */
+   IR3_BRANCH_SHPS,    /* preamble start */
 };
 
 struct ir3_block {
@@ -2328,6 +2329,7 @@ INSTR1NODST(PREDT)
 INSTR0(PREDF)
 INSTR0(PREDE)
 INSTR0(GETONE)
+INSTR0(GETLAST)
 INSTR0(SHPS)
 INSTR0(SHPE)
 
@@ -2481,6 +2483,26 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
    return sam;
 }
 
+/* brcst.active rx, ry behaves like a conditional move: rx either keeps its
+ * value or is set to ry. In order to model this in SSA form, we add an extra
+ * argument (the initial value of rx) and tie it to the destination.
+ */
+static inline struct ir3_instruction *
+ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size,
+                 struct ir3_instruction *src,
+                 struct ir3_instruction *dst_default)
+{
+   struct ir3_instruction *brcst =
+      ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2);
+   brcst->cat5.cluster_size = cluster_size;
+   brcst->cat5.type = TYPE_U32;
+   struct ir3_register *brcst_dst = __ssa_dst(brcst);
+   __ssa_src(brcst, src, 0);
+   struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
+   ir3_reg_tie(brcst_dst, default_src);
+   return brcst;
+}
+
 /* cat6 instructions: */
 INSTR0(GETFIBERID)
 INSTR2(LDLV)
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index bcceaa678d4..8216e95c9f1 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1984,6 +1984,111 @@ emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    return create_multidst_mov(ctx->block, dst);
 }
 
+static struct ir3_instruction *
+emit_intrinsic_reduce_clusters(struct ir3_context *ctx,
+                               nir_intrinsic_instr *intr)
+{
+   nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr);
+   reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
+   unsigned dst_size = intr->def.bit_size;
+
+   bool need_exclusive =
+      intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3;
+   bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32;
+
+   /* Note: the shared reg is initialized to the identity, so we need it to
+    * always be 32-bit even when the source isn't because half shared regs are
+    * not supported.
+    */
+   struct ir3_instruction *identity =
+      create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size));
+   identity->dsts[0]->flags |= IR3_REG_SHARED;
+
+   /* OPC_SCAN_CLUSTERS_MACRO has the following destinations:
+    * - Shared reg reduction result, must be initialized to the identity
+    * - Inclusive scan result
+    * - (iff exclusive) Exclusive scan result. Conditionally added because
+    *   calculating the exclusive value is optional (i.e., not a side-effect of
+    *   calculating the inclusive value) and won't be DCE'd anymore at this
+    *   point.
+    * - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for
+    *   most ops but this isn't possible for the 32b mul_u macro since its
+    *   destination is clobbered. So conditionally allocate an extra
+    *   register in that case.
+    *
+    * Note that the getlast loop this macro expands to iterates over all
+    * clusters. However, for each iteration, not only the fibers in the current
+    * cluster are active but all later ones as well. Since they still need their
+    * sources when their cluster is handled, all destinations interfere with
+    * the sources.
+    */
+   unsigned ndst = 2 + need_exclusive + need_scratch;
+   unsigned nsrc = 2 + need_exclusive;
+   struct ir3_instruction *scan =
+      ir3_instr_create(ctx->block, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc);
+   scan->cat1.reduce_op = reduce_op;
+
+   unsigned dst_flags = IR3_REG_EARLY_CLOBBER;
+   if (ir3_bitsize(ctx, dst_size) == 16)
+      dst_flags |= IR3_REG_HALF;
+
+   struct ir3_register *reduce = __ssa_dst(scan);
+   reduce->flags |= IR3_REG_SHARED;
+   struct ir3_register *inclusive = __ssa_dst(scan);
+   inclusive->flags |= dst_flags;
+
+   struct ir3_register *exclusive = NULL;
+   if (need_exclusive) {
+      exclusive = __ssa_dst(scan);
+      exclusive->flags |= dst_flags;
+   }
+
+   if (need_scratch) {
+      struct ir3_register *scratch = __ssa_dst(scan);
+      scratch->flags |= dst_flags;
+   }
+
+   struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
+   ir3_reg_tie(reduce, reduce_init);
+
+   struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0];
+   __ssa_src(scan, inclusive_src, 0);
+
+   if (need_exclusive) {
+      struct ir3_instruction *exclusive_src =
+         ir3_get_src(ctx, &intr->src[1])[0];
+      __ssa_src(scan, exclusive_src, 0);
+   }
+
+   struct ir3_register *dst;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_reduce_clusters_ir3:
+      dst = reduce;
+      break;
+   case nir_intrinsic_inclusive_scan_clusters_ir3:
+      dst = inclusive;
+      break;
+   case nir_intrinsic_exclusive_scan_clusters_ir3: {
+      assert(exclusive != NULL);
+      dst = exclusive;
+      break;
+   }
+   default:
+      unreachable("unknown reduce intrinsic");
+   }
+
+   return create_multidst_mov(ctx->block, dst);
+}
+
+static struct ir3_instruction *
+emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+   struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0];
+   struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0];
+   return ir3_BRCST_ACTIVE(ctx->block, nir_intrinsic_cluster_size(intr),
+                           brcst_val, default_src);
+}
+
 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
 
@@ -2637,6 +2742,16 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
       dst[0] = emit_intrinsic_reduce(ctx, intr);
       break;
 
+   case nir_intrinsic_reduce_clusters_ir3:
+   case nir_intrinsic_inclusive_scan_clusters_ir3:
+   case nir_intrinsic_exclusive_scan_clusters_ir3:
+      dst[0] = emit_intrinsic_reduce_clusters(ctx, intr);
+      break;
+
+   case nir_intrinsic_brcst_active_ir3:
+      dst[0] = emit_intrinsic_brcst_active(ctx, intr);
+      break;
+
    case nir_intrinsic_preamble_end_ir3: {
       struct ir3_instruction *instr = ir3_SHPE(ctx->block);
       instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index 795d3c7d7bb..990a6efacbc 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -695,6 +695,7 @@ block_sched(struct ir3 *ir)
          struct ir3_instruction *br1, *br2;
 
          if (block->brtype == IR3_BRANCH_GETONE ||
+             block->brtype == IR3_BRANCH_GETLAST ||
              block->brtype == IR3_BRANCH_SHPS) {
             /* getone/shps can't be inverted, and it wouldn't even make sense
              * to follow it with an inverted branch, so follow it by an
@@ -703,6 +704,8 @@ block_sched(struct ir3 *ir)
             assert(!block->condition);
             if (block->brtype == IR3_BRANCH_GETONE)
                br1 = ir3_GETONE(block);
+            else if (block->brtype == IR3_BRANCH_GETLAST)
+               br1 = ir3_GETLAST(block);
             else
                br1 = ir3_SHPS(block);
             br1->cat0.target = block->successors[1];
@@ -740,6 +743,7 @@ block_sched(struct ir3 *ir)
                br2->cat0.brtype = BRANCH_ANY;
                break;
             case IR3_BRANCH_GETONE:
+            case IR3_BRANCH_GETLAST:
             case IR3_BRANCH_SHPS:
                unreachable("can't get here");
             }
diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c
index da416497d12..91b99b7df65 100644
--- a/src/freedreno/ir3/ir3_lower_subgroups.c
+++ b/src/freedreno/ir3/ir3_lower_subgroups.c
@@ -22,6 +22,7 @@
  */
 
 #include "ir3.h"
+#include "ir3_nir.h"
 #include "util/ralloc.h"
 
 /* Lower several macro-instructions needed for shader subgroup support that
@@ -241,6 +242,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
    case OPC_READ_COND_MACRO:
    case OPC_SWZ_SHARED_MACRO:
    case OPC_SCAN_MACRO:
+   case OPC_SCAN_CLUSTERS_MACRO:
       break;
    case OPC_READ_FIRST_MACRO:
       /* Moves to shared registers read the first active fiber, so we can just
@@ -313,6 +315,79 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
       mov_reg(exit, exclusive, reduce);
       do_reduce(exit, instr->cat1.reduce_op, inclusive, src, exclusive);
       mov_reg(exit, reduce, inclusive);
+   } else if (instr->opc == OPC_SCAN_CLUSTERS_MACRO) {
+      /* The pseudo-code for the scan macro is:
+       *
+       * while (true) {
+       *    body:
+       *    scratch = reduce;
+       *
+       *    inclusive = inclusive_src OP scratch;
+       *
+       *    static if (is exclusive scan)
+       *       exclusive = exclusive_src OP scratch
+       *
+       *    if (getlast()) {
+       *       store:
+       *       reduce = inclusive;
+       *       if (elect())
+       *           break;
+       *    } else {
+       *       break;
+       *    }
+       * }
+       * after_block:
+       */
+      struct ir3_block *body = ir3_block_create(ir);
+      list_add(&body->node, &before_block->node);
+
+      struct ir3_block *store = ir3_block_create(ir);
+      list_add(&store->node, &body->node);
+
+      link_blocks(before_block, body, 0);
+
+      link_blocks(body, store, 0);
+      link_blocks(body, after_block, 1);
+      body->brtype = IR3_BRANCH_GETLAST;
+
+      link_blocks(store, after_block, 0);
+      link_blocks(store, body, 1);
+      store->brtype = IR3_BRANCH_GETONE;
+
+      struct ir3_register *reduce = instr->dsts[0];
+      struct ir3_register *inclusive = instr->dsts[1];
+      struct ir3_register *inclusive_src = instr->srcs[1];
+
+      /* We need to perform the following operations:
+       *  - inclusive = inclusive_src OP reduce
+       *  - exclusive = exclusive_src OP reduce (iff exclusive scan)
+       * Since reduce is initially in a shared register, we need to copy it to a
+       * scratch register before performing the operations.
+       *
+       * The scratch register used is:
+       *  - an explicitly allocated one if op is 32b mul_u.
+       *    - necessary because we cannot do 'foo = foo mul_u bar' since mul_u
+       *      clobbers its destination.
+       *  - exclusive if this is an exclusive scan (and not 32b mul_u).
+       *    - since we calculate inclusive first.
+       *  - inclusive otherwise.
+       *
+       * In all cases, this is the last destination.
+       */
+      struct ir3_register *scratch = instr->dsts[instr->dsts_count - 1];
+
+      mov_reg(body, scratch, reduce);
+      do_reduce(body, instr->cat1.reduce_op, inclusive, inclusive_src, scratch);
+
+      /* exclusive scan */
+      if (instr->srcs_count == 3) {
+         struct ir3_register *exclusive_src = instr->srcs[2];
+         struct ir3_register *exclusive = instr->dsts[2];
+         do_reduce(body, instr->cat1.reduce_op, exclusive, exclusive_src,
+                   scratch);
+      }
+
+      mov_reg(store, reduce, inclusive);
    } else {
       struct ir3_block *then_block = create_if(ir, before_block, after_block);
 
@@ -447,3 +522,65 @@ ir3_lower_subgroups(struct ir3 *ir)
 
    return progress;
 }
+
+static bool
+filter_scan_reduce(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_reduce:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static nir_def *
+lower_scan_reduce(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   unsigned bit_size = intrin->def.bit_size;
+
+   nir_op op = nir_intrinsic_reduction_op(intrin);
+   nir_const_value ident_val = nir_alu_binop_identity(op, bit_size);
+   nir_def *ident = nir_build_imm(b, 1, bit_size, &ident_val);
+   nir_def *inclusive = intrin->src[0].ssa;
+   nir_def *exclusive = ident;
+
+   for (unsigned cluster_size = 2; cluster_size <= 8; cluster_size *= 2) {
+      nir_def *brcst = nir_brcst_active_ir3(b, ident, inclusive,
+                                            .cluster_size = cluster_size);
+      inclusive = nir_build_alu2(b, op, inclusive, brcst);
+
+      if (intrin->intrinsic == nir_intrinsic_exclusive_scan)
+         exclusive = nir_build_alu2(b, op, exclusive, brcst);
+   }
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_reduce:
+      return nir_reduce_clusters_ir3(b, inclusive, .reduction_op = op);
+   case nir_intrinsic_inclusive_scan:
+      return nir_inclusive_scan_clusters_ir3(b, inclusive, .reduction_op = op);
+   case nir_intrinsic_exclusive_scan:
+      return nir_exclusive_scan_clusters_ir3(b, inclusive, exclusive,
+                                             .reduction_op = op);
+   default:
+      unreachable("filtered intrinsic");
+   }
+}
+
+bool
+ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v)
+{
+   if (!v->compiler->has_getfiberid)
+      return false;
+
+   return nir_shader_lower_instructions(nir, filter_scan_reduce,
+                                        lower_scan_reduce, NULL);
+}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index e3596c54b9b..793d3084d07 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -740,6 +740,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
       progress |= OPT(s, nir_opt_constant_folding);
    }
 
+   OPT(s, ir3_nir_opt_subgroups, so);
+
    /* Do the preamble before analysing UBO ranges, because it's usually
     * higher-value and because it can result in eliminating some indirect UBO
     * accesses where otherwise we'd have to push the whole range. However we
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index a4adde07225..d311096c2d3 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -86,6 +86,8 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
                                              nir_def *offset,
                                              int32_t shift);
 
+bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v);
+
 static inline nir_intrinsic_instr *
 ir3_bindless_resource(nir_src src)
 {
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
index 3e3c54ff72b..d3a86bc64a5 100644
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@@ -137,7 +137,8 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
                                 disasm_a3xx_instr_name(instr->opc));
       }
 
-      if (instr->opc == OPC_SCAN_MACRO) {
+      if (instr->opc == OPC_SCAN_MACRO ||
+          instr->opc == OPC_SCAN_CLUSTERS_MACRO) {
          switch (instr->cat1.reduce_op) {
          case REDUCE_OP_ADD_U:
             mesa_log_stream_printf(stream, ".add.u");
@@ -548,6 +549,9 @@ print_block(struct ir3_block *block, int lvl)
       case IR3_BRANCH_GETONE:
          mesa_log_stream_printf(stream, "getone ");
          break;
+      case IR3_BRANCH_GETLAST:
+         mesa_log_stream_printf(stream, "getlast ");
+         break;
       case IR3_BRANCH_SHPS:
          mesa_log_stream_printf(stream, "shps ");
          break;
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c
index 3bd89566a4c..30575c3012e 100644
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -249,6 +249,27 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
          validate_assert(ctx, reg_class_flags(instr->dsts[1]) ==
                               reg_class_flags(instr->srcs[0]));
          validate_assert(ctx, reg_class_flags(instr->dsts[2]) == IR3_REG_SHARED);
+      } else if (instr->opc == OPC_SCAN_CLUSTERS_MACRO) {
+         validate_assert(ctx, instr->dsts_count >= 2 && instr->dsts_count < 5);
+         validate_assert(ctx, instr->srcs_count >= 2 && instr->srcs_count < 4);
+         validate_assert(ctx,
+                         reg_class_flags(instr->dsts[0]) == IR3_REG_SHARED);
+         validate_assert(ctx, reg_class_flags(instr->dsts[1]) ==
+                                 reg_class_flags(instr->srcs[1]));
+
+         /* exclusive scan */
+         if (instr->srcs_count == 3) {
+            validate_assert(ctx, instr->dsts_count >= 3);
+            validate_assert(ctx, reg_class_flags(instr->srcs[2]) ==
+                                    reg_class_flags(instr->srcs[1]));
+            validate_assert(ctx, reg_class_flags(instr->dsts[2]) ==
+                                    reg_class_flags(instr->srcs[1]));
+         }
+
+         /* scratch register */
+         validate_assert(ctx,
+                         reg_class_flags(instr->dsts[instr->dsts_count - 1]) ==
+                            reg_class_flags(instr->srcs[1]));
       } else {
          foreach_dst (dst, instr)
             validate_reg_size(ctx, dst, instr->cat1.dst_type);