ir3: add support for predication

Use predication instead of branching whenever possible and profitable: all divergent leaf branches are replaced with predication. Non-divergent branches are kept since for those a branch might be more performant when it jumps over all instructions. Although it might be possible to support a limited form of nested predication, this is more difficult to implement so we only support leaf branches for now. When translating from NIR to ir3, predication is emitted just like normal branches except that the branch is replaced with pred[tf] and the opposite (pred[ft]) is inserted at the end of the then-block. This pattern is then recognized during legalization at which point the closing prede is inserted. We don't insert this right away to allow opt_jump to optimize jumps out of the else-block. Since the branches we support for predication always have exactly one block in each arm, the then-block is emitted first, and blocks are never reordered, this way of emitting predicated branches ensures they have the correct memory layout. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27982>
2026-03-16 08:20:36 +01:00 · 2024-03-05 15:00:35 +01:00 · 2024-03-05 15:00:35 +01:00 · 39088571f0
commit 39088571f0
parent bbc78e92ff
7 changed files with 272 additions and 10 deletions
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@ -523,6 +523,7 @@ regid(int num, int comp)
 /* special registers: */
 #define REG_A0 61 /* address register */
 #define REG_P0 62 /* predicate register */
+#define REG_P0_X regid(REG_P0, 0) /* p0.x */

 /* With is_bindless_s2en = 1, this determines whether bindless is enabled and
 * if so, how to get the (base, index) pair for both sampler and texture.
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -891,6 +891,8 @@ is_terminator(struct ir3_instruction *instr)
   case OPC_SHPS:
   case OPC_GETONE:
   case OPC_GETLAST:
+   case OPC_PREDT:
+   case OPC_PREDF:
      return true;
   default:
      return false;
@ -2388,7 +2390,7 @@ INSTR0(END)
 INSTR0(CHSH)
 INSTR0(CHMASK)
 INSTR1NODST(PREDT)
-INSTR0(PREDF)
+INSTR1NODST(PREDF)
 INSTR0(PREDE)
 INSTR0(GETONE)
 INSTR0(GETLAST)
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@ -222,6 +222,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
      compiler->num_predicates = 4;
      compiler->bitops_can_write_predicates = true;
      compiler->has_branch_and_or = true;
+      compiler->has_predication = true;
   } else {
      compiler->max_const_pipeline = 512;
      compiler->max_const_geom = 512;
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@ -226,6 +226,9 @@ struct ir3_compiler {
   /* True if braa/brao are available. */
   bool has_branch_and_or;

+   /* True if predt/predf/prede are supported. */
+   bool has_predication;
+
   /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
   uint32_t max_variable_workgroup_size;

--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -3765,8 +3765,10 @@ emit_block(struct ir3_context *ctx, nir_block *nblock)
   /* Emit unconditional branch if we only have one successor. Conditional
    * branches are emitted in emit_if.
    */
-   if (ctx->block->successors[0] && !ctx->block->successors[1])
-      ir3_JUMP(ctx->block);
+   if (ctx->block->successors[0] && !ctx->block->successors[1]) {
+      if (!ir3_block_get_terminator(ctx->block))
+         ir3_JUMP(ctx->block);
+   }

   _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
 }
@ -3843,13 +3845,153 @@ fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
   return branch;
 }

-static struct ir3_instruction *
-emit_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
+static bool
+instr_can_be_predicated(nir_instr *instr)
 {
+   /* Anything that doesn't expand to control-flow can be predicated. */
+   switch (instr->type) {
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_tex:
+   case nir_instr_type_load_const:
+   case nir_instr_type_undef:
+   case nir_instr_type_phi:
+   case nir_instr_type_parallel_copy:
+      return true;
+   case nir_instr_type_call:
+   case nir_instr_type_jump:
+      return false;
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_reduce:
+      case nir_intrinsic_inclusive_scan:
+      case nir_intrinsic_exclusive_scan:
+      case nir_intrinsic_reduce_clusters_ir3:
+      case nir_intrinsic_inclusive_scan_clusters_ir3:
+      case nir_intrinsic_exclusive_scan_clusters_ir3:
+      case nir_intrinsic_brcst_active_ir3:
+      case nir_intrinsic_ballot:
+      case nir_intrinsic_elect:
+      case nir_intrinsic_read_invocation_cond_ir3:
+      case nir_intrinsic_discard_if:
+      case nir_intrinsic_discard:
+      case nir_intrinsic_demote:
+      case nir_intrinsic_demote_if:
+      case nir_intrinsic_terminate:
+      case nir_intrinsic_terminate_if:
+         return false;
+      default:
+         return true;
+      }
+   }
+   }
+
+   unreachable("Checked all cases");
+}
+
+static bool
+nif_can_be_predicated(nir_if *nif)
+{
+   /* For non-divergent branches, predication is more expensive than a branch
+    * because the latter can potentially skip all instructions.
+    */
+   if (!nir_src_is_divergent(nif->condition))
+      return false;
+
+   /* Although it could potentially be possible to allow a limited form of
+    * nested predication (e.g., by resetting the predication mask after a nested
+    * branch), let's avoid this for now and only use predication for leaf
+    * branches. That is, for ifs that contain exactly one block in both branches
+    * (note that they always contain at least one block).
+    */
+   if (!exec_list_is_singular(&nif->then_list) ||
+       !exec_list_is_singular(&nif->else_list)) {
+      return false;
+   }
+
+   nir_foreach_instr (instr, nir_if_first_then_block(nif)) {
+      if (!instr_can_be_predicated(instr))
+         return false;
+   }
+
+   nir_foreach_instr (instr, nir_if_first_else_block(nif)) {
+      if (!instr_can_be_predicated(instr))
+         return false;
+   }
+
+   return true;
+}
+
+/* A typical if-else block like this:
+ * if (cond) {
+ *     tblock;
+ * } else {
+ *     fblock;
+ * }
+ * Will be emitted as:
+ *        |-- i --|
+ *        | ...   |
+ *        | predt |
+ *        |-------|
+ *    succ0 /   \ succ1
+ * |-- i+1 --| |-- i+2 --|
+ * | tblock  | | fblock  |
+ * | predf   | | jump    |
+ * |---------| |---------|
+ *    succ0 \   / succ0
+ *        |-- j --|
+ *        |  ...  |
+ *        |-------|
+ * Where the numbers at the top of blocks are their indices. That is, the true
+ * block and false block are laid-out contiguously after the current block. This
+ * layout is verified during legalization in prede_sched which also inserts the
+ * final prede instruction. Note that we don't insert prede right away to allow
+ * opt_jump to optimize the jump in the false block.
+ */
+static struct ir3_instruction *
+emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)
+{
+   if (!ctx->compiler->has_predication)
+      return NULL;
+   if (!nif_can_be_predicated(nif))
+      return NULL;
+
+   struct ir3_block *then_block = get_block(ctx, nir_if_first_then_block(nif));
+   struct ir3_block *else_block = get_block(ctx, nir_if_first_else_block(nif));
+   assert(list_is_empty(&then_block->instr_list) &&
+          list_is_empty(&else_block->instr_list));
+
+   bool inv;
+   struct ir3_instruction *condition =
+      get_branch_condition(ctx, &nif->condition, &inv);
+   struct ir3_instruction *pred, *pred_inv;
+
+   if (!inv) {
+      pred = ir3_PREDT(ctx->block, condition, IR3_REG_PREDICATE);
+      pred_inv = ir3_PREDF(then_block, condition, IR3_REG_PREDICATE);
+   } else {
+      pred = ir3_PREDF(ctx->block, condition, IR3_REG_PREDICATE);
+      pred_inv = ir3_PREDT(then_block, condition, IR3_REG_PREDICATE);
+   }
+
+   pred->srcs[0]->num = REG_P0_X;
+   pred_inv->srcs[0]->num = REG_P0_X;
+   return pred;
+}
+
+static struct ir3_instruction *
+emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
+{
+   nir_src *nir_cond = &nif->condition;
   struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond);
   if (folded)
      return folded;

+   struct ir3_instruction *predicated = emit_predicated_branch(ctx, nif);
+   if (predicated)
+      return predicated;
+
   bool inv1;
   struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1);
   struct ir3_instruction *branch =
@ -3881,7 +4023,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif)
       */
      ir3_SHPS(ctx->block);
   } else {
-      emit_conditional_branch(ctx, &nif->condition);
+      emit_conditional_branch(ctx, nif);
   }

   ctx->block->divergent_condition = nif->condition.ssa->divergent;
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@ -1038,7 +1038,8 @@ block_sched(struct ir3 *ir)

            br2 = ir3_JUMP(block);
            br2->cat0.target = block->successors[0];
-         } else {
+         } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
+                    opc == OPC_BALL || opc == OPC_BANY) {
            /* create "else" branch first (since "then" block should
             * frequently/always end up being a fall-thru):
             */
@ -1046,6 +1047,12 @@ block_sched(struct ir3 *ir)
            br2 = ir3_instr_clone(br1);
            invert_branch(br1);
            br2->cat0.target = block->successors[0];
+         } else {
+            assert(opc == OPC_PREDT || opc == OPC_PREDF);
+
+            /* Handled by prede_sched. */
+            terminator->cat0.target = block->successors[0];
+            continue;
         }

         /* Creating br2 caused it to be moved before the terminator b1, move it
@ -1053,16 +1060,113 @@ block_sched(struct ir3 *ir)
          */
         ir3_instr_move_after(br2, br1);
      } else if (block->successors[0]) {
-         /* otherwise unconditional jump to next block which should already have
-          * been inserted.
+         /* otherwise unconditional jump or predt/predf to next block which
+          * should already have been inserted.
          */
         assert(terminator);
-         assert(terminator->opc == OPC_JUMP);
+         assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
+                terminator->opc == OPC_PREDF);
         terminator->cat0.target = block->successors[0];
      }
   }
 }

+static void
+prede_sched(struct ir3 *ir)
+{
+   unsigned index = 0;
+   foreach_block (block, &ir->block_list)
+      block->index = index++;
+
+   foreach_block (block, &ir->block_list) {
+      /* Look for the following pattern generated by NIR lowering. The numbers
+       * at the top of blocks are their index.
+       *        |--- i ----|
+       *        |   ...    |
+       *        | pred[tf] |
+       *        |----------|
+       *      succ0 /   \ succ1
+       * |-- i+1 ---| |-- i+2 ---|
+       * |    ...   | |   ...    |
+       * | pred[ft] | |   ...    |
+       * |----------| |----------|
+       *     succ0 \   / succ0
+       *        |--- j ----|
+       *        |   ...    |
+       *        |----------|
+       */
+      struct ir3_block *succ0 = block->successors[0];
+      struct ir3_block *succ1 = block->successors[1];
+
+      if (!succ1)
+         continue;
+
+      struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+      if (!terminator)
+         continue;
+      if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
+         continue;
+
+      assert(!succ0->successors[1] && !succ1->successors[1]);
+      assert(succ0->successors[0] == succ1->successors[0]);
+      assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
+      assert(succ0->index == (block->index + 1));
+      assert(succ1->index == (block->index + 2));
+
+      struct ir3_instruction *succ0_terminator =
+         ir3_block_get_terminator(succ0);
+      assert(succ0_terminator);
+      assert(succ0_terminator->opc ==
+             (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
+
+      ASSERTED struct ir3_instruction *succ1_terminator =
+         ir3_block_get_terminator(succ1);
+      assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
+
+      /* Simple case: both successors contain instructions. Keep both blocks and
+       * insert prede before the second successor's terminator:
+       *        |--- i ----|
+       *        |   ...    |
+       *        | pred[tf] |
+       *        |----------|
+       *      succ0 /   \ succ1
+       * |-- i+1 ---| |-- i+2 ---|
+       * |    ...   | |   ...    |
+       * | pred[ft] | | prede    |
+       * |----------| |----------|
+       *     succ0 \   / succ0
+       *        |--- j ----|
+       *        |   ...    |
+       *        |----------|
+       */
+      if (!list_is_empty(&succ1->instr_list)) {
+         ir3_PREDE(succ1);
+         continue;
+      }
+
+      /* Second successor is empty so we can remove it:
+       *        |--- i ----|
+       *        |   ...    |
+       *        | pred[tf] |
+       *        |----------|
+       *      succ0 /   \ succ1
+       * |-- i+1 ---|   |
+       * |    ...   |   |
+       * |   prede  |   |
+       * |----------|   |
+       *     succ0 \    /
+       *        |--- j ----|
+       *        |   ...    |
+       *        |----------|
+       */
+      list_delinit(&succ0_terminator->node);
+      ir3_PREDE(succ0);
+      remove_unused_block(succ1);
+      block->successors[1] = succ0->successors[0];
+      ir3_block_add_predecessor(succ0->successors[0], block);
+   }
+}
+
 /* Here we workaround the fact that kill doesn't actually kill the thread as
 * GL expects. The last instruction always needs to be an end instruction,
 * which means that if we're stuck in a loop where kill is the only way out,
@ -1352,6 +1456,8 @@ helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
             */
            if (is_alu(instr) || is_sfu(instr))
               continue;
+            if (instr->opc == OPC_PREDE)
+               continue;

            expensive_instruction_in_block = true;
            break;
@ -1457,6 +1563,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
   while (opt_jump(ir))
      ;

+   prede_sched(ir);
+
   /* TODO: does (eq) exist before a6xx? */
   if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
       so->compiler->gen >= 6)
--- a/src/freedreno/ir3/ir3_reconvergence.c
+++ b/src/freedreno/ir3/ir3_reconvergence.c
@ -181,6 +181,11 @@ ir3_calc_reconvergence(struct ir3_shader_variant *so)
    * reconvergence point.
    */
   foreach_block (block, &so->ir->block_list) {
+      struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+      if (!terminator)
+         continue;
+      if (terminator->opc == OPC_PREDT || terminator->opc == OPC_PREDF)
+         continue;
      if (block->successors[0] && block->successors[1] &&
          block->divergent_condition) {
         unsigned idx = block->successors[0]->index >