ir3: implement RA for predicate registers

Up to now, ir3 only supported one predicate register (p0.x). However, since a6xx, four predicate registers are available. This patch adds a register allocator for predicate registers that allows all of them to be used. The RA also works for older generations with only one register. The use of p0.x was hard-coded in many places in ir3. This has been replaced by a new flag, IR3_REG_PREDICATE, to indicate that an SSA value should be allocated to a predicate register. The RA uses the standard liveness analysis available in ir3. Using this, registers are allocated in a single pass over all blocks. For each block we keep track of currently live defs in the registers. Predicate destinations allocate a new register and sources take the register from their def. The live defs of a block are initialized with the intersection of the live-out defs of their predecessors: if all predecessors have the same live-out def in the same register, it is used as live-in. However, we only do this for defs that are actually live-in according to the liveness analysis. This doesn't work for loops: since predecessors from back edges are processed after their successors, we don't know their live-out state yet. We solve this by ignoring such predecessors while calculating the live-in state. When this predecessor is later processed, we fix-up its live-out state to match what its successor expects by reloading defs if necessary. Spilling is implemented by reloading, or rematerializing, the instruction that produced the def. Whenever we need a new register while none are available, we simply free one. If the freed def is later needed again, we clone the original instruction in front on the new use. We keep track of the original def the reload is cloned from so that subsequent uses can reuse the reload. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27411>
2026-01-03 15:50:17 +01:00 · 2024-02-01 14:51:20 +01:00 · 2024-02-01 14:51:20 +01:00 · 21cd9b9557
commit 21cd9b9557
parent 49f5a73312
16 changed files with 542 additions and 129 deletions
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -168,6 +168,9 @@ typedef enum ir3_register_flags {
    * Note: This effectively has the same semantics as IR3_REG_KILL.
    */
   IR3_REG_LAST_USE = BIT(18),
+
+   /* Predicate register (p0.c). Cannot be combined with half or shared. */
+   IR3_REG_PREDICATE = BIT(19),
 } ir3_register_flags;

 struct ir3_register {
@ -563,9 +566,6 @@ struct ir3 {
   /* same for a1.x: */
   DECLARE_ARRAY(struct ir3_instruction *, a1_users);

-   /* and same for instructions that consume predicate register: */
-   DECLARE_ARRAY(struct ir3_instruction *, predicates);
-
   /* Track texture sample instructions which need texture state
    * patched in (for astc-srgb workaround):
    */
@ -952,7 +952,7 @@ is_same_type_mov(struct ir3_instruction *instr)
   dst = instr->dsts[0];

   /* mov's that write to a0 or p0.x are special: */
-   if (dst->num == regid(REG_P0, 0))
+   if (dst->flags & IR3_REG_PREDICATE)
      return false;
   if (reg_num(dst) == REG_A0)
      return false;
@ -1295,7 +1295,9 @@ is_dest_gpr(struct ir3_register *dst)
 {
   if (dst->wrmask == 0)
      return false;
-   if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0)))
+   if (reg_num(dst) == REG_A0)
+      return false;
+   if (dst->flags & IR3_REG_PREDICATE)
      return false;
   return true;
 }
@ -1333,10 +1335,10 @@ writes_addr1(struct ir3_instruction *instr)
 static inline bool
 writes_pred(struct ir3_instruction *instr)
 {
-   /* Note: only the first dest can write to p0.x */
+   /* Note: only the first dest can write to p0 */
   if (instr->dsts_count > 0) {
      struct ir3_register *dst = instr->dsts[0];
-      return reg_num(dst) == REG_P0;
+      return !!(dst->flags & IR3_REG_PREDICATE);
   }
   return false;
 }
@ -1349,8 +1351,8 @@ writes_pred(struct ir3_instruction *instr)
 static inline bool
 is_reg_special(const struct ir3_register *reg)
 {
-   return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) ||
-          (reg_num(reg) == REG_P0);
+   return (reg->flags & (IR3_REG_SHARED | IR3_REG_PREDICATE) ||
+           (reg_num(reg) == REG_A0));
 }

 /* Same as above but in cases where we don't have a register. r48.x and above
@ -1381,9 +1383,9 @@ conflicts(struct ir3_register *a, struct ir3_register *b)
 static inline bool
 reg_gpr(struct ir3_register *r)
 {
-   if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+   if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE))
      return false;
-   if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
+   if (reg_num(r) == REG_A0)
      return false;
   return true;
 }
@ -1999,6 +2001,7 @@ bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);

 /* register assignment: */
 int ir3_ra(struct ir3_shader_variant *v);
+void ir3_ra_predicates(struct ir3_shader_variant *v);

 /* lower subgroup ops: */
 bool ir3_lower_subgroups(struct ir3 *ir);
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@ -156,6 +156,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,

   compiler->local_mem_size = dev_info->cs_shared_mem_size;

+   compiler->num_predicates = 1;
+
   if (compiler->gen >= 6) {
      compiler->samgq_workaround = true;
      /* a6xx split the pipeline state into geometry and fragment state, in
@ -212,6 +214,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
      compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
      compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
      compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
+      compiler->num_predicates = 4;
   } else {
      compiler->max_const_pipeline = 512;
      compiler->max_const_geom = 512;
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@ -214,6 +214,9 @@ struct ir3_compiler {
    */
   bool has_getfiberid;

+   /* Number of available predicate registers (p0.c) */
+   uint32_t num_predicates;
+
   /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
   uint32_t max_variable_workgroup_size;

--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -2614,8 +2614,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      cond->cat2.condition = IR3_COND_NE;

      /* condition always goes in predicate register: */
-      cond->dsts[0]->num = regid(REG_P0, 0);
-      cond->dsts[0]->flags &= ~IR3_REG_SSA;
+      cond->dsts[0]->flags |= IR3_REG_PREDICATE;

      if (intr->intrinsic == nir_intrinsic_demote ||
          intr->intrinsic == nir_intrinsic_demote_if) {
@ -2631,8 +2630,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
                            IR3_BARRIER_ACTIVE_FIBERS_W;
      kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
                               IR3_BARRIER_ACTIVE_FIBERS_R;
-      kill->srcs[0]->num = regid(REG_P0, 0);
-      array_insert(ctx->ir, ctx->ir->predicates, kill);
+      kill->srcs[0]->flags |= IR3_REG_PREDICATE;

      array_insert(b, b->keeps, kill);
      ctx->so->has_kill = true;
@ -2653,14 +2651,13 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      cond->cat2.condition = IR3_COND_NE;

      /* condition always goes in predicate register: */
-      cond->dsts[0]->num = regid(REG_P0, 0);
+      cond->dsts[0]->flags |= IR3_REG_PREDICATE;

-      kill = ir3_PREDT(b, cond, 0);
+      kill = ir3_PREDT(b, cond, IR3_REG_PREDICATE);

      kill->barrier_class = IR3_BARRIER_EVERYTHING;
      kill->barrier_conflict = IR3_BARRIER_EVERYTHING;

-      array_insert(ctx->ir, ctx->ir->predicates, kill);
      array_insert(b, b->keeps, kill);
      break;
   }
@ -2673,8 +2670,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
         dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
      else
         dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
-      dst[0]->srcs[0]->num = regid(REG_P0, 0);
-      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
+      dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
      break;
   }
   case nir_intrinsic_elect:
@ -2690,8 +2686,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
      dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
                                   src, 0);
      dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
-      dst[0]->srcs[0]->num = regid(REG_P0, 0);
-      array_insert(ctx->ir, ctx->ir->predicates, dst[0]);
+      dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
      break;
   }

@ -2712,8 +2707,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
         struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
         struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
         ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
-         ballot->srcs[0]->num = regid(REG_P0, 0);
-         array_insert(ctx->ir, ctx->ir->predicates, ballot);
+         ballot->srcs[0]->flags |= IR3_REG_PREDICATE;
      }

      ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
@ -3820,11 +3814,11 @@ emit_if(struct ir3_context *ctx, nir_if *nif)

   if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
      struct ir3_instruction *pred = ssa(condition->srcs[0]);
-      ir3_BANY(ctx->block, pred, 0);
+      ir3_BANY(ctx->block, pred, IR3_REG_PREDICATE);
   } else if (condition->opc == OPC_ALL_MACRO &&
              condition->block == ctx->block) {
      struct ir3_instruction *pred = ssa(condition->srcs[0]);
-      ir3_BALL(ctx->block, pred, 0);
+      ir3_BALL(ctx->block, pred, IR3_REG_PREDICATE);
   } else if (condition->opc == OPC_ELECT_MACRO &&
              condition->block == ctx->block) {
      ir3_GETONE(ctx->block);
@ -3837,7 +3831,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif)
      ir3_SHPS(ctx->block);
   } else {
      struct ir3_instruction *pred = ir3_get_predicate(ctx, condition);
-      ir3_BR(ctx->block, pred, 0);
+      ir3_BR(ctx->block, pred, IR3_REG_PREDICATE);
   }

   emit_cf_list(ctx, &nif->then_list);
@ -3959,15 +3953,14 @@ emit_stream_out(struct ir3_context *ctx)

   /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
   cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
-   cond->dsts[0]->num = regid(REG_P0, 0);
-   cond->dsts[0]->flags &= ~IR3_REG_SSA;
+   cond->dsts[0]->flags |= IR3_REG_PREDICATE;
   cond->cat2.condition = IR3_COND_LT;

   /* condition goes on previous block to the conditional,
    * since it is used to pick which of the two successor
    * paths to take:
    */
-   ir3_BR(orig_end_block, cond, 0);
+   ir3_BR(orig_end_block, cond, IR3_REG_PREDICATE);

   /* switch to stream_out_block to generate the stream-out
    * instructions:
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@ -463,15 +463,14 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
   struct ir3_block *b = ctx->block;
   struct ir3_instruction *cond;

-   /* NOTE: only cmps.*.* can write p0.x: */
+   /* NOTE: we use cpms.s.ne x, 0 to move x into a predicate register */
   struct ir3_instruction *zero =
         create_immed_typed(b, 0, is_half(src) ? TYPE_U16 : TYPE_U32);
   cond = ir3_CMPS_S(b, src, 0, zero, 0);
   cond->cat2.condition = IR3_COND_NE;

   /* condition always goes in predicate register: */
-   cond->dsts[0]->num = regid(REG_P0, 0);
-   cond->dsts[0]->flags &= ~IR3_REG_SSA;
+   cond->dsts[0]->flags |= IR3_REG_PREDICATE;

   return cond;
 }
--- a/src/freedreno/ir3/ir3_cp.c
+++ b/src/freedreno/ir3/ir3_cp.c
@ -100,7 +100,7 @@ static bool
 is_foldable_double_cmp(struct ir3_instruction *cmp)
 {
   struct ir3_instruction *cond = ssa(cmp->srcs[0]);
-   return (cmp->dsts[0]->num == regid(REG_P0, 0)) && cond &&
+   return (cmp->dsts[0]->flags & IR3_REG_PREDICATE) && cond &&
          (cmp->srcs[1]->flags & IR3_REG_IMMED) &&
          (cmp->srcs[1]->iim_val == 0) &&
          (cmp->cat2.condition == IR3_COND_NE) &&
--- a/src/freedreno/ir3/ir3_dce.c
+++ b/src/freedreno/ir3/ir3_dce.c
@ -133,12 +133,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
      struct ir3_instruction *terminator = ir3_block_get_terminator(block);
      if (terminator) {
         instr_dce(terminator, false);
-
-         /* Temporary workaround for predicates not being SSA. Won't be
-          * necessary anymore once we have RA for predicates.
-          */
-         foreach_src (src, terminator)
-            instr_dce(src->def->instr, false);
      }
   }

@ -181,12 +175,6 @@ find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
         ir->a1_users[i] = NULL;
   }

-   for (i = 0; i < ir->predicates_count; i++) {
-      struct ir3_instruction *instr = ir->predicates[i];
-      if (instr && (instr->flags & IR3_INSTR_UNUSED))
-         ir->predicates[i] = NULL;
-   }
-
   /* cleanup unused inputs: */
   foreach_input_n (in, n, ir)
      if (in->flags & IR3_INSTR_UNUSED)
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@ -727,9 +727,6 @@ invert_branch(struct ir3_instruction *branch)
 * Initially this is done naively, without considering if the successor
 * block immediately follows the current block (ie. so no jump required),
 * but that is cleaned up in opt_jump().
- *
- * TODO what ensures that the last write to p0.x in a block is the
- * branch condition?  Have we been getting lucky all this time?
 */
 static void
 block_sched(struct ir3 *ir)
--- a/src/freedreno/ir3/ir3_parser.y
+++ b/src/freedreno/ir3/ir3_parser.y
@ -845,11 +845,11 @@ instr:             iflags cat0_instr

 label:             T_IDENTIFIER ':' { new_label($1); }

-cat0_src1:         '!' T_P0        { instr->cat0.inv1 = true; $$ = new_src((62 << 3) + $2, 0); }
-|                  T_P0            { $$ = new_src((62 << 3) + $1, 0); }
+cat0_src1:         '!' T_P0        { instr->cat0.inv1 = true; $$ = new_src((62 << 3) + $2, IR3_REG_PREDICATE); }
+|                  T_P0            { $$ = new_src((62 << 3) + $1, IR3_REG_PREDICATE); }

-cat0_src2:         '!' T_P0        { instr->cat0.inv2 = true; $$ = new_src((62 << 3) + $2, 0); }
-|                  T_P0            { $$ = new_src((62 << 3) + $1, 0); }
+cat0_src2:         '!' T_P0        { instr->cat0.inv2 = true; $$ = new_src((62 << 3) + $2, IR3_REG_PREDICATE); }
+|                  T_P0            { $$ = new_src((62 << 3) + $1, IR3_REG_PREDICATE); }

 cat0_immed:        '#' integer     { instr->cat0.immed = $2; }
 |                  '#' T_IDENTIFIER { ralloc_steal(instr, (void *)$2); instr->cat0.target_label = $2; }
@ -1434,12 +1434,12 @@ meta_print: meta_print_start meta_print_regs {
 src:               T_REGISTER     { $$ = new_src($1, 0); }
 |                  T_A0           { $$ = new_src((61 << 3), IR3_REG_HALF); }
 |                  T_A1           { $$ = new_src((61 << 3) + 1, IR3_REG_HALF); }
-|                  T_P0           { $$ = new_src((62 << 3) + $1, 0); }
+|                  T_P0           { $$ = new_src((62 << 3) + $1, IR3_REG_PREDICATE); }

 dst:               T_REGISTER     { $$ = new_dst($1, 0); }
 |                  T_A0           { $$ = new_dst((61 << 3), IR3_REG_HALF); }
 |                  T_A1           { $$ = new_dst((61 << 3) + 1, IR3_REG_HALF); }
-|                  T_P0           { $$ = new_dst((62 << 3) + $1, 0); }
+|                  T_P0           { $$ = new_dst((62 << 3) + $1, IR3_REG_PREDICATE); }

 const:             T_CONSTANT     { $$ = new_src($1, IR3_REG_CONST); }

--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@ -257,9 +257,18 @@ print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
      print_ssa_def_name(stream, reg);
   }

-   if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
-      mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")", reg_num(reg),
+   if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY)) {
+      const char *prefix = "r";
+      unsigned num = reg_num(reg);
+
+      if (reg->flags & IR3_REG_PREDICATE) {
+         prefix = "p";
+         num = 0;
+      }
+
+      mesa_log_stream_printf(stream, "(" SYN_REG("%s%u.%c") ")", prefix, num,
                             "xyzw"[reg_comp(reg)]);
+   }
 }

 static void
@ -296,6 +305,8 @@ print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
      mesa_log_stream_printf(stream, "s");
   if (reg->flags & IR3_REG_HALF)
      mesa_log_stream_printf(stream, "h");
+   if (reg->flags & IR3_REG_PREDICATE)
+      mesa_log_stream_printf(stream, "p");

   if (reg->flags & IR3_REG_IMMED) {
      mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val,
@ -325,6 +336,9 @@ print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
      if (reg->flags & IR3_REG_CONST)
         mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg),
                                "xyzw"[reg_comp(reg)]);
+      else if (reg->flags & IR3_REG_PREDICATE)
+         mesa_log_stream_printf(stream, SYN_REG("p0.%c"),
+                                "xyzw"[reg_comp(reg)]);
      else
         mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg),
                                "xyzw"[reg_comp(reg)]);
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -2544,6 +2544,9 @@ ir3_ra(struct ir3_shader_variant *v)
 {
   ir3_calc_dominance(v->ir);

+   /* Predicate RA needs dominance. */
+   ir3_ra_predicates(v);
+
   ir3_create_parallel_copies(v->ir);

   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@ -87,7 +87,7 @@ ra_reg_get_physreg(const struct ir3_register *reg)
 static inline bool
 def_is_gpr(const struct ir3_register *reg)
 {
-   return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
+   return reg_num(reg) != REG_A0 && !(reg->flags & IR3_REG_PREDICATE);
 }

 /* Note: don't count undef as a source.
@ -105,6 +105,12 @@ ra_reg_is_dst(const struct ir3_register *reg)
          ((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
 }

+static inline bool
+ra_reg_is_predicate(const struct ir3_register *reg)
+{
+   return (reg->flags & IR3_REG_SSA) && (reg->flags & IR3_REG_PREDICATE);
+}
+
 /* Iterators for sources and destinations which:
 * - Don't include fake sources (irrelevant for RA)
 * - Don't include non-SSA sources (immediates and constants, also irrelevant)
--- a/src/freedreno/ir3/ir3_ra_predicates.c
+++ b/src/freedreno/ir3/ir3_ra_predicates.c
@ -0,0 +1,467 @@
+/*
+ * Copyright 2024 Igalia S.L.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "ir3.h"
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+
+/* Represents a def that is currently live. We keep track of both the pre-RA def
+ * a register refers to and, in case of spilling and reloading, the def of the
+ * reloaded instruction. This allows us to assign reloaded defs to sources and
+ * prevents additional reloads.
+ */
+struct live_def {
+   /* The pre-RA def. */
+   struct ir3_register *def;
+
+   /* The reloaded def. NULL if def was not reloaded. */
+   struct ir3_register *reloaded_def;
+
+   /* Set when used for a src marked first-kill. We cannot immediately free the
+    * register because then it might get reused for another src in the same
+    * instruction. Instead, we free it after an instruction's sources have been
+    * processed.
+    */
+   bool killed;
+};
+
+/* Per-block liveness information. Stores live defs per supported register,
+ * indexed by register component.
+ */
+struct block_liveness {
+   /* Live-in defs taken from the intersections the block's predecessors
+    * live-out defs.
+    */
+   struct live_def *live_in_defs;
+
+   /* Currently live defs. Starts from live-in and is updated while processing
+    * the instructions in a block. Contains the live-out defs after the whole
+    * block has been processed.
+    */
+   struct live_def *live_defs;
+};
+
+struct ra_predicates_ctx {
+   struct ir3 *ir;
+   unsigned num_regs;
+   struct ir3_liveness *liveness;
+   struct block_liveness *blocks_liveness;
+
+   /* True once we spilled a register. This allows us to postpone the
+    * calculation of SSA uses and instruction counting until the first time we
+    * need to spill. This is useful since spilling is rare in general.
+    */
+   bool spilled;
+};
+
+static bool
+has_free_regs(struct ra_predicates_ctx *ctx, struct block_liveness *live)
+{
+   for (unsigned i = 0; i < ctx->num_regs; ++i) {
+      if (live->live_defs[i].def == NULL)
+         return true;
+   }
+
+   return false;
+}
+
+static unsigned
+alloc_reg_comp(struct ra_predicates_ctx *ctx, struct block_liveness *live)
+{
+   for (unsigned i = 0; i < ctx->num_regs; ++i) {
+      if (live->live_defs[i].def == NULL)
+         return i;
+   }
+
+   unreachable("Reg availability should have been checked before");
+}
+
+static struct live_def *
+assign_reg(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+           struct ir3_register *def, struct ir3_register *reloaded_def,
+           unsigned comp)
+{
+   assert(comp < ctx->num_regs);
+
+   struct ir3_register *current_def =
+      (reloaded_def == NULL) ? def : reloaded_def;
+
+   current_def->num = regid(REG_P0, comp);
+
+   struct live_def *live_def = &live->live_defs[comp];
+   assert((live_def->def == NULL) && (live_def->reloaded_def == NULL));
+
+   live_def->def = def;
+   live_def->reloaded_def = reloaded_def;
+   return live_def;
+}
+
+static struct live_def *
+alloc_reg(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+          struct ir3_register *def, struct ir3_register *reloaded_def)
+{
+   unsigned comp = alloc_reg_comp(ctx, live);
+   return assign_reg(ctx, live, def, reloaded_def, comp);
+}
+
+static void
+free_reg(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+         struct ir3_register *reg)
+{
+   assert((reg->flags & IR3_REG_PREDICATE) && (reg->num != INVALID_REG));
+
+   unsigned comp = reg_comp(reg);
+   assert(comp < ctx->num_regs);
+
+   struct live_def *reg_live_def = &live->live_defs[comp];
+   assert((reg_live_def->def == reg) || (reg_live_def->reloaded_def == reg));
+
+   reg_live_def->def = NULL;
+   reg_live_def->reloaded_def = NULL;
+   reg_live_def->killed = false;
+}
+
+static struct ir3_instruction *
+first_non_allocated_use_after(struct ir3_register *def,
+                              struct ir3_instruction *after)
+{
+   uint32_t first_ip = UINT32_MAX;
+   struct ir3_instruction *first = NULL;
+
+   foreach_ssa_use (use, def->instr) {
+      if (!ir3_block_dominates(after->block, use->block))
+         continue;
+
+      /* Do not filter-out after itself. This ensures that if after is a use of
+       * def, def will not get selected to get spilled because there must be
+       * another register with a further first use. We have to ensure that def
+       * doesn't get spilled in this case because otherwise, we might spill a
+       * register used by an earlier source of after.
+       */
+      if (use->ip < after->ip)
+         continue;
+
+      foreach_ssa_src_n (src, n, use) {
+         if (__is_false_dep(use, n))
+            continue;
+
+         struct ir3_register *src_reg = use->srcs[n];
+         if (!ra_reg_is_predicate(src_reg) || src_reg->def != def)
+            continue;
+         if (use->ip >= first_ip)
+            continue;
+
+         first_ip = use->ip;
+         first = use;
+      }
+   }
+
+   return first;
+}
+
+static bool
+is_predicate_use(struct ir3_instruction *instr, unsigned src_n)
+{
+   if (__is_false_dep(instr, src_n))
+      return false;
+   return ra_reg_is_predicate(instr->srcs[src_n]);
+}
+
+/* Spill a register by simply removing one from the live defs. We don't need to
+ * store its value anywhere since it can be rematerialized (see reload). We
+ * chose the register whose def's first use is the furthest.
+ */
+static void
+spill(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+      struct ir3_instruction *spill_location)
+{
+   if (!ctx->spilled) {
+      ir3_count_instructions_ra(ctx->ir);
+      ir3_find_ssa_uses_for(ctx->ir, ctx, is_predicate_use);
+      ctx->spilled = true;
+   }
+
+   unsigned furthest_first_use = 0;
+   unsigned spill_reg = ~0;
+
+   for (unsigned i = 0; i < ctx->num_regs; ++i) {
+      struct ir3_register *candidate = live->live_defs[i].def;
+      assert(candidate != NULL);
+
+      struct ir3_instruction *first_use =
+         first_non_allocated_use_after(candidate, spill_location);
+
+      if (first_use == NULL) {
+         spill_reg = i;
+         break;
+      }
+
+      if (first_use->ip > furthest_first_use) {
+         furthest_first_use = first_use->ip;
+         spill_reg = i;
+      }
+   }
+
+   assert(spill_reg != ~0);
+
+   live->live_defs[spill_reg].def = NULL;
+   live->live_defs[spill_reg].reloaded_def = NULL;
+}
+
+static struct live_def *
+find_live_def(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+              struct ir3_register *def)
+{
+   for (unsigned i = 0; i < ctx->num_regs; ++i) {
+      struct live_def *live_def = &live->live_defs[i];
+      if (live_def->def == def)
+         return live_def;
+   }
+
+   return NULL;
+}
+
+static struct ir3_register *
+get_def(struct live_def *live_def)
+{
+   return live_def->reloaded_def == NULL ? live_def->def
+                                         : live_def->reloaded_def;
+}
+
+/* Reload a def into s specific register, which must be free. Reloading is
+ * implemented by cloning the instruction that produced the def and moving it in
+ * front of the use.
+ */
+static struct live_def *
+reload_into(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+            struct ir3_register *def, struct ir3_instruction *use,
+            unsigned comp)
+{
+   struct ir3_instruction *reloaded_instr = NULL;
+   bool def_is_allocated = !(def->flags & IR3_REG_UNUSED);
+
+   if (!def_is_allocated && use->block == def->instr->block) {
+      /* If def has not been allocated a register yet, no source is currently
+       * using it. If it's in the same block as the current use, just move it in
+       * front of it.
+       */
+      reloaded_instr = def->instr;
+   } else {
+      /* If the def is either 1) already allocated or 2) in a different block
+       * than the current use, we have to clone it. For 1) because its allocated
+       * register isn't currently live (we wouldn't be reloading it otherwise).
+       * For 2) because it might have other uses in blocks that aren't
+       * successors of the use.
+       */
+      reloaded_instr = ir3_instr_clone(def->instr);
+   }
+
+   reloaded_instr->block = use->block;
+   ir3_instr_move_before(reloaded_instr, use);
+   struct ir3_register *reloaded_def = reloaded_instr->dsts[0];
+   return assign_reg(ctx, live, def, reloaded_def, comp);
+}
+
+/* Reload a def into a register, spilling one if necessary. */
+static struct live_def *
+reload(struct ra_predicates_ctx *ctx, struct block_liveness *live,
+       struct ir3_register *def, struct ir3_instruction *use)
+{
+   if (!has_free_regs(ctx, live))
+      spill(ctx, live, use);
+
+   unsigned comp = alloc_reg_comp(ctx, live);
+   return reload_into(ctx, live, def, use, comp);
+}
+
+static int
+ra_block(struct ra_predicates_ctx *ctx, struct ir3_block *block)
+{
+   struct block_liveness *live = &ctx->blocks_liveness[block->index];
+
+   foreach_instr (instr, &block->instr_list) {
+      /* Assign registers to sources based on their defs. */
+      foreach_src (src, instr) {
+         if (!ra_reg_is_predicate(src))
+            continue;
+
+         struct live_def *live_def = find_live_def(ctx, live, src->def);
+         if (live_def == NULL)
+            live_def = reload(ctx, live, src->def, instr);
+
+         assert(live_def != NULL);
+
+         struct ir3_register *def = get_def(live_def);
+         src->num = def->num;
+         src->def = def;
+
+         /* Mark the def as used to make sure we won't move it anymore. */
+         def->flags &= ~IR3_REG_UNUSED;
+
+         /* If this source kills the def, don't free the register right away to
+          * prevent it being reused for another source of this instruction. We
+          * can free it after all sources of this instruction have been
+          * processed.
+          */
+         if (src->flags & IR3_REG_FIRST_KILL)
+            live_def->killed = true;
+      }
+
+      /* After all sources of an instruction have been processed, we can  free
+       * the registers that were killed by a source.
+       */
+      for (unsigned reg = 0; reg < ctx->num_regs; ++reg) {
+         struct live_def *live_def = &live->live_defs[reg];
+         if (live_def->def == NULL)
+            continue;
+
+         if (live_def->killed)
+            free_reg(ctx, live, get_def(live_def));
+      }
+
+      /* Allocate registers for new defs. */
+      foreach_dst (dst, instr) {
+         if (!ra_reg_is_predicate(dst))
+            continue;
+
+         /* Mark it as unused until we encounter the first use. This allows us
+          * to know when it is legal to move the instruction.
+          */
+         dst->flags |= IR3_REG_UNUSED;
+
+         /* If we don't have any free registers, ignore the def for now. If we
+          * start spilling right away, we might end-up with a cascade of spills
+          * when there are a lot of defs before their first uses.
+          */
+         if (!has_free_regs(ctx, live))
+            continue;
+
+         alloc_reg(ctx, live, dst, NULL);
+      }
+   }
+
+   /* Process loop back edges. Since we ignore them while calculating a block's
+    * live-in defs in init_block_liveness, we now make sure that we satisfy our
+    * successor's live-in requirements by producing the correct defs in the
+    * required registers.
+    */
+   for (unsigned i = 0; i < 2; ++i) {
+      struct ir3_block *succ = block->successors[i];
+      if (!succ)
+         continue;
+
+      struct live_def *succ_live_in =
+         ctx->blocks_liveness[succ->index].live_in_defs;
+
+      /* If live_in_defs has not been set yet, it's not a back edge. */
+      if (!succ_live_in)
+         continue;
+
+      for (unsigned reg = 0; reg < ctx->num_regs; ++reg) {
+         struct live_def *succ_def = &succ_live_in[reg];
+         if (!succ_def->def)
+            continue;
+
+         struct live_def *cur_def = &live->live_defs[reg];
+
+         /* Same def in the same register, nothing to be done. */
+         if (cur_def->def == succ_def->def)
+            continue;
+
+         /* Different def in the same register, free it first. */
+         if (cur_def->def)
+            free_reg(ctx, live, cur_def->def);
+
+         /* Reload the def in the required register right before the block's
+          * terminator.
+          */
+         struct ir3_instruction *use = ir3_block_get_terminator(block);
+         reload_into(ctx, live, succ_def->def, use, reg);
+      }
+   }
+
+   return 0;
+}
+
+/* Propagate live-out defs of a block's predecessors to the block's live-in
+ * defs. This takes the intersection of all predecessors live-out defs. That is,
+ * a def will be live-in if it's live-out in the same register in all
+ * predecessors.
+ */
+static void
+init_block_liveness(struct ra_predicates_ctx *ctx, struct ir3_block *block)
+{
+   struct block_liveness *live = &ctx->blocks_liveness[block->index];
+   live->live_defs = rzalloc_array(ctx, struct live_def, ctx->num_regs);
+   BITSET_WORD *live_in = ctx->liveness->live_in[block->index];
+
+   for (unsigned i = 0; i < block->predecessors_count; ++i) {
+      struct ir3_block *pred = block->predecessors[i];
+      assert(pred != NULL);
+
+      struct block_liveness *pred_live = &ctx->blocks_liveness[pred->index];
+
+      /* If the predecessor has not been processed yet it means it's the back
+       * edge of a loop. We ignore it now, take the live-out defs of the block's
+       * other predecessors, and make sure the live-out defs of the back edge
+       * match this block's live-in defs after processing the back edge.
+       */
+      if (pred_live->live_defs == NULL)
+         continue;
+
+      for (unsigned reg = 0; reg < ctx->num_regs; ++reg) {
+         struct live_def *cur_def = &live->live_defs[reg];
+         struct live_def *pred_def = &pred_live->live_defs[reg];
+
+         if (i == 0 && pred_def->def != NULL) {
+            /* If the first predecessor has a def in reg, use it if it's live-in
+             * in this block.
+             */
+            if (BITSET_TEST(live_in, pred_def->def->name))
+               *cur_def = *pred_def;
+         } else if (cur_def->def != pred_def->def) {
+            /* Different predecessors have different live-out defs in reg so we
+             * cannot use it as live-in.
+             */
+            cur_def->def = NULL;
+            cur_def->reloaded_def = NULL;
+         }
+      }
+   }
+
+   live->live_in_defs = rzalloc_array(ctx, struct live_def, ctx->num_regs);
+   memcpy(live->live_in_defs, live->live_defs,
+          sizeof(struct live_def) * ctx->num_regs);
+}
+
+void
+ir3_ra_predicates(struct ir3_shader_variant *v)
+{
+   struct ra_predicates_ctx *ctx = rzalloc(NULL, struct ra_predicates_ctx);
+   ctx->ir = v->ir;
+   ctx->num_regs = v->compiler->num_predicates;
+   ctx->liveness = ir3_calc_liveness_for(ctx, v->ir, ra_reg_is_predicate,
+                                         ra_reg_is_predicate);
+   ctx->blocks_liveness =
+      rzalloc_array(ctx, struct block_liveness, ctx->liveness->block_count);
+
+   foreach_block (block, &v->ir->block_list) {
+      init_block_liveness(ctx, block);
+      ra_block(ctx, block);
+   }
+
+   /* Remove instructions that became unused. This happens when a def was never
+    * used directly but only through its reloaded clones.
+    * Note that index 0 in the liveness definitions is always NULL.
+    */
+   for (unsigned i = 1; i < ctx->liveness->definitions_count; ++i) {
+      struct ir3_register *def = ctx->liveness->definitions[i];
+
+      if (def->flags & IR3_REG_UNUSED)
+         list_delinit(&def->instr->node);
+   }
+
+   ralloc_free(ctx);
+}
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@ -97,9 +97,8 @@ struct ir3_sched_ctx {
   struct ir3_instruction *scheduled; /* last scheduled instr */
   struct ir3_instruction *addr0;     /* current a0.x user, if any */
   struct ir3_instruction *addr1;     /* current a1.x user, if any */
-   struct ir3_instruction *pred;      /* current p0.x user, if any */

-   struct ir3_instruction *split; /* most-recently-split a0/a1/p0 producer */
+   struct ir3_instruction *split; /* most-recently-split a0/a1 producer */

   int remaining_kills;
   int remaining_tex;
@ -277,11 +276,6 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
      ctx->addr1 = instr;
   }

-   if (writes_pred(instr)) {
-      assert(ctx->pred == NULL);
-      ctx->pred = instr;
-   }
-
   instr->flags |= IR3_INSTR_MARK;

   di(instr, "schedule");
@ -366,9 +360,9 @@ struct ir3_sched_notes {
    */
   bool blocked_kill;
   /* there is at least one instruction that could be scheduled,
-    * except for conflicting address/predicate register usage:
+    * except for conflicting address register usage:
    */
-   bool addr0_conflict, addr1_conflict, pred_conflict;
+   bool addr0_conflict, addr1_conflict;
 };

 static bool
@ -485,12 +479,6 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
      return false;
   }

-   if (writes_pred(instr) && ctx->pred) {
-      assert(ctx->pred != instr);
-      notes->pred_conflict = true;
-      return false;
-   }
-
   /* if the instruction is a kill, we need to ensure *every*
    * bary.f is scheduled.  The hw seems unhappy if the thread
    * gets killed before the end-input (ei) flag is hit.
@ -955,58 +943,6 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
   return new_addr;
 }

-/* "spill" the predicate register by remapping any unscheduled
- * instructions which depend on the current predicate register
- * to a clone of the instruction which wrote the address reg.
- */
-static struct ir3_instruction *
-split_pred(struct ir3_sched_ctx *ctx)
-{
-   struct ir3 *ir;
-   struct ir3_instruction *new_pred = NULL;
-   unsigned i;
-
-   assert(ctx->pred);
-
-   ir = ctx->pred->block->shader;
-
-   for (i = 0; i < ir->predicates_count; i++) {
-      struct ir3_instruction *predicated = ir->predicates[i];
-
-      if (!predicated)
-         continue;
-
-      /* skip instructions already scheduled: */
-      if (is_scheduled(predicated))
-         continue;
-
-      /* remap remaining instructions using current pred
-       * to new pred:
-       *
-       * TODO is there ever a case when pred isn't first
-       * (and only) src?
-       */
-      if (ssa(predicated->srcs[0]) == ctx->pred) {
-         if (!new_pred) {
-            new_pred = split_instr(ctx, ctx->pred);
-            /* original pred is scheduled, but new one isn't: */
-            new_pred->flags &= ~IR3_INSTR_MARK;
-         }
-         predicated->srcs[0]->def->instr = new_pred;
-         /* don't need to remove old dag edge since old pred is
-          * already scheduled:
-          */
-         sched_node_add_dep(predicated, new_pred, 0);
-         di(predicated, "new predicate");
-      }
-   }
-
-   /* all remaining predicated remapped to new pred: */
-   ctx->pred = NULL;
-
-   return new_pred;
-}
-
 static void
 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
@ -1182,7 +1118,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
   /* addr/pred writes are per-block: */
   ctx->addr0 = NULL;
   ctx->addr1 = NULL;
-   ctx->pred = NULL;
   ctx->sy_delay = 0;
   ctx->ss_delay = 0;
   ctx->sy_index = ctx->first_outstanding_sy_index = 0;
@ -1267,8 +1202,6 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
         } else if (notes.addr1_conflict) {
            new_instr =
               split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count);
-         } else if (notes.pred_conflict) {
-            new_instr = split_pred(ctx);
         } else {
            d("unscheduled_list:");
            foreach_instr (instr, &ctx->unscheduled_list)
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@ -65,7 +65,7 @@ validate_error(struct ir3_validate_ctx *ctx, const char *condstr)
 static unsigned
 reg_class_flags(struct ir3_register *reg)
 {
-   return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+   return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED | IR3_REG_PREDICATE);
 }

 static void
@ -78,6 +78,9 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
   if (!(reg->flags & IR3_REG_SSA) || !reg->def)
      return;

+   if (reg->flags & IR3_REG_PREDICATE)
+      validate_assert(ctx, !(reg->flags & (IR3_REG_SHARED | IR3_REG_HALF)));
+
   struct ir3_register *src = reg->def;

   validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@ -108,6 +108,7 @@ libfreedreno_ir3_files = files(
  'ir3_print.c',
  'ir3_ra.c',
  'ir3_ra.h',
+  'ir3_ra_predicates.c',
  'ir3_ra_validate.c',
  'ir3_reconvergence.c',
  'ir3_remove_unreachable.c',