diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
index e17ace592d9..86fd5574d2b 100644
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -523,6 +523,7 @@ regid(int num, int comp)
 /* special registers: */
 #define REG_A0 61 /* address register */
 #define REG_P0 62 /* predicate register */
+#define REG_P0_X regid(REG_P0, 0) /* p0.x */
 
 /* With is_bindless_s2en = 1, this determines whether bindless is enabled and
  * if so, how to get the (base, index) pair for both sampler and texture.
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index dd1263d3a60..3bdf6cc2ef4 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -891,6 +891,8 @@ is_terminator(struct ir3_instruction *instr)
    case OPC_SHPS:
    case OPC_GETONE:
    case OPC_GETLAST:
+   case OPC_PREDT:
+   case OPC_PREDF:
       return true;
    default:
       return false;
@@ -2388,7 +2390,7 @@ INSTR0(END)
 INSTR0(CHSH)
 INSTR0(CHMASK)
 INSTR1NODST(PREDT)
-INSTR0(PREDF)
+INSTR1NODST(PREDF)
 INSTR0(PREDE)
 INSTR0(GETONE)
 INSTR0(GETLAST)
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 4bd1ffd6372..769e6c562d9 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -222,6 +222,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
       compiler->num_predicates = 4;
       compiler->bitops_can_write_predicates = true;
       compiler->has_branch_and_or = true;
+      compiler->has_predication = true;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 23f3aedc2c0..cd86462e291 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -226,6 +226,9 @@ struct ir3_compiler {
    /* True if braa/brao are available. */
    bool has_branch_and_or;
 
+   /* True if predt/predf/prede are supported. */
+   bool has_predication;
+
    /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
    uint32_t max_variable_workgroup_size;
 
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index fe9fea27e16..f5e89b4d0ec 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -3765,8 +3765,10 @@ emit_block(struct ir3_context *ctx, nir_block *nblock)
    /* Emit unconditional branch if we only have one successor. Conditional
     * branches are emitted in emit_if.
     */
-   if (ctx->block->successors[0] && !ctx->block->successors[1])
-      ir3_JUMP(ctx->block);
+   if (ctx->block->successors[0] && !ctx->block->successors[1]) {
+      if (!ir3_block_get_terminator(ctx->block))
+         ir3_JUMP(ctx->block);
+   }
 
    _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
 }
@@ -3843,13 +3845,153 @@ fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
    return branch;
 }
 
-static struct ir3_instruction *
-emit_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
+static bool
+instr_can_be_predicated(nir_instr *instr)
 {
+   /* Anything that doesn't expand to control-flow can be predicated. */
+   switch (instr->type) {
+   case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_tex:
+   case nir_instr_type_load_const:
+   case nir_instr_type_undef:
+   case nir_instr_type_phi:
+   case nir_instr_type_parallel_copy:
+      return true;
+   case nir_instr_type_call:
+   case nir_instr_type_jump:
+      return false;
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_reduce:
+      case nir_intrinsic_inclusive_scan:
+      case nir_intrinsic_exclusive_scan:
+      case nir_intrinsic_reduce_clusters_ir3:
+      case nir_intrinsic_inclusive_scan_clusters_ir3:
+      case nir_intrinsic_exclusive_scan_clusters_ir3:
+      case nir_intrinsic_brcst_active_ir3:
+      case nir_intrinsic_ballot:
+      case nir_intrinsic_elect:
+      case nir_intrinsic_read_invocation_cond_ir3:
+      case nir_intrinsic_discard_if:
+      case nir_intrinsic_discard:
+      case nir_intrinsic_demote:
+      case nir_intrinsic_demote_if:
+      case nir_intrinsic_terminate:
+      case nir_intrinsic_terminate_if:
+         return false;
+      default:
+         return true;
+      }
+   }
+   }
+
+   unreachable("Checked all cases");
+}
+
+static bool
+nif_can_be_predicated(nir_if *nif)
+{
+   /* For non-divergent branches, predication is more expensive than a branch
+    * because the latter can potentially skip all instructions.
+    */
+   if (!nir_src_is_divergent(nif->condition))
+      return false;
+
+   /* Although it could potentially be possible to allow a limited form of
+    * nested predication (e.g., by resetting the predication mask after a nested
+    * branch), let's avoid this for now and only use predication for leaf
+    * branches. That is, for ifs that contain exactly one block in both branches
+    * (note that they always contain at least one block).
+    */
+   if (!exec_list_is_singular(&nif->then_list) ||
+       !exec_list_is_singular(&nif->else_list)) {
+      return false;
+   }
+
+   nir_foreach_instr (instr, nir_if_first_then_block(nif)) {
+      if (!instr_can_be_predicated(instr))
+         return false;
+   }
+
+   nir_foreach_instr (instr, nir_if_first_else_block(nif)) {
+      if (!instr_can_be_predicated(instr))
+         return false;
+   }
+
+   return true;
+}
+
+/* A typical if-else block like this:
+ * if (cond) {
+ *     tblock;
+ * } else {
+ *     fblock;
+ * }
+ * Will be emitted as:
+ *        |-- i --|
+ *        | ...   |
+ *        | predt |
+ *        |-------|
+ *    succ0 /   \ succ1
+ * |-- i+1 --| |-- i+2 --|
+ * | tblock  | | fblock  |
+ * | predf   | | jump    |
+ * |---------| |---------|
+ *    succ0 \   / succ0
+ *        |-- j --|
+ *        |  ...  |
+ *        |-------|
+ * Where the numbers at the top of blocks are their indices. That is, the true
+ * block and false block are laid-out contiguously after the current block. This
+ * layout is verified during legalization in prede_sched which also inserts the
+ * final prede instruction. Note that we don't insert prede right away to allow
+ * opt_jump to optimize the jump in the false block.
+ */
+static struct ir3_instruction *
+emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)
+{
+   if (!ctx->compiler->has_predication)
+      return NULL;
+   if (!nif_can_be_predicated(nif))
+      return NULL;
+
+   struct ir3_block *then_block = get_block(ctx, nir_if_first_then_block(nif));
+   struct ir3_block *else_block = get_block(ctx, nir_if_first_else_block(nif));
+   assert(list_is_empty(&then_block->instr_list) &&
+          list_is_empty(&else_block->instr_list));
+
+   bool inv;
+   struct ir3_instruction *condition =
+      get_branch_condition(ctx, &nif->condition, &inv);
+   struct ir3_instruction *pred, *pred_inv;
+
+   if (!inv) {
+      pred = ir3_PREDT(ctx->block, condition, IR3_REG_PREDICATE);
+      pred_inv = ir3_PREDF(then_block, condition, IR3_REG_PREDICATE);
+   } else {
+      pred = ir3_PREDF(ctx->block, condition, IR3_REG_PREDICATE);
+      pred_inv = ir3_PREDT(then_block, condition, IR3_REG_PREDICATE);
+   }
+
+   pred->srcs[0]->num = REG_P0_X;
+   pred_inv->srcs[0]->num = REG_P0_X;
+   return pred;
+}
+
+static struct ir3_instruction *
+emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
+{
+   nir_src *nir_cond = &nif->condition;
    struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond);
    if (folded)
       return folded;
 
+   struct ir3_instruction *predicated = emit_predicated_branch(ctx, nif);
+   if (predicated)
+      return predicated;
+
    bool inv1;
    struct ir3_instruction *cond1 = get_branch_condition(ctx, nir_cond, &inv1);
    struct ir3_instruction *branch =
@@ -3881,7 +4023,7 @@ emit_if(struct ir3_context *ctx, nir_if *nif)
        */
       ir3_SHPS(ctx->block);
    } else {
-      emit_conditional_branch(ctx, &nif->condition);
+      emit_conditional_branch(ctx, nif);
    }
 
    ctx->block->divergent_condition = nif->condition.ssa->divergent;
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index b8148c236d9..b3c34ba5f3b 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -1038,7 +1038,8 @@ block_sched(struct ir3 *ir)
 
             br2 = ir3_JUMP(block);
             br2->cat0.target = block->successors[0];
-         } else {
+         } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
+                    opc == OPC_BALL || opc == OPC_BANY) {
             /* create "else" branch first (since "then" block should
              * frequently/always end up being a fall-thru):
              */
@@ -1046,6 +1047,12 @@ block_sched(struct ir3 *ir)
             br2 = ir3_instr_clone(br1);
             invert_branch(br1);
             br2->cat0.target = block->successors[0];
+         } else {
+            assert(opc == OPC_PREDT || opc == OPC_PREDF);
+
+            /* Handled by prede_sched. */
+            terminator->cat0.target = block->successors[0];
+            continue;
          }
 
          /* Creating br2 caused it to be moved before the terminator b1, move it
@@ -1053,16 +1060,113 @@ block_sched(struct ir3 *ir)
           */
          ir3_instr_move_after(br2, br1);
       } else if (block->successors[0]) {
-         /* otherwise unconditional jump to next block which should already have
-          * been inserted.
+         /* otherwise unconditional jump or predt/predf to next block which
+          * should already have been inserted.
           */
          assert(terminator);
-         assert(terminator->opc == OPC_JUMP);
+         assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
+                terminator->opc == OPC_PREDF);
          terminator->cat0.target = block->successors[0];
       }
    }
 }
 
+static void
+prede_sched(struct ir3 *ir)
+{
+   unsigned index = 0;
+   foreach_block (block, &ir->block_list)
+      block->index = index++;
+
+   foreach_block (block, &ir->block_list) {
+      /* Look for the following pattern generated by NIR lowering. The numbers
+       * at the top of blocks are their index.
+       *        |--- i ----|
+       *        |   ...    |
+       *        | pred[tf] |
+       *        |----------|
+       *      succ0 /   \ succ1
+       * |-- i+1 ---| |-- i+2 ---|
+       * |    ...   | |   ...    |
+       * | pred[ft] | |   ...    |
+       * |----------| |----------|
+       *     succ0 \   / succ0
+       *        |--- j ----|
+       *        |   ...    |
+       *        |----------|
+       */
+      struct ir3_block *succ0 = block->successors[0];
+      struct ir3_block *succ1 = block->successors[1];
+
+      if (!succ1)
+         continue;
+
+      struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+      if (!terminator)
+         continue;
+      if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
+         continue;
+
+      assert(!succ0->successors[1] && !succ1->successors[1]);
+      assert(succ0->successors[0] == succ1->successors[0]);
+      assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
+      assert(succ0->index == (block->index + 1));
+      assert(succ1->index == (block->index + 2));
+
+      struct ir3_instruction *succ0_terminator =
+         ir3_block_get_terminator(succ0);
+      assert(succ0_terminator);
+      assert(succ0_terminator->opc ==
+             (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
+
+      ASSERTED struct ir3_instruction *succ1_terminator =
+         ir3_block_get_terminator(succ1);
+      assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
+
+      /* Simple case: both successors contain instructions. Keep both blocks and
+       * insert prede before the second successor's terminator:
+       *        |--- i ----|
+       *        |   ...    |
+       *        | pred[tf] |
+       *        |----------|
+       *      succ0 /   \ succ1
+       * |-- i+1 ---| |-- i+2 ---|
+       * |    ...   | |   ...    |
+       * | pred[ft] | | prede    |
+       * |----------| |----------|
+       *     succ0 \   / succ0
+       *        |--- j ----|
+       *        |   ...    |
+       *        |----------|
+       */
+      if (!list_is_empty(&succ1->instr_list)) {
+         ir3_PREDE(succ1);
+         continue;
+      }
+
+      /* Second successor is empty so we can remove it:
+       *        |--- i ----|
+       *        |   ...    |
+       *        | pred[tf] |
+       *        |----------|
+       *      succ0 /   \ succ1
+       * |-- i+1 ---|   |
+       * |    ...   |   |
+       * |   prede  |   |
+       * |----------|   |
+       *     succ0 \    /
+       *        |--- j ----|
+       *        |   ...    |
+       *        |----------|
+       */
+      list_delinit(&succ0_terminator->node);
+      ir3_PREDE(succ0);
+      remove_unused_block(succ1);
+      block->successors[1] = succ0->successors[0];
+      ir3_block_add_predecessor(succ0->successors[0], block);
+   }
+}
+
 /* Here we workaround the fact that kill doesn't actually kill the thread as
  * GL expects. The last instruction always needs to be an end instruction,
  * which means that if we're stuck in a loop where kill is the only way out,
@@ -1352,6 +1456,8 @@ helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
              */
             if (is_alu(instr) || is_sfu(instr))
                continue;
+            if (instr->opc == OPC_PREDE)
+               continue;
 
             expensive_instruction_in_block = true;
             break;
@@ -1457,6 +1563,8 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
    while (opt_jump(ir))
       ;
 
+   prede_sched(ir);
+
    /* TODO: does (eq) exist before a6xx? */
    if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
        so->compiler->gen >= 6)
diff --git a/src/freedreno/ir3/ir3_reconvergence.c b/src/freedreno/ir3/ir3_reconvergence.c
index 5870193b4b8..bddd733efca 100644
--- a/src/freedreno/ir3/ir3_reconvergence.c
+++ b/src/freedreno/ir3/ir3_reconvergence.c
@@ -181,6 +181,11 @@ ir3_calc_reconvergence(struct ir3_shader_variant *so)
     * reconvergence point.
     */
    foreach_block (block, &so->ir->block_list) {
+      struct ir3_instruction *terminator = ir3_block_get_terminator(block);
+      if (!terminator)
+         continue;
+      if (terminator->opc == OPC_PREDT || terminator->opc == OPC_PREDF)
+         continue;
       if (block->successors[0] && block->successors[1] &&
           block->divergent_condition) {
          unsigned idx = block->successors[0]->index >