diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 53ee9bfec7d..dd1263d3a60 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1288,6 +1288,26 @@ reg_size(const struct ir3_register *reg)
    return reg_elems(reg) * reg_elem_size(reg);
 }
 
+/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
+ * and have to handle relative accesses specially.
+ */
+
+static inline unsigned
+post_ra_reg_elems(struct ir3_register *reg)
+{
+   if (reg->flags & IR3_REG_RELATIV)
+      return reg->size;
+   return reg_elems(reg);
+}
+
+static inline unsigned
+post_ra_reg_num(struct ir3_register *reg)
+{
+   if (reg->flags & IR3_REG_RELATIV)
+      return reg->array.base;
+   return reg->num;
+}
+
 static inline unsigned
 dest_regs(struct ir3_instruction *instr)
 {
@@ -1871,8 +1891,6 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
                                     struct ir3_instruction *consumer,
                                     unsigned assigner_n, unsigned consumer_n);
-unsigned ir3_delay_calc(struct ir3_block *block,
-                        struct ir3_instruction *instr, bool mergedregs);
 
 /* estimated (ss)/(sy) delay calculation */
 
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index db5b5871c48..735e5ce1738 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -95,38 +95,6 @@ ir3_delayslots(struct ir3_instruction *assigner,
    }
 }
 
-static bool
-count_instruction(struct ir3_instruction *n)
-{
-   /* NOTE: don't count branch/jump since we don't know yet if they will
-    * be eliminated later in resolve_jumps().. really should do that
-    * earlier so we don't have this constraint.
-    */
-   return is_alu(n) ||
-          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
-           (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
-}
-
-/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
- * and have to handle relative accesses specially.
- */
-
-static unsigned
-post_ra_reg_elems(struct ir3_register *reg)
-{
-   if (reg->flags & IR3_REG_RELATIV)
-      return reg->size;
-   return reg_elems(reg);
-}
-
-static unsigned
-post_ra_reg_num(struct ir3_register *reg)
-{
-   if (reg->flags & IR3_REG_RELATIV)
-      return reg->array.base;
-   return reg->num;
-}
-
 unsigned
 ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
                            struct ir3_instruction *consumer,
@@ -211,128 +179,3 @@ ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
    return offset > delay ? 0 : delay - offset;
 }
 
-static unsigned
-delay_calc_srcn(struct ir3_instruction *assigner,
-                struct ir3_instruction *consumer, unsigned assigner_n,
-                unsigned consumer_n, bool mergedregs)
-{
-   struct ir3_register *src = consumer->srcs[consumer_n];
-   struct ir3_register *dst = assigner->dsts[assigner_n];
-   bool mismatched_half =
-      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
-
-   /* In the mergedregs case or when the register is a special register,
-    * half-registers do not alias with full registers.
-    */
-   if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
-       mismatched_half)
-      return 0;
-
-   unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
-   unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
-   unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
-   unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
-
-   if (dst_start >= src_end || src_start >= dst_end)
-      return 0;
-
-   return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
-}
-
-static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *start,
-           struct ir3_instruction *consumer, unsigned distance,
-           regmask_t *in_mask, bool mergedregs)
-{
-   regmask_t mask;
-   memcpy(&mask, in_mask, sizeof(mask));
-
-   unsigned delay = 0;
-   /* Search backwards starting at the instruction before start, unless it's
-    * NULL then search backwards from the block end.
-    */
-   struct list_head *start_list =
-      start ? start->node.prev : block->instr_list.prev;
-   list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
-                                 &block->instr_list, node) {
-      if (count_instruction(assigner))
-         distance += assigner->nop;
-
-      if (distance + delay >= MAX_NOPS)
-         return delay;
-
-      if (is_meta(assigner))
-         continue;
-
-      unsigned new_delay = 0;
-
-      foreach_dst_n (dst, dst_n, assigner) {
-         if (dst->wrmask == 0)
-            continue;
-         if (!regmask_get(&mask, dst))
-            continue;
-         foreach_src_n (src, src_n, consumer) {
-            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
-               continue;
-
-            unsigned src_delay = delay_calc_srcn(
-               assigner, consumer, dst_n, src_n, mergedregs);
-            new_delay = MAX2(new_delay, src_delay);
-         }
-         regmask_clear(&mask, dst);
-      }
-
-      new_delay = new_delay > distance ? new_delay - distance : 0;
-      delay = MAX2(delay, new_delay);
-
-      if (count_instruction(assigner))
-         distance += 1 + assigner->repeat;
-   }
-
-   /* Note: this allows recursion into "block" if it has already been
-    * visited, but *not* recursion into its predecessors. We may have to
-    * visit the original block twice, for the loop case where we have to
-    * consider definititons in an earlier iterations of the same loop:
-    *
-    * while (...) {
-    *		mov.u32u32 ..., r0.x
-    *		...
-    *		mov.u32u32 r0.x, ...
-    * }
-    *
-    * However any other recursion would be unnecessary.
-    */
-
-   if (block->data != block) {
-      block->data = block;
-
-      for (unsigned i = 0; i < block->predecessors_count; i++) {
-         struct ir3_block *pred = block->predecessors[i];
-         unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
-                                          &mask, mergedregs);
-         delay = MAX2(delay, pred_delay);
-      }
-
-      block->data = NULL;
-   }
-
-   return delay;
-}
-
-/**
- * Calculate delay for nop insertion. This must exactly match hardware
- * requirements, including recursing into predecessor blocks.
- */
-unsigned
-ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-               bool mergedregs)
-{
-   regmask_t mask;
-   regmask_init(&mask, mergedregs);
-   foreach_src (src, instr) {
-      if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
-         regmask_set(&mask, src);
-   }
-
-   return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
-}
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index 53e9c51fd49..b8148c236d9 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -54,11 +54,37 @@ struct ir3_legalize_ctx {
    bool has_inputs;
 };
 
+struct ir3_nop_state {
+   unsigned full_ready[4 * 48];
+   unsigned half_ready[4 * 48];
+};
+
 struct ir3_legalize_state {
    regmask_t needs_ss;
    regmask_t needs_ss_war; /* write after read */
    regmask_t needs_sy;
    bool needs_ss_for_const;
+
+   /* Each of these arrays contains the cycle when the corresponding register
+    * becomes "ready" i.e. does not require any more nops. There is a special
+    * mechanism to let ALU instructions read compatible (i.e. same halfness)
+    * destinations of another ALU instruction with less delay, so this can
+    * depend on what type the consuming instruction is, which is why there are
+    * multiple arrays. The cycle is counted relative to the start of the block.
+    */
+
+   /* When ALU instructions reading the given full/half register will be ready.
+    */
+   struct ir3_nop_state alu_nop;
+
+   /* When non-ALU (e.g. cat5) instructions reading the given full/half register
+    * will be ready.
+    */
+   struct ir3_nop_state non_alu_nop;
+
+   /* When p0.x-w, a0.x, and a1.x are ready. */
+   unsigned pred_ready[4];
+   unsigned addr_ready[2];
 };
 
 struct ir3_legalize_block_data {
@@ -87,6 +113,177 @@ apply_sy(struct ir3_instruction *instr,
    regmask_init(&state->needs_sy, mergedregs);
 }
 
+static bool
+count_instruction(struct ir3_instruction *n)
+{
+   /* NOTE: don't count branch/jump since we don't know yet if they will
+    * be eliminated later in resolve_jumps().. really should do that
+    * earlier so we don't have this constraint.
+    */
+   return is_alu(n) ||
+          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
+           (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
+}
+
+static unsigned *
+get_ready_slot(struct ir3_legalize_state *state,
+               struct ir3_register *reg, unsigned num,
+               bool consumer_alu, bool matching_size)
+{
+   if (reg->flags & IR3_REG_PREDICATE) {
+      assert(num == reg->num);
+      assert(reg_num(reg) == REG_P0);
+      return &state->pred_ready[reg_comp(reg)];
+   }
+   if (reg->num == regid(REG_A0, 0))
+      return &state->addr_ready[0];
+   if (reg->num == regid(REG_A0, 1))
+      return &state->addr_ready[1];
+   struct ir3_nop_state *nop =
+      consumer_alu ? &state->alu_nop : &state->non_alu_nop;
+   assert(!(reg->flags & IR3_REG_SHARED));
+   if (reg->flags & IR3_REG_HALF) {
+      if (matching_size)
+         return &nop->half_ready[num];
+      else
+         return &nop->full_ready[num / 2];
+   } else {
+      if (matching_size)
+         return &nop->full_ready[num];
+      /* If "num" is large enough, then it can't alias a half-reg because only
+       * the first half of the full reg speace aliases half regs. Return NULL in
+       * this case.
+       */
+      else if (num * 2 < ARRAY_SIZE(nop->half_ready))
+         return &nop->half_ready[num * 2];
+      else
+         return NULL;
+   }
+}
+
+static unsigned
+delay_calc(struct ir3_legalize_state *state,
+           struct ir3_instruction *instr,
+           unsigned cycle)
+{
+   /* As far as we know, shader outputs don't need any delay. */
+   if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
+      return 0;
+
+   unsigned delay = 0;
+   foreach_src_n (src, n, instr) {
+      if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
+         continue;
+
+      unsigned elems = post_ra_reg_elems(src);
+      unsigned num = post_ra_reg_num(src);
+      unsigned src_cycle = cycle;
+
+      /* gat and swz have scalar sources and each source is read in a
+       * subsequent cycle.
+       */
+      if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ)
+         src_cycle += n;
+
+      /* cat3 instructions consume their last source two cycles later, so they
+       * only need a delay of 1.
+       */
+      if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2)
+         src_cycle += 2;
+
+      for (unsigned elem = 0; elem < elems; elem++, num++) {
+         unsigned ready_cycle =
+            *get_ready_slot(state, src, num, is_alu(instr), true);
+         delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
+
+         /* Increment cycle for ALU instructions with (rptN) where sources are
+          * read each subsequent cycle.
+          */
+         if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
+            src_cycle++;
+      }
+   }
+
+   return delay;
+}
+
+static void
+delay_update(struct ir3_legalize_state *state,
+             struct ir3_instruction *instr,
+             unsigned cycle,
+             bool mergedregs)
+{
+   foreach_dst_n (dst, n, instr) {
+      unsigned elems = post_ra_reg_elems(dst);
+      unsigned num = post_ra_reg_num(dst);
+      unsigned dst_cycle = cycle;
+
+      /* sct and swz have scalar destinations and each destination is written in
+       * a subsequent cycle.
+       */
+      if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
+         dst_cycle += n;
+
+      /* For relative accesses with (rptN), we have no way of knowing which
+       * component is accessed when, so we have to assume the worst and mark
+       * every array member as being written at the end.
+       */
+      if (dst->flags & IR3_REG_RELATIV)
+         dst_cycle += instr->repeat;
+
+      if (dst->flags & IR3_REG_SHARED)
+         continue;
+
+      for (unsigned elem = 0; elem < elems; elem++, num++) {
+         for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
+            for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
+               unsigned *ready_slot =
+                  get_ready_slot(state, dst, num, consumer_alu, matching_size);
+
+               if (!ready_slot)
+                  continue;
+
+               bool reset_ready_slot = false;
+               unsigned delay = 0;
+               if (!is_alu(instr)) {
+                  /* Apparently writes that require (ss) or (sy) are
+                   * synchronized against previous writes, so consumers don't
+                   * have to wait for any previous overlapping ALU instructions
+                   * to complete.
+                   */
+                  reset_ready_slot = true;
+               } else if ((dst->flags & IR3_REG_PREDICATE) ||
+                          reg_num(dst) == REG_A0) {
+                  delay = 6;
+                  if (!matching_size)
+                     continue;
+               } else {
+                  delay = (consumer_alu && matching_size) ? 3 : 6;
+               }
+
+               if (!matching_size) {
+                  for (unsigned i = 0; i < reg_elem_size(dst); i++) {
+                     ready_slot[i] =
+                        reset_ready_slot ? 0 :
+                        MAX2(ready_slot[i], dst_cycle + delay);
+                  }
+               } else {
+                  *ready_slot =
+                     reset_ready_slot ? 0 :
+                     MAX2(*ready_slot, dst_cycle + delay);
+               }
+            }
+         }
+
+         /* Increment cycle for ALU instructions with (rptN) where destinations
+          * are written each subsequent cycle.
+          */
+         if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
+            dst_cycle++;
+      }
+   }
+}
+
 /* We want to evaluate each block from the position of any other
  * predecessor block, in order that the flags set are the union of
  * all possible program paths.
@@ -140,6 +337,21 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                  &pstate->needs_ss_war);
       regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
       state->needs_ss_for_const |= pstate->needs_ss_for_const;
+
+      /* Our nop state is the max of the predecessor blocks */
+      for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
+         state->pred_ready[i] = MAX2(state->pred_ready[i],
+                                     pstate->pred_ready[i]);
+      for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
+         state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
+                                             pstate->alu_nop.full_ready[i]);
+         state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
+                                             pstate->alu_nop.half_ready[i]);
+         state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
+                                                 pstate->non_alu_nop.full_ready[i]);
+         state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
+                                                 pstate->non_alu_nop.half_ready[i]);
+      }
    }
 
    /* We need to take phsyical-only edges into account when tracking shared
@@ -178,6 +390,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
    list_replace(&block->instr_list, &instr_list);
    list_inithead(&block->instr_list);
 
+   unsigned cycle = 0;
+
    foreach_instr_safe (n, &instr_list) {
       unsigned i;
 
@@ -257,11 +471,40 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          nop = ir3_NOP(block);
          nop->flags |= IR3_INSTR_SS;
          n->flags &= ~IR3_INSTR_SS;
+         last_n = nop;
+         cycle++;
       }
 
-      /* need to be able to set (ss) on first instruction: */
-      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n))
-         ir3_NOP(block);
+      unsigned delay = delay_calc(state, n, cycle);
+
+      /* NOTE: I think the nopN encoding works for a5xx and
+       * probably a4xx, but not a3xx.  So far only tested on
+       * a6xx.
+       */
+
+      if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
+          ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
+          (last_n->repeat == 0)) {
+         /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
+         unsigned transfer = MIN2(delay, 3 - last_n->nop);
+         last_n->nop += transfer;
+         delay -= transfer;
+         cycle += transfer;
+      }
+
+      if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
+         /* the previous nop can encode at most 5 repeats: */
+         unsigned transfer = MIN2(delay, 5 - last_n->repeat);
+         last_n->repeat += transfer;
+         delay -= transfer;
+         cycle += transfer;
+      }
+
+      if (delay > 0) {
+         assert(delay <= 6);
+         ir3_NOP(block)->repeat = delay - 1;
+         cycle += delay;
+      }
 
       if (ctx->compiler->samgq_workaround &&
           ctx->type != MESA_SHADER_FRAGMENT &&
@@ -328,6 +571,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          }
       }
 
+      if (count_instruction(n))
+         cycle += 1;
+
+      delay_update(state, n, cycle, mergedregs);
+
+      if (count_instruction(n))
+         cycle += n->repeat;
+
       if (ctx->early_input_release && is_input(n)) {
          last_input_needs_ss |= (n->opc == OPC_LDLV);
 
@@ -384,6 +635,24 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
       list_add(&baryf->node, &block->instr_list);
    }
 
+   /* Currently our nop state contains the cycle offset from the start of this
+    * block when each register becomes ready. But successor blocks need the
+    * cycle offset from their start, which is this block's end. Translate the
+    * cycle offset.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
+      state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
+   for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
+      state->alu_nop.full_ready[i] =
+         MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
+      state->alu_nop.half_ready[i] =
+         MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
+      state->non_alu_nop.full_ready[i] =
+         MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
+      state->non_alu_nop.half_ready[i] =
+         MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
+   }
+
    bd->valid = true;
 
    if (memcmp(&prev_state, state, sizeof(*state))) {
@@ -407,8 +676,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
  * dsxpp.1.p dst, src
  *
  * We apply this after flags syncing, as we don't want to sync in between the
- * two (which might happen if dst == src).  We do it before nop scheduling
- * because that needs to count actual instructions.
+ * two (which might happen if dst == src).
  */
 static bool
 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
@@ -865,55 +1133,6 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
    }
 }
 
-/* Insert nop's required to make this a legal/valid shader program: */
-static void
-nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
-{
-   foreach_block (block, &ir->block_list) {
-      struct ir3_instruction *last = NULL;
-      struct list_head instr_list;
-
-      /* remove all the instructions from the list, we'll be adding
-       * them back in as we go
-       */
-      list_replace(&block->instr_list, &instr_list);
-      list_inithead(&block->instr_list);
-
-      foreach_instr_safe (instr, &instr_list) {
-         unsigned delay = ir3_delay_calc(block, instr, so->mergedregs);
-
-         /* NOTE: I think the nopN encoding works for a5xx and
-          * probably a4xx, but not a3xx.  So far only tested on
-          * a6xx.
-          */
-
-         if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
-             ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
-             (last->repeat == 0)) {
-            /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
-            unsigned transfer = MIN2(delay, 3 - last->nop);
-            last->nop += transfer;
-            delay -= transfer;
-         }
-
-         if ((delay > 0) && last && (last->opc == OPC_NOP)) {
-            /* the previous nop can encode at most 5 repeats: */
-            unsigned transfer = MIN2(delay, 5 - last->repeat);
-            last->repeat += transfer;
-            delay -= transfer;
-         }
-
-         if (delay > 0) {
-            assert(delay <= 6);
-            ir3_NOP(block)->repeat = delay - 1;
-         }
-
-         list_addtail(&instr->node, &block->instr_list);
-         last = instr;
-      }
-   }
-}
-
 static void
 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
 {
@@ -1227,8 +1446,6 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
       progress |= apply_fine_deriv_macro(ctx, block);
    }
 
-   nop_sched(ir, so);
-
    if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
       dbg_sync_sched(ir, so);
    }
diff --git a/src/freedreno/ir3/tests/delay.c b/src/freedreno/ir3/tests/delay.c
index 66e14c092a8..eab6a229231 100644
--- a/src/freedreno/ir3/tests/delay.c
+++ b/src/freedreno/ir3/tests/delay.c
@@ -145,6 +145,30 @@ fixup_wrmask(struct ir3 *ir)
    }
 }
 
+/* Calculate the number of nops added before the last instruction by
+ * ir3_legalize.
+ */
+static unsigned
+calc_nops(struct ir3_block *block, struct ir3_instruction *last)
+{
+   unsigned nops = 0;
+
+   foreach_instr_rev (instr, &block->instr_list) {
+      if (instr == last)
+         continue;
+
+      if (instr->opc == OPC_NOP) {
+         nops += 1 + instr->repeat;
+      } else {
+         if (is_alu(instr))
+            nops += instr->nop;
+         break;
+      }
+   }
+
+   return nops;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -177,13 +201,10 @@ main(int argc, char **argv)
          break;
       }
 
-      /* The delay calc is expecting the instr to not yet be added to the
-       * block, so remove it from the block so that it doesn't get counted
-       * in the distance from assigner:
-       */
-      list_delinit(&last->node);
+      int max_bary;
+      ir3_legalize(ir, shader->variants, &max_bary);
 
-      unsigned n = ir3_delay_calc(block, last, true);
+      unsigned n = calc_nops(block, last);
 
       if (n != test->expected_delay) {
          printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n", i,