ir3/legalize: add cycle to ir3_legalize_state

Having the cycle as part of the state will become convenient for two reasons: - It will allow us to merge the state of predecessors without having to normalize states at the end of blocks (i.e., we now have to subtract the block's final cycle value from its ready slots at the end of the block; having its final cycle value available in its state will allow us to do this when merging predecessor states at the start of the block). - We can update the cycle value as part of delay/sync state update routines. This way, the user doesn't have to worry about which instructions should actually update the cycle as this logic is nicely encapsulated. This is part of the preparation for making the delay/sync legalization logic available outside of ir3_legalize. Signed-off-by: Job Noorman <job@noorman.info> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34108>
2026-01-31 13:30:42 +01:00 · 2025-05-09 11:26:05 +02:00 · 2025-05-09 11:26:05 +02:00 · 03ee7c7c0f
commit 03ee7c7c0f
parent 12fadd27d3
1 changed files with 20 additions and 19 deletions
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@ -75,6 +75,8 @@ struct ir3_legalize_state {
   /* When p0.x-w, a0.x, and a1.x are ready. */
   unsigned pred_ready[4];
   unsigned addr_ready[2];
+
+   unsigned cycle;
 };

 struct ir3_legalize_block_data {
@ -188,8 +190,7 @@ get_ready_slot(struct ir3_legalize_state *state,
 static unsigned
 delay_calc(struct ir3_legalize_ctx *ctx,
           struct ir3_legalize_state *state,
-           struct ir3_instruction *instr,
-           unsigned cycle)
+           struct ir3_instruction *instr)
 {
   /* As far as we know, shader outputs don't need any delay. */
   if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
@ -202,7 +203,8 @@ delay_calc(struct ir3_legalize_ctx *ctx,

      unsigned elems = post_ra_reg_elems(src);
      unsigned num = post_ra_reg_num(src);
-      unsigned src_cycle = cycle + ir3_src_read_delay(ctx->compiler, instr, n);
+      unsigned src_cycle =
+         state->cycle + ir3_src_read_delay(ctx->compiler, instr, n);

      for (unsigned elem = 0; elem < elems; elem++, num++) {
         unsigned ready_cycle =
@ -224,7 +226,6 @@ static void
 delay_update(struct ir3_legalize_ctx *ctx,
             struct ir3_legalize_state *state,
             struct ir3_instruction *instr,
-             unsigned cycle,
             bool mergedregs)
 {
   if (writes_addr1(instr) && instr->block->in_early_preamble)
@ -236,7 +237,7 @@ delay_update(struct ir3_legalize_ctx *ctx,

      unsigned elems = post_ra_reg_elems(dst);
      unsigned num = post_ra_reg_num(dst);
-      unsigned dst_cycle = cycle;
+      unsigned dst_cycle = state->cycle;

      /* sct and swz have scalar destinations and each destination is written in
       * a subsequent cycle.
@ -432,7 +433,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
   list_replace(&block->instr_list, &instr_list);
   list_inithead(&block->instr_list);

-   unsigned cycle = 0;
+   state->cycle = 0;

   foreach_instr_safe (n, &instr_list) {
      unsigned i;
@ -566,10 +567,10 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
         nop->flags |= IR3_INSTR_SS;
         n->flags &= ~IR3_INSTR_SS;
         last_n = nop;
-         cycle++;
+         state->cycle++;
      }

-      unsigned delay = delay_calc(ctx, state, n, cycle);
+      unsigned delay = delay_calc(ctx, state, n);

      /* NOTE: I think the nopN encoding works for a5xx and
       * probably a4xx, but not a3xx.  So far only tested on
@ -584,7 +585,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
         unsigned transfer = MIN2(delay, 3 - last_n->nop);
         last_n->nop += transfer;
         delay -= transfer;
-         cycle += transfer;
+         state->cycle += transfer;
      }

      if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
@ -592,13 +593,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
         unsigned transfer = MIN2(delay, 5 - last_n->repeat);
         last_n->repeat += transfer;
         delay -= transfer;
-         cycle += transfer;
+         state->cycle += transfer;
      }

      if (delay > 0) {
         assert(delay <= 6);
         ir3_NOP(&build)->repeat = delay - 1;
-         cycle += delay;
+         state->cycle += delay;
      }

      if (ctx->compiler->samgq_workaround &&
@ -721,12 +722,12 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)

      bool count = count_instruction(n, ctx->compiler);
      if (count)
-         cycle += 1;
+         state->cycle += 1;

-      delay_update(ctx, state, n, cycle, mergedregs);
+      delay_update(ctx, state, n, mergedregs);

      if (count)
-         cycle += n->repeat + n->nop;
+         state->cycle += n->repeat + n->nop;

      if (ctx->early_input_release && is_input(n)) {
         last_input_needs_ss |= (n->opc == OPC_LDLV);
@ -791,16 +792,16 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
    * cycle offset.
    */
   for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
-      state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
+      state->pred_ready[i] = MAX2(state->pred_ready[i], state->cycle) - state->cycle;
   for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
      state->alu_nop.full_ready[i] =
-         MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
+         MAX2(state->alu_nop.full_ready[i], state->cycle) - state->cycle;
      state->alu_nop.half_ready[i] =
-         MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
+         MAX2(state->alu_nop.half_ready[i], state->cycle) - state->cycle;
      state->non_alu_nop.full_ready[i] =
-         MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
+         MAX2(state->non_alu_nop.full_ready[i], state->cycle) - state->cycle;
      state->non_alu_nop.half_ready[i] =
-         MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
+         MAX2(state->non_alu_nop.half_ready[i], state->cycle) - state->cycle;
   }

   bd->valid = true;