ir3/legalize: extract ir3_merge_pred_legalize_states helper

We will want to use this functionality in ir3_postsched. Signed-off-by: Job Noorman <job@noorman.info> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34108>
2026-05-04 03:18:08 +02:00 · 2025-05-09 11:26:05 +02:00 · 2025-05-09 11:26:05 +02:00 · 0c05839fcd
commit 0c05839fcd
parent d4503b01b7
1 changed files with 102 additions and 70 deletions
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@ -346,6 +346,107 @@ ir3_init_legalize_state(struct ir3_legalize_state *state,
   regmask_init(&state->needs_sy, compiler->mergedregs);
 }

+static struct ir3_legalize_state *
+get_block_legalize_state(struct ir3_block *block)
+{
+   struct ir3_legalize_block_data *bd = block->data;
+   return &bd->state;
+}
+
+typedef struct ir3_legalize_state *(*ir3_get_block_legalize_state_cb)(
+   struct ir3_block *);
+
+static void
+ir3_merge_pred_legalize_states(struct ir3_legalize_state *state,
+                               struct ir3_block *block,
+                               ir3_get_block_legalize_state_cb get_state)
+{
+   /* Our input state is the OR of all predecessor blocks' state.
+    *
+    * Why don't we just zero the state at the beginning before merging in the
+    * predecessors? Because otherwise updates may not be a "lattice refinement",
+    * i.e. needs_ss may go from true to false for some register due to a (ss) we
+    * inserted the second time around (and the same for (sy)). This means that
+    * there's no solid guarantee the algorithm will converge, and in theory
+    * there may be infinite loops where we fight over the placment of an (ss).
+    */
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *predecessor = block->predecessors[i];
+      struct ir3_legalize_state *pstate = get_state(predecessor);
+
+      if (!pstate) {
+         continue;
+      }
+
+      /* Our input (ss)/(sy) state is based on OR'ing the output
+       * state of all our predecessor blocks
+       */
+      regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
+      regmask_or(&state->needs_ss_war, &state->needs_ss_war,
+                 &pstate->needs_ss_war);
+      regmask_or(&state->needs_sy_war, &state->needs_sy_war,
+                 &pstate->needs_sy_war);
+      regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
+                 &pstate->needs_ss_or_sy_war);
+      regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
+      state->needs_ss_for_const |= pstate->needs_ss_for_const;
+      state->needs_sy_for_const |= pstate->needs_sy_for_const;
+
+      /* Our nop state is the max of the predecessor blocks. The predecessor nop
+       * state contains the cycle offset from the start of its block when each
+       * register becomes ready. But successor blocks need the cycle offset from
+       * their start, which is the predecessor's block's end. Translate the
+       * cycle offset.
+       */
+      for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
+         state->pred_ready[i] =
+            MAX2(state->pred_ready[i],
+                 MAX2(pstate->pred_ready[i], pstate->cycle) - pstate->cycle);
+      for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
+         state->alu_nop.full_ready[i] = MAX2(
+            state->alu_nop.full_ready[i],
+            MAX2(pstate->alu_nop.full_ready[i], pstate->cycle) - pstate->cycle);
+         state->alu_nop.half_ready[i] = MAX2(
+            state->alu_nop.half_ready[i],
+            MAX2(pstate->alu_nop.half_ready[i], pstate->cycle) - pstate->cycle);
+         state->non_alu_nop.full_ready[i] =
+            MAX2(state->non_alu_nop.full_ready[i],
+                 MAX2(pstate->non_alu_nop.full_ready[i], pstate->cycle) -
+                    pstate->cycle);
+         state->non_alu_nop.half_ready[i] =
+            MAX2(state->non_alu_nop.half_ready[i],
+                 MAX2(pstate->non_alu_nop.half_ready[i], pstate->cycle) -
+                    pstate->cycle);
+      }
+   }
+
+   /* We need to take phsyical-only edges into account when tracking shared
+    * registers.
+    */
+   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+      struct ir3_block *predecessor = block->physical_predecessors[i];
+      struct ir3_legalize_state *pstate = get_state(predecessor);
+
+      if (!pstate) {
+         continue;
+      }
+
+      regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
+      regmask_or_shared(&state->needs_ss_scalar_full,
+                        &state->needs_ss_scalar_full,
+                        &pstate->needs_ss_scalar_full);
+      regmask_or_shared(&state->needs_ss_scalar_half,
+                        &state->needs_ss_scalar_half,
+                        &pstate->needs_ss_scalar_half);
+      regmask_or_shared(&state->needs_ss_scalar_war,
+                        &state->needs_ss_scalar_war,
+                        &pstate->needs_ss_scalar_war);
+      regmask_or_shared(&state->needs_ss_or_sy_scalar_war,
+                        &state->needs_ss_or_sy_scalar_war,
+                        &pstate->needs_ss_or_sy_scalar_war);
+   }
+}
+
 static bool
 count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
 {
@ -549,76 +650,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
   bool mergedregs = ctx->so->mergedregs;
   struct ir3_builder build = ir3_builder_at(ir3_after_block(block));

-   /* Our input state is the OR of all predecessor blocks' state.
-    *
-    * Why don't we just zero the state at the beginning before merging in the
-    * predecessors? Because otherwise updates may not be a "lattice refinement",
-    * i.e. needs_ss may go from true to false for some register due to a (ss) we
-    * inserted the second time around (and the same for (sy)). This means that
-    * there's no solid guarantee the algorithm will converge, and in theory
-    * there may be infinite loops where we fight over the placment of an (ss).
-    */
-   for (unsigned i = 0; i < block->predecessors_count; i++) {
-      struct ir3_block *predecessor = block->predecessors[i];
-      struct ir3_legalize_block_data *pbd = predecessor->data;
-      struct ir3_legalize_state *pstate = &pbd->state;
-
-      /* Our input (ss)/(sy) state is based on OR'ing the output
-       * state of all our predecessor blocks
-       */
-      regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
-      regmask_or(&state->needs_ss_war, &state->needs_ss_war,
-                 &pstate->needs_ss_war);
-      regmask_or(&state->needs_sy_war, &state->needs_sy_war,
-                 &pstate->needs_sy_war);
-      regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
-                 &pstate->needs_ss_or_sy_war);
-      regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
-      state->needs_ss_for_const |= pstate->needs_ss_for_const;
-      state->needs_sy_for_const |= pstate->needs_sy_for_const;
-
-      /* Our nop state is the max of the predecessor blocks. The predecessor nop
-       * state contains the cycle offset from the start of its block when each
-       * register becomes ready. But successor blocks need the cycle offset from
-       * their start, which is the predecessor's block's end. Translate the
-       * cycle offset.
-       */
-      for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
-         state->pred_ready[i] = MAX2(state->pred_ready[i],
-                                     MAX2(pstate->pred_ready[i], pstate->cycle) - pstate->cycle);
-      for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
-         state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
-                                             MAX2(pstate->alu_nop.full_ready[i], pstate->cycle) - pstate->cycle);
-         state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
-                                             MAX2(pstate->alu_nop.half_ready[i], pstate->cycle) - pstate->cycle);
-         state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
-                                                 MAX2(pstate->non_alu_nop.full_ready[i], pstate->cycle) - pstate->cycle);
-         state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
-                                                 MAX2(pstate->non_alu_nop.half_ready[i], pstate->cycle) - pstate->cycle);
-      }
-   }
-
-   /* We need to take phsyical-only edges into account when tracking shared
-    * registers.
-    */
-   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
-      struct ir3_block *predecessor = block->physical_predecessors[i];
-      struct ir3_legalize_block_data *pbd = predecessor->data;
-      struct ir3_legalize_state *pstate = &pbd->state;
-
-      regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
-      regmask_or_shared(&state->needs_ss_scalar_full,
-                        &state->needs_ss_scalar_full,
-                        &pstate->needs_ss_scalar_full);
-      regmask_or_shared(&state->needs_ss_scalar_half,
-                        &state->needs_ss_scalar_half,
-                        &pstate->needs_ss_scalar_half);
-      regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
-                        &pstate->needs_ss_scalar_war);
-      regmask_or_shared(&state->needs_ss_or_sy_scalar_war,
-                        &state->needs_ss_or_sy_scalar_war,
-                        &pstate->needs_ss_or_sy_scalar_war);
-   }
+   ir3_merge_pred_legalize_states(state, block, get_block_legalize_state);

   memcpy(&bd->state, state, sizeof(*state));
   state = &bd->state;