From 039b76d07ca8c4972b344fa491c11872bb2b2bc7 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Date: Thu, 7 May 2026 10:31:05 -0400
Subject: [PATCH] jay/lower_scoreboard: factor regdist logic out

no change, just hoisting the loop & reindenting.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41510>
---
 src/intel/compiler/jay/jay_lower_scoreboard.c | 251 +++++++++---------
 1 file changed, 124 insertions(+), 127 deletions(-)

diff --git a/src/intel/compiler/jay/jay_lower_scoreboard.c b/src/intel/compiler/jay/jay_lower_scoreboard.c
index 439da3ec567..3dfd0531e48 100644
--- a/src/intel/compiler/jay/jay_lower_scoreboard.c
+++ b/src/intel/compiler/jay/jay_lower_scoreboard.c
@@ -159,6 +159,8 @@ struct swsb_state {
     */
    unsigned finished_ip[TGL_NUM_PIPES][TGL_NUM_PIPES];
    u32_per_pipe *access;
+
+   jay_inst *last_sync;
 };
 
 static enum tgl_pipe
@@ -250,144 +252,132 @@ depend_on_writer(struct swsb_state *state,
    for (unsigned pipe = 1; pipe < TGL_NUM_PIPES; ++pipe)
 
 static void
-lower_regdist_local(jay_function *func,
-                    jay_block *block,
-                    struct swsb_state *state)
+lower_regdist(jay_function *func, jay_inst *I, struct swsb_state *ctx)
 {
-   jay_inst *last_sync = NULL;
+   enum tgl_pipe exec_pipe = inst_exec_pipe(func->shader->devinfo, I);
+   unsigned dep[TGL_NUM_PIPES] = { 0 };
 
-   jay_foreach_inst_in_block_safe(block, I) {
-      enum tgl_pipe exec_pipe = inst_exec_pipe(func->shader->devinfo, I);
-      unsigned dep[TGL_NUM_PIPES] = { 0 };
-      if (I->op == JAY_OPCODE_SYNC) {
-         last_sync = I;
-         continue;
-      }
+   jay_foreach_dst(I, def) {
+      struct gpr_range r = def_to_gpr(func, I, def);
+      depend_on_writer(ctx, r, dep, exec_pipe, true /* except_pipe */);
 
-      jay_foreach_dst(I, def) {
-         struct gpr_range r = def_to_gpr(func, I, def);
-         depend_on_writer(state, r, dep, exec_pipe, true /* except_pipe */);
-
-         for (unsigned i = 0; i < r.width; ++i) {
-            jay_foreach_pipe(p) {
-               if (p != exec_pipe) {
-                  dep[p] = MAX2(dep[p], state->access[r.base + i][p]);
-               }
+      for (unsigned i = 0; i < r.width; ++i) {
+         jay_foreach_pipe(p) {
+            if (p != exec_pipe) {
+               dep[p] = MAX2(dep[p], ctx->access[r.base + i][p]);
             }
          }
       }
+   }
+
+   /* Read-after-write */
+   jay_foreach_src(I, s) {
+      depend_on_writer(ctx, def_to_gpr(func, I, I->src[s]), dep, exec_pipe,
+                       false);
+   }
+
+   /* If dependency P implies dependency Q, drop dependency Q to avoid
+    * unnecessary annotations.
+    */
+   jay_foreach_pipe(p) {
+      if (dep[p]) {
+         jay_foreach_pipe(q) {
+            if (p != q && dep[q] && ctx->finished_ip[p][q] >= dep[q]) {
+               dep[q] = 0;
+            }
+         }
+      }
+   }
+
+   uint32_t wait_pipes = 0;
+   unsigned min_delta = 7;
+
+   jay_foreach_pipe(p) {
+      if (dep[p] && (exec_pipe == TGL_PIPE_NONE /* TODO: Sends */ ||
+                     dep[p] > ctx->finished_ip[exec_pipe][p])) {
+
+         min_delta = MIN2(min_delta, ctx->ip[p] - dep[p] + 1);
+         wait_pipes |= BITFIELD_BIT(p);
+      }
+   }
+
+   /* We'll wait on the unioned dependency. Update the tracking for that. */
+   u_foreach_bit(p, wait_pipes) {
+      ctx->finished_ip[exec_pipe][p] = ctx->ip[p] + 1 - min_delta;
+   }
+
+   uint32_t last_pipe = util_logbase2(wait_pipes);
+   bool single_wait = wait_pipes == BITFIELD_BIT(last_pipe);
+
+   /* If we're SIMD split the same way as our dependency, we can relax the
+    * dependency to have each half wait in parallel. We could do even better
+    * with more tracking but this should be good enough for now.
+    */
+   unsigned simd_split = jay_simd_split(func->shader, I);
+   unsigned shape = ((simd_split << 2) | jay_macro_length(I)) + 1;
+   bool same_shape = ctx->last_shape[last_pipe] == shape;
+
+   if (simd_split && same_shape && single_wait && min_delta == 1) {
+      min_delta += ((1 << simd_split) - 1) * jay_macro_length(I);
+      I->replicate_dep = true;
+      I->decrement_dep = last_pipe != exec_pipe;
+   }
+
+   bool has_sbid = I->op == JAY_OPCODE_SEND && !jay_send_eot(I);
+   I->dep = (struct tgl_swsb) {
+      .sbid = has_sbid ? jay_send_sbid(I) : 0,
+      .mode = has_sbid ? TGL_SBID_SET : TGL_SBID_NULL,
+      .regdist = wait_pipes ? min_delta : 0,
+      .pipe = single_wait && (!has_sbid ||
+                              last_pipe == TGL_PIPE_FLOAT ||
+                              last_pipe == TGL_PIPE_INT) ?
+                 last_pipe :
+                 TGL_PIPE_ALL,
+   };
+
+   /* Fold the immediate preceding SYNC.nop into this instruction, allowing
+    * us to wait on both ALU and a SEND in the same annotation. We cannot do
+    * this safely in the presence of predication or SIMD splitting that could
+    * cause any part of the instruction to get shot down, skipping the sync
+    * for future instructions (at least not without more tricky logic).
+    */
+   if (ctx->last_sync &&
+       jay_sync_op(ctx->last_sync) == TGL_SYNC_NOP &&
+       I->dep.mode == TGL_SBID_NULL &&
+       !I->predication &&
+       !jay_simd_split(func->shader, I) &&
+       (I->dep.regdist == 0 ||
+        inferred_sync_pipe(func->shader->devinfo, I) == I->dep.pipe)) {
+
+      assert(ctx->last_sync->dep.regdist == 0);
+      assert(ctx->last_sync->dep.pipe == TGL_PIPE_NONE);
+
+      I->dep.mode = ctx->last_sync->dep.mode;
+      I->dep.sbid = ctx->last_sync->dep.sbid;
+
+      jay_remove_instruction(ctx->last_sync);
+   }
+
+   if (exec_pipe != TGL_PIPE_NONE) {
+      /* Advance the IP by the number of physical instructions emitted */
+      ctx->ip[exec_pipe] +=
+         jay_macro_length(I) << jay_simd_split(func->shader, I);
+
+      struct gpr_range r = def_to_gpr(func, I, I->dst);
+      uint32_t now = make_writer(exec_pipe, ctx->ip[exec_pipe]);
+
+      for (unsigned i = 0; i < r.width; ++i) {
+         ctx->access[r.base + i][0] = now;
+      }
 
-      /* Read-after-write */
       jay_foreach_src(I, s) {
-         depend_on_writer(state, def_to_gpr(func, I, I->src[s]), dep, exec_pipe,
-                          false);
-      }
-
-      /* If dependency P implies dependency Q, drop dependency Q to avoid
-       * unnecessary annotations.
-       */
-      jay_foreach_pipe(p) {
-         if (dep[p]) {
-            jay_foreach_pipe(q) {
-               if (p != q && dep[q] && state->finished_ip[p][q] >= dep[q]) {
-                  dep[q] = 0;
-               }
-            }
-         }
-      }
-
-      uint32_t wait_pipes = 0;
-      unsigned min_delta = 7;
-
-      jay_foreach_pipe(p) {
-         if (dep[p] && (exec_pipe == TGL_PIPE_NONE /* TODO: Sends */ ||
-                        dep[p] > state->finished_ip[exec_pipe][p])) {
-
-            min_delta = MIN2(min_delta, state->ip[p] - dep[p] + 1);
-            wait_pipes |= BITFIELD_BIT(p);
-         }
-      }
-
-      /* We'll wait on the unioned dependency. Update the tracking for that. */
-      u_foreach_bit(p, wait_pipes) {
-         state->finished_ip[exec_pipe][p] = state->ip[p] + 1 - min_delta;
-      }
-
-      uint32_t last_pipe = util_logbase2(wait_pipes);
-      bool single_wait = wait_pipes == BITFIELD_BIT(last_pipe);
-
-      /* If we're SIMD split the same way as our dependency, we can relax the
-       * dependency to have each half wait in parallel. We could do even better
-       * with more tracking but this should be good enough for now.
-       */
-      unsigned simd_split = jay_simd_split(func->shader, I);
-      unsigned shape = ((simd_split << 2) | jay_macro_length(I)) + 1;
-      bool same_shape = state->last_shape[last_pipe] == shape;
-
-      if (simd_split && same_shape && single_wait && min_delta == 1) {
-         min_delta += ((1 << simd_split) - 1) * jay_macro_length(I);
-         I->replicate_dep = true;
-         I->decrement_dep = last_pipe != exec_pipe;
-      }
-
-      bool has_sbid = I->op == JAY_OPCODE_SEND && !jay_send_eot(I);
-      I->dep = (struct tgl_swsb) {
-         .sbid = has_sbid ? jay_send_sbid(I) : 0,
-         .mode = has_sbid ? TGL_SBID_SET : TGL_SBID_NULL,
-         .regdist = wait_pipes ? min_delta : 0,
-         .pipe = single_wait && (!has_sbid ||
-                                 last_pipe == TGL_PIPE_FLOAT ||
-                                 last_pipe == TGL_PIPE_INT) ?
-                    last_pipe :
-                    TGL_PIPE_ALL,
-      };
-
-      /* Fold the immediate preceding SYNC.nop into this instruction, allowing
-       * us to wait on both ALU and a SEND in the same annotation. We cannot do
-       * this safely in the presence of predication or SIMD splitting that could
-       * cause any part of the instruction to get shot down, skipping the sync
-       * for future instructions (at least not without more tricky logic).
-       */
-      if (last_sync &&
-          jay_sync_op(last_sync) == TGL_SYNC_NOP &&
-          I->dep.mode == TGL_SBID_NULL &&
-          !I->predication &&
-          !jay_simd_split(func->shader, I) &&
-          (I->dep.regdist == 0 ||
-           inferred_sync_pipe(func->shader->devinfo, I) == I->dep.pipe)) {
-
-         assert(last_sync->dep.regdist == 0);
-         assert(last_sync->dep.pipe == TGL_PIPE_NONE);
-
-         I->dep.mode = last_sync->dep.mode;
-         I->dep.sbid = last_sync->dep.sbid;
-
-         jay_remove_instruction(last_sync);
-      }
-
-      if (exec_pipe != TGL_PIPE_NONE) {
-         /* Advance the IP by the number of physical instructions emitted */
-         state->ip[exec_pipe] +=
-            jay_macro_length(I) << jay_simd_split(func->shader, I);
-
-         struct gpr_range r = def_to_gpr(func, I, I->dst);
-         uint32_t now = make_writer(exec_pipe, state->ip[exec_pipe]);
-
+         struct gpr_range r = def_to_gpr(func, I, I->src[s]);
          for (unsigned i = 0; i < r.width; ++i) {
-            state->access[r.base + i][0] = now;
+            ctx->access[r.base + i][exec_pipe] = ctx->ip[exec_pipe];
          }
-
-         jay_foreach_src(I, s) {
-            struct gpr_range r = def_to_gpr(func, I, I->src[s]);
-            for (unsigned i = 0; i < r.width; ++i) {
-               state->access[r.base + i][exec_pipe] = state->ip[exec_pipe];
-            }
-         }
-
-         state->last_shape[exec_pipe] = shape;
       }
 
-      last_sync = NULL;
+      ctx->last_shape[exec_pipe] = shape;
    }
 }
 
@@ -428,7 +418,14 @@ jay_lower_scoreboard(jay_shader *shader)
       }
 
       jay_foreach_block(f, block) {
-         lower_regdist_local(f, block, &state);
+         jay_foreach_inst_in_block_safe(block, I) {
+            if (I->op == JAY_OPCODE_SYNC) {
+               state.last_sync = I;
+            } else {
+               lower_regdist(f, I, &state);
+               state.last_sync = NULL;
+            }
+         }
       }
    }