From c9eb9ea52a3ed2d6c8a1b734f451cf6481d3dd81 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Date: Thu, 28 May 2026 11:05:30 -0400
Subject: [PATCH] jay/partition: pick better partitions

look at what the program actually does instead of hardcoding a worst-case.

SIMD16:
   Totals from 1965 (74.23% of 2647) affected shaders:
   Instrs: 2603230 -> 2539932 (-2.43%); split: -3.44%, +1.01%
   CodeSize: 38826160 -> 37811904 (-2.61%); split: -3.59%, +0.97%
   Number of spill instructions: 1206 -> 555 (-53.98%)
   Number of fill instructions: 1194 -> 551 (-53.85%)

SIMD32:
   Totals from 1974 (74.57% of 2647) affected shaders:
   Instrs: 3998126 -> 3033333 (-24.13%); split: -24.18%, +0.05%
   CodeSize: 59563952 -> 45580448 (-23.48%); split: -23.52%, +0.05%
   Number of spill instructions: 43534 -> 37471 (-13.93%); split: -13.97%, +0.04%
   Number of fill instructions: 43118 -> 36412 (-15.55%)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41872>
---
 src/intel/compiler/jay/jay_partition.c | 242 +++++++++++++++++--------
 1 file changed, 168 insertions(+), 74 deletions(-)

diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c
index 63d7ccab4f9..4c7c428624f 100644
--- a/src/intel/compiler/jay/jay_partition.c
+++ b/src/intel/compiler/jay/jay_partition.c
@@ -3,19 +3,75 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "util/u_math.h"
 #include "jay_ir.h"
 #include "jay_private.h"
 
 /*
- * jay_partition_grf partitions the register file for the entire shader,
- * satisfying functional and performance rules. The partition is specified in a
- * convenient form within this file, as a flat array of jay_partition_builder
- * structs, which build_partition translates to the more complicated
- * jay_partition structs.
+ * In addition to having enough total registers globally, the partition needs to
+ * have enough contiguous registers in each file/stride to allocate any single
+ * instruction in isolation. analyze_per_inst calculates bounds on the numbers
+ * of required contiguous registers.
  *
- * All functions must share the same partition for correctness with non-uniform
- * function calls. For unlinked library functions, we must use the ABI
- * partition (TODO).
+ * This analysis is overly conservative, implicitly assuming that all operands
+ * of a given file/stride must be contiguous. This could be improved with a lot
+ * more bookkeeping, but it's unclear if it matters much in practice.
+ *
+ * Note the util_next_power_of_two below is critical for correctness. Although
+ * RA handles non-power-of-two vectors, it aligns vectors to the next
+ * power-of-two size. That doesn't affect global register demand, but it needs
+ * to be reflected in our per-instruction minimum.
+ *
+ * As an example, an instruction with sources (vec3, vec2) requires at least 6
+ * contiguous registers to satisfy RA, since the vec3 gets vec4 alignment and
+ * the vec2 gets vec2 alignment. In principal, future RA improvements might
+ * relax this, but we're not there yet.
+ */
+struct instruction_req {
+   unsigned gpr[JAY_NUM_STRIDES];
+   unsigned ugpr;
+};
+
+static struct instruction_req
+analyze_per_inst(jay_shader *shader)
+{
+   struct instruction_req global = { 0 };
+
+   jay_foreach_inst_in_shader(shader, func, I) {
+      struct instruction_req local = { 0 };
+
+      for (int i = -1; i < I->num_srcs; ++i) {
+         jay_def x = i >= 0 ? I->src[i] : I->dst;
+         if (!jay_is_null(x)) {
+            unsigned size = util_next_power_of_two(jay_num_values(x));
+
+            if (x.file == UGPR) {
+               local.ugpr += size;
+            } else if (x.file == GPR && i >= 0) {
+               enum jay_stride min_stride = jay_src_stride_minmax(I, i, false);
+               local.gpr[min_stride] += size;
+            } else if (x.file == GPR) {
+               enum jay_stride min_stride = jay_dst_stride_minmax(I, false);
+               local.gpr[min_stride] += size;
+            }
+         }
+      }
+
+      /* Take the worst-case for each block */
+      global.ugpr = MAX2(global.ugpr, local.ugpr);
+
+      for (unsigned i = 0; i < ARRAY_SIZE(global.gpr); ++i) {
+         global.gpr[i] = MAX2(global.gpr[i], local.gpr[i]);
+      }
+   }
+
+   return global;
+}
+
+/*
+ * The partition is specified in a convenient form within jay_partition_grf(),
+ * as a flat array of jay_partition_builder structs, which build_partition
+ * translates to the more complicated jay_partition structs.
  */
 struct jay_partition_builder {
    enum jay_file file;
@@ -100,72 +156,20 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
    assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped");
 }
 
+/*
+ * Partition the register file for the entire shader.
+ *
+ * All functions must share the same partition for correctness with non-uniform
+ * function calls. For unlinked library functions, we must use the ABI
+ * partition (TODO).
+ */
 void
 jay_partition_grf(jay_shader *shader)
 {
-   /* Calculate the maximum register demand across all functions in the shader.
-    * We will use this to choose a good partition.
-    */
-   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
-
-   jay_foreach_function(shader, f) {
-      jay_compute_liveness(f);
-      jay_calculate_register_demands(f);
-
-      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
-      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
-   }
-
-   /* We must have enough register file space for the register payload, plus the
-    * reserved UGPRs in the case we spill. That UGPR interferes with everything
-    * we preload so it needs to be reserved specially here for the worst case.
-    */
-   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
-      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
-      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
-      assert(I->dst.file < JAY_NUM_GRF_FILES);
-      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
-   }
-
-   /* Determine a good GPR/UGPR split informed by the demand calculation */
    unsigned grf_per_gpr = jay_grf_per_gpr(shader);
    unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
-   unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
-
-   /* We must have enough for SIMD1 images (TODO: Check if this actually
-    * applies. Or if we could eliminate this with smarter partitioning even.)
-    */
-   unsigned min_ugprs = 16;
-   min_ugprs = MAX2(min_ugprs, 256);
-
-   /* TODO: We could partition more cleverly */
-   uniform_grfs = align(uniform_grfs, grf_per_gpr);
-   uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
-                        128 - (32 * grf_per_gpr));
-   unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
-
-   /* Check the split */
-   assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
-   assert(nonuniform_grfs >= 32 * grf_per_gpr);
-   assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
-
-   /* Set the targets for the virtual register file accordingly */
-   shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
-   shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
-
-   unsigned spill_reservation = 0, mem_slots = 0;
-
-   /* Spilling requires reserving UGPRs for the lowered SENDs */
-   if (demand[GPR] > jay_gpr_limit(shader)) {
-      spill_reservation = shader->dispatch_width / ugpr_per_grf;
-
-      /* This should be an acceptable upper limit since we assign memory tightly
-       * thanks to the usual SSA allocator guarantees.
-       */
-      mem_slots = demand[GPR] * grf_per_gpr;
-      shader->num_regs[MEM] = demand[GPR];
-   }
 
+   /* Determine the shape of the payload/EOT sections upfront. */
    unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 };
    unsigned eot_u = 0, eot_4 = 0;
 
@@ -200,12 +204,102 @@ jay_partition_grf(jay_shader *shader)
       eot_4 = eot_u = 0;
    }
 
-   unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u;
-   unsigned special_4 = payload_4[0] + payload_4[1] + eot_4;
+   /* Calculate the maximum register demand across all functions in the shader.
+    * We will use this to choose a good partition.
+    */
+   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
+   struct instruction_req instr_req = analyze_per_inst(shader);
 
-   /* TODO: Make the stride partition smarter */
-   unsigned grf_8 = 8 * grf_per_gpr;
-   unsigned grf_2 = 8;
+   jay_foreach_function(shader, f) {
+      jay_compute_liveness(f);
+      jay_calculate_register_demands(f);
+
+      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
+      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
+   }
+
+   /* We must have enough register file space for the register payload, plus the
+    * reserved UGPRs in the case we spill. That UGPR interferes with everything
+    * we preload so it needs to be reserved specially here for the worst case.
+    */
+   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
+      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
+      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
+      assert(I->dst.file < JAY_NUM_GRF_FILES);
+      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
+   }
+
+   unsigned uniform_grfs, nonuniform_grfs;
+   unsigned spilling_grfs = 0, mem_slots = 0;
+   unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u;
+
+   /* Our current stride partition heuristic is rather dumb: allocate as few
+    * non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs
+    * well for 32-bit code but poorly for 16-bit/64-bit heavy routines.
+    * We'll need stride-aware demand calculations to fix that (TODO).
+    */
+   unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
+   unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
+
+   for (unsigned spilling = 0; spilling <= 1; spilling++) {
+      /* There is an interdependence between partition choice and spilling,
+       * because spilling requires reserved UGPRs for the lowered SENDs. The
+       * solution is to first try to build a partition that forbids spilling,
+       * and if that fails, build one with it.
+       */
+      spilling_grfs = spilling ? shader->dispatch_width / ugpr_per_grf : 0;
+      special_u = payload_u[0] + payload_u[1] + spilling_grfs + eot_u;
+
+      /* We want to determine a good GPR/UGPR split by the demand calculation.
+       * At minimum we need to not spill UGPRs, but if GPR pressure is low we
+       * want to take extra UGPRs to reduce shuffling.
+       */
+      uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs;
+      unsigned bonus_grfs = 4 * grf_per_gpr;
+      unsigned estimate_nonunif_grf =
+         (demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4;
+
+      if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <=
+          JAY_NUM_PHYS_GRF) {
+         uniform_grfs += bonus_grfs;
+      }
+
+      /* If the minimum vector length can't fit in any single existing block, we
+       * will need a new block for it. This is quite conservative.
+       */
+      unsigned min_ugprs = special_u * ugpr_per_grf;
+      if (instr_req.ugpr > payload_u[0] * ugpr_per_grf &&
+          instr_req.ugpr > payload_u[1] * ugpr_per_grf &&
+          instr_req.ugpr > eot_u * ugpr_per_grf) {
+
+         min_ugprs += instr_req.ugpr;
+      }
+
+      /* Finally, we need to snap to GPR bounds */
+      uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
+                           128 - (32 * grf_per_gpr));
+      uniform_grfs = align(uniform_grfs, grf_per_gpr);
+      nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
+
+      /* Set the targets for the virtual register file accordingly */
+      shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
+      shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
+
+      /* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the
+       * limit without spilling, we're good to go.
+       */
+      if (demand[GPR] <= jay_gpr_limit(shader) && !spilling) {
+         break;
+      }
+   }
+
+   /* This should be an acceptable upper limit since we assign memory
+    * tightly thanks to the usual SSA allocator guarantees.
+    */
+   if (spilling_grfs) {
+      mem_slots = demand[GPR] * grf_per_gpr;
+      shader->num_regs[MEM] = demand[GPR];
+   }
 
    struct jay_partition_builder blocks[] = {
       /* Stage-specific payload */
@@ -221,7 +315,7 @@ jay_partition_grf(jay_shader *shader)
       { GPR, JAY_STRIDE_2, grf_2 },
 
       /* Spilling registers */
-      { UGPR, 0, spill_reservation, JAY_BLOCK_SPILL },
+      { UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL },
       { MEM, JAY_STRIDE_4, mem_slots },
 
       /* EOT */