From 2fcf6a94a03880dda156201fa98ad28dcd7aa74e Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Date: Thu, 18 Jun 2026 12:43:05 -0400
Subject: [PATCH] jay: improve the stride partition heuristic

This helps MHW a bunch, I just added that fossil to my collection.

SIMD16:
   Totals from 607 (8.61% of 7050) affected shaders:
   Instrs: 551809 -> 504620 (-8.55%); split: -8.67%, +0.12%
   CodeSize: 7935424 -> 7217680 (-9.04%); split: -9.17%, +0.12%

SIMD32:
   Totals from 468 (6.64% of 7050) affected shaders:
   Instrs: 432455 -> 396733 (-8.26%); split: -8.86%, +0.60%
   CodeSize: 6276144 -> 5734736 (-8.63%); split: -9.20%, +0.58%

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42287>
---
 src/intel/compiler/jay/jay_partition.c | 74 +++++++++++++++++++++-----
 1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c
index 92ac3d28b88..aac05be4261 100644
--- a/src/intel/compiler/jay/jay_partition.c
+++ b/src/intel/compiler/jay/jay_partition.c
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <math.h>
 #include "util/u_math.h"
 #include "jay_ir.h"
 #include "jay_private.h"
@@ -274,13 +275,13 @@ jay_partition_grf(jay_shader *shader)
    unsigned spilling_grfs = 0, mem_slots = 0;
    unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u;
 
-   /* Our current stride partition heuristic is rather dumb: allocate as few
-    * non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs
-    * well for 32-bit code but poorly for 16-bit/64-bit heavy routines.
-    * We'll need stride-aware demand calculations to fix that (TODO).
-    */
-   unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
-   unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
+   unsigned increment[JAY_NUM_STRIDES] = { grf_per_gpr, grf_per_gpr,
+                                           2 * grf_per_gpr };
+   unsigned min_grf[JAY_NUM_STRIDES] = {};
+   for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) {
+      min_grf[i] = align(instr_req.gpr[i] * grf_per_gpr, increment[i]);
+   }
+
    unsigned mapped_accums = grf_per_gpr == 1 ? 2 : 0;
 
    for (unsigned spilling = 0; spilling <= 1; spilling++) {
@@ -298,8 +299,10 @@ jay_partition_grf(jay_shader *shader)
        */
       uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs;
       unsigned bonus_grfs = 4 * grf_per_gpr;
-      unsigned estimate_nonunif_grf =
-         (demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4;
+      unsigned estimate_nonunif_grf = (demand[GPR] * grf_per_gpr) +
+                                      min_grf[JAY_STRIDE_8] +
+                                      min_grf[JAY_STRIDE_2] +
+                                      special_4;
 
       if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <=
           JAY_NUM_PHYS_GRF) {
@@ -343,6 +346,53 @@ jay_partition_grf(jay_shader *shader)
       shader->num_regs[MEM] = demand[GPR];
    }
 
+   /* Now that we've decided how many GRFs to use for GPRs, we need to partition
+    * those GRFs by stride. This does not affect spilling but it has a
+    * significant effect on moves inserted by RA. We use a simple heuristic to
+    * pick a balanced partition: give each stride GRFs proportionate to the
+    * number of SSA defs with that associated stride, plus a slight bias towards
+    * 32-bit to avoid divsion by zero. This reflects our intuition that shaders
+    * heavy on 16-bit (or 64-bit) arithmetic should have more 16-bit (or 64-bit)
+    * registers overall.
+    */
+   unsigned counts[3] = { [JAY_STRIDE_4] = 1 };
+   jay_foreach_inst_in_shader(shader, block, I) {
+      if (I->dst.file == GPR) {
+         counts[jay_dst_stride_minmax(I, false)] += jay_num_values(I->dst);
+      }
+   }
+
+   min_grf[JAY_STRIDE_4] = MAX2(min_grf[JAY_STRIDE_4], special_4);
+   unsigned denom_i = counts[0] + counts[1] + counts[2];
+   float factor = nonuniform_grfs / ((float) denom_i);
+
+   unsigned picked_grf[JAY_NUM_STRIDES] = {}, total = 0;
+   for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) {
+      float ideal = ((float) counts[i]) * factor;
+
+      picked_grf[i] = align(MAX2(roundf(ideal), min_grf[i]), increment[i]);
+      total += picked_grf[i];
+   }
+
+   if (total < nonuniform_grfs) {
+      /* If we have GRFs to spare due to rounding, put them on 32-bit */
+      picked_grf[JAY_STRIDE_4] += nonuniform_grfs - total;
+   } else {
+      /* If we used too many GRFs, remove where we can */
+      unsigned excess = total - nonuniform_grfs;
+      assert(util_is_aligned(excess, grf_per_gpr));
+
+      for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) {
+         while (excess && picked_grf[i] > min_grf[i]) {
+            assert(excess >= increment[i]);
+            picked_grf[i] -= increment[i];
+            excess -= increment[i];
+         }
+      }
+   }
+
+   assert(picked_grf[0] + picked_grf[1] + picked_grf[2] == nonuniform_grfs);
+
    struct jay_partition_builder blocks[] = {
       /* Stage-specific payload */
       { UGPR, 0, payload_u[0] },
@@ -352,9 +402,9 @@ jay_partition_grf(jay_shader *shader)
 
       /* General registers */
       { UGPR, 0, uniform_grfs - special_u },
-      { GPR, JAY_STRIDE_4, nonuniform_grfs - (special_4 + grf_8 + grf_2) },
-      { GPR, JAY_STRIDE_8, grf_8 },
-      { GPR, JAY_STRIDE_2, grf_2 },
+      { GPR, JAY_STRIDE_4, picked_grf[JAY_STRIDE_4] - special_4 },
+      { GPR, JAY_STRIDE_8, picked_grf[JAY_STRIDE_8] },
+      { GPR, JAY_STRIDE_2, picked_grf[JAY_STRIDE_2] },
 
       /* Spilling registers */
       { UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL },