jay: rewrite partition handling

Jay's novel SSA-based register allocator relies on a fixed partition of Intel GRFs mapping to logical GPRs. Previously, Jay used a simple partitioning scheme, which was good enough for simple compute and fragment shaders, but has both limitations preventing new feature bring-up and performance issues. Here we rewrite the Jay partitioning code at the heart of the Jay RA in order to lift these restrictions and allow fully flexible partitions. This should be easier to reason about, fix a bunch of issues around simd32 payloads, enable better performance, etc. The # of stride 16 GRFs reserved is halved in simd32 mode here to match how multisampling stuff works, which explains the large simd32-only instruction count reduction. While churning all this code, I took the opportunity to break off jay_partition.c... I think that is better organized and the diff was garbage otherwise. SIMD16: Totals from 2189 (82.70% of 2647) affected shaders: Instrs: 2702159 -> 2670951 (-1.15%); split: -1.41%, +0.26% CodeSize: 40296128 -> 39850304 (-1.11%); split: -1.40%, +0.30% SIMD32: Totals from 2373 (89.65% of 2647) affected shaders: Instrs: 4559418 -> 4072897 (-10.67%); split: -10.77%, +0.10% CodeSize: 68185488 -> 60635616 (-11.07%); split: -11.17%, +0.09% Number of spill instructions: 44069 -> 44055 (-0.03%) Number of fill instructions: 43292 -> 43278 (-0.03%) Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41808>
2026-06-04 10:58:15 +02:00 · 2026-05-26 11:15:35 -04:00 · 2026-05-26 11:15:35 -04:00 · 598931f653
commit 598931f653
parent 1860e7af30
9 changed files with 605 additions and 472 deletions
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@ -2522,11 +2522,6 @@ setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
 {
   nj->payload.urb_handle = read_payload(p, GPR);

-   /* XXX: This is a hack to line up with the partition chosen in RA. This whole
-    * thing needs an overhaul. Need to think harder about partitioning.
-    */
-   p->offsets[GPR] += 7;
-
   setup_payload_dispatch_start(nj, p);
   setup_payload_push(nj, p);

@ -2605,22 +2600,52 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
      fs->bary[i] = read_vector_payload(p, GPR, 2);
   }

-   if (nj->s->prog_data->fs.uses_src_depth) {
-      fs->coord.z = read_payload(p, GPR);
+   struct {
+      bool cond;
+      jay_def *def;
+   } split_gprs[] = {
+      { nj->s->prog_data->fs.uses_src_depth,   &fs->coord.z       },
+      { nj->s->prog_data->fs.uses_src_w,       &fs->coord.w       },
+      { nj->s->prog_data->fs.uses_sample_mask, &fs->coverage_mask },
+   };
+
+   unsigned extra_gpr =
+      split_gprs[0].cond + split_gprs[1].cond + split_gprs[2].cond;
+   bool odd = extra_gpr & 1;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
+      if (split_gprs[i].cond) {
+         extra_gpr -= 1;
+
+         /* Pad out to GPR alignment by reading the last split GPR as two UGPR
+          * halves and zipping them together below. This lets us construct a
+          * valid partition with minimal copying.
+          */
+         if (extra_gpr == 0 && jay_grf_per_gpr(nj->s) == 2 && odd) {
+            *split_gprs[i].def =
+               read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
+         } else {
+            *split_gprs[i].def = read_payload(p, GPR);
+         }
+      }
   }

-   if (nj->s->prog_data->fs.uses_src_w) {
-      fs->coord.w = read_payload(p, GPR);
-   }
-
-   if (nj->s->prog_data->fs.uses_sample_mask) {
-      fs->coverage_mask = read_payload(p, GPR);
-   }
+   assert(extra_gpr == 0);

   if (nj->s->prog_data->fs.uses_pos_offset) {
      fs->sample_pos = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
   }

+   nj->s->payload_ugprs = p->offsets[UGPR];
+
+   jay_def split[3] = { jay_null() };
+   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
+      if (!jay_is_null(*split_gprs[i].def) &&
+          (*split_gprs[i].def).file == UGPR) {
+         split[i] = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
+      }
+   }
+
   setup_payload_dispatch_start(nj, p);
   setup_payload_push(nj, p);

@ -2639,6 +2664,13 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
      }
   }

+   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
+      if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) {
+         *(split_gprs[i].def) =
+            jay_ZIP_UGPR16_u32(&nj->bld, *split_gprs[i].def, split[i]);
+      }
+   }
+
   if (nj->s->prog_data->fs.uses_src_xy) {
      jay_def t = jay_alloc_def(&nj->bld, GPR, 1);
      jay_def lo = jay_extract_range(nj->payload.u0, 10, 4);
@ -2675,7 +2707,6 @@ jay_insert_payload_swizzle(jay_shader *s)
   jay_builder b = jay_init_builder(func, jay_before_function(func));

   unsigned size = s->payload_gprs;
-   assert(s->partition.blocks[GPR][0].start == 1);

   /* Odd: copy both halves to contiguous pair after payload */
   for (unsigned i = 0; i < (size / 2); ++i) {
@ -2936,6 +2967,8 @@ jay_compile(const struct intel_device_info *devinfo,
   if (debug) {
      fprintf(stdout, "Jay shader (post-RA):\n\n");
      jay_print(stdout, s);
+
+      jay_print_partition(&s->partition);
   }

   struct jay_shader_bin *bin =
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@ -745,35 +745,66 @@ jay_stride_to_bits(enum jay_stride s)
   return 16 << s;
 }

-#define JAY_PARTITION_BLOCKS (3)
+#define jay_foreach_ra_file(file)                                              \
+   for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file)
+
+#define JAY_PARTITION_BLOCKS (6)
+
+enum jay_block_type {
+   JAY_BLOCK_NORMAL,
+
+   /** A block suitable for EOT sends */
+   JAY_BLOCK_EOT,
+
+   /** A block reserved for post-RA spill lowering */
+   JAY_BLOCK_SPILL,
+
+   JAY_BLOCK_TYPES,
+};

 struct jay_register_block {
-   uint16_t start, len;
+   /** First GRF mapped by this block */
+   uint16_t start_grf;
+
+   /** First GPR/UGPR mapped by this block */
+   uint16_t start_gpr;
+
+   /** Length of this block in GPRs/UGPRs */
+   uint16_t len_gpr;
+
+   /** For GPR blocks, stride of GPRs in this block. */
+   enum jay_stride stride;
+
+   /** Special feature of the block */
+   enum jay_block_type type:2;
 };
+static_assert(sizeof(struct jay_register_block) == 8, "packed");

 struct jay_partition {
-   /** Consecutive ranges of GRFs in GPR/UGPRs. */
-   struct jay_register_block blocks[JAY_NUM_GRF_FILES][JAY_PARTITION_BLOCKS];
+   struct jay_register_block blocks[JAY_NUM_RA_FILES][JAY_PARTITION_BLOCKS];
+   unsigned nr_blocks[JAY_NUM_RA_FILES];

   /** Number of GPR/UGPRs per GRF, times 16. For example, 16 encodes SIMD16
-    * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 UGPRs).
-    * 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR).
+    * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16
+    * UGPRs). 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR).
    */
-   unsigned units_x16[JAY_NUM_GRF_FILES];
-
-   /** Base GPR for each stride. The file is partitioned (4, 8, 2, 4=EOT). */
-   unsigned base8, base2, base_eot;
-
-   /** Region of the UGPR partition suitable for large UGPR vectors */
-   struct jay_register_block large_ugpr_block;
+   unsigned units_x16[JAY_NUM_RA_FILES];
 };

-static inline enum jay_stride
-jay_gpr_to_stride(const struct jay_partition *p, unsigned reg)
+static inline struct jay_register_block
+jay_lookup_block(const struct jay_partition *p,
+                 unsigned reg,
+                 enum jay_file file)
 {
-   return (reg < p->base8 || reg >= p->base_eot) ? JAY_STRIDE_4 :
-          reg >= p->base2                        ? JAY_STRIDE_2 :
-                                                   JAY_STRIDE_8;
+   for (unsigned i = 0; i < p->nr_blocks[file]; ++i) {
+      struct jay_register_block B = p->blocks[file][i];
+
+      if (reg >= B.start_gpr && reg < B.start_gpr + B.len_gpr) {
+         return B;
+      }
+   }
+
+   UNREACHABLE("invalid reg");
 }

 /**
@ -786,7 +817,7 @@ typedef struct jay_shader {
   union brw_any_prog_data *prog_data;
   unsigned spills, fills;
   unsigned scratch_size;
-   unsigned payload_gprs, push_grfs;
+   unsigned payload_gprs, payload_ugprs, push_grfs;

   /**
    * Ralloc linear context. Since we don't typically free as we go,
@ -1051,7 +1082,7 @@ static inline enum jay_stride
 jay_def_stride(const jay_shader *shader, jay_def x)
 {
   assert(x.file == GPR);
-   return jay_gpr_to_stride(&shader->partition, x.reg);
+   return jay_lookup_block(&shader->partition, x.reg, GPR).stride;
 }

 /* Represents an allocated register number with file in the top 3 bits. */
--- a/src/intel/compiler/jay/jay_lower_spill.c
+++ b/src/intel/compiler/jay/jay_lower_spill.c
@ -51,9 +51,26 @@ jay_lower_spill(jay_function *func)
 {
   jay_builder b = jay_init_builder(func, jay_before_function(func));

-   /* We reserve the top UGPRs for spilling by ABI */
-   unsigned ugpr_reservation = func->shader->num_regs[UGPR];
-   assert(util_is_aligned(ugpr_reservation, func->shader->dispatch_width));
+   /* We reserved a block of UGPRs for our use */
+   signed ugpr_reservation = -1, gpr2 = -1;
+   for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) {
+      struct jay_register_block B = func->shader->partition.blocks[GPR][i];
+
+      if (B.stride == JAY_STRIDE_2) {
+         gpr2 = B.start_gpr;
+      }
+   }
+
+   for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) {
+      struct jay_register_block B = func->shader->partition.blocks[UGPR][i];
+
+      if (B.type == JAY_BLOCK_SPILL) {
+         ugpr_reservation = B.start_gpr;
+      }
+   }
+
+   assert(ugpr_reservation >= 0 && "must have reserved something");
+   assert(gpr2 >= 0 && "must have a stride-2 gpr");

   jay_def sp = jay_bare_reg(UGPR, ugpr_reservation);
   sp.num_values_m1 = func->shader->dispatch_width - 1;
@ -80,7 +97,7 @@ jay_lower_spill(jay_function *func)
   jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4);

   /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */
-   jay_def tmp2 = jay_bare_reg(GPR, func->shader->partition.base2);
+   jay_def tmp2 = jay_bare_reg(GPR, gpr2);
   jay_LANE_ID_8(&b, tmp2);
   for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) {
      jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i);
--- a/src/intel/compiler/jay/jay_partition.c
+++ b/src/intel/compiler/jay/jay_partition.c
@ -0,0 +1,239 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_ir.h"
+#include "jay_private.h"
+
+/*
+ * jay_partition_grf partitions the register file for the entire shader,
+ * satisfying functional and performance rules. The partition is specified in a
+ * convenient form within this file, as a flat array of jay_partition_builder
+ * structs, which build_partition translates to the more complicated
+ * jay_partition structs.
+ *
+ * All functions must share the same partition for correctness with non-uniform
+ * function calls. For unlinked library functions, we must use the ABI
+ * partition (TODO).
+ */
+struct jay_partition_builder {
+   enum jay_file file;
+   enum jay_stride stride;
+   signed len_grf;
+   enum jay_block_type type;
+};
+
+static void
+build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
+{
+   unsigned base_grf = 0, base_gpr[JAY_NUM_RA_FILES] = { 0 };
+   struct jay_partition *p = &shader->partition;
+
+   *p = (struct jay_partition) {
+      .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16,
+      .units_x16[GPR] = 16 / jay_grf_per_gpr(shader),
+      .units_x16[MEM] = 16 / jay_grf_per_gpr(shader),
+   };
+
+   for (unsigned i = 0; i < n; ++i) {
+      if (b[i].len_grf) {
+         enum jay_file file = b[i].file;
+         unsigned len_gpr = (b[i].len_grf * p->units_x16[file]) / 16;
+         bool grf = file < JAY_NUM_GRF_FILES;
+         assert(p->nr_blocks[file] < JAY_PARTITION_BLOCKS);
+
+         p->blocks[file][p->nr_blocks[file]++] = (struct jay_register_block) {
+            .start_grf = grf ? base_grf : 0,
+            .start_gpr = base_gpr[file],
+            .len_gpr = (b[i].len_grf * p->units_x16[file]) / 16,
+            .stride = b[i].stride,
+            .type = b[i].type,
+         };
+
+         if (file < JAY_NUM_GRF_FILES) {
+            base_grf += b[i].len_grf;
+            base_gpr[file] += len_gpr;
+         }
+      }
+   }
+
+   /* Validate the well formedness of the partition we built above */
+   BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 };
+
+   for (enum jay_file file = 0; file < JAY_NUM_GRF_FILES; ++file) {
+      for (unsigned b = 0; b < p->nr_blocks[file]; ++b) {
+         struct jay_register_block B = p->blocks[file][b];
+         unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file];
+
+         assert(len_grf > 0 && "no empty partitions");
+         assert(B.start_grf + len_grf <= JAY_NUM_PHYS_GRF && "GRF file size");
+         assert(!BITSET_TEST_COUNT(regs, B.start_grf, len_grf) && "uniqueness");
+
+         BITSET_SET_COUNT(regs, B.start_grf, len_grf);
+      }
+   }
+
+   assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped");
+}
+
+void
+jay_partition_grf(jay_shader *shader)
+{
+   /* Calculate the maximum register demand across all functions in the shader.
+    * We will use this to choose a good partition.
+    */
+   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
+
+   jay_foreach_function(shader, f) {
+      jay_compute_liveness(f);
+      jay_calculate_register_demands(f);
+
+      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
+      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
+   }
+
+   /* We must have enough register file space for the register payload, plus the
+    * reserved UGPRs in the case we spill. That UGPR interferes with everything
+    * we preload so it needs to be reserved specially here for the worst case.
+    */
+   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
+      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
+      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
+      assert(I->dst.file < JAY_NUM_GRF_FILES);
+      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
+   }
+
+   /* Determine a good GPR/UGPR split informed by the demand calculation */
+   unsigned grf_per_gpr = jay_grf_per_gpr(shader);
+   unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
+   unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
+
+   /* We must have enough for SIMD1 images (TODO: Check if this actually
+    * applies. Or if we could eliminate this with smarter partitioning even.)
+    */
+   unsigned min_ugprs = 16;
+   min_ugprs = MAX2(min_ugprs, 256);
+
+   /* TODO: We could partition more cleverly */
+   uniform_grfs = align(uniform_grfs, grf_per_gpr);
+   uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
+                        128 - (32 * grf_per_gpr));
+   unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
+
+   /* Check the split */
+   assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
+   assert(nonuniform_grfs >= 32 * grf_per_gpr);
+   assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
+
+   /* Set the targets for the virtual register file accordingly */
+   shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
+   shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
+
+   unsigned spill_reservation = 0, mem_slots = 0;
+
+   /* Spilling requires reserving UGPRs for the lowered SENDs */
+   if (demand[GPR] > jay_gpr_limit(shader)) {
+      spill_reservation = shader->dispatch_width / ugpr_per_grf;
+
+      /* This should be an acceptable upper limit since we assign memory tightly
+       * thanks to the usual SSA allocator guarantees.
+       */
+      mem_slots = demand[GPR] * grf_per_gpr;
+      shader->num_regs[MEM] = demand[GPR];
+   }
+
+   unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 };
+   unsigned eot_u = 0, eot_4 = 0;
+
+   if (shader->stage == MESA_SHADER_VERTEX) {
+      payload_4[0] = 1;
+      payload_4[1] = shader->prog_data->vue.urb_read_length * 8;
+      payload_u[1] = shader->push_grfs;
+      eot_4 = 16;
+   } else if (shader->stage == MESA_SHADER_FRAGMENT) {
+      /* The SIMD32 fragment payload splits GPRs into low and high GRFs, with
+       * UGPRs mixed in between. jay_insert_payload_swizzle deals with this and
+       * swizzles things appropriately, we just need the partition to have two
+       * separate GPR block with a UGPR block in between. That requires the
+       * number of GPRs in the payload to be even.
+       */
+      assert(util_is_aligned(shader->payload_gprs, grf_per_gpr) &&
+             "payload constraint");
+
+      payload_4[0] = shader->payload_gprs;
+      payload_u[1] = (shader->payload_ugprs / ugpr_per_grf) - payload_u[0];
+      payload_4[1] = grf_per_gpr == 2 ? shader->payload_gprs : 0;
+      eot_4 = 14;
+      eot_u = 1;
+   } else {
+      eot_u = 1;
+   }
+
+   unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u;
+   unsigned special_4 = payload_4[0] + payload_4[1] + eot_4;
+
+   /* TODO: Make the stride partition smarter */
+   unsigned grf_8 = 8 * grf_per_gpr;
+   unsigned grf_2 = 8;
+
+   struct jay_partition_builder blocks[] = {
+      /* Stage-specific payload */
+      { UGPR, 0, payload_u[0] },
+      { GPR, JAY_STRIDE_4, payload_4[0] },
+      { UGPR, 0, payload_u[1] },
+      { GPR, JAY_STRIDE_4, payload_4[1] },
+
+      /* General registers */
+      { UGPR, 0, uniform_grfs - special_u },
+      { GPR, JAY_STRIDE_4, nonuniform_grfs - (special_4 + grf_8 + grf_2) },
+      { GPR, JAY_STRIDE_8, grf_8 },
+      { GPR, JAY_STRIDE_2, grf_2 },
+
+      /* Spilling registers */
+      { UGPR, 0, spill_reservation, JAY_BLOCK_SPILL },
+      { MEM, JAY_STRIDE_4, mem_slots },
+
+      /* EOT */
+      { UGPR, 0, eot_u, JAY_BLOCK_EOT },
+      { GPR, JAY_STRIDE_4, eot_4, JAY_BLOCK_EOT },
+   };
+
+   build_partition(shader, blocks, ARRAY_SIZE(blocks));
+
+   /* By construction of our partition, the entire GRF is used. */
+   shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF;
+}
+
+#define ANSI_END    "\033[0m"
+#define ANSI_BOLD   "\033[1m"
+#define ANSI_ITALIC "\033[3m"
+
+void
+jay_print_partition(struct jay_partition *p)
+{
+   jay_foreach_ra_file(file) {
+      if (p->nr_blocks[file]) {
+         const char *files[JAY_NUM_RA_FILES] = { "GPR", "UGPR", "MEM" };
+         printf("%s" ANSI_BOLD "    GRF      %s%s" ANSI_END "\n",
+                file ? "\n" : "", files[file], file == GPR ? "    Stride" : "");
+      }
+
+      for (unsigned b = 0; b < p->nr_blocks[file]; ++b) {
+         struct jay_register_block B = p->blocks[file][b];
+         unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file];
+
+         printf("  %3u…%-3u  %3u…%-3u", B.start_grf, B.start_grf + len_grf - 1,
+                B.start_gpr, B.start_gpr + B.len_gpr - 1);
+
+         if (file == GPR) {
+            printf("  %u-bit", jay_stride_to_bits(B.stride));
+         }
+
+         const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill" };
+         printf(ANSI_ITALIC "%s" ANSI_END "\n", types[B.type]);
+      }
+   }
+
+   printf("\n");
+}
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@ -39,6 +39,7 @@ void jay_calculate_register_demands(jay_function *f);

 void jay_spill(jay_function *func, unsigned limit);
 void jay_partition_grf(jay_shader *shader);
+void jay_print_partition(struct jay_partition *p);
 void jay_register_allocate(jay_shader *s);
 void jay_assign_flags(jay_shader *s);
 void jay_assign_accumulators(jay_shader *s);
@ -86,6 +87,16 @@ struct jay_shader_bin *jay_to_binary(jay_shader *s,
                                     size_t const_data_size,
                                     bool debug);

+static inline unsigned
+jay_gpr_limit(jay_shader *shader)
+{
+   /* If testing spilling, set limit tightly. */
+   bool test = (jay_debug & JAY_DBG_SPILL);
+   test &= shader->stage != MESA_SHADER_VERTEX;
+
+   return test ? 13 : shader->num_regs[GPR];
+}
+
 #ifdef __cplusplus
 } /* extern C */
 #endif
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@ -15,7 +15,6 @@
 #include "jay_ir.h"
 #include "jay_opcodes.h"
 #include "jay_private.h"
-#include "shader_enums.h"

 /**
 * Register allocation for Jay shaders.
@ -39,9 +38,6 @@
 * Finally, we deconstruct SSA.
 */

-#define jay_foreach_ra_file(file)                                              \
-   for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file)
-
 #define jay_foreach_ra_src(I, s)                                               \
   jay_foreach_src(I, s)                                                       \
      if (I->src[s].file < JAY_NUM_RA_FILES && !jay_is_null(I->src[s]))
@ -186,7 +182,11 @@ struct affinity {
    */
   bool eot:1;

-   /** If true, this UGPR needs full GRF alignment */
+   /**
+    * If align is nonzero, this SSA def should be assigned to a register of the
+    * form (k * align) + align_offs for some integer k. In other words, align is
+    * the alignment of the whole vector and align_offs is this def's channel.
+    */
   unsigned align     :5;
   unsigned align_offs:4;
   unsigned nr        :4;
@ -283,15 +283,16 @@ def_from_reg(jay_reg r)
   return jay_bare_reg(r_file(r), r_reg(r));
 }

+struct jay_roundrobin {
+   unsigned block, gpr;
+};
+
 typedef struct jay_ra_state {
   /** Size of each register file */
   unsigned num_regs[JAY_NUM_RA_FILES];

-   /** Counter for roundrobin register allocation */
-   unsigned roundrobin[JAY_NUM_RA_FILES];
-
-   /** First GPR that may be used for EOT sends */
-   unsigned eot_offs;
+   /** Partition-aware counters for roundrobin register allocation */
+   struct jay_roundrobin roundrobin[JAY_NUM_RA_FILES][JAY_NUM_STRIDES];

   /** Phi coalescing data structure */
   struct phi_web_node *phi_web;
@ -691,11 +692,12 @@ try_find_free_reg(jay_ra_state *ra,
                  unsigned except,
                  bool stride4)
 {
+   struct jay_partition *p = &ra->b.shader->partition;
+
   unsigned i;
   BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) {
      if (i != except &&
-          (!stride4 ||
-           jay_gpr_to_stride(&ra->b.shader->partition, i) == JAY_STRIDE_4)) {
+          (!stride4 || jay_lookup_block(p, i, GPR).stride == JAY_STRIDE_4)) {
         return make_reg(file, i);
      }
   }
@ -732,6 +734,124 @@ find_temp_regs(jay_ra_state *ra)
   };
 }

+static void
+pick_regs_from_block(jay_ra_state *ra,
+                     enum jay_file file,
+                     unsigned size,
+                     unsigned alignment,
+                     jay_inst *I,
+                     jay_def var,
+                     bool is_src,
+                     struct jay_register_block block,
+                     unsigned block_cost,
+                     struct affinity affinity,
+                     unsigned *best_cost,
+                     unsigned *best_reg,
+                     unsigned first)
+{
+   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
+   must_tie &= !is_src;
+
+   /* Cross-lane access cannot be SIMD split if the source/destination registers
+    * overlap, but as long as we don't tie those destinations, we're ok.
+    */
+   bool may_tie = !jay_is_shuffle_like(I);
+
+   first = align(first, alignment);
+   for (unsigned i = first; i + size <= block.len_gpr; i += alignment) {
+      unsigned r = block.start_gpr + i;
+
+      unsigned cost = block_cost;
+      bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
+
+      if (tied ? !may_tie :
+                 (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size)))
+         continue;
+
+      /* Try to tie predicated default values, otherwise post-RA lowering needs
+       * to insert a predicated-MOV or SEL.
+       */
+      if (I->predication == JAY_PREDICATED_DEFAULT && !is_src)
+         cost += jay_inst_get_default(I)->reg != r;
+
+      /* If there are stricter alignment requirements later, model the cost of
+       * inserting copies for that.
+       */
+      if (affinity.align &&
+          (i < affinity.align_offs ||
+           !util_is_aligned(i - affinity.align_offs, affinity.align)))
+         cost += size;
+
+      if (affinity.repr == jay_channel(var, 0)) {
+         /* If we are the collect representative but the final collect won't
+          * actually be usable, the whole vector will need to be copied.
+          */
+         if (i < affinity.offset || !util_is_aligned(i - affinity.offset, 4)) {
+            cost += affinity.nr;
+         }
+      } else if (affinity.repr) {
+         /* If we are used for a collect but not in the right place, we will
+          * similarly insert copies.
+          */
+         if (ra->reg_for_index[affinity.repr] != NO_REG &&
+             r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) {
+
+            cost++;
+         }
+      }
+
+      for (unsigned c = 0; c < size; ++c) {
+         unsigned j = r + c;
+
+         /* If the register is unavailable, account for the cost of shuffling */
+         if (!BITSET_TEST(ra->available_regs[file], j) && !tied) {
+            bool live_out = u_sparse_bitset_test(&ra->block->live_out,
+                                                 ra->index_for_reg[file][j]);
+            cost += 1 + live_out;
+         }
+
+         /* Model the cost of shuffling for phis */
+         if (c < jay_num_values(var)) {
+            struct phi_web_node *phi_web =
+               &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))];
+            if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != j) {
+               cost += 2;
+            }
+         }
+
+         /* Choosing this register will pin it, leaving it unavailable to later
+          * smaller sources which will need a move.
+          */
+         cost += BITSET_TEST(ra->sources[file], j);
+      }
+
+      if (cost < *best_cost) {
+         *best_cost = cost;
+         *best_reg = r;
+
+         /* If we find something with 0 cost, we are guaranteed to pick this
+          * register, so terminate early. This speeds up the search.
+          */
+         if (cost == 0) {
+            return;
+         }
+      }
+   }
+}
+
+static bool
+is_block_compatible(struct jay_register_block block,
+                    enum jay_file file,
+                    enum jay_stride min_stride,
+                    enum jay_stride max_stride,
+                    bool eot)
+{
+   return block.type != JAY_BLOCK_SPILL &&
+          (file != GPR ||
+           (min_stride <= block.stride && block.stride <= max_stride)) &&
+          (!eot || block.type == JAY_BLOCK_EOT);
+}
+
 static unsigned
 pick_regs(jay_ra_state *ra,
          enum jay_file file,
@ -744,38 +864,17 @@ pick_regs(jay_ra_state *ra,
          bool is_src)
 {
   struct jay_partition *partition = &ra->b.shader->partition;
-   unsigned first = 0, end = ra->num_regs[file];
-   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
-   must_tie &= !is_src;
-
-   /* Cross-lane access cannot be SIMD split if the source/destination registers
-    * overlap, but as long as we don't tie those destinations, we're ok.
-    */
-   bool may_tie = !jay_is_shuffle_like(I);
-
-   /* Ensure we do not cross partitions */
-   if (file == UGPR && size > 16) {
-      first = partition->large_ugpr_block.start;
-      end = partition->large_ugpr_block.start + partition->large_ugpr_block.len;
-   } else if (file == GPR && size > 1 && ra->b.shader->payload_gprs < 8) {
-      first = align(ra->b.shader->payload_gprs, MAX2(size, alignment));
-   }
-
-   /* Sources used by end-of-thread sends must be at the end of the file */
-   if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) {
-      first = ra->eot_offs;
-   }
+   bool eot = I->op == JAY_OPCODE_SEND && jay_send_eot(I);

   /* If possible, keep sources in place to avoid shuffles. */
   if (is_src && jay_channel(var, 0) != 0) {
      unsigned cur = r_reg(ra->reg_for_index[jay_channel(var, 0)]);
-      enum jay_stride stride = jay_gpr_to_stride(partition, cur);
+      struct jay_register_block block = jay_lookup_block(partition, cur, file);

      if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) &&
-          util_is_aligned(cur, alignment) &&
-          cur >= first &&
-          cur + size <= end &&
-          (file != GPR || (min_stride <= stride && stride <= max_stride))) {
+          util_is_aligned(cur - block.start_gpr, alignment) &&
+          is_block_compatible(block, file, min_stride, max_stride, eot) &&
+          cur + size <= (block.start_gpr + block.len_gpr)) {
         return cur;
      }
   }
@ -786,120 +885,74 @@ pick_regs(jay_ra_state *ra,
      ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity;

   assert(alignment >= size && "alignment must be a multiple of size");
-   unsigned nr = DIV_ROUND_UP((end + 1 - size - first), alignment);
-   unsigned roundrobin = (ra->roundrobin[file]) % nr;
-   unsigned rr_al = roundrobin * alignment, nr_al = nr * alignment;

-   /* Heuristic: Advance the roundrobin by a whole vector if we are the
-    * representative. This leaves us registers for the rest of the vector.
+   /* We select registers roundrobin. This has several benefits:
+    *
+    * 1. Easier coalescing since we are less likely statistically to allocate
+    *    a register that a future instruction has an affinity.
+    *
+    * 2. More freedom for post-RA scheduling thanks to fewer dependencies.
+    *
+    * 3. Less stalling due to SWSB annotations from register reuse.
    */
-   ra->roundrobin[file] +=
-      affinity.repr == jay_channel(var, 0) ? MAX2(size, affinity.nr) : size;
+   enum jay_stride stride = file == GPR ? min_stride : 0;
+   struct jay_roundrobin *rr = &ra->roundrobin[file][stride];
+   unsigned nr_blocks = partition->nr_blocks[file];

-   for (unsigned i = rr_al; i < rr_al + nr_al; i += alignment) {
-      /* We select registers roundrobin. This has several benefits:
-       *
-       * 1. Easier coalescing since we are less likely statistically to allocate
-       *    a register that a future instruction has an affinity.
-       *
-       * 2. More freedom for post-RA scheduling thanks to fewer dependencies.
-       *
-       * 3. Less stalling due to SWSB annotations from register reuse.
-       */
-      unsigned r = first + (i >= nr_al ? (i - nr_al) : i);
-      assert(r >= first && r + size <= end);
+   /* Make sure we use the optimal stride for roundrobin RA */
+   if (file == GPR) {
+      while (partition->blocks[GPR][rr->block].stride != stride) {
+         rr->block = (rr->block + 1 == nr_blocks) ? 0 : rr->block + 1;
+      }
+   }

-      unsigned cost = 0;
-      bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
-      enum jay_stride stride =
-         file == GPR ? jay_gpr_to_stride(partition, r) : min_stride;
+   unsigned last_b_ = rr->block + nr_blocks;
+   for (unsigned b_ = rr->block; b_ <= last_b_ && best_cost > 0; ++b_) {
+      unsigned b = b_ >= nr_blocks ? (b_ - nr_blocks) : b_;
+      assert(b < nr_blocks);

-      if ((tied ? !may_tie :
-                  (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) ||
-          !(min_stride <= stride && stride <= max_stride))
-         continue;
+      struct jay_register_block block = partition->blocks[file][b];

-      /* Try to tie predicated default values, otherwise post-RA lowering needs
-       * to insert a predicated-MOV or SEL.
-       */
-      if (I->predication == JAY_PREDICATED_DEFAULT && !is_src)
-         cost += jay_inst_get_default(I)->reg != r;
+      if (is_block_compatible(block, file, min_stride, max_stride, eot)) {
+         unsigned r = b_ == rr->block ? rr->gpr : 0;

-      /* Assigning a stride that is too big may result in SIMDness splitting.
-       * Model that cost so we prefer packed registers.
-       */
-      cost += stride - min_stride;
-
-      /* If we are used for end-of-thread and it is not in the appropriate
-       * register, we will need to insert 1 copy per channel at the end.
-       */
-      if (affinity.eot && r < ra->eot_offs)
-         cost += size;
-
-      /* If there are stricter alignment requirements later, model the cost of
-       * inserting copies for that.
-       */
-      if (affinity.align &&
-          !util_is_aligned(r - affinity.align_offs, affinity.align))
-         cost += size;
-
-      if (affinity.repr == jay_channel(var, 0)) {
-         /* If we are the collect representative but the final collect won't
-          * actually be usable, the whole vector will need to be copied.
-          */
-         if (!util_is_aligned(r - affinity.offset, 8) ||
-             (affinity.eot && r - affinity.offset < ra->eot_offs)) {
-            cost += 8;
+         if (affinity.repr == jay_channel(var, 0) && b_ == rr->block) {
+            r += affinity.offset;
         }
-      } else if (affinity.repr) {
-         /* If we are used for a collect but not in the right place, we will
-          * similarly insert copies.
-          */
-         if (ra->reg_for_index[affinity.repr] != NO_REG &&
-             r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) {

-            cost += size;
+         /* Assigning a stride that is too big may result in SIMDness splitting.
+          * Model that cost so we prefer packed registers.
+          */
+         unsigned block_cost = file == GPR ? block.stride - min_stride : 0;
+
+         /* If we are used for end-of-thread and it is not in the appropriate
+          * register, we will need to insert 1 copy per channel at the end.
+          */
+         if (affinity.eot && block.type != JAY_BLOCK_EOT) {
+            block_cost += size;
+         }
+
+         /* Consider only blocks that could be picked */
+         if (best_cost > block_cost) {
+            pick_regs_from_block(ra, file, size, alignment, I, var, is_src,
+                                 block, block_cost, affinity, &best_cost,
+                                 &best_reg, r);
         }
      }
+   }

-      for (unsigned c = 0; c < size; ++c) {
-         unsigned i = r + c;
+   /* If we chose a register roundrobin (the constant 16 here is determined
+    * experimentally), advance the roundrobin. As a heuristic, advance by a
+    * whole vector if we are the representative. This leaves us registers for
+    * the rest of the vector.
+    */
+   if (rr->gpr <= best_reg && best_reg <= rr->gpr + 16) {
+      bool is_repr = affinity.repr == jay_channel(var, 0);
+      rr->gpr = best_reg + MAX2(size, is_repr ? affinity.nr : 0);

-         /* If the register is unavailable, account for the cost of shuffling */
-         if (!BITSET_TEST(ra->available_regs[file], i) && !tied) {
-            cost++;
-
-            /* ..plus the cost of shuffling back. */
-            if (u_sparse_bitset_test(&ra->block->live_out,
-                                     ra->index_for_reg[file][i]))
-               cost++;
-         }
-
-         /* Model the cost of shuffling for phis */
-         if (c < jay_num_values(var)) {
-            struct phi_web_node *phi_web =
-               &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))];
-            if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != i) {
-               cost += 2;
-            }
-         }
-
-         /* Choosing this register will pin it, leaving it unavailable to later
-          * smaller sources which will need a move.
-          */
-         cost += BITSET_TEST(ra->sources[file], i);
-      }
-
-      if (cost < best_cost) {
-         best_cost = cost;
-         best_reg = r;
-
-         /* If we find something with 0 cost, we are guaranteed to pick this
-          * register, so terminate early. This speeds up the search.
-          */
-         if (cost == 0) {
-            break;
-         }
+      if (rr->gpr >= partition->blocks[file][rr->block].len_gpr) {
+         rr->block = ((rr->block + 1) == nr_blocks) ? 0 : (rr->block + 1);
+         rr->gpr = 0;
      }
   }

@ -1322,233 +1375,6 @@ insert_parallel_copies_for_phis(jay_function *f)
   free(phi_dsts);
 }

-static struct jay_register_block
-block_gpr_to_grf(struct jay_partition *p, enum jay_file file, unsigned block)
-{
-   assert(file == GPR || file == UGPR);
-   assert(((p->blocks[file][block].start * 16) % p->units_x16[file]) == 0);
-   assert(((p->blocks[file][block].len * 16) % p->units_x16[file]) == 0);
-
-   return (struct jay_register_block) {
-      .start = (p->blocks[file][block].start * 16) / p->units_x16[file],
-      .len = (p->blocks[file][block].len * 16) / p->units_x16[file],
-   };
-}
-
-static void
-print_partition(struct jay_partition *p)
-{
-   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
-      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
-         struct jay_register_block B = block_gpr_to_grf(p, f, b);
-         const char *file = f ? "UGPR" : "GPR";
-
-         if (B.len > 1) {
-            fprintf(stderr, "%s: %u-%u\n", file, B.start, B.start + B.len - 1);
-         } else if (B.len == 1) {
-            fprintf(stderr, "%s: %u\n", file, B.start);
-         }
-      }
-   }
-
-   fprintf(stderr, "\n");
-}
-
-/*
- * Verify that a register partition is a bijective mapping of the GRF file.
- */
-static void
-validate_partition(struct jay_partition *p,
-                   unsigned stride4_header_size,
-                   unsigned nonuniform_gprs)
-{
-   BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 };
-
-   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
-      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
-         struct jay_register_block B = block_gpr_to_grf(p, f, b);
-         if (B.len) {
-            assert(B.start + B.len <= JAY_NUM_PHYS_GRF && "GRF file size");
-            assert(!BITSET_TEST_COUNT(regs, B.start, B.len) && "uniqueness");
-
-            BITSET_SET_COUNT(regs, B.start, B.len);
-         }
-      }
-   }
-
-   for (unsigned i = 0; i < JAY_NUM_PHYS_GRF; ++i) {
-      assert(BITSET_TEST(regs, i) && "all GRFs mapped");
-   }
-
-   assert(p->large_ugpr_block.len && "partition must have a large UGPR block");
-   assert(p->base2 >= p->base8 && p->base_eot >= p->base2 && "monotonic");
-   assert(p->base8 >= stride4_header_size && "header is big enough");
-   assert(p->base_eot + p->units_x16[GPR] <= nonuniform_gprs && "EOT fits");
-   assert(util_is_aligned(p->base8, 8) && "so vectors don't cross");
-   assert(util_is_aligned(p->base2, 8) && "so vectors don't cross");
-   assert(util_is_aligned(p->base_eot, 8) && "so vectors don't cross");
-}
-
-static void
-build_partition(jay_shader *shader, unsigned *blocks, unsigned n)
-{
-   unsigned base = 0;
-   unsigned ugpr_base = 0;
-   struct jay_partition *p = &shader->partition;
-
-   *p = (struct jay_partition) {
-      .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16,
-      .units_x16[GPR] = 16 / jay_grf_per_gpr(shader),
-   };
-
-   for (unsigned i = 0; i < n; ++i) {
-      enum jay_file file = (i & 1) ? GPR : UGPR;
-      unsigned file_i = i >> 1;
-
-      p->blocks[file][file_i].start = (base * p->units_x16[file]) / 16;
-      p->blocks[file][file_i].len = (blocks[i] * p->units_x16[file]) / 16;
-
-      if (file == UGPR && blocks[i] >= 8) {
-         p->large_ugpr_block = (struct jay_register_block) {
-            .start = (ugpr_base * p->units_x16[file]) / 16,
-            .len = p->blocks[file][file_i].len,
-         };
-      }
-
-      base += blocks[i];
-      if (file == UGPR) {
-         ugpr_base += blocks[i];
-      }
-   }
-}
-
-static unsigned
-gpr_limit(jay_shader *shader)
-{
-   /* If testing spilling, set limit tightly. */
-   bool test = (jay_debug & JAY_DBG_SPILL);
-   test &= shader->stage != MESA_SHADER_VERTEX;
-
-   return test ? 13 : shader->num_regs[GPR];
-}
-
-/*
- * Partition the register file for the entire shader. All functions must
- * share the same partition for correctness with non-uniform function calls.
- * For unlinked library functions, we must use the ABI partition (TODO).
- */
-void
-jay_partition_grf(jay_shader *shader)
-{
-   /* Calculate the maximum register demand across all functions in the shader.
-    * We will use this to choose a good partition.
-    */
-   struct jay_partition *p = &shader->partition;
-   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
-
-   jay_foreach_function(shader, f) {
-      jay_compute_liveness(f);
-      jay_calculate_register_demands(f);
-
-      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
-      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
-   }
-
-   /* We must have enough register file space for the register payload, plus the
-    * reserved UGPRs in the case we spill. That UGPR interferes with everything
-    * we preload so it needs to be reserved specially here for the worst case.
-    */
-   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
-      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
-      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
-      assert(I->dst.file < JAY_NUM_GRF_FILES);
-      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
-   }
-
-   /* Determine a good GPR/UGPR split informed by the demand calculation */
-   unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
-   unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
-
-   /* We must have enough for SIMD1 images (TODO: Check if this actually
-    * applies. Or if we could eliminate this with smarter partitioning even.)
-    */
-   unsigned min_ugprs = 16;
-   min_ugprs = MAX2(min_ugprs, 256);
-
-   unsigned grf_block_alignment = 8 * jay_grf_per_gpr(shader); /* max_vec */
-
-   /* TODO: We could partition more cleverly */
-   uniform_grfs = CLAMP(align(uniform_grfs, grf_block_alignment),
-                        DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
-                        128 - (32 * jay_grf_per_gpr(shader)));
-   unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
-
-   /* Check the split */
-   assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
-   assert(nonuniform_grfs >= 32 * jay_grf_per_gpr(shader));
-   assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
-
-   /* Partition GRFs between GPR & UGPR */
-   unsigned stride4_header_size = 0;
-
-   if (shader->stage == MESA_SHADER_VERTEX) {
-      unsigned attrib_grfs = shader->prog_data->vue.urb_read_length * 8;
-      unsigned blocks[] = {
-         1,                                         /* UGPR: g0 */
-         8,                                         /* GPR: URB output handle */
-         shader->push_grfs,                         /* UGPR: Push constants */
-         attrib_grfs,                               /* GPR: Vertex inputs */
-         uniform_grfs - (blocks[0] + blocks[2]),    /* UGPR: * */
-         nonuniform_grfs - (blocks[1] + blocks[3]), /* GPR: * and EOT */
-      };
-
-      build_partition(shader, blocks, ARRAY_SIZE(blocks));
-      stride4_header_size = blocks[1] + blocks[3];
-   } else if (shader->stage == MESA_SHADER_FRAGMENT) {
-      unsigned len0 = jay_grf_per_gpr(shader);
-      unsigned payload_grfs = shader->payload_gprs * len0;
-
-      unsigned blocks[] = {
-         len0,                /* UGPR: g0 (and maybe g1) */
-         payload_grfs,        /* GPR: Barycentrics */
-         uniform_grfs - len0, /* UGPR: Dispatch (eg push constants) & general */
-         nonuniform_grfs - payload_grfs, /* GPR: General & EOT */
-      };
-      build_partition(shader, blocks, ARRAY_SIZE(blocks));
-      stride4_header_size = blocks[1];
-   } else {
-      unsigned blocks[] = { uniform_grfs - 4, nonuniform_grfs, 4 };
-      build_partition(shader, blocks, ARRAY_SIZE(blocks));
-   }
-
-   /* TODO: Make the stride partition smarter */
-   unsigned nonuniform_gprs = nonuniform_grfs / jay_grf_per_gpr(shader);
-   unsigned eot_gprs = 16 / jay_grf_per_gpr(shader);
-   p->base8 = ROUND_DOWN_TO(nonuniform_gprs - (16 + eot_gprs), 8) + 0;
-   p->base2 = 8 + p->base8;
-   p->base_eot = 8 + p->base2;
-
-   // print_partition(p);
-   validate_partition(p, stride4_header_size, nonuniform_gprs);
-
-   /* By construction of our partition, the entire GRF is used. */
-   shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF;
-
-   /* Set the targets for the virtual register file accordingly */
-   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
-      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
-         shader->num_regs[f] += p->blocks[f][b].len;
-      }
-   }
-
-   /* This should be an acceptable upper limit since we assign memory tightly
-    * thanks to the usual SSA allocator guarantees.
-    */
-   if (demand[GPR] > gpr_limit(shader)) {
-      shader->num_regs[MEM] = demand[GPR];
-   }
-}
-
 static void
 jay_register_allocate_function(jay_function *f)
 {
@ -1556,15 +1382,10 @@ jay_register_allocate_function(jay_function *f)
   jay_ra_state ra = { .b.shader = shader, .b.func = f };

   /* Spill as needed to fit within the limits. */
-   unsigned limit = gpr_limit(f->shader);
+   unsigned limit = jay_gpr_limit(f->shader);
   bool spilled = f->demand[GPR] > limit;

   if (spilled) {
-      /* Spilling requires reserving UGPRs for spilling */
-      unsigned reservation = f->shader->dispatch_width;
-      f->shader->num_regs[UGPR] -= reservation;
-      f->shader->partition.large_ugpr_block.len -= reservation;
-
      jay_spill(f, limit);
      jay_validate(f->shader, "spilling");
      jay_compute_liveness(f);
@ -1588,17 +1409,6 @@ jay_register_allocate_function(jay_function *f)

   typed_memcpy(ra.num_regs, shader->num_regs, JAY_NUM_RA_FILES);

-   /* The end of the register file is allowed for end-of-thread messages.
-    * Calculate the offset in GPRs. Compute shaders have this as UGPRs while
-    * fragment shaders have this as GPRs.
-    */
-   if (mesa_shader_stage_is_compute(shader->stage)) {
-      ra.eot_offs = ROUND_DOWN_TO(ra.num_regs[UGPR], jay_ugpr_per_grf(shader)) -
-                    jay_ugpr_per_grf(shader);
-   } else {
-      ra.eot_offs = ra.num_regs[GPR] - (16 / jay_grf_per_gpr(shader));
-   }
-
   linear_ctx *lin_ctx = linear_context(shader);

   ra.reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc);
--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@ -10,7 +10,6 @@
 #include "compiler/brw/brw_eu_inst.h"
 #include "compiler/brw/brw_reg.h"
 #include "compiler/brw/brw_reg_type.h"
-#include "dev/intel_debug.h"
 #include "util/macros.h"
 #include "util/u_dynarray.h"
 #include "util/u_math.h"
@ -42,45 +41,29 @@ to_brw_reg_type(enum jay_type type)
   /* clang-format on */
 }

-static inline unsigned
-to_def_grf_16(struct jay_partition *p, jay_def d)
-{
-   unsigned count = jay_num_values(d);
-   if (count == 0 || !(d.file == GPR || d.file == UGPR)) {
-      return d.reg;
-   }
-
-   unsigned base = 0;
-   for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) {
-      unsigned offset = d.reg - base;
-
-      if (offset < p->blocks[d.file][i].len) {
-         assert(offset + count <= p->blocks[d.file][i].len &&
-                "vectors must not cross partition boundaries");
-
-         return (p->blocks[d.file][i].start + offset) * 2 + d.hi;
-      }
-
-      base += p->blocks[d.file][i].len;
-   }
-
-   UNREACHABLE("virtual register must be in a block");
-}
-
 static inline brw_reg
-to_brw_reg(jay_function *f,
-           const jay_inst *I,
-           signed idx,
-           unsigned simd_offs,
-           bool force_hi)
+to_brw_reg(
+   jay_function *f, const jay_inst *I, signed idx, unsigned simd_offs, bool hi)
 {
   bool is_dest = idx < 0;
   enum jay_type type = is_dest ? I->type : jay_src_type(I, idx);
   jay_def d = is_dest ? I->dst : I->src[idx];
-   d.hi |= force_hi;
+   hi |= d.hi;

   struct brw_reg R;
-   unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0;
+   unsigned reg = d.reg, count = jay_num_values(d);
+   unsigned offset_B = 0, grf = 0;
+   assert(!hi || d.file == GPR);
+
+   if (count && (d.file == GPR || d.file == UGPR)) {
+      struct jay_register_block block =
+         jay_lookup_block(&f->shader->partition, d.reg, d.file);
+
+      grf = block.start_grf;
+      reg -= block.start_gpr;
+
+      assert(reg + count <= block.len_gpr && "must not cross partitions");
+   }

   if (jay_is_imm(d)) {
      /* Immediates have size restrictions but can zero extend */
@ -95,13 +78,13 @@ to_brw_reg(jay_function *f,
   } else if (jay_is_null(d)) {
      R = brw_null_reg();
   } else if (d.file == UGPR || d.file == UACCUM) {
-      unsigned phys_reg = (reg >> 1) / 8;
-      offset_B = ((reg >> 1) % 8) * 4;
+      grf += (reg / jay_ugpr_per_grf(f->shader));
+      offset_B = (reg % jay_ugpr_per_grf(f->shader)) * 4;

      if (d.file == UGPR) {
-         R = brw_ud1_grf(phys_reg, 0);
+         R = brw_ud1_grf(grf * 2, 0);
      } else {
-         R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0);
+         R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0);
      }

      /* Handle 3-src restrictions and vectorized uniform code. */
@ -140,22 +123,22 @@ to_brw_reg(jay_function *f,
      unsigned stride_bits = jay_stride_to_bits(def_stride);
      unsigned simd_width = jay_simd_width_physical(f->shader, I);

-      unsigned phys_reg;
      if (def_stride == JAY_STRIDE_2) {
-         /* Bit 0 selects between lo/hi halves of the GPR */
-         phys_reg = (reg / 2) * jay_grf_per_gpr(f->shader);
-         offset_B = (reg & 1) * 2 * f->shader->dispatch_width;
+         /* Select between lo/hi halves of the GPR */
+         grf += reg * jay_grf_per_gpr(f->shader);
+         offset_B = hi ? 2 * f->shader->dispatch_width : 0;
      } else {
-         /* Low bits are an offset in 2-byte words into the GRF */
+         /* Treat low bits as an offset in 2-byte words into the GRF */
+         unsigned r = (reg * 2) + hi;
         unsigned mask = BITFIELD_MASK(stride_bits / 32);
-         phys_reg = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader);
-         offset_B = (reg & mask) * 2;
+         grf += ((r & ~mask) / 2) * jay_grf_per_gpr(f->shader);
+         offset_B = (r & mask) * 2;
      }

      if (d.file == GPR) {
-         R = xe2_vec8_grf(phys_reg, 0);
+         R = xe2_vec8_grf(grf, 0);
      } else {
-         R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0);
+         R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + grf, 0);
      }

      R = byte_offset(R, simd_offs * simd_width * stride_bits / 8);
@ -524,8 +507,13 @@ emit(struct brw_codegen *p,

   case JAY_OPCODE_SHUFFLE: {
      struct brw_reg a0 = brw_address_reg(0);
-      unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]);
-      unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width;
+      assert(I->src[0].file == GPR && jay_num_values(I->src[0]) == 1);
+      struct jay_register_block block =
+         jay_lookup_block(&f->shader->partition, I->src[0].reg, GPR);
+
+      unsigned offset_B =
+         (block.start_grf * 64) +
+         ((I->src[0].reg - block.start_gpr) * 4 * f->shader->dispatch_width);

      brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B));
      brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD));
--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@ -62,6 +62,7 @@ libintel_compiler_jay_files = files(
  'jay_opt_dead_code.c',
  'jay_opt_predicate.c',
  'jay_opt_propagate.c',
+  'jay_partition.c',
  'jay_print.c',
  'jay_private.h',
  'jay_prog_data.c',
--- a/src/intel/compiler/jay/test/jay_test.h
+++ b/src/intel/compiler/jay/test/jay_test.h
@ -8,7 +8,6 @@
 #include <inttypes.h>
 #include "jay_builder.h"
 #include "jay_ir.h"
-#include "jay_private.h"
 #include "shader_enums.h"

 static inline jay_block *
@ -25,13 +24,17 @@ jay_test_builder(void *memctx)
 {
   jay_shader *s = jay_new_shader(memctx, MESA_SHADER_COMPUTE);
   jay_function *f = jay_new_function(s);
-   s->partition.base8 = 8;

   struct intel_device_info *devinfo =
      rzalloc(memctx, struct intel_device_info);
   s->devinfo = devinfo;
   s->dispatch_width = 32;

+   s->partition.blocks[GPR][s->partition.nr_blocks[GPR]++] = {
+      .len_gpr = 32,
+      .stride = JAY_STRIDE_4,
+   };
+
   unsigned verx10 = 200;
   devinfo->verx10 = verx10;
   devinfo->ver = verx10 / 10;