diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c
index e325918be7d..b908e41a518 100644
--- a/src/intel/compiler/jay/jay_from_nir.c
+++ b/src/intel/compiler/jay/jay_from_nir.c
@@ -2522,11 +2522,6 @@ setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
 {
    nj->payload.urb_handle = read_payload(p, GPR);
 
-   /* XXX: This is a hack to line up with the partition chosen in RA. This whole
-    * thing needs an overhaul. Need to think harder about partitioning.
-    */
-   p->offsets[GPR] += 7;
-
    setup_payload_dispatch_start(nj, p);
    setup_payload_push(nj, p);
 
@@ -2605,22 +2600,52 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
       fs->bary[i] = read_vector_payload(p, GPR, 2);
    }
 
-   if (nj->s->prog_data->fs.uses_src_depth) {
-      fs->coord.z = read_payload(p, GPR);
+   struct {
+      bool cond;
+      jay_def *def;
+   } split_gprs[] = {
+      { nj->s->prog_data->fs.uses_src_depth,   &fs->coord.z       },
+      { nj->s->prog_data->fs.uses_src_w,       &fs->coord.w       },
+      { nj->s->prog_data->fs.uses_sample_mask, &fs->coverage_mask },
+   };
+
+   unsigned extra_gpr =
+      split_gprs[0].cond + split_gprs[1].cond + split_gprs[2].cond;
+   bool odd = extra_gpr & 1;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
+      if (split_gprs[i].cond) {
+         extra_gpr -= 1;
+
+         /* Pad out to GPR alignment by reading the last split GPR as two UGPR
+          * halves and zipping them together below. This lets us construct a
+          * valid partition with minimal copying.
+          */
+         if (extra_gpr == 0 && jay_grf_per_gpr(nj->s) == 2 && odd) {
+            *split_gprs[i].def =
+               read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
+         } else {
+            *split_gprs[i].def = read_payload(p, GPR);
+         }
+      }
    }
 
-   if (nj->s->prog_data->fs.uses_src_w) {
-      fs->coord.w = read_payload(p, GPR);
-   }
-
-   if (nj->s->prog_data->fs.uses_sample_mask) {
-      fs->coverage_mask = read_payload(p, GPR);
-   }
+   assert(extra_gpr == 0);
 
    if (nj->s->prog_data->fs.uses_pos_offset) {
       fs->sample_pos = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
    }
 
+   nj->s->payload_ugprs = p->offsets[UGPR];
+
+   jay_def split[3] = { jay_null() };
+   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
+      if (!jay_is_null(*split_gprs[i].def) &&
+          (*split_gprs[i].def).file == UGPR) {
+         split[i] = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s));
+      }
+   }
+
    setup_payload_dispatch_start(nj, p);
    setup_payload_push(nj, p);
 
@@ -2639,6 +2664,13 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p)
       }
    }
 
+   for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) {
+      if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) {
+         *(split_gprs[i].def) =
+            jay_ZIP_UGPR16_u32(&nj->bld, *split_gprs[i].def, split[i]);
+      }
+   }
+
    if (nj->s->prog_data->fs.uses_src_xy) {
       jay_def t = jay_alloc_def(&nj->bld, GPR, 1);
       jay_def lo = jay_extract_range(nj->payload.u0, 10, 4);
@@ -2675,7 +2707,6 @@ jay_insert_payload_swizzle(jay_shader *s)
    jay_builder b = jay_init_builder(func, jay_before_function(func));
 
    unsigned size = s->payload_gprs;
-   assert(s->partition.blocks[GPR][0].start == 1);
 
    /* Odd: copy both halves to contiguous pair after payload */
    for (unsigned i = 0; i < (size / 2); ++i) {
@@ -2936,6 +2967,8 @@ jay_compile(const struct intel_device_info *devinfo,
    if (debug) {
       fprintf(stdout, "Jay shader (post-RA):\n\n");
       jay_print(stdout, s);
+
+      jay_print_partition(&s->partition);
    }
 
    struct jay_shader_bin *bin =
diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h
index 18f3e411780..1e92e2ec6bc 100644
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@@ -745,35 +745,66 @@ jay_stride_to_bits(enum jay_stride s)
    return 16 << s;
 }
 
-#define JAY_PARTITION_BLOCKS (3)
+#define jay_foreach_ra_file(file)                                              \
+   for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file)
+
+#define JAY_PARTITION_BLOCKS (6)
+
+enum jay_block_type {
+   JAY_BLOCK_NORMAL,
+
+   /** A block suitable for EOT sends */
+   JAY_BLOCK_EOT,
+
+   /** A block reserved for post-RA spill lowering */
+   JAY_BLOCK_SPILL,
+
+   JAY_BLOCK_TYPES,
+};
 
 struct jay_register_block {
-   uint16_t start, len;
+   /** First GRF mapped by this block */
+   uint16_t start_grf;
+
+   /** First GPR/UGPR mapped by this block */
+   uint16_t start_gpr;
+
+   /** Length of this block in GPRs/UGPRs */
+   uint16_t len_gpr;
+
+   /** For GPR blocks, stride of GPRs in this block. */
+   enum jay_stride stride;
+
+   /** Special feature of the block */
+   enum jay_block_type type:2;
 };
+static_assert(sizeof(struct jay_register_block) == 8, "packed");
 
 struct jay_partition {
-   /** Consecutive ranges of GRFs in GPR/UGPRs. */
-   struct jay_register_block blocks[JAY_NUM_GRF_FILES][JAY_PARTITION_BLOCKS];
+   struct jay_register_block blocks[JAY_NUM_RA_FILES][JAY_PARTITION_BLOCKS];
+   unsigned nr_blocks[JAY_NUM_RA_FILES];
 
    /** Number of GPR/UGPRs per GRF, times 16. For example, 16 encodes SIMD16
-    * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 UGPRs).
-    * 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR).
+    * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16
+    * UGPRs). 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR).
     */
-   unsigned units_x16[JAY_NUM_GRF_FILES];
-
-   /** Base GPR for each stride. The file is partitioned (4, 8, 2, 4=EOT). */
-   unsigned base8, base2, base_eot;
-
-   /** Region of the UGPR partition suitable for large UGPR vectors */
-   struct jay_register_block large_ugpr_block;
+   unsigned units_x16[JAY_NUM_RA_FILES];
 };
 
-static inline enum jay_stride
-jay_gpr_to_stride(const struct jay_partition *p, unsigned reg)
+static inline struct jay_register_block
+jay_lookup_block(const struct jay_partition *p,
+                 unsigned reg,
+                 enum jay_file file)
 {
-   return (reg < p->base8 || reg >= p->base_eot) ? JAY_STRIDE_4 :
-          reg >= p->base2                        ? JAY_STRIDE_2 :
-                                                   JAY_STRIDE_8;
+   for (unsigned i = 0; i < p->nr_blocks[file]; ++i) {
+      struct jay_register_block B = p->blocks[file][i];
+
+      if (reg >= B.start_gpr && reg < B.start_gpr + B.len_gpr) {
+         return B;
+      }
+   }
+
+   UNREACHABLE("invalid reg");
 }
 
 /**
@@ -786,7 +817,7 @@ typedef struct jay_shader {
    union brw_any_prog_data *prog_data;
    unsigned spills, fills;
    unsigned scratch_size;
-   unsigned payload_gprs, push_grfs;
+   unsigned payload_gprs, payload_ugprs, push_grfs;
 
    /**
     * Ralloc linear context. Since we don't typically free as we go,
@@ -1051,7 +1082,7 @@ static inline enum jay_stride
 jay_def_stride(const jay_shader *shader, jay_def x)
 {
    assert(x.file == GPR);
-   return jay_gpr_to_stride(&shader->partition, x.reg);
+   return jay_lookup_block(&shader->partition, x.reg, GPR).stride;
 }
 
 /* Represents an allocated register number with file in the top 3 bits. */
diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c
index 9bd265dcaf0..56762bf1a72 100644
--- a/src/intel/compiler/jay/jay_lower_spill.c
+++ b/src/intel/compiler/jay/jay_lower_spill.c
@@ -51,9 +51,26 @@ jay_lower_spill(jay_function *func)
 {
    jay_builder b = jay_init_builder(func, jay_before_function(func));
 
-   /* We reserve the top UGPRs for spilling by ABI */
-   unsigned ugpr_reservation = func->shader->num_regs[UGPR];
-   assert(util_is_aligned(ugpr_reservation, func->shader->dispatch_width));
+   /* We reserved a block of UGPRs for our use */
+   signed ugpr_reservation = -1, gpr2 = -1;
+   for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) {
+      struct jay_register_block B = func->shader->partition.blocks[GPR][i];
+
+      if (B.stride == JAY_STRIDE_2) {
+         gpr2 = B.start_gpr;
+      }
+   }
+
+   for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) {
+      struct jay_register_block B = func->shader->partition.blocks[UGPR][i];
+
+      if (B.type == JAY_BLOCK_SPILL) {
+         ugpr_reservation = B.start_gpr;
+      }
+   }
+
+   assert(ugpr_reservation >= 0 && "must have reserved something");
+   assert(gpr2 >= 0 && "must have a stride-2 gpr");
 
    jay_def sp = jay_bare_reg(UGPR, ugpr_reservation);
    sp.num_values_m1 = func->shader->dispatch_width - 1;
@@ -80,7 +97,7 @@ jay_lower_spill(jay_function *func)
    jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4);
 
    /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */
-   jay_def tmp2 = jay_bare_reg(GPR, func->shader->partition.base2);
+   jay_def tmp2 = jay_bare_reg(GPR, gpr2);
    jay_LANE_ID_8(&b, tmp2);
    for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) {
       jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i);
diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c
new file mode 100644
index 00000000000..6d489f42170
--- /dev/null
+++ b/src/intel/compiler/jay/jay_partition.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2026 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "jay_ir.h"
+#include "jay_private.h"
+
+/*
+ * jay_partition_grf partitions the register file for the entire shader,
+ * satisfying functional and performance rules. The partition is specified in a
+ * convenient form within this file, as a flat array of jay_partition_builder
+ * structs, which build_partition translates to the more complicated
+ * jay_partition structs.
+ *
+ * All functions must share the same partition for correctness with non-uniform
+ * function calls. For unlinked library functions, we must use the ABI
+ * partition (TODO).
+ */
+struct jay_partition_builder {
+   enum jay_file file;
+   enum jay_stride stride;
+   signed len_grf;
+   enum jay_block_type type;
+};
+
+static void
+build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
+{
+   unsigned base_grf = 0, base_gpr[JAY_NUM_RA_FILES] = { 0 };
+   struct jay_partition *p = &shader->partition;
+
+   *p = (struct jay_partition) {
+      .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16,
+      .units_x16[GPR] = 16 / jay_grf_per_gpr(shader),
+      .units_x16[MEM] = 16 / jay_grf_per_gpr(shader),
+   };
+
+   for (unsigned i = 0; i < n; ++i) {
+      if (b[i].len_grf) {
+         enum jay_file file = b[i].file;
+         unsigned len_gpr = (b[i].len_grf * p->units_x16[file]) / 16;
+         bool grf = file < JAY_NUM_GRF_FILES;
+         assert(p->nr_blocks[file] < JAY_PARTITION_BLOCKS);
+
+         p->blocks[file][p->nr_blocks[file]++] = (struct jay_register_block) {
+            .start_grf = grf ? base_grf : 0,
+            .start_gpr = base_gpr[file],
+            .len_gpr = (b[i].len_grf * p->units_x16[file]) / 16,
+            .stride = b[i].stride,
+            .type = b[i].type,
+         };
+
+         if (file < JAY_NUM_GRF_FILES) {
+            base_grf += b[i].len_grf;
+            base_gpr[file] += len_gpr;
+         }
+      }
+   }
+
+   /* Validate the well formedness of the partition we built above */
+   BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 };
+
+   for (enum jay_file file = 0; file < JAY_NUM_GRF_FILES; ++file) {
+      for (unsigned b = 0; b < p->nr_blocks[file]; ++b) {
+         struct jay_register_block B = p->blocks[file][b];
+         unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file];
+
+         assert(len_grf > 0 && "no empty partitions");
+         assert(B.start_grf + len_grf <= JAY_NUM_PHYS_GRF && "GRF file size");
+         assert(!BITSET_TEST_COUNT(regs, B.start_grf, len_grf) && "uniqueness");
+
+         BITSET_SET_COUNT(regs, B.start_grf, len_grf);
+      }
+   }
+
+   assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped");
+}
+
+void
+jay_partition_grf(jay_shader *shader)
+{
+   /* Calculate the maximum register demand across all functions in the shader.
+    * We will use this to choose a good partition.
+    */
+   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
+
+   jay_foreach_function(shader, f) {
+      jay_compute_liveness(f);
+      jay_calculate_register_demands(f);
+
+      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
+      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
+   }
+
+   /* We must have enough register file space for the register payload, plus the
+    * reserved UGPRs in the case we spill. That UGPR interferes with everything
+    * we preload so it needs to be reserved specially here for the worst case.
+    */
+   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
+      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
+      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
+      assert(I->dst.file < JAY_NUM_GRF_FILES);
+      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
+   }
+
+   /* Determine a good GPR/UGPR split informed by the demand calculation */
+   unsigned grf_per_gpr = jay_grf_per_gpr(shader);
+   unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
+   unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
+
+   /* We must have enough for SIMD1 images (TODO: Check if this actually
+    * applies. Or if we could eliminate this with smarter partitioning even.)
+    */
+   unsigned min_ugprs = 16;
+   min_ugprs = MAX2(min_ugprs, 256);
+
+   /* TODO: We could partition more cleverly */
+   uniform_grfs = align(uniform_grfs, grf_per_gpr);
+   uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
+                        128 - (32 * grf_per_gpr));
+   unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
+
+   /* Check the split */
+   assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
+   assert(nonuniform_grfs >= 32 * grf_per_gpr);
+   assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
+
+   /* Set the targets for the virtual register file accordingly */
+   shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
+   shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
+
+   unsigned spill_reservation = 0, mem_slots = 0;
+
+   /* Spilling requires reserving UGPRs for the lowered SENDs */
+   if (demand[GPR] > jay_gpr_limit(shader)) {
+      spill_reservation = shader->dispatch_width / ugpr_per_grf;
+
+      /* This should be an acceptable upper limit since we assign memory tightly
+       * thanks to the usual SSA allocator guarantees.
+       */
+      mem_slots = demand[GPR] * grf_per_gpr;
+      shader->num_regs[MEM] = demand[GPR];
+   }
+
+   unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 };
+   unsigned eot_u = 0, eot_4 = 0;
+
+   if (shader->stage == MESA_SHADER_VERTEX) {
+      payload_4[0] = 1;
+      payload_4[1] = shader->prog_data->vue.urb_read_length * 8;
+      payload_u[1] = shader->push_grfs;
+      eot_4 = 16;
+   } else if (shader->stage == MESA_SHADER_FRAGMENT) {
+      /* The SIMD32 fragment payload splits GPRs into low and high GRFs, with
+       * UGPRs mixed in between. jay_insert_payload_swizzle deals with this and
+       * swizzles things appropriately, we just need the partition to have two
+       * separate GPR block with a UGPR block in between. That requires the
+       * number of GPRs in the payload to be even.
+       */
+      assert(util_is_aligned(shader->payload_gprs, grf_per_gpr) &&
+             "payload constraint");
+
+      payload_4[0] = shader->payload_gprs;
+      payload_u[1] = (shader->payload_ugprs / ugpr_per_grf) - payload_u[0];
+      payload_4[1] = grf_per_gpr == 2 ? shader->payload_gprs : 0;
+      eot_4 = 14;
+      eot_u = 1;
+   } else {
+      eot_u = 1;
+   }
+
+   unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u;
+   unsigned special_4 = payload_4[0] + payload_4[1] + eot_4;
+
+   /* TODO: Make the stride partition smarter */
+   unsigned grf_8 = 8 * grf_per_gpr;
+   unsigned grf_2 = 8;
+
+   struct jay_partition_builder blocks[] = {
+      /* Stage-specific payload */
+      { UGPR, 0, payload_u[0] },
+      { GPR, JAY_STRIDE_4, payload_4[0] },
+      { UGPR, 0, payload_u[1] },
+      { GPR, JAY_STRIDE_4, payload_4[1] },
+
+      /* General registers */
+      { UGPR, 0, uniform_grfs - special_u },
+      { GPR, JAY_STRIDE_4, nonuniform_grfs - (special_4 + grf_8 + grf_2) },
+      { GPR, JAY_STRIDE_8, grf_8 },
+      { GPR, JAY_STRIDE_2, grf_2 },
+
+      /* Spilling registers */
+      { UGPR, 0, spill_reservation, JAY_BLOCK_SPILL },
+      { MEM, JAY_STRIDE_4, mem_slots },
+
+      /* EOT */
+      { UGPR, 0, eot_u, JAY_BLOCK_EOT },
+      { GPR, JAY_STRIDE_4, eot_4, JAY_BLOCK_EOT },
+   };
+
+   build_partition(shader, blocks, ARRAY_SIZE(blocks));
+
+   /* By construction of our partition, the entire GRF is used. */
+   shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF;
+}
+
+#define ANSI_END    "\033[0m"
+#define ANSI_BOLD   "\033[1m"
+#define ANSI_ITALIC "\033[3m"
+
+void
+jay_print_partition(struct jay_partition *p)
+{
+   jay_foreach_ra_file(file) {
+      if (p->nr_blocks[file]) {
+         const char *files[JAY_NUM_RA_FILES] = { "GPR", "UGPR", "MEM" };
+         printf("%s" ANSI_BOLD "    GRF      %s%s" ANSI_END "\n",
+                file ? "\n" : "", files[file], file == GPR ? "    Stride" : "");
+      }
+
+      for (unsigned b = 0; b < p->nr_blocks[file]; ++b) {
+         struct jay_register_block B = p->blocks[file][b];
+         unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file];
+
+         printf("  %3u…%-3u  %3u…%-3u", B.start_grf, B.start_grf + len_grf - 1,
+                B.start_gpr, B.start_gpr + B.len_gpr - 1);
+
+         if (file == GPR) {
+            printf("  %u-bit", jay_stride_to_bits(B.stride));
+         }
+
+         const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill" };
+         printf(ANSI_ITALIC "%s" ANSI_END "\n", types[B.type]);
+      }
+   }
+
+   printf("\n");
+}
diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h
index 78276fe78d8..1ce8638346e 100644
--- a/src/intel/compiler/jay/jay_private.h
+++ b/src/intel/compiler/jay/jay_private.h
@@ -39,6 +39,7 @@ void jay_calculate_register_demands(jay_function *f);
 
 void jay_spill(jay_function *func, unsigned limit);
 void jay_partition_grf(jay_shader *shader);
+void jay_print_partition(struct jay_partition *p);
 void jay_register_allocate(jay_shader *s);
 void jay_assign_flags(jay_shader *s);
 void jay_assign_accumulators(jay_shader *s);
@@ -86,6 +87,16 @@ struct jay_shader_bin *jay_to_binary(jay_shader *s,
                                      size_t const_data_size,
                                      bool debug);
 
+static inline unsigned
+jay_gpr_limit(jay_shader *shader)
+{
+   /* If testing spilling, set limit tightly. */
+   bool test = (jay_debug & JAY_DBG_SPILL);
+   test &= shader->stage != MESA_SHADER_VERTEX;
+
+   return test ? 13 : shader->num_regs[GPR];
+}
+
 #ifdef __cplusplus
 } /* extern C */
 #endif
diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c
index b3b85dfbca4..dd04344c77c 100644
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@@ -15,7 +15,6 @@
 #include "jay_ir.h"
 #include "jay_opcodes.h"
 #include "jay_private.h"
-#include "shader_enums.h"
 
 /**
  * Register allocation for Jay shaders.
@@ -39,9 +38,6 @@
  * Finally, we deconstruct SSA.
  */
 
-#define jay_foreach_ra_file(file)                                              \
-   for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file)
-
 #define jay_foreach_ra_src(I, s)                                               \
    jay_foreach_src(I, s)                                                       \
       if (I->src[s].file < JAY_NUM_RA_FILES && !jay_is_null(I->src[s]))
@@ -186,7 +182,11 @@ struct affinity {
     */
    bool eot:1;
 
-   /** If true, this UGPR needs full GRF alignment */
+   /**
+    * If align is nonzero, this SSA def should be assigned to a register of the
+    * form (k * align) + align_offs for some integer k. In other words, align is
+    * the alignment of the whole vector and align_offs is this def's channel.
+    */
    unsigned align     :5;
    unsigned align_offs:4;
    unsigned nr        :4;
@@ -283,15 +283,16 @@ def_from_reg(jay_reg r)
    return jay_bare_reg(r_file(r), r_reg(r));
 }
 
+struct jay_roundrobin {
+   unsigned block, gpr;
+};
+
 typedef struct jay_ra_state {
    /** Size of each register file */
    unsigned num_regs[JAY_NUM_RA_FILES];
 
-   /** Counter for roundrobin register allocation */
-   unsigned roundrobin[JAY_NUM_RA_FILES];
-
-   /** First GPR that may be used for EOT sends */
-   unsigned eot_offs;
+   /** Partition-aware counters for roundrobin register allocation */
+   struct jay_roundrobin roundrobin[JAY_NUM_RA_FILES][JAY_NUM_STRIDES];
 
    /** Phi coalescing data structure */
    struct phi_web_node *phi_web;
@@ -691,11 +692,12 @@ try_find_free_reg(jay_ra_state *ra,
                   unsigned except,
                   bool stride4)
 {
+   struct jay_partition *p = &ra->b.shader->partition;
+
    unsigned i;
    BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) {
       if (i != except &&
-          (!stride4 ||
-           jay_gpr_to_stride(&ra->b.shader->partition, i) == JAY_STRIDE_4)) {
+          (!stride4 || jay_lookup_block(p, i, GPR).stride == JAY_STRIDE_4)) {
          return make_reg(file, i);
       }
    }
@@ -732,6 +734,124 @@ find_temp_regs(jay_ra_state *ra)
    };
 }
 
+static void
+pick_regs_from_block(jay_ra_state *ra,
+                     enum jay_file file,
+                     unsigned size,
+                     unsigned alignment,
+                     jay_inst *I,
+                     jay_def var,
+                     bool is_src,
+                     struct jay_register_block block,
+                     unsigned block_cost,
+                     struct affinity affinity,
+                     unsigned *best_cost,
+                     unsigned *best_reg,
+                     unsigned first)
+{
+   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
+   must_tie &= !is_src;
+
+   /* Cross-lane access cannot be SIMD split if the source/destination registers
+    * overlap, but as long as we don't tie those destinations, we're ok.
+    */
+   bool may_tie = !jay_is_shuffle_like(I);
+
+   first = align(first, alignment);
+   for (unsigned i = first; i + size <= block.len_gpr; i += alignment) {
+      unsigned r = block.start_gpr + i;
+
+      unsigned cost = block_cost;
+      bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
+
+      if (tied ? !may_tie :
+                 (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size)))
+         continue;
+
+      /* Try to tie predicated default values, otherwise post-RA lowering needs
+       * to insert a predicated-MOV or SEL.
+       */
+      if (I->predication == JAY_PREDICATED_DEFAULT && !is_src)
+         cost += jay_inst_get_default(I)->reg != r;
+
+      /* If there are stricter alignment requirements later, model the cost of
+       * inserting copies for that.
+       */
+      if (affinity.align &&
+          (i < affinity.align_offs ||
+           !util_is_aligned(i - affinity.align_offs, affinity.align)))
+         cost += size;
+
+      if (affinity.repr == jay_channel(var, 0)) {
+         /* If we are the collect representative but the final collect won't
+          * actually be usable, the whole vector will need to be copied.
+          */
+         if (i < affinity.offset || !util_is_aligned(i - affinity.offset, 4)) {
+            cost += affinity.nr;
+         }
+      } else if (affinity.repr) {
+         /* If we are used for a collect but not in the right place, we will
+          * similarly insert copies.
+          */
+         if (ra->reg_for_index[affinity.repr] != NO_REG &&
+             r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) {
+
+            cost++;
+         }
+      }
+
+      for (unsigned c = 0; c < size; ++c) {
+         unsigned j = r + c;
+
+         /* If the register is unavailable, account for the cost of shuffling */
+         if (!BITSET_TEST(ra->available_regs[file], j) && !tied) {
+            bool live_out = u_sparse_bitset_test(&ra->block->live_out,
+                                                 ra->index_for_reg[file][j]);
+            cost += 1 + live_out;
+         }
+
+         /* Model the cost of shuffling for phis */
+         if (c < jay_num_values(var)) {
+            struct phi_web_node *phi_web =
+               &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))];
+            if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != j) {
+               cost += 2;
+            }
+         }
+
+         /* Choosing this register will pin it, leaving it unavailable to later
+          * smaller sources which will need a move.
+          */
+         cost += BITSET_TEST(ra->sources[file], j);
+      }
+
+      if (cost < *best_cost) {
+         *best_cost = cost;
+         *best_reg = r;
+
+         /* If we find something with 0 cost, we are guaranteed to pick this
+          * register, so terminate early. This speeds up the search.
+          */
+         if (cost == 0) {
+            return;
+         }
+      }
+   }
+}
+
+static bool
+is_block_compatible(struct jay_register_block block,
+                    enum jay_file file,
+                    enum jay_stride min_stride,
+                    enum jay_stride max_stride,
+                    bool eot)
+{
+   return block.type != JAY_BLOCK_SPILL &&
+          (file != GPR ||
+           (min_stride <= block.stride && block.stride <= max_stride)) &&
+          (!eot || block.type == JAY_BLOCK_EOT);
+}
+
 static unsigned
 pick_regs(jay_ra_state *ra,
           enum jay_file file,
@@ -744,38 +864,17 @@ pick_regs(jay_ra_state *ra,
           bool is_src)
 {
    struct jay_partition *partition = &ra->b.shader->partition;
-   unsigned first = 0, end = ra->num_regs[file];
-   bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND;
-   must_tie &= !is_src;
-
-   /* Cross-lane access cannot be SIMD split if the source/destination registers
-    * overlap, but as long as we don't tie those destinations, we're ok.
-    */
-   bool may_tie = !jay_is_shuffle_like(I);
-
-   /* Ensure we do not cross partitions */
-   if (file == UGPR && size > 16) {
-      first = partition->large_ugpr_block.start;
-      end = partition->large_ugpr_block.start + partition->large_ugpr_block.len;
-   } else if (file == GPR && size > 1 && ra->b.shader->payload_gprs < 8) {
-      first = align(ra->b.shader->payload_gprs, MAX2(size, alignment));
-   }
-
-   /* Sources used by end-of-thread sends must be at the end of the file */
-   if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) {
-      first = ra->eot_offs;
-   }
+   bool eot = I->op == JAY_OPCODE_SEND && jay_send_eot(I);
 
    /* If possible, keep sources in place to avoid shuffles. */
    if (is_src && jay_channel(var, 0) != 0) {
       unsigned cur = r_reg(ra->reg_for_index[jay_channel(var, 0)]);
-      enum jay_stride stride = jay_gpr_to_stride(partition, cur);
+      struct jay_register_block block = jay_lookup_block(partition, cur, file);
 
       if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) &&
-          util_is_aligned(cur, alignment) &&
-          cur >= first &&
-          cur + size <= end &&
-          (file != GPR || (min_stride <= stride && stride <= max_stride))) {
+          util_is_aligned(cur - block.start_gpr, alignment) &&
+          is_block_compatible(block, file, min_stride, max_stride, eot) &&
+          cur + size <= (block.start_gpr + block.len_gpr)) {
          return cur;
       }
    }
@@ -786,120 +885,74 @@ pick_regs(jay_ra_state *ra,
       ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity;
 
    assert(alignment >= size && "alignment must be a multiple of size");
-   unsigned nr = DIV_ROUND_UP((end + 1 - size - first), alignment);
-   unsigned roundrobin = (ra->roundrobin[file]) % nr;
-   unsigned rr_al = roundrobin * alignment, nr_al = nr * alignment;
 
-   /* Heuristic: Advance the roundrobin by a whole vector if we are the
-    * representative. This leaves us registers for the rest of the vector.
+   /* We select registers roundrobin. This has several benefits:
+    *
+    * 1. Easier coalescing since we are less likely statistically to allocate
+    *    a register that a future instruction has an affinity.
+    *
+    * 2. More freedom for post-RA scheduling thanks to fewer dependencies.
+    *
+    * 3. Less stalling due to SWSB annotations from register reuse.
     */
-   ra->roundrobin[file] +=
-      affinity.repr == jay_channel(var, 0) ? MAX2(size, affinity.nr) : size;
+   enum jay_stride stride = file == GPR ? min_stride : 0;
+   struct jay_roundrobin *rr = &ra->roundrobin[file][stride];
+   unsigned nr_blocks = partition->nr_blocks[file];
 
-   for (unsigned i = rr_al; i < rr_al + nr_al; i += alignment) {
-      /* We select registers roundrobin. This has several benefits:
-       *
-       * 1. Easier coalescing since we are less likely statistically to allocate
-       *    a register that a future instruction has an affinity.
-       *
-       * 2. More freedom for post-RA scheduling thanks to fewer dependencies.
-       *
-       * 3. Less stalling due to SWSB annotations from register reuse.
-       */
-      unsigned r = first + (i >= nr_al ? (i - nr_al) : i);
-      assert(r >= first && r + size <= end);
+   /* Make sure we use the optimal stride for roundrobin RA */
+   if (file == GPR) {
+      while (partition->blocks[GPR][rr->block].stride != stride) {
+         rr->block = (rr->block + 1 == nr_blocks) ? 0 : rr->block + 1;
+      }
+   }
 
-      unsigned cost = 0;
-      bool tied = !is_src && BITSET_TEST(ra->killed[file], r);
-      enum jay_stride stride =
-         file == GPR ? jay_gpr_to_stride(partition, r) : min_stride;
+   unsigned last_b_ = rr->block + nr_blocks;
+   for (unsigned b_ = rr->block; b_ <= last_b_ && best_cost > 0; ++b_) {
+      unsigned b = b_ >= nr_blocks ? (b_ - nr_blocks) : b_;
+      assert(b < nr_blocks);
 
-      if ((tied ? !may_tie :
-                  (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) ||
-          !(min_stride <= stride && stride <= max_stride))
-         continue;
+      struct jay_register_block block = partition->blocks[file][b];
 
-      /* Try to tie predicated default values, otherwise post-RA lowering needs
-       * to insert a predicated-MOV or SEL.
-       */
-      if (I->predication == JAY_PREDICATED_DEFAULT && !is_src)
-         cost += jay_inst_get_default(I)->reg != r;
+      if (is_block_compatible(block, file, min_stride, max_stride, eot)) {
+         unsigned r = b_ == rr->block ? rr->gpr : 0;
 
-      /* Assigning a stride that is too big may result in SIMDness splitting.
-       * Model that cost so we prefer packed registers.
-       */
-      cost += stride - min_stride;
-
-      /* If we are used for end-of-thread and it is not in the appropriate
-       * register, we will need to insert 1 copy per channel at the end.
-       */
-      if (affinity.eot && r < ra->eot_offs)
-         cost += size;
-
-      /* If there are stricter alignment requirements later, model the cost of
-       * inserting copies for that.
-       */
-      if (affinity.align &&
-          !util_is_aligned(r - affinity.align_offs, affinity.align))
-         cost += size;
-
-      if (affinity.repr == jay_channel(var, 0)) {
-         /* If we are the collect representative but the final collect won't
-          * actually be usable, the whole vector will need to be copied.
-          */
-         if (!util_is_aligned(r - affinity.offset, 8) ||
-             (affinity.eot && r - affinity.offset < ra->eot_offs)) {
-            cost += 8;
+         if (affinity.repr == jay_channel(var, 0) && b_ == rr->block) {
+            r += affinity.offset;
          }
-      } else if (affinity.repr) {
-         /* If we are used for a collect but not in the right place, we will
-          * similarly insert copies.
-          */
-         if (ra->reg_for_index[affinity.repr] != NO_REG &&
-             r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) {
 
-            cost += size;
+         /* Assigning a stride that is too big may result in SIMDness splitting.
+          * Model that cost so we prefer packed registers.
+          */
+         unsigned block_cost = file == GPR ? block.stride - min_stride : 0;
+
+         /* If we are used for end-of-thread and it is not in the appropriate
+          * register, we will need to insert 1 copy per channel at the end.
+          */
+         if (affinity.eot && block.type != JAY_BLOCK_EOT) {
+            block_cost += size;
+         }
+
+         /* Consider only blocks that could be picked */
+         if (best_cost > block_cost) {
+            pick_regs_from_block(ra, file, size, alignment, I, var, is_src,
+                                 block, block_cost, affinity, &best_cost,
+                                 &best_reg, r);
          }
       }
+   }
 
-      for (unsigned c = 0; c < size; ++c) {
-         unsigned i = r + c;
+   /* If we chose a register roundrobin (the constant 16 here is determined
+    * experimentally), advance the roundrobin. As a heuristic, advance by a
+    * whole vector if we are the representative. This leaves us registers for
+    * the rest of the vector.
+    */
+   if (rr->gpr <= best_reg && best_reg <= rr->gpr + 16) {
+      bool is_repr = affinity.repr == jay_channel(var, 0);
+      rr->gpr = best_reg + MAX2(size, is_repr ? affinity.nr : 0);
 
-         /* If the register is unavailable, account for the cost of shuffling */
-         if (!BITSET_TEST(ra->available_regs[file], i) && !tied) {
-            cost++;
-
-            /* ..plus the cost of shuffling back. */
-            if (u_sparse_bitset_test(&ra->block->live_out,
-                                     ra->index_for_reg[file][i]))
-               cost++;
-         }
-
-         /* Model the cost of shuffling for phis */
-         if (c < jay_num_values(var)) {
-            struct phi_web_node *phi_web =
-               &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))];
-            if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != i) {
-               cost += 2;
-            }
-         }
-
-         /* Choosing this register will pin it, leaving it unavailable to later
-          * smaller sources which will need a move.
-          */
-         cost += BITSET_TEST(ra->sources[file], i);
-      }
-
-      if (cost < best_cost) {
-         best_cost = cost;
-         best_reg = r;
-
-         /* If we find something with 0 cost, we are guaranteed to pick this
-          * register, so terminate early. This speeds up the search.
-          */
-         if (cost == 0) {
-            break;
-         }
+      if (rr->gpr >= partition->blocks[file][rr->block].len_gpr) {
+         rr->block = ((rr->block + 1) == nr_blocks) ? 0 : (rr->block + 1);
+         rr->gpr = 0;
       }
    }
 
@@ -1322,233 +1375,6 @@ insert_parallel_copies_for_phis(jay_function *f)
    free(phi_dsts);
 }
 
-static struct jay_register_block
-block_gpr_to_grf(struct jay_partition *p, enum jay_file file, unsigned block)
-{
-   assert(file == GPR || file == UGPR);
-   assert(((p->blocks[file][block].start * 16) % p->units_x16[file]) == 0);
-   assert(((p->blocks[file][block].len * 16) % p->units_x16[file]) == 0);
-
-   return (struct jay_register_block) {
-      .start = (p->blocks[file][block].start * 16) / p->units_x16[file],
-      .len = (p->blocks[file][block].len * 16) / p->units_x16[file],
-   };
-}
-
-static void
-print_partition(struct jay_partition *p)
-{
-   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
-      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
-         struct jay_register_block B = block_gpr_to_grf(p, f, b);
-         const char *file = f ? "UGPR" : "GPR";
-
-         if (B.len > 1) {
-            fprintf(stderr, "%s: %u-%u\n", file, B.start, B.start + B.len - 1);
-         } else if (B.len == 1) {
-            fprintf(stderr, "%s: %u\n", file, B.start);
-         }
-      }
-   }
-
-   fprintf(stderr, "\n");
-}
-
-/*
- * Verify that a register partition is a bijective mapping of the GRF file.
- */
-static void
-validate_partition(struct jay_partition *p,
-                   unsigned stride4_header_size,
-                   unsigned nonuniform_gprs)
-{
-   BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 };
-
-   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
-      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
-         struct jay_register_block B = block_gpr_to_grf(p, f, b);
-         if (B.len) {
-            assert(B.start + B.len <= JAY_NUM_PHYS_GRF && "GRF file size");
-            assert(!BITSET_TEST_COUNT(regs, B.start, B.len) && "uniqueness");
-
-            BITSET_SET_COUNT(regs, B.start, B.len);
-         }
-      }
-   }
-
-   for (unsigned i = 0; i < JAY_NUM_PHYS_GRF; ++i) {
-      assert(BITSET_TEST(regs, i) && "all GRFs mapped");
-   }
-
-   assert(p->large_ugpr_block.len && "partition must have a large UGPR block");
-   assert(p->base2 >= p->base8 && p->base_eot >= p->base2 && "monotonic");
-   assert(p->base8 >= stride4_header_size && "header is big enough");
-   assert(p->base_eot + p->units_x16[GPR] <= nonuniform_gprs && "EOT fits");
-   assert(util_is_aligned(p->base8, 8) && "so vectors don't cross");
-   assert(util_is_aligned(p->base2, 8) && "so vectors don't cross");
-   assert(util_is_aligned(p->base_eot, 8) && "so vectors don't cross");
-}
-
-static void
-build_partition(jay_shader *shader, unsigned *blocks, unsigned n)
-{
-   unsigned base = 0;
-   unsigned ugpr_base = 0;
-   struct jay_partition *p = &shader->partition;
-
-   *p = (struct jay_partition) {
-      .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16,
-      .units_x16[GPR] = 16 / jay_grf_per_gpr(shader),
-   };
-
-   for (unsigned i = 0; i < n; ++i) {
-      enum jay_file file = (i & 1) ? GPR : UGPR;
-      unsigned file_i = i >> 1;
-
-      p->blocks[file][file_i].start = (base * p->units_x16[file]) / 16;
-      p->blocks[file][file_i].len = (blocks[i] * p->units_x16[file]) / 16;
-
-      if (file == UGPR && blocks[i] >= 8) {
-         p->large_ugpr_block = (struct jay_register_block) {
-            .start = (ugpr_base * p->units_x16[file]) / 16,
-            .len = p->blocks[file][file_i].len,
-         };
-      }
-
-      base += blocks[i];
-      if (file == UGPR) {
-         ugpr_base += blocks[i];
-      }
-   }
-}
-
-static unsigned
-gpr_limit(jay_shader *shader)
-{
-   /* If testing spilling, set limit tightly. */
-   bool test = (jay_debug & JAY_DBG_SPILL);
-   test &= shader->stage != MESA_SHADER_VERTEX;
-
-   return test ? 13 : shader->num_regs[GPR];
-}
-
-/*
- * Partition the register file for the entire shader. All functions must
- * share the same partition for correctness with non-uniform function calls.
- * For unlinked library functions, we must use the ABI partition (TODO).
- */
-void
-jay_partition_grf(jay_shader *shader)
-{
-   /* Calculate the maximum register demand across all functions in the shader.
-    * We will use this to choose a good partition.
-    */
-   struct jay_partition *p = &shader->partition;
-   unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
-
-   jay_foreach_function(shader, f) {
-      jay_compute_liveness(f);
-      jay_calculate_register_demands(f);
-
-      demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
-      demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
-   }
-
-   /* We must have enough register file space for the register payload, plus the
-    * reserved UGPRs in the case we spill. That UGPR interferes with everything
-    * we preload so it needs to be reserved specially here for the worst case.
-    */
-   jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
-      unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
-      unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
-      assert(I->dst.file < JAY_NUM_GRF_FILES);
-      demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
-   }
-
-   /* Determine a good GPR/UGPR split informed by the demand calculation */
-   unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
-   unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
-
-   /* We must have enough for SIMD1 images (TODO: Check if this actually
-    * applies. Or if we could eliminate this with smarter partitioning even.)
-    */
-   unsigned min_ugprs = 16;
-   min_ugprs = MAX2(min_ugprs, 256);
-
-   unsigned grf_block_alignment = 8 * jay_grf_per_gpr(shader); /* max_vec */
-
-   /* TODO: We could partition more cleverly */
-   uniform_grfs = CLAMP(align(uniform_grfs, grf_block_alignment),
-                        DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
-                        128 - (32 * jay_grf_per_gpr(shader)));
-   unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
-
-   /* Check the split */
-   assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
-   assert(nonuniform_grfs >= 32 * jay_grf_per_gpr(shader));
-   assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
-
-   /* Partition GRFs between GPR & UGPR */
-   unsigned stride4_header_size = 0;
-
-   if (shader->stage == MESA_SHADER_VERTEX) {
-      unsigned attrib_grfs = shader->prog_data->vue.urb_read_length * 8;
-      unsigned blocks[] = {
-         1,                                         /* UGPR: g0 */
-         8,                                         /* GPR: URB output handle */
-         shader->push_grfs,                         /* UGPR: Push constants */
-         attrib_grfs,                               /* GPR: Vertex inputs */
-         uniform_grfs - (blocks[0] + blocks[2]),    /* UGPR: * */
-         nonuniform_grfs - (blocks[1] + blocks[3]), /* GPR: * and EOT */
-      };
-
-      build_partition(shader, blocks, ARRAY_SIZE(blocks));
-      stride4_header_size = blocks[1] + blocks[3];
-   } else if (shader->stage == MESA_SHADER_FRAGMENT) {
-      unsigned len0 = jay_grf_per_gpr(shader);
-      unsigned payload_grfs = shader->payload_gprs * len0;
-
-      unsigned blocks[] = {
-         len0,                /* UGPR: g0 (and maybe g1) */
-         payload_grfs,        /* GPR: Barycentrics */
-         uniform_grfs - len0, /* UGPR: Dispatch (eg push constants) & general */
-         nonuniform_grfs - payload_grfs, /* GPR: General & EOT */
-      };
-      build_partition(shader, blocks, ARRAY_SIZE(blocks));
-      stride4_header_size = blocks[1];
-   } else {
-      unsigned blocks[] = { uniform_grfs - 4, nonuniform_grfs, 4 };
-      build_partition(shader, blocks, ARRAY_SIZE(blocks));
-   }
-
-   /* TODO: Make the stride partition smarter */
-   unsigned nonuniform_gprs = nonuniform_grfs / jay_grf_per_gpr(shader);
-   unsigned eot_gprs = 16 / jay_grf_per_gpr(shader);
-   p->base8 = ROUND_DOWN_TO(nonuniform_gprs - (16 + eot_gprs), 8) + 0;
-   p->base2 = 8 + p->base8;
-   p->base_eot = 8 + p->base2;
-
-   // print_partition(p);
-   validate_partition(p, stride4_header_size, nonuniform_gprs);
-
-   /* By construction of our partition, the entire GRF is used. */
-   shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF;
-
-   /* Set the targets for the virtual register file accordingly */
-   for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) {
-      for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) {
-         shader->num_regs[f] += p->blocks[f][b].len;
-      }
-   }
-
-   /* This should be an acceptable upper limit since we assign memory tightly
-    * thanks to the usual SSA allocator guarantees.
-    */
-   if (demand[GPR] > gpr_limit(shader)) {
-      shader->num_regs[MEM] = demand[GPR];
-   }
-}
-
 static void
 jay_register_allocate_function(jay_function *f)
 {
@@ -1556,15 +1382,10 @@ jay_register_allocate_function(jay_function *f)
    jay_ra_state ra = { .b.shader = shader, .b.func = f };
 
    /* Spill as needed to fit within the limits. */
-   unsigned limit = gpr_limit(f->shader);
+   unsigned limit = jay_gpr_limit(f->shader);
    bool spilled = f->demand[GPR] > limit;
 
    if (spilled) {
-      /* Spilling requires reserving UGPRs for spilling */
-      unsigned reservation = f->shader->dispatch_width;
-      f->shader->num_regs[UGPR] -= reservation;
-      f->shader->partition.large_ugpr_block.len -= reservation;
-
       jay_spill(f, limit);
       jay_validate(f->shader, "spilling");
       jay_compute_liveness(f);
@@ -1588,17 +1409,6 @@ jay_register_allocate_function(jay_function *f)
 
    typed_memcpy(ra.num_regs, shader->num_regs, JAY_NUM_RA_FILES);
 
-   /* The end of the register file is allowed for end-of-thread messages.
-    * Calculate the offset in GPRs. Compute shaders have this as UGPRs while
-    * fragment shaders have this as GPRs.
-    */
-   if (mesa_shader_stage_is_compute(shader->stage)) {
-      ra.eot_offs = ROUND_DOWN_TO(ra.num_regs[UGPR], jay_ugpr_per_grf(shader)) -
-                    jay_ugpr_per_grf(shader);
-   } else {
-      ra.eot_offs = ra.num_regs[GPR] - (16 / jay_grf_per_gpr(shader));
-   }
-
    linear_ctx *lin_ctx = linear_context(shader);
 
    ra.reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc);
diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c
index 3af3fbd71b5..8615e252fb6 100644
--- a/src/intel/compiler/jay/jay_to_binary.c
+++ b/src/intel/compiler/jay/jay_to_binary.c
@@ -10,7 +10,6 @@
 #include "compiler/brw/brw_eu_inst.h"
 #include "compiler/brw/brw_reg.h"
 #include "compiler/brw/brw_reg_type.h"
-#include "dev/intel_debug.h"
 #include "util/macros.h"
 #include "util/u_dynarray.h"
 #include "util/u_math.h"
@@ -42,45 +41,29 @@ to_brw_reg_type(enum jay_type type)
    /* clang-format on */
 }
 
-static inline unsigned
-to_def_grf_16(struct jay_partition *p, jay_def d)
-{
-   unsigned count = jay_num_values(d);
-   if (count == 0 || !(d.file == GPR || d.file == UGPR)) {
-      return d.reg;
-   }
-
-   unsigned base = 0;
-   for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) {
-      unsigned offset = d.reg - base;
-
-      if (offset < p->blocks[d.file][i].len) {
-         assert(offset + count <= p->blocks[d.file][i].len &&
-                "vectors must not cross partition boundaries");
-
-         return (p->blocks[d.file][i].start + offset) * 2 + d.hi;
-      }
-
-      base += p->blocks[d.file][i].len;
-   }
-
-   UNREACHABLE("virtual register must be in a block");
-}
-
 static inline brw_reg
-to_brw_reg(jay_function *f,
-           const jay_inst *I,
-           signed idx,
-           unsigned simd_offs,
-           bool force_hi)
+to_brw_reg(
+   jay_function *f, const jay_inst *I, signed idx, unsigned simd_offs, bool hi)
 {
    bool is_dest = idx < 0;
    enum jay_type type = is_dest ? I->type : jay_src_type(I, idx);
    jay_def d = is_dest ? I->dst : I->src[idx];
-   d.hi |= force_hi;
+   hi |= d.hi;
 
    struct brw_reg R;
-   unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0;
+   unsigned reg = d.reg, count = jay_num_values(d);
+   unsigned offset_B = 0, grf = 0;
+   assert(!hi || d.file == GPR);
+
+   if (count && (d.file == GPR || d.file == UGPR)) {
+      struct jay_register_block block =
+         jay_lookup_block(&f->shader->partition, d.reg, d.file);
+
+      grf = block.start_grf;
+      reg -= block.start_gpr;
+
+      assert(reg + count <= block.len_gpr && "must not cross partitions");
+   }
 
    if (jay_is_imm(d)) {
       /* Immediates have size restrictions but can zero extend */
@@ -95,13 +78,13 @@ to_brw_reg(jay_function *f,
    } else if (jay_is_null(d)) {
       R = brw_null_reg();
    } else if (d.file == UGPR || d.file == UACCUM) {
-      unsigned phys_reg = (reg >> 1) / 8;
-      offset_B = ((reg >> 1) % 8) * 4;
+      grf += (reg / jay_ugpr_per_grf(f->shader));
+      offset_B = (reg % jay_ugpr_per_grf(f->shader)) * 4;
 
       if (d.file == UGPR) {
-         R = brw_ud1_grf(phys_reg, 0);
+         R = brw_ud1_grf(grf * 2, 0);
       } else {
-         R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0);
+         R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0);
       }
 
       /* Handle 3-src restrictions and vectorized uniform code. */
@@ -140,22 +123,22 @@ to_brw_reg(jay_function *f,
       unsigned stride_bits = jay_stride_to_bits(def_stride);
       unsigned simd_width = jay_simd_width_physical(f->shader, I);
 
-      unsigned phys_reg;
       if (def_stride == JAY_STRIDE_2) {
-         /* Bit 0 selects between lo/hi halves of the GPR */
-         phys_reg = (reg / 2) * jay_grf_per_gpr(f->shader);
-         offset_B = (reg & 1) * 2 * f->shader->dispatch_width;
+         /* Select between lo/hi halves of the GPR */
+         grf += reg * jay_grf_per_gpr(f->shader);
+         offset_B = hi ? 2 * f->shader->dispatch_width : 0;
       } else {
-         /* Low bits are an offset in 2-byte words into the GRF */
+         /* Treat low bits as an offset in 2-byte words into the GRF */
+         unsigned r = (reg * 2) + hi;
          unsigned mask = BITFIELD_MASK(stride_bits / 32);
-         phys_reg = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader);
-         offset_B = (reg & mask) * 2;
+         grf += ((r & ~mask) / 2) * jay_grf_per_gpr(f->shader);
+         offset_B = (r & mask) * 2;
       }
 
       if (d.file == GPR) {
-         R = xe2_vec8_grf(phys_reg, 0);
+         R = xe2_vec8_grf(grf, 0);
       } else {
-         R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0);
+         R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + grf, 0);
       }
 
       R = byte_offset(R, simd_offs * simd_width * stride_bits / 8);
@@ -524,8 +507,13 @@ emit(struct brw_codegen *p,
 
    case JAY_OPCODE_SHUFFLE: {
       struct brw_reg a0 = brw_address_reg(0);
-      unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]);
-      unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width;
+      assert(I->src[0].file == GPR && jay_num_values(I->src[0]) == 1);
+      struct jay_register_block block =
+         jay_lookup_block(&f->shader->partition, I->src[0].reg, GPR);
+
+      unsigned offset_B =
+         (block.start_grf * 64) +
+         ((I->src[0].reg - block.start_gpr) * 4 * f->shader->dispatch_width);
 
       brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B));
       brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD));
diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build
index a367de71380..d4f5f1a0353 100644
--- a/src/intel/compiler/jay/meson.build
+++ b/src/intel/compiler/jay/meson.build
@@ -62,6 +62,7 @@ libintel_compiler_jay_files = files(
   'jay_opt_dead_code.c',
   'jay_opt_predicate.c',
   'jay_opt_propagate.c',
+  'jay_partition.c',
   'jay_print.c',
   'jay_private.h',
   'jay_prog_data.c',
diff --git a/src/intel/compiler/jay/test/jay_test.h b/src/intel/compiler/jay/test/jay_test.h
index 43cc48b87ef..086129396fe 100644
--- a/src/intel/compiler/jay/test/jay_test.h
+++ b/src/intel/compiler/jay/test/jay_test.h
@@ -8,7 +8,6 @@
 #include <inttypes.h>
 #include "jay_builder.h"
 #include "jay_ir.h"
-#include "jay_private.h"
 #include "shader_enums.h"
 
 static inline jay_block *
@@ -25,13 +24,17 @@ jay_test_builder(void *memctx)
 {
    jay_shader *s = jay_new_shader(memctx, MESA_SHADER_COMPUTE);
    jay_function *f = jay_new_function(s);
-   s->partition.base8 = 8;
 
    struct intel_device_info *devinfo =
       rzalloc(memctx, struct intel_device_info);
    s->devinfo = devinfo;
    s->dispatch_width = 32;
 
+   s->partition.blocks[GPR][s->partition.nr_blocks[GPR]++] = {
+      .len_gpr = 32,
+      .stride = JAY_STRIDE_4,
+   };
+
    unsigned verx10 = 200;
    devinfo->verx10 = verx10;
    devinfo->ver = verx10 / 10;