jay: introduce accumulators into the partition

In SIMD16, map acc2/acc3 as extra GPRs. This gets us a pressure reduction. We leave acc0/acc1 reserved for mul_32 lowering and for parallel copy lowering, changing this would be very challenging due to the possibility of SIMD1 multiplies leading to uniform access on the accumulator => stuff blows up. But this is an easy win on select platforms. Note we still use acc2/acc3 for post-RA accumulator substitution, this just lets us also use them as panic registers. SIMD16: Totals from 784 (29.62% of 2647) affected shaders: Instrs: 1686724 -> 1686700 (-0.00%); split: -0.15%, +0.15% CodeSize: 23406952 -> 23409432 (+0.01%); split: -0.16%, +0.17% Number of spill instructions: 224 -> 174 (-22.32%) Number of fill instructions: 546 -> 382 (-30.04%) Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42097>
2026-06-21 16:48:22 +02:00 · 2026-06-09 11:31:04 -04:00 · 2026-06-09 11:31:04 -04:00 · 5e64954fe0
commit 5e64954fe0
parent 091e6976d9
3 changed files with 67 additions and 17 deletions
--- a/src/intel/compiler/jay/jay_ir.h
+++ b/src/intel/compiler/jay/jay_ir.h
@ -723,6 +723,9 @@ enum jay_block_type {
   /** A block reserved for post-RA spill lowering */
   JAY_BLOCK_SPILL,

+   /** A block containing accumulators mapped as GPRs */
+   JAY_BLOCK_ACCUM,
+
   JAY_BLOCK_TYPES,
 };

@ -740,7 +743,7 @@ struct jay_register_block {
   enum jay_stride stride;

   /** Special feature of the block */
-   enum jay_block_type type:2;
+   enum jay_block_type type:3;
 };
 static_assert(sizeof(struct jay_register_block) == 8, "packed");

--- a/src/intel/compiler/jay/jay_partition.c
+++ b/src/intel/compiler/jay/jay_partition.c
@ -148,6 +148,9 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
      for (unsigned b = 0; b < p->nr_blocks[file]; ++b) {
         struct jay_register_block B = p->blocks[file][b];
         unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file];
+         if (B.type == JAY_BLOCK_ACCUM) {
+            continue;
+         }

         assert(len_grf > 0 && "no empty partitions");
         assert(B.start_grf + len_grf <= JAY_NUM_PHYS_GRF && "GRF file size");
@ -254,6 +257,7 @@ jay_partition_grf(jay_shader *shader)
    */
   unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
   unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
+   unsigned mapped_accums = grf_per_gpr == 1 ? 2 : 0;

   for (unsigned spilling = 0; spilling <= 1; spilling++) {
      /* There is an interdependence between partition choice and spilling,
@ -296,7 +300,7 @@ jay_partition_grf(jay_shader *shader)
      nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;

      /* Set the targets for the virtual register file accordingly */
-      shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
+      shader->num_regs[GPR] = (nonuniform_grfs / grf_per_gpr) + mapped_accums;
      shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;

      /* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the
@ -335,6 +339,9 @@ jay_partition_grf(jay_shader *shader)
      /* EOT */
      { UGPR, 0, eot_u, JAY_BLOCK_EOT },
      { GPR, JAY_STRIDE_4, eot_4, JAY_BLOCK_EOT },
+
+      /* Accumulator block */
+      { GPR, JAY_STRIDE_4, mapped_accums * grf_per_gpr, JAY_BLOCK_ACCUM },
   };

   build_partition(shader, blocks, ARRAY_SIZE(blocks));
@ -368,7 +375,8 @@ jay_print_partition(struct jay_partition *p)
            printf("  %u-bit", jay_stride_to_bits(B.stride));
         }

-         const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill" };
+         const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill",
+                                                " Accumulator" };
         printf(ANSI_ITALIC "%s" ANSI_END "\n", types[B.type]);
      }
   }
--- a/src/intel/compiler/jay/jay_register_allocate.c
+++ b/src/intel/compiler/jay/jay_register_allocate.c
@ -277,7 +277,7 @@ push_temp(jay_builder *b,
   unsigned r = avoid_regs[0] ? (avoid_regs[1] ? 2 : 1) : 0;

   file = file == UGPR ? UACCUM : ACCUM;
-   *backing = jay_bare_reg(file, outer ? 2 : 0);
+   *backing = jay_bare_reg(file, outer * 2);

   /* Put accumulators down the float pipe - it's still a raw move. */
   jay_def new = def_from_reg(r);
@ -301,21 +301,34 @@ pop_temp(jay_builder *b, jay_def temp, jay_def backing)
 static void
 mov(jay_builder *b, jay_def dst, jay_def src, struct jay_temp_regs temps)
 {
-   jay_shader *s = b->shader;
   bool split_copy = dst.file == MEM && src.file == MEM;
-   split_copy |= (dst.file == GPR && src.file == GPR) &&
-                 jay_def_stride(s, dst) != jay_def_stride(s, src) &&
-                 jay_def_stride(s, dst) != JAY_STRIDE_4 &&
-                 jay_def_stride(s, src) != JAY_STRIDE_4;
+   bool acc_src = false, acc_dst = false;
+
+   if (dst.file == GPR && src.file == GPR) {
+      struct jay_partition *p = &b->shader->partition;
+      struct jay_register_block D = jay_lookup_block(p, dst.reg, GPR);
+      struct jay_register_block S = jay_lookup_block(p, src.reg, GPR);
+
+      acc_dst = D.type == JAY_BLOCK_ACCUM;
+      acc_src = S.type == JAY_BLOCK_ACCUM;
+
+      split_copy |= D.stride != S.stride &&
+                    D.stride != JAY_STRIDE_4 &&
+                    S.stride != JAY_STRIDE_4;
+
+      split_copy |= (acc_dst && S.stride != JAY_STRIDE_4) ||
+                    (acc_src && D.stride != JAY_STRIDE_4);
+   }

   if (split_copy) {
      jay_def temp = jay_null(), backing = jay_null();
      temp = push_temp(b, temps, GPR, false, &backing, jay_null(), jay_null());
-      jay_MOV(b, temp, src);
-      jay_MOV(b, dst, temp);
+      jay_MOV(b, temp, src)->type = acc_src ? JAY_TYPE_F32 : JAY_TYPE_U32;
+      jay_MOV(b, dst, temp)->type = acc_dst ? JAY_TYPE_F32 : JAY_TYPE_U32;
      pop_temp(b, temp, backing);
   } else {
-      jay_MOV(b, dst, src);
+      jay_MOV(b, dst, src)->type =
+         (acc_src || acc_dst) ? JAY_TYPE_F32 : JAY_TYPE_U32;
   }
 }

@ -563,12 +576,14 @@ is_block_compatible(struct jay_register_block block,
                    enum jay_file file,
                    enum jay_stride min_stride,
                    enum jay_stride max_stride,
-                    bool eot)
+                    bool eot,
+                    bool allow_accum)
 {
   return block.type != JAY_BLOCK_SPILL &&
          (file != GPR ||
           (min_stride <= block.stride && block.stride <= max_stride)) &&
-          (!eot || block.type == JAY_BLOCK_EOT);
+          (!eot || block.type == JAY_BLOCK_EOT) &&
+          (allow_accum || block.type != JAY_BLOCK_ACCUM);
 }

 static jay_reg
@ -581,7 +596,7 @@ try_find_free_reg(jay_ra_state *ra,
      struct jay_register_block B = ra->b.shader->partition.blocks[file][b];

      if (is_block_compatible(B, file, stride4 ? JAY_STRIDE_4 : 0,
-                              stride4 ? JAY_STRIDE_4 : ~0, false)) {
+                              stride4 ? JAY_STRIDE_4 : ~0, false, !stride4)) {

         for (unsigned i = B.start_gpr; i < B.start_gpr + B.len_gpr; ++i) {
            if (BITSET_TEST(ra->available_regs[file], i) && i != except) {
@ -740,7 +755,8 @@ pick_regs(jay_ra_state *ra,

      if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) &&
          util_is_aligned(cur - block.start_gpr, alignment) &&
-          is_block_compatible(block, file, min_stride, max_stride, eot) &&
+          is_block_compatible(block, file, min_stride, max_stride, eot,
+                              false) &&
          cur + size <= (block.start_gpr + block.len_gpr)) {
         return cur;
      }
@ -784,7 +800,8 @@ pick_regs(jay_ra_state *ra,

      struct jay_register_block block = partition->blocks[file][b];

-      if (is_block_compatible(block, file, min_stride, max_stride, eot)) {
+      if (is_block_compatible(block, file, min_stride, max_stride, eot,
+                              false)) {
         unsigned r = b_ == rr->block ? rr->gpr : 0;

         if (affinity.repr == jay_channel(var, 0) && b_ == rr->block) {
@ -1248,6 +1265,20 @@ insert_parallel_copies_for_phis(jay_function *f)
   free(phi_dsts);
 }

+static void
+map_gpr_to_acc(jay_shader *shader, jay_def *x)
+{
+   if (x->file == GPR) {
+      struct jay_register_block B =
+         jay_lookup_block(&shader->partition, x->reg, GPR);
+
+      if (B.type == JAY_BLOCK_ACCUM) {
+         x->file = ACCUM;
+         x->reg = (2 + (x->reg - B.start_gpr)) * 2;
+      }
+   }
+}
+
 static void
 jay_register_allocate_function(jay_function *f)
 {
@ -1369,6 +1400,14 @@ jay_register_allocate_function(jay_function *f)
   if (spilled) {
      jay_lower_spill(f);
   }
+
+   jay_foreach_inst_in_func(f, block, I) {
+      map_gpr_to_acc(shader, &I->dst);
+
+      jay_foreach_src(I, s) {
+         map_gpr_to_acc(shader, &I->src[s]);
+      }
+   }
 }

 void