jay: introduce accumulators into the partition

In SIMD16, map acc2/acc3 as extra GPRs. This gets us a pressure reduction. We
leave acc0/acc1 reserved for mul_32 lowering and for parallel copy lowering,
changing this would be very challenging due to the possibility of SIMD1
multiplies leading to uniform access on the accumulator => stuff blows up. But
this is an easy win on select platforms.

Note we still use acc2/acc3 for post-RA accumulator substitution, this just lets
us also use them as panic registers.

SIMD16:
   Totals from 784 (29.62% of 2647) affected shaders:
   Instrs: 1686724 -> 1686700 (-0.00%); split: -0.15%, +0.15%
   CodeSize: 23406952 -> 23409432 (+0.01%); split: -0.16%, +0.17%
   Number of spill instructions: 224 -> 174 (-22.32%)
   Number of fill instructions: 546 -> 382 (-30.04%)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42097>
This commit is contained in:
Alyssa Rosenzweig 2026-06-09 11:31:04 -04:00 committed by Marge Bot
parent 091e6976d9
commit 5e64954fe0
3 changed files with 67 additions and 17 deletions

View file

@ -723,6 +723,9 @@ enum jay_block_type {
/** A block reserved for post-RA spill lowering */
JAY_BLOCK_SPILL,
/** A block containing accumulators mapped as GPRs */
JAY_BLOCK_ACCUM,
JAY_BLOCK_TYPES,
};
@ -740,7 +743,7 @@ struct jay_register_block {
enum jay_stride stride;
/** Special feature of the block */
enum jay_block_type type:2;
enum jay_block_type type:3;
};
static_assert(sizeof(struct jay_register_block) == 8, "packed");

View file

@ -148,6 +148,9 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
for (unsigned b = 0; b < p->nr_blocks[file]; ++b) {
struct jay_register_block B = p->blocks[file][b];
unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file];
if (B.type == JAY_BLOCK_ACCUM) {
continue;
}
assert(len_grf > 0 && "no empty partitions");
assert(B.start_grf + len_grf <= JAY_NUM_PHYS_GRF && "GRF file size");
@ -254,6 +257,7 @@ jay_partition_grf(jay_shader *shader)
*/
unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
unsigned mapped_accums = grf_per_gpr == 1 ? 2 : 0;
for (unsigned spilling = 0; spilling <= 1; spilling++) {
/* There is an interdependence between partition choice and spilling,
@ -296,7 +300,7 @@ jay_partition_grf(jay_shader *shader)
nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
/* Set the targets for the virtual register file accordingly */
shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
shader->num_regs[GPR] = (nonuniform_grfs / grf_per_gpr) + mapped_accums;
shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
/* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the
@ -335,6 +339,9 @@ jay_partition_grf(jay_shader *shader)
/* EOT */
{ UGPR, 0, eot_u, JAY_BLOCK_EOT },
{ GPR, JAY_STRIDE_4, eot_4, JAY_BLOCK_EOT },
/* Accumulator block */
{ GPR, JAY_STRIDE_4, mapped_accums * grf_per_gpr, JAY_BLOCK_ACCUM },
};
build_partition(shader, blocks, ARRAY_SIZE(blocks));
@ -368,7 +375,8 @@ jay_print_partition(struct jay_partition *p)
printf(" %u-bit", jay_stride_to_bits(B.stride));
}
const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill" };
const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill",
" Accumulator" };
printf(ANSI_ITALIC "%s" ANSI_END "\n", types[B.type]);
}
}

View file

@ -277,7 +277,7 @@ push_temp(jay_builder *b,
unsigned r = avoid_regs[0] ? (avoid_regs[1] ? 2 : 1) : 0;
file = file == UGPR ? UACCUM : ACCUM;
*backing = jay_bare_reg(file, outer ? 2 : 0);
*backing = jay_bare_reg(file, outer * 2);
/* Put accumulators down the float pipe - it's still a raw move. */
jay_def new = def_from_reg(r);
@ -301,21 +301,34 @@ pop_temp(jay_builder *b, jay_def temp, jay_def backing)
static void
mov(jay_builder *b, jay_def dst, jay_def src, struct jay_temp_regs temps)
{
jay_shader *s = b->shader;
bool split_copy = dst.file == MEM && src.file == MEM;
split_copy |= (dst.file == GPR && src.file == GPR) &&
jay_def_stride(s, dst) != jay_def_stride(s, src) &&
jay_def_stride(s, dst) != JAY_STRIDE_4 &&
jay_def_stride(s, src) != JAY_STRIDE_4;
bool acc_src = false, acc_dst = false;
if (dst.file == GPR && src.file == GPR) {
struct jay_partition *p = &b->shader->partition;
struct jay_register_block D = jay_lookup_block(p, dst.reg, GPR);
struct jay_register_block S = jay_lookup_block(p, src.reg, GPR);
acc_dst = D.type == JAY_BLOCK_ACCUM;
acc_src = S.type == JAY_BLOCK_ACCUM;
split_copy |= D.stride != S.stride &&
D.stride != JAY_STRIDE_4 &&
S.stride != JAY_STRIDE_4;
split_copy |= (acc_dst && S.stride != JAY_STRIDE_4) ||
(acc_src && D.stride != JAY_STRIDE_4);
}
if (split_copy) {
jay_def temp = jay_null(), backing = jay_null();
temp = push_temp(b, temps, GPR, false, &backing, jay_null(), jay_null());
jay_MOV(b, temp, src);
jay_MOV(b, dst, temp);
jay_MOV(b, temp, src)->type = acc_src ? JAY_TYPE_F32 : JAY_TYPE_U32;
jay_MOV(b, dst, temp)->type = acc_dst ? JAY_TYPE_F32 : JAY_TYPE_U32;
pop_temp(b, temp, backing);
} else {
jay_MOV(b, dst, src);
jay_MOV(b, dst, src)->type =
(acc_src || acc_dst) ? JAY_TYPE_F32 : JAY_TYPE_U32;
}
}
@ -563,12 +576,14 @@ is_block_compatible(struct jay_register_block block,
enum jay_file file,
enum jay_stride min_stride,
enum jay_stride max_stride,
bool eot)
bool eot,
bool allow_accum)
{
return block.type != JAY_BLOCK_SPILL &&
(file != GPR ||
(min_stride <= block.stride && block.stride <= max_stride)) &&
(!eot || block.type == JAY_BLOCK_EOT);
(!eot || block.type == JAY_BLOCK_EOT) &&
(allow_accum || block.type != JAY_BLOCK_ACCUM);
}
static jay_reg
@ -581,7 +596,7 @@ try_find_free_reg(jay_ra_state *ra,
struct jay_register_block B = ra->b.shader->partition.blocks[file][b];
if (is_block_compatible(B, file, stride4 ? JAY_STRIDE_4 : 0,
stride4 ? JAY_STRIDE_4 : ~0, false)) {
stride4 ? JAY_STRIDE_4 : ~0, false, !stride4)) {
for (unsigned i = B.start_gpr; i < B.start_gpr + B.len_gpr; ++i) {
if (BITSET_TEST(ra->available_regs[file], i) && i != except) {
@ -740,7 +755,8 @@ pick_regs(jay_ra_state *ra,
if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) &&
util_is_aligned(cur - block.start_gpr, alignment) &&
is_block_compatible(block, file, min_stride, max_stride, eot) &&
is_block_compatible(block, file, min_stride, max_stride, eot,
false) &&
cur + size <= (block.start_gpr + block.len_gpr)) {
return cur;
}
@ -784,7 +800,8 @@ pick_regs(jay_ra_state *ra,
struct jay_register_block block = partition->blocks[file][b];
if (is_block_compatible(block, file, min_stride, max_stride, eot)) {
if (is_block_compatible(block, file, min_stride, max_stride, eot,
false)) {
unsigned r = b_ == rr->block ? rr->gpr : 0;
if (affinity.repr == jay_channel(var, 0) && b_ == rr->block) {
@ -1248,6 +1265,20 @@ insert_parallel_copies_for_phis(jay_function *f)
free(phi_dsts);
}
static void
map_gpr_to_acc(jay_shader *shader, jay_def *x)
{
if (x->file == GPR) {
struct jay_register_block B =
jay_lookup_block(&shader->partition, x->reg, GPR);
if (B.type == JAY_BLOCK_ACCUM) {
x->file = ACCUM;
x->reg = (2 + (x->reg - B.start_gpr)) * 2;
}
}
}
static void
jay_register_allocate_function(jay_function *f)
{
@ -1369,6 +1400,14 @@ jay_register_allocate_function(jay_function *f)
if (spilled) {
jay_lower_spill(f);
}
jay_foreach_inst_in_func(f, block, I) {
map_gpr_to_acc(shader, &I->dst);
jay_foreach_src(I, s) {
map_gpr_to_acc(shader, &I->src[s]);
}
}
}
void