From 5e64954fe0e6579478c7fb8ec33edfd0fec06d97 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 9 Jun 2026 11:31:04 -0400 Subject: [PATCH] jay: introduce accumulators into the partition In SIMD16, map acc2/acc3 as extra GPRs. This gets us a pressure reduction. We leave acc0/acc1 reserved for mul_32 lowering and for parallel copy lowering, changing this would be very challenging due to the possibility of SIMD1 multiplies leading to uniform access on the accumulator => stuff blows up. But this is an easy win on select platforms. Note we still use acc2/acc3 for post-RA accumulator substitution, this just lets us also use them as panic registers. SIMD16: Totals from 784 (29.62% of 2647) affected shaders: Instrs: 1686724 -> 1686700 (-0.00%); split: -0.15%, +0.15% CodeSize: 23406952 -> 23409432 (+0.01%); split: -0.16%, +0.17% Number of spill instructions: 224 -> 174 (-22.32%) Number of fill instructions: 546 -> 382 (-30.04%) Signed-off-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/jay/jay_ir.h | 5 +- src/intel/compiler/jay/jay_partition.c | 12 +++- .../compiler/jay/jay_register_allocate.c | 67 +++++++++++++++---- 3 files changed, 67 insertions(+), 17 deletions(-) diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h index da38275ea1c..307cde7c905 100644 --- a/src/intel/compiler/jay/jay_ir.h +++ b/src/intel/compiler/jay/jay_ir.h @@ -723,6 +723,9 @@ enum jay_block_type { /** A block reserved for post-RA spill lowering */ JAY_BLOCK_SPILL, + /** A block containing accumulators mapped as GPRs */ + JAY_BLOCK_ACCUM, + JAY_BLOCK_TYPES, }; @@ -740,7 +743,7 @@ struct jay_register_block { enum jay_stride stride; /** Special feature of the block */ - enum jay_block_type type:2; + enum jay_block_type type:3; }; static_assert(sizeof(struct jay_register_block) == 8, "packed"); diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c index 3a4b2b85878..fea54e72fe2 100644 --- a/src/intel/compiler/jay/jay_partition.c +++ b/src/intel/compiler/jay/jay_partition.c @@ -148,6 +148,9 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n) for (unsigned b = 0; b < p->nr_blocks[file]; ++b) { struct jay_register_block B = p->blocks[file][b]; unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file]; + if (B.type == JAY_BLOCK_ACCUM) { + continue; + } assert(len_grf > 0 && "no empty partitions"); assert(B.start_grf + len_grf <= JAY_NUM_PHYS_GRF && "GRF file size"); @@ -254,6 +257,7 @@ jay_partition_grf(jay_shader *shader) */ unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr; unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr; + unsigned mapped_accums = grf_per_gpr == 1 ? 2 : 0; for (unsigned spilling = 0; spilling <= 1; spilling++) { /* There is an interdependence between partition choice and spilling, @@ -296,7 +300,7 @@ jay_partition_grf(jay_shader *shader) nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; /* Set the targets for the virtual register file accordingly */ - shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr; + shader->num_regs[GPR] = (nonuniform_grfs / grf_per_gpr) + mapped_accums; shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf; /* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the @@ -335,6 +339,9 @@ jay_partition_grf(jay_shader *shader) /* EOT */ { UGPR, 0, eot_u, JAY_BLOCK_EOT }, { GPR, JAY_STRIDE_4, eot_4, JAY_BLOCK_EOT }, + + /* Accumulator block */ + { GPR, JAY_STRIDE_4, mapped_accums * grf_per_gpr, JAY_BLOCK_ACCUM }, }; build_partition(shader, blocks, ARRAY_SIZE(blocks)); @@ -368,7 +375,8 @@ jay_print_partition(struct jay_partition *p) printf(" %u-bit", jay_stride_to_bits(B.stride)); } - const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill" }; + const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill", + " Accumulator" }; printf(ANSI_ITALIC "%s" ANSI_END "\n", types[B.type]); } } diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c index a6135582d18..50a63c4e148 100644 --- a/src/intel/compiler/jay/jay_register_allocate.c +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -277,7 +277,7 @@ push_temp(jay_builder *b, unsigned r = avoid_regs[0] ? (avoid_regs[1] ? 2 : 1) : 0; file = file == UGPR ? UACCUM : ACCUM; - *backing = jay_bare_reg(file, outer ? 2 : 0); + *backing = jay_bare_reg(file, outer * 2); /* Put accumulators down the float pipe - it's still a raw move. */ jay_def new = def_from_reg(r); @@ -301,21 +301,34 @@ pop_temp(jay_builder *b, jay_def temp, jay_def backing) static void mov(jay_builder *b, jay_def dst, jay_def src, struct jay_temp_regs temps) { - jay_shader *s = b->shader; bool split_copy = dst.file == MEM && src.file == MEM; - split_copy |= (dst.file == GPR && src.file == GPR) && - jay_def_stride(s, dst) != jay_def_stride(s, src) && - jay_def_stride(s, dst) != JAY_STRIDE_4 && - jay_def_stride(s, src) != JAY_STRIDE_4; + bool acc_src = false, acc_dst = false; + + if (dst.file == GPR && src.file == GPR) { + struct jay_partition *p = &b->shader->partition; + struct jay_register_block D = jay_lookup_block(p, dst.reg, GPR); + struct jay_register_block S = jay_lookup_block(p, src.reg, GPR); + + acc_dst = D.type == JAY_BLOCK_ACCUM; + acc_src = S.type == JAY_BLOCK_ACCUM; + + split_copy |= D.stride != S.stride && + D.stride != JAY_STRIDE_4 && + S.stride != JAY_STRIDE_4; + + split_copy |= (acc_dst && S.stride != JAY_STRIDE_4) || + (acc_src && D.stride != JAY_STRIDE_4); + } if (split_copy) { jay_def temp = jay_null(), backing = jay_null(); temp = push_temp(b, temps, GPR, false, &backing, jay_null(), jay_null()); - jay_MOV(b, temp, src); - jay_MOV(b, dst, temp); + jay_MOV(b, temp, src)->type = acc_src ? JAY_TYPE_F32 : JAY_TYPE_U32; + jay_MOV(b, dst, temp)->type = acc_dst ? JAY_TYPE_F32 : JAY_TYPE_U32; pop_temp(b, temp, backing); } else { - jay_MOV(b, dst, src); + jay_MOV(b, dst, src)->type = + (acc_src || acc_dst) ? JAY_TYPE_F32 : JAY_TYPE_U32; } } @@ -563,12 +576,14 @@ is_block_compatible(struct jay_register_block block, enum jay_file file, enum jay_stride min_stride, enum jay_stride max_stride, - bool eot) + bool eot, + bool allow_accum) { return block.type != JAY_BLOCK_SPILL && (file != GPR || (min_stride <= block.stride && block.stride <= max_stride)) && - (!eot || block.type == JAY_BLOCK_EOT); + (!eot || block.type == JAY_BLOCK_EOT) && + (allow_accum || block.type != JAY_BLOCK_ACCUM); } static jay_reg @@ -581,7 +596,7 @@ try_find_free_reg(jay_ra_state *ra, struct jay_register_block B = ra->b.shader->partition.blocks[file][b]; if (is_block_compatible(B, file, stride4 ? JAY_STRIDE_4 : 0, - stride4 ? JAY_STRIDE_4 : ~0, false)) { + stride4 ? JAY_STRIDE_4 : ~0, false, !stride4)) { for (unsigned i = B.start_gpr; i < B.start_gpr + B.len_gpr; ++i) { if (BITSET_TEST(ra->available_regs[file], i) && i != except) { @@ -740,7 +755,8 @@ pick_regs(jay_ra_state *ra, if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) && util_is_aligned(cur - block.start_gpr, alignment) && - is_block_compatible(block, file, min_stride, max_stride, eot) && + is_block_compatible(block, file, min_stride, max_stride, eot, + false) && cur + size <= (block.start_gpr + block.len_gpr)) { return cur; } @@ -784,7 +800,8 @@ pick_regs(jay_ra_state *ra, struct jay_register_block block = partition->blocks[file][b]; - if (is_block_compatible(block, file, min_stride, max_stride, eot)) { + if (is_block_compatible(block, file, min_stride, max_stride, eot, + false)) { unsigned r = b_ == rr->block ? rr->gpr : 0; if (affinity.repr == jay_channel(var, 0) && b_ == rr->block) { @@ -1248,6 +1265,20 @@ insert_parallel_copies_for_phis(jay_function *f) free(phi_dsts); } +static void +map_gpr_to_acc(jay_shader *shader, jay_def *x) +{ + if (x->file == GPR) { + struct jay_register_block B = + jay_lookup_block(&shader->partition, x->reg, GPR); + + if (B.type == JAY_BLOCK_ACCUM) { + x->file = ACCUM; + x->reg = (2 + (x->reg - B.start_gpr)) * 2; + } + } +} + static void jay_register_allocate_function(jay_function *f) { @@ -1369,6 +1400,14 @@ jay_register_allocate_function(jay_function *f) if (spilled) { jay_lower_spill(f); } + + jay_foreach_inst_in_func(f, block, I) { + map_gpr_to_acc(shader, &I->dst); + + jay_foreach_src(I, s) { + map_gpr_to_acc(shader, &I->src[s]); + } + } } void