jay: assign accumulators post-RA

Greedy post-RA substitution pass, similar to IGC's AccSubstitution pass.
Stats together with the previous commits.

SIMD16:

   Totals from 2209 (83.45% of 2647) affected shaders:
   Instrs: 2701029 -> 2696350 (-0.17%)
   CodeSize: 39166720 -> 40372272 (+3.08%); split: -0.36%, +3.44%

SIMD32:

   Totals from 2211 (83.53% of 2647) affected shaders:
   Instrs: 4691165 -> 4641188 (-1.07%)
   CodeSize: 69365792 -> 69341616 (-0.03%); split: -0.50%, +0.47%

The instruction count reduction is from RA shuffle code getting coalesced via
accumulators. The code size changes are from:

* Fewer moves from the instr count reduction (helped)
* Smaller MADs encoded as MACs (helped)
* Fewer SYNC.nop due to fewer scoreboarding annotations (helped)
* Less compaction due to explicit accumulator operands (hurt)

I expect significant cycle count changes from this but we don't have a cycle
model wired up yet, so reading the assembly will have to do.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41398>
This commit is contained in:
Alyssa Rosenzweig 2026-04-30 09:39:30 -04:00 committed by Marge Bot
parent 8b324591d1
commit e4dc161277
5 changed files with 365 additions and 0 deletions

View file

@ -0,0 +1,359 @@
/*
* Copyright 2026 Intel Corporation
* Copyright 2025 Valve Corporation
* Copyright 2019-2022 Collabora, Ltd.
* SPDX-License-Identifier: MIT
*/
#include "util/bitset.h"
#include "util/ralloc.h"
#include "util/u_dynarray.h"
#include "util/u_worklist.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
#define JAY_MAX_ACCUMS 4
static void
postra_liveness_ins(BITSET_WORD *live, jay_inst *I)
{
if (I->dst.file == GPR && !I->predication) {
BITSET_CLEAR_COUNT(live, I->dst.reg, jay_num_values(I->dst));
}
jay_foreach_src(I, s) {
if (I->src[s].file == GPR) {
BITSET_SET_COUNT(live, I->src[s].reg, jay_num_values(I->src[s]));
}
}
}
/*
* Globally, liveness analysis uses a fixed-point algorithm based on a
* worklist. We initialize a work list with the exit block. We iterate the work
* list to compute live_in from live_out for each block on the work list,
* adding the predecessors of the block to the work list if we made progress.
*/
static void
postra_liveness(jay_function *func)
{
u_worklist worklist;
u_worklist_init(&worklist, func->num_blocks, NULL);
jay_foreach_block(func, block) {
BITSET_ZERO(block->postra_gpr_live_in);
BITSET_ZERO(block->postra_gpr_live_out);
jay_worklist_push_tail(&worklist, block);
}
while (!u_worklist_is_empty(&worklist)) {
/* Pop off in reverse order since liveness is backwards */
jay_block *blk = jay_worklist_pop_tail(&worklist);
/* Calculate liveness locally */
jay_foreach_successor(blk, succ, GPR) {
BITSET_OR(blk->postra_gpr_live_out, blk->postra_gpr_live_out,
succ->postra_gpr_live_in);
}
BITSET_DECLARE(live, JAY_NUM_PHYS_GRF);
memcpy(live, blk->postra_gpr_live_out, sizeof(live));
jay_foreach_inst_in_block_rev(blk, ins) {
postra_liveness_ins(live, ins);
}
/* If we made progress, we need to reprocess the predecessors */
if (!BITSET_EQUAL(blk->postra_gpr_live_in, live)) {
memcpy(blk->postra_gpr_live_in, live, sizeof(live));
jay_foreach_predecessor(blk, pred, GPR) {
jay_worklist_push_head(&worklist, *pred);
}
}
}
u_worklist_fini(&worklist);
}
/*
* Check if a source is killed by the instruction. If the register is dead after
* this instuction, it's the last use / killed. That also includes if the
* register is overwritten this cycle, but that won't show up in the liveness.
*/
static bool
source_killed(BITSET_WORD *live, const jay_inst *I, unsigned s)
{
return !BITSET_TEST(live, I->src[s].reg) ||
(I->dst.file == GPR &&
I->src[s].reg >= I->dst.reg &&
(I->src[s].reg - I->dst.reg) < jay_num_values(I->dst));
}
/* We assign accumulators with a simple heuristic: promote registers with the
* shortest live range. This is pretty naive but it is well-motivated:
*
* 1. Short live ranges reduce interfere with other potentially promotable
* registers, allowing for more overall accumulator usage. This is a builtin
* defense against being too greedy.
*
* 2. Short live ranges necessarily have the first read of the register shortly
* after the write. That situation benefits greatly from promoting to an
* accumulator as such sequences are GRF latency bound.
*
* There are lots of ways to do better in the future, but this is good for now.
*/
struct candidate {
uint32_t def_ip, last_use_ip;
};
static int
score(struct candidate c)
{
assert(c.def_ip < c.last_use_ip);
return (int) (c.last_use_ip - c.def_ip);
}
static int
cmp_candidates(const void *left_, const void *right_)
{
const struct candidate *left = left_;
const struct candidate *right = right_;
int l = score(*left), r = score(*right);
return (l > r) - (l < r);
}
/*
* Query whether an instruction can access accumulators. Comments are quoted
* from bspec 56619 as the rules are complex.
*/
static inline bool
can_access_accum(jay_shader *shader, jay_inst *I, signed src)
{
/* "No Accumulator usage for Control Flow, Math, Send, DPAS instructions." */
if (jay_op_is_control_flow(I->op) ||
I->op == JAY_OPCODE_MATH ||
I->op == JAY_OPCODE_SEND) {
return false;
}
/* TODO: Many, many more restrictions on non-f32 */
if (I->type != JAY_TYPE_F32) {
return false;
}
/* "When destination is accumulator with offset 0, destination horizontal
* stride must be 1."
*/
if (I->dst.file == GPR && jay_def_stride(shader, I->dst) != JAY_STRIDE_4) {
return false;
}
/* "Register Regioning patterns where register data bit locations are changed
* between source and destination are not supported when an accumulator is
* used as an implicit source or an explicit source in an instruction.."
*/
jay_foreach_src(I, s) {
if (I->src[s].file == GPR &&
jay_def_stride(shader, I->src[s]) != JAY_STRIDE_4) {
return false;
}
}
/* Jay's predication requires tying the destination to the source which is
* too complicated to model here. It's also only dubiously useful.
*/
if (src < 0 && I->predication) {
return false;
}
/* This copies only part of a GRF so can't be accumulatored */
if (I->op == JAY_OPCODE_DESWIZZLE_EVEN) {
return false;
}
return true;
}
static inline void
substitute_acc(jay_def *x, unsigned acc_p1)
{
if (acc_p1) {
assert(x->file == GPR && (acc_p1 - 1) < JAY_MAX_ACCUMS);
x->file = ACCUM;
x->reg = (acc_p1 - 1) * 2;
}
}
static void
pass(jay_function *func)
{
void *memctx = ralloc_context(NULL);
void *linctx = linear_context(memctx);
/* Analyze the shader globally */
postra_liveness(func);
struct util_dynarray candidates = UTIL_DYNARRAY_INIT;
/* Find the longest block so we can size our allocations & count IPs */
uint32_t ip_bound = 0;
jay_foreach_block(func, block) {
ip_bound = MAX2(ip_bound, list_length(&block->instructions) + 1);
}
/* in_use[acc][IP] set if acc is in-use /before/ executing instruction IP */
BITSET_WORD *in_use[JAY_MAX_ACCUMS];
unsigned nr_accums = func->shader->dispatch_width == 32 ? 2 : 4;
for (unsigned i = 0; i < nr_accums; ++i) {
in_use[i] = BITSET_LINEAR_ZALLOC(linctx, ip_bound);
}
/* acc+1 if the instruction writes acc, 0 if no accumulator written */
uint8_t *ra = linear_zalloc_array(linctx, uint8_t, ip_bound);
jay_foreach_block(func, block) {
util_dynarray_clear(&candidates);
/* Live-set at each point in the program */
BITSET_DECLARE(live, JAY_NUM_PHYS_GRF);
memcpy(live, block->postra_gpr_live_out, sizeof(live));
uint32_t ip = ip_bound;
uint32_t last_use_ip[JAY_NUM_PHYS_GRF] = { 0 };
uint32_t pre_live = 0;
jay_foreach_inst_in_block_rev(block, I) {
--ip;
assert(ip > 0 && "invariant");
/* Collect candidates */
if (I->dst.file == GPR && last_use_ip[I->dst.reg]) {
if (can_access_accum(func->shader, I, -1)) {
struct candidate c = { ip, last_use_ip[I->dst.reg] };
util_dynarray_append(&candidates, c);
}
last_use_ip[I->dst.reg] = 0;
}
if (I->dst.file == ACCUM || I->dst.file == UACCUM) {
pre_live &= ~BITFIELD_BIT(I->dst.reg / 2);
}
jay_foreach_src(I, s) {
if (I->src[s].file == GPR && source_killed(live, I, s)) {
last_use_ip[I->src[s].reg] = ip;
}
}
/* Prune candidates (in a second loop in case of duplicated sources) */
jay_foreach_src(I, s) {
if (I->src[s].file == GPR &&
!can_access_accum(func->shader, I, s)) {
jay_foreach_comp(I->src[s], c) {
last_use_ip[I->src[s].reg + c] = 0;
}
}
if (I->src[s].file == ACCUM || I->src[s].file == UACCUM) {
pre_live |= BITFIELD_BIT(I->src[s].reg / 2);
}
}
u_foreach_bit(i, pre_live) {
BITSET_SET(in_use[i], ip);
}
/* Implicit use of the integer accumulator acc0 corrupts acc0/acc1,
* which coresponds to virtual acc0 in SIMD32 mode (a pair) or virtual
* acc0/acc1 in SIMD16 (two registers). Model interference.
*/
if (I->op == JAY_OPCODE_MUL_32) {
unsigned n = func->shader->dispatch_width < 32 ? 2 : 1;
for (unsigned i = 0; i < n; ++i) {
BITSET_SET(in_use[i], ip);
}
}
postra_liveness_ins(live, I);
}
qsort(candidates.data,
util_dynarray_num_elements(&candidates, struct candidate),
sizeof(struct candidate), cmp_candidates);
/* Greedily assign candidates */
util_dynarray_foreach(&candidates, struct candidate, c) {
for (unsigned i = 0; i < nr_accums; ++i) {
if (!BITSET_TEST_RANGE(in_use[i], c->def_ip + 1, c->last_use_ip)) {
BITSET_SET_RANGE(in_use[i], c->def_ip + 1, c->last_use_ip);
ra[c->def_ip] = i + 1;
break;
}
}
}
uint32_t min_ip = ip;
uint8_t gpr_to_acc_p1[JAY_NUM_PHYS_GRF] = { 0 };
jay_foreach_inst_in_block_safe(block, I) {
/* Rewrite operands using accumulators */
jay_foreach_src(I, s) {
if (I->src[s].file == GPR) {
substitute_acc(&I->src[s], gpr_to_acc_p1[I->src[s].reg]);
}
}
if (I->dst.file == GPR) {
jay_foreach_comp(I->dst, c) {
gpr_to_acc_p1[I->dst.reg + c] = ra[ip];
}
substitute_acc(&I->dst, ra[ip]);
}
/* Rewrite MAD->MAC where possible to improve code density.
*
* The bspec says "Instructions that specify an implicit accumulator
* source cannot specify an explicit accumulator source operand.". But
* it works fine on Lunar Lake so ¯\_()_/¯
*/
if ((I->op == JAY_OPCODE_MAD && I->type == JAY_TYPE_F32) &&
(I->src[2].file == ACCUM && I->src[2].reg == 0) &&
!(I->src[2].negate || I->src[2].abs)) {
I->op = JAY_OPCODE_MAC;
}
/* Sometimes this algorithm turns nontrivial GPR->GPR copies into
* trivial accumulator->accumulator copies, which can be coalesced now.
*/
if (I->op == JAY_OPCODE_MOV && jay_regs_equal(I->dst, I->src[0])) {
jay_remove_instruction(I);
}
++ip;
}
assert(ip == ip_bound);
/* Zero per-block allocation */
for (unsigned i = 0; i < nr_accums; ++i) {
BITSET_CLEAR_RANGE(in_use[i], min_ip, ip);
}
memset(ra + min_ip, 0, (ip - min_ip) * sizeof(*ra));
}
util_dynarray_fini(&candidates);
ralloc_free(memctx);
}
JAY_DEFINE_FUNCTION_PASS(jay_assign_accumulators, pass)

View file

@ -2686,6 +2686,7 @@ jay_compile(const struct intel_device_info *devinfo,
if (!(jay_debug & JAY_DBG_NOOPT)) {
JAY_PASS(s, jay_opt_predicate);
JAY_PASS(s, jay_assign_accumulators);
}
JAY_PASS(s, jay_lower_scoreboard);

View file

@ -1072,6 +1072,9 @@ typedef struct jay_block {
struct u_sparse_bitset live_in;
struct u_sparse_bitset live_out;
BITSET_DECLARE(postra_gpr_live_in, JAY_NUM_PHYS_GRF);
BITSET_DECLARE(postra_gpr_live_out, JAY_NUM_PHYS_GRF);
/**
* After register allocation but before going out-of-SSA, registers that
* are free at the logical end of the block (before phi_src). These will

View file

@ -39,6 +39,7 @@ void jay_spill(jay_function *func, unsigned limit);
void jay_partition_grf(jay_shader *shader);
void jay_register_allocate(jay_shader *s);
void jay_assign_flags(jay_shader *s);
void jay_assign_accumulators(jay_shader *s);
void jay_repair_ssa(jay_function *func);
const char *jay_file_prefix(enum jay_file file);

View file

@ -47,6 +47,7 @@ jay_nir_algebraic = custom_target(
libintel_compiler_jay_files = files(
'jay.h',
'jay_assign_accumulators.c',
'jay_assign_flags.c',
'jay_from_nir.c',
'jay_ir.h',