jay/partition: pick better partitions

look at what the program actually does instead of hardcoding a worst-case.

SIMD16:
   Totals from 1965 (74.23% of 2647) affected shaders:
   Instrs: 2603230 -> 2539932 (-2.43%); split: -3.44%, +1.01%
   CodeSize: 38826160 -> 37811904 (-2.61%); split: -3.59%, +0.97%
   Number of spill instructions: 1206 -> 555 (-53.98%)
   Number of fill instructions: 1194 -> 551 (-53.85%)

SIMD32:
   Totals from 1974 (74.57% of 2647) affected shaders:
   Instrs: 3998126 -> 3033333 (-24.13%); split: -24.18%, +0.05%
   CodeSize: 59563952 -> 45580448 (-23.48%); split: -23.52%, +0.05%
   Number of spill instructions: 43534 -> 37471 (-13.93%); split: -13.97%, +0.04%
   Number of fill instructions: 43118 -> 36412 (-15.55%)

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41872>
This commit is contained in:
Alyssa Rosenzweig 2026-05-28 11:05:30 -04:00 committed by Marge Bot
parent 7221ef0230
commit c9eb9ea52a

View file

@ -3,19 +3,75 @@
* SPDX-License-Identifier: MIT
*/
#include "util/u_math.h"
#include "jay_ir.h"
#include "jay_private.h"
/*
* jay_partition_grf partitions the register file for the entire shader,
* satisfying functional and performance rules. The partition is specified in a
* convenient form within this file, as a flat array of jay_partition_builder
* structs, which build_partition translates to the more complicated
* jay_partition structs.
* In addition to having enough total registers globally, the partition needs to
* have enough contiguous registers in each file/stride to allocate any single
* instruction in isolation. analyze_per_inst calculates bounds on the numbers
* of required contiguous registers.
*
* All functions must share the same partition for correctness with non-uniform
* function calls. For unlinked library functions, we must use the ABI
* partition (TODO).
* This analysis is overly conservative, implicitly assuming that all operands
* of a given file/stride must be contiguous. This could be improved with a lot
* more bookkeeping, but it's unclear if it matters much in practice.
*
* Note the util_next_power_of_two below is critical for correctness. Although
* RA handles non-power-of-two vectors, it aligns vectors to the next
* power-of-two size. That doesn't affect global register demand, but it needs
* to be reflected in our per-instruction minimum.
*
* As an example, an instruction with sources (vec3, vec2) requires at least 6
* contiguous registers to satisfy RA, since the vec3 gets vec4 alignment and
* the vec2 gets vec2 alignment. In principal, future RA improvements might
* relax this, but we're not there yet.
*/
struct instruction_req {
unsigned gpr[JAY_NUM_STRIDES];
unsigned ugpr;
};
static struct instruction_req
analyze_per_inst(jay_shader *shader)
{
struct instruction_req global = { 0 };
jay_foreach_inst_in_shader(shader, func, I) {
struct instruction_req local = { 0 };
for (int i = -1; i < I->num_srcs; ++i) {
jay_def x = i >= 0 ? I->src[i] : I->dst;
if (!jay_is_null(x)) {
unsigned size = util_next_power_of_two(jay_num_values(x));
if (x.file == UGPR) {
local.ugpr += size;
} else if (x.file == GPR && i >= 0) {
enum jay_stride min_stride = jay_src_stride_minmax(I, i, false);
local.gpr[min_stride] += size;
} else if (x.file == GPR) {
enum jay_stride min_stride = jay_dst_stride_minmax(I, false);
local.gpr[min_stride] += size;
}
}
}
/* Take the worst-case for each block */
global.ugpr = MAX2(global.ugpr, local.ugpr);
for (unsigned i = 0; i < ARRAY_SIZE(global.gpr); ++i) {
global.gpr[i] = MAX2(global.gpr[i], local.gpr[i]);
}
}
return global;
}
/*
* The partition is specified in a convenient form within jay_partition_grf(),
* as a flat array of jay_partition_builder structs, which build_partition
* translates to the more complicated jay_partition structs.
*/
struct jay_partition_builder {
enum jay_file file;
@ -100,72 +156,20 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped");
}
/*
* Partition the register file for the entire shader.
*
* All functions must share the same partition for correctness with non-uniform
* function calls. For unlinked library functions, we must use the ABI
* partition (TODO).
*/
void
jay_partition_grf(jay_shader *shader)
{
/* Calculate the maximum register demand across all functions in the shader.
* We will use this to choose a good partition.
*/
unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
jay_foreach_function(shader, f) {
jay_compute_liveness(f);
jay_calculate_register_demands(f);
demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
}
/* We must have enough register file space for the register payload, plus the
* reserved UGPRs in the case we spill. That UGPR interferes with everything
* we preload so it needs to be reserved specially here for the worst case.
*/
jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
assert(I->dst.file < JAY_NUM_GRF_FILES);
demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
}
/* Determine a good GPR/UGPR split informed by the demand calculation */
unsigned grf_per_gpr = jay_grf_per_gpr(shader);
unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
/* We must have enough for SIMD1 images (TODO: Check if this actually
* applies. Or if we could eliminate this with smarter partitioning even.)
*/
unsigned min_ugprs = 16;
min_ugprs = MAX2(min_ugprs, 256);
/* TODO: We could partition more cleverly */
uniform_grfs = align(uniform_grfs, grf_per_gpr);
uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
128 - (32 * grf_per_gpr));
unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
/* Check the split */
assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
assert(nonuniform_grfs >= 32 * grf_per_gpr);
assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
/* Set the targets for the virtual register file accordingly */
shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
unsigned spill_reservation = 0, mem_slots = 0;
/* Spilling requires reserving UGPRs for the lowered SENDs */
if (demand[GPR] > jay_gpr_limit(shader)) {
spill_reservation = shader->dispatch_width / ugpr_per_grf;
/* This should be an acceptable upper limit since we assign memory tightly
* thanks to the usual SSA allocator guarantees.
*/
mem_slots = demand[GPR] * grf_per_gpr;
shader->num_regs[MEM] = demand[GPR];
}
/* Determine the shape of the payload/EOT sections upfront. */
unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 };
unsigned eot_u = 0, eot_4 = 0;
@ -200,12 +204,102 @@ jay_partition_grf(jay_shader *shader)
eot_4 = eot_u = 0;
}
unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u;
unsigned special_4 = payload_4[0] + payload_4[1] + eot_4;
/* Calculate the maximum register demand across all functions in the shader.
* We will use this to choose a good partition.
*/
unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
struct instruction_req instr_req = analyze_per_inst(shader);
/* TODO: Make the stride partition smarter */
unsigned grf_8 = 8 * grf_per_gpr;
unsigned grf_2 = 8;
jay_foreach_function(shader, f) {
jay_compute_liveness(f);
jay_calculate_register_demands(f);
demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
}
/* We must have enough register file space for the register payload, plus the
* reserved UGPRs in the case we spill. That UGPR interferes with everything
* we preload so it needs to be reserved specially here for the worst case.
*/
jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
assert(I->dst.file < JAY_NUM_GRF_FILES);
demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
}
unsigned uniform_grfs, nonuniform_grfs;
unsigned spilling_grfs = 0, mem_slots = 0;
unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u;
/* Our current stride partition heuristic is rather dumb: allocate as few
* non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs
* well for 32-bit code but poorly for 16-bit/64-bit heavy routines.
* We'll need stride-aware demand calculations to fix that (TODO).
*/
unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
for (unsigned spilling = 0; spilling <= 1; spilling++) {
/* There is an interdependence between partition choice and spilling,
* because spilling requires reserved UGPRs for the lowered SENDs. The
* solution is to first try to build a partition that forbids spilling,
* and if that fails, build one with it.
*/
spilling_grfs = spilling ? shader->dispatch_width / ugpr_per_grf : 0;
special_u = payload_u[0] + payload_u[1] + spilling_grfs + eot_u;
/* We want to determine a good GPR/UGPR split by the demand calculation.
* At minimum we need to not spill UGPRs, but if GPR pressure is low we
* want to take extra UGPRs to reduce shuffling.
*/
uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs;
unsigned bonus_grfs = 4 * grf_per_gpr;
unsigned estimate_nonunif_grf =
(demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4;
if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <=
JAY_NUM_PHYS_GRF) {
uniform_grfs += bonus_grfs;
}
/* If the minimum vector length can't fit in any single existing block, we
* will need a new block for it. This is quite conservative.
*/
unsigned min_ugprs = special_u * ugpr_per_grf;
if (instr_req.ugpr > payload_u[0] * ugpr_per_grf &&
instr_req.ugpr > payload_u[1] * ugpr_per_grf &&
instr_req.ugpr > eot_u * ugpr_per_grf) {
min_ugprs += instr_req.ugpr;
}
/* Finally, we need to snap to GPR bounds */
uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
128 - (32 * grf_per_gpr));
uniform_grfs = align(uniform_grfs, grf_per_gpr);
nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
/* Set the targets for the virtual register file accordingly */
shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
/* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the
* limit without spilling, we're good to go.
*/
if (demand[GPR] <= jay_gpr_limit(shader) && !spilling) {
break;
}
}
/* This should be an acceptable upper limit since we assign memory
* tightly thanks to the usual SSA allocator guarantees.
*/
if (spilling_grfs) {
mem_slots = demand[GPR] * grf_per_gpr;
shader->num_regs[MEM] = demand[GPR];
}
struct jay_partition_builder blocks[] = {
/* Stage-specific payload */
@ -221,7 +315,7 @@ jay_partition_grf(jay_shader *shader)
{ GPR, JAY_STRIDE_2, grf_2 },
/* Spilling registers */
{ UGPR, 0, spill_reservation, JAY_BLOCK_SPILL },
{ UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL },
{ MEM, JAY_STRIDE_4, mem_slots },
/* EOT */