jay: improve the stride partition heuristic

This helps MHW a bunch, I just added that fossil to my collection.

SIMD16:
   Totals from 607 (8.61% of 7050) affected shaders:
   Instrs: 551809 -> 504620 (-8.55%); split: -8.67%, +0.12%
   CodeSize: 7935424 -> 7217680 (-9.04%); split: -9.17%, +0.12%

SIMD32:
   Totals from 468 (6.64% of 7050) affected shaders:
   Instrs: 432455 -> 396733 (-8.26%); split: -8.86%, +0.60%
   CodeSize: 6276144 -> 5734736 (-8.63%); split: -9.20%, +0.58%

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42287>
This commit is contained in:
Alyssa Rosenzweig 2026-06-18 12:43:05 -04:00 committed by Marge Bot
parent d05ae14651
commit 2fcf6a94a0

View file

@ -3,6 +3,7 @@
* SPDX-License-Identifier: MIT
*/
#include <math.h>
#include "util/u_math.h"
#include "jay_ir.h"
#include "jay_private.h"
@ -274,13 +275,13 @@ jay_partition_grf(jay_shader *shader)
unsigned spilling_grfs = 0, mem_slots = 0;
unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u;
/* Our current stride partition heuristic is rather dumb: allocate as few
* non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs
* well for 32-bit code but poorly for 16-bit/64-bit heavy routines.
* We'll need stride-aware demand calculations to fix that (TODO).
*/
unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
unsigned increment[JAY_NUM_STRIDES] = { grf_per_gpr, grf_per_gpr,
2 * grf_per_gpr };
unsigned min_grf[JAY_NUM_STRIDES] = {};
for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) {
min_grf[i] = align(instr_req.gpr[i] * grf_per_gpr, increment[i]);
}
unsigned mapped_accums = grf_per_gpr == 1 ? 2 : 0;
for (unsigned spilling = 0; spilling <= 1; spilling++) {
@ -298,8 +299,10 @@ jay_partition_grf(jay_shader *shader)
*/
uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs;
unsigned bonus_grfs = 4 * grf_per_gpr;
unsigned estimate_nonunif_grf =
(demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4;
unsigned estimate_nonunif_grf = (demand[GPR] * grf_per_gpr) +
min_grf[JAY_STRIDE_8] +
min_grf[JAY_STRIDE_2] +
special_4;
if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <=
JAY_NUM_PHYS_GRF) {
@ -343,6 +346,53 @@ jay_partition_grf(jay_shader *shader)
shader->num_regs[MEM] = demand[GPR];
}
/* Now that we've decided how many GRFs to use for GPRs, we need to partition
* those GRFs by stride. This does not affect spilling but it has a
* significant effect on moves inserted by RA. We use a simple heuristic to
* pick a balanced partition: give each stride GRFs proportionate to the
* number of SSA defs with that associated stride, plus a slight bias towards
* 32-bit to avoid divsion by zero. This reflects our intuition that shaders
* heavy on 16-bit (or 64-bit) arithmetic should have more 16-bit (or 64-bit)
* registers overall.
*/
unsigned counts[3] = { [JAY_STRIDE_4] = 1 };
jay_foreach_inst_in_shader(shader, block, I) {
if (I->dst.file == GPR) {
counts[jay_dst_stride_minmax(I, false)] += jay_num_values(I->dst);
}
}
min_grf[JAY_STRIDE_4] = MAX2(min_grf[JAY_STRIDE_4], special_4);
unsigned denom_i = counts[0] + counts[1] + counts[2];
float factor = nonuniform_grfs / ((float) denom_i);
unsigned picked_grf[JAY_NUM_STRIDES] = {}, total = 0;
for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) {
float ideal = ((float) counts[i]) * factor;
picked_grf[i] = align(MAX2(roundf(ideal), min_grf[i]), increment[i]);
total += picked_grf[i];
}
if (total < nonuniform_grfs) {
/* If we have GRFs to spare due to rounding, put them on 32-bit */
picked_grf[JAY_STRIDE_4] += nonuniform_grfs - total;
} else {
/* If we used too many GRFs, remove where we can */
unsigned excess = total - nonuniform_grfs;
assert(util_is_aligned(excess, grf_per_gpr));
for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) {
while (excess && picked_grf[i] > min_grf[i]) {
assert(excess >= increment[i]);
picked_grf[i] -= increment[i];
excess -= increment[i];
}
}
}
assert(picked_grf[0] + picked_grf[1] + picked_grf[2] == nonuniform_grfs);
struct jay_partition_builder blocks[] = {
/* Stage-specific payload */
{ UGPR, 0, payload_u[0] },
@ -352,9 +402,9 @@ jay_partition_grf(jay_shader *shader)
/* General registers */
{ UGPR, 0, uniform_grfs - special_u },
{ GPR, JAY_STRIDE_4, nonuniform_grfs - (special_4 + grf_8 + grf_2) },
{ GPR, JAY_STRIDE_8, grf_8 },
{ GPR, JAY_STRIDE_2, grf_2 },
{ GPR, JAY_STRIDE_4, picked_grf[JAY_STRIDE_4] - special_4 },
{ GPR, JAY_STRIDE_8, picked_grf[JAY_STRIDE_8] },
{ GPR, JAY_STRIDE_2, picked_grf[JAY_STRIDE_2] },
/* Spilling registers */
{ UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL },