From 2fcf6a94a03880dda156201fa98ad28dcd7aa74e Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 18 Jun 2026 12:43:05 -0400 Subject: [PATCH] jay: improve the stride partition heuristic This helps MHW a bunch, I just added that fossil to my collection. SIMD16: Totals from 607 (8.61% of 7050) affected shaders: Instrs: 551809 -> 504620 (-8.55%); split: -8.67%, +0.12% CodeSize: 7935424 -> 7217680 (-9.04%); split: -9.17%, +0.12% SIMD32: Totals from 468 (6.64% of 7050) affected shaders: Instrs: 432455 -> 396733 (-8.26%); split: -8.86%, +0.60% CodeSize: 6276144 -> 5734736 (-8.63%); split: -9.20%, +0.58% Signed-off-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/jay/jay_partition.c | 74 +++++++++++++++++++++----- 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c index 92ac3d28b88..aac05be4261 100644 --- a/src/intel/compiler/jay/jay_partition.c +++ b/src/intel/compiler/jay/jay_partition.c @@ -3,6 +3,7 @@ * SPDX-License-Identifier: MIT */ +#include #include "util/u_math.h" #include "jay_ir.h" #include "jay_private.h" @@ -274,13 +275,13 @@ jay_partition_grf(jay_shader *shader) unsigned spilling_grfs = 0, mem_slots = 0; unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u; - /* Our current stride partition heuristic is rather dumb: allocate as few - * non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs - * well for 32-bit code but poorly for 16-bit/64-bit heavy routines. - * We'll need stride-aware demand calculations to fix that (TODO). - */ - unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr; - unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr; + unsigned increment[JAY_NUM_STRIDES] = { grf_per_gpr, grf_per_gpr, + 2 * grf_per_gpr }; + unsigned min_grf[JAY_NUM_STRIDES] = {}; + for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) { + min_grf[i] = align(instr_req.gpr[i] * grf_per_gpr, increment[i]); + } + unsigned mapped_accums = grf_per_gpr == 1 ? 2 : 0; for (unsigned spilling = 0; spilling <= 1; spilling++) { @@ -298,8 +299,10 @@ jay_partition_grf(jay_shader *shader) */ uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs; unsigned bonus_grfs = 4 * grf_per_gpr; - unsigned estimate_nonunif_grf = - (demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4; + unsigned estimate_nonunif_grf = (demand[GPR] * grf_per_gpr) + + min_grf[JAY_STRIDE_8] + + min_grf[JAY_STRIDE_2] + + special_4; if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <= JAY_NUM_PHYS_GRF) { @@ -343,6 +346,53 @@ jay_partition_grf(jay_shader *shader) shader->num_regs[MEM] = demand[GPR]; } + /* Now that we've decided how many GRFs to use for GPRs, we need to partition + * those GRFs by stride. This does not affect spilling but it has a + * significant effect on moves inserted by RA. We use a simple heuristic to + * pick a balanced partition: give each stride GRFs proportionate to the + * number of SSA defs with that associated stride, plus a slight bias towards + * 32-bit to avoid divsion by zero. This reflects our intuition that shaders + * heavy on 16-bit (or 64-bit) arithmetic should have more 16-bit (or 64-bit) + * registers overall. + */ + unsigned counts[3] = { [JAY_STRIDE_4] = 1 }; + jay_foreach_inst_in_shader(shader, block, I) { + if (I->dst.file == GPR) { + counts[jay_dst_stride_minmax(I, false)] += jay_num_values(I->dst); + } + } + + min_grf[JAY_STRIDE_4] = MAX2(min_grf[JAY_STRIDE_4], special_4); + unsigned denom_i = counts[0] + counts[1] + counts[2]; + float factor = nonuniform_grfs / ((float) denom_i); + + unsigned picked_grf[JAY_NUM_STRIDES] = {}, total = 0; + for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) { + float ideal = ((float) counts[i]) * factor; + + picked_grf[i] = align(MAX2(roundf(ideal), min_grf[i]), increment[i]); + total += picked_grf[i]; + } + + if (total < nonuniform_grfs) { + /* If we have GRFs to spare due to rounding, put them on 32-bit */ + picked_grf[JAY_STRIDE_4] += nonuniform_grfs - total; + } else { + /* If we used too many GRFs, remove where we can */ + unsigned excess = total - nonuniform_grfs; + assert(util_is_aligned(excess, grf_per_gpr)); + + for (unsigned i = 0; i < JAY_NUM_STRIDES; ++i) { + while (excess && picked_grf[i] > min_grf[i]) { + assert(excess >= increment[i]); + picked_grf[i] -= increment[i]; + excess -= increment[i]; + } + } + } + + assert(picked_grf[0] + picked_grf[1] + picked_grf[2] == nonuniform_grfs); + struct jay_partition_builder blocks[] = { /* Stage-specific payload */ { UGPR, 0, payload_u[0] }, @@ -352,9 +402,9 @@ jay_partition_grf(jay_shader *shader) /* General registers */ { UGPR, 0, uniform_grfs - special_u }, - { GPR, JAY_STRIDE_4, nonuniform_grfs - (special_4 + grf_8 + grf_2) }, - { GPR, JAY_STRIDE_8, grf_8 }, - { GPR, JAY_STRIDE_2, grf_2 }, + { GPR, JAY_STRIDE_4, picked_grf[JAY_STRIDE_4] - special_4 }, + { GPR, JAY_STRIDE_8, picked_grf[JAY_STRIDE_8] }, + { GPR, JAY_STRIDE_2, picked_grf[JAY_STRIDE_2] }, /* Spilling registers */ { UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL },