From c9eb9ea52a3ed2d6c8a1b734f451cf6481d3dd81 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 28 May 2026 11:05:30 -0400 Subject: [PATCH] jay/partition: pick better partitions look at what the program actually does instead of hardcoding a worst-case. SIMD16: Totals from 1965 (74.23% of 2647) affected shaders: Instrs: 2603230 -> 2539932 (-2.43%); split: -3.44%, +1.01% CodeSize: 38826160 -> 37811904 (-2.61%); split: -3.59%, +0.97% Number of spill instructions: 1206 -> 555 (-53.98%) Number of fill instructions: 1194 -> 551 (-53.85%) SIMD32: Totals from 1974 (74.57% of 2647) affected shaders: Instrs: 3998126 -> 3033333 (-24.13%); split: -24.18%, +0.05% CodeSize: 59563952 -> 45580448 (-23.48%); split: -23.52%, +0.05% Number of spill instructions: 43534 -> 37471 (-13.93%); split: -13.97%, +0.04% Number of fill instructions: 43118 -> 36412 (-15.55%) Signed-off-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/jay/jay_partition.c | 242 +++++++++++++++++-------- 1 file changed, 168 insertions(+), 74 deletions(-) diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c index 63d7ccab4f9..4c7c428624f 100644 --- a/src/intel/compiler/jay/jay_partition.c +++ b/src/intel/compiler/jay/jay_partition.c @@ -3,19 +3,75 @@ * SPDX-License-Identifier: MIT */ +#include "util/u_math.h" #include "jay_ir.h" #include "jay_private.h" /* - * jay_partition_grf partitions the register file for the entire shader, - * satisfying functional and performance rules. The partition is specified in a - * convenient form within this file, as a flat array of jay_partition_builder - * structs, which build_partition translates to the more complicated - * jay_partition structs. + * In addition to having enough total registers globally, the partition needs to + * have enough contiguous registers in each file/stride to allocate any single + * instruction in isolation. analyze_per_inst calculates bounds on the numbers + * of required contiguous registers. * - * All functions must share the same partition for correctness with non-uniform - * function calls. For unlinked library functions, we must use the ABI - * partition (TODO). + * This analysis is overly conservative, implicitly assuming that all operands + * of a given file/stride must be contiguous. This could be improved with a lot + * more bookkeeping, but it's unclear if it matters much in practice. + * + * Note the util_next_power_of_two below is critical for correctness. Although + * RA handles non-power-of-two vectors, it aligns vectors to the next + * power-of-two size. That doesn't affect global register demand, but it needs + * to be reflected in our per-instruction minimum. + * + * As an example, an instruction with sources (vec3, vec2) requires at least 6 + * contiguous registers to satisfy RA, since the vec3 gets vec4 alignment and + * the vec2 gets vec2 alignment. In principal, future RA improvements might + * relax this, but we're not there yet. + */ +struct instruction_req { + unsigned gpr[JAY_NUM_STRIDES]; + unsigned ugpr; +}; + +static struct instruction_req +analyze_per_inst(jay_shader *shader) +{ + struct instruction_req global = { 0 }; + + jay_foreach_inst_in_shader(shader, func, I) { + struct instruction_req local = { 0 }; + + for (int i = -1; i < I->num_srcs; ++i) { + jay_def x = i >= 0 ? I->src[i] : I->dst; + if (!jay_is_null(x)) { + unsigned size = util_next_power_of_two(jay_num_values(x)); + + if (x.file == UGPR) { + local.ugpr += size; + } else if (x.file == GPR && i >= 0) { + enum jay_stride min_stride = jay_src_stride_minmax(I, i, false); + local.gpr[min_stride] += size; + } else if (x.file == GPR) { + enum jay_stride min_stride = jay_dst_stride_minmax(I, false); + local.gpr[min_stride] += size; + } + } + } + + /* Take the worst-case for each block */ + global.ugpr = MAX2(global.ugpr, local.ugpr); + + for (unsigned i = 0; i < ARRAY_SIZE(global.gpr); ++i) { + global.gpr[i] = MAX2(global.gpr[i], local.gpr[i]); + } + } + + return global; +} + +/* + * The partition is specified in a convenient form within jay_partition_grf(), + * as a flat array of jay_partition_builder structs, which build_partition + * translates to the more complicated jay_partition structs. */ struct jay_partition_builder { enum jay_file file; @@ -100,72 +156,20 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n) assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped"); } +/* + * Partition the register file for the entire shader. + * + * All functions must share the same partition for correctness with non-uniform + * function calls. For unlinked library functions, we must use the ABI + * partition (TODO). + */ void jay_partition_grf(jay_shader *shader) { - /* Calculate the maximum register demand across all functions in the shader. - * We will use this to choose a good partition. - */ - unsigned demand[JAY_NUM_GRF_FILES] = { 0 }; - - jay_foreach_function(shader, f) { - jay_compute_liveness(f); - jay_calculate_register_demands(f); - - demand[GPR] = MAX2(demand[GPR], f->demand[GPR]); - demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]); - } - - /* We must have enough register file space for the register payload, plus the - * reserved UGPRs in the case we spill. That UGPR interferes with everything - * we preload so it needs to be reserved specially here for the worst case. - */ - jay_foreach_preload(jay_shader_get_entrypoint(shader), I) { - unsigned end = jay_preload_reg(I) + jay_num_values(I->dst); - unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0; - assert(I->dst.file < JAY_NUM_GRF_FILES); - demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra); - } - - /* Determine a good GPR/UGPR split informed by the demand calculation */ unsigned grf_per_gpr = jay_grf_per_gpr(shader); unsigned ugpr_per_grf = jay_ugpr_per_grf(shader); - unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf); - - /* We must have enough for SIMD1 images (TODO: Check if this actually - * applies. Or if we could eliminate this with smarter partitioning even.) - */ - unsigned min_ugprs = 16; - min_ugprs = MAX2(min_ugprs, 256); - - /* TODO: We could partition more cleverly */ - uniform_grfs = align(uniform_grfs, grf_per_gpr); - uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf), - 128 - (32 * grf_per_gpr)); - unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; - - /* Check the split */ - assert((uniform_grfs * ugpr_per_grf) >= min_ugprs); - assert(nonuniform_grfs >= 32 * grf_per_gpr); - assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF); - - /* Set the targets for the virtual register file accordingly */ - shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr; - shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf; - - unsigned spill_reservation = 0, mem_slots = 0; - - /* Spilling requires reserving UGPRs for the lowered SENDs */ - if (demand[GPR] > jay_gpr_limit(shader)) { - spill_reservation = shader->dispatch_width / ugpr_per_grf; - - /* This should be an acceptable upper limit since we assign memory tightly - * thanks to the usual SSA allocator guarantees. - */ - mem_slots = demand[GPR] * grf_per_gpr; - shader->num_regs[MEM] = demand[GPR]; - } + /* Determine the shape of the payload/EOT sections upfront. */ unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 }; unsigned eot_u = 0, eot_4 = 0; @@ -200,12 +204,102 @@ jay_partition_grf(jay_shader *shader) eot_4 = eot_u = 0; } - unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u; - unsigned special_4 = payload_4[0] + payload_4[1] + eot_4; + /* Calculate the maximum register demand across all functions in the shader. + * We will use this to choose a good partition. + */ + unsigned demand[JAY_NUM_GRF_FILES] = { 0 }; + struct instruction_req instr_req = analyze_per_inst(shader); - /* TODO: Make the stride partition smarter */ - unsigned grf_8 = 8 * grf_per_gpr; - unsigned grf_2 = 8; + jay_foreach_function(shader, f) { + jay_compute_liveness(f); + jay_calculate_register_demands(f); + + demand[GPR] = MAX2(demand[GPR], f->demand[GPR]); + demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]); + } + + /* We must have enough register file space for the register payload, plus the + * reserved UGPRs in the case we spill. That UGPR interferes with everything + * we preload so it needs to be reserved specially here for the worst case. + */ + jay_foreach_preload(jay_shader_get_entrypoint(shader), I) { + unsigned end = jay_preload_reg(I) + jay_num_values(I->dst); + unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0; + assert(I->dst.file < JAY_NUM_GRF_FILES); + demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra); + } + + unsigned uniform_grfs, nonuniform_grfs; + unsigned spilling_grfs = 0, mem_slots = 0; + unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u; + + /* Our current stride partition heuristic is rather dumb: allocate as few + * non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs + * well for 32-bit code but poorly for 16-bit/64-bit heavy routines. + * We'll need stride-aware demand calculations to fix that (TODO). + */ + unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr; + unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr; + + for (unsigned spilling = 0; spilling <= 1; spilling++) { + /* There is an interdependence between partition choice and spilling, + * because spilling requires reserved UGPRs for the lowered SENDs. The + * solution is to first try to build a partition that forbids spilling, + * and if that fails, build one with it. + */ + spilling_grfs = spilling ? shader->dispatch_width / ugpr_per_grf : 0; + special_u = payload_u[0] + payload_u[1] + spilling_grfs + eot_u; + + /* We want to determine a good GPR/UGPR split by the demand calculation. + * At minimum we need to not spill UGPRs, but if GPR pressure is low we + * want to take extra UGPRs to reduce shuffling. + */ + uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs; + unsigned bonus_grfs = 4 * grf_per_gpr; + unsigned estimate_nonunif_grf = + (demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4; + + if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <= + JAY_NUM_PHYS_GRF) { + uniform_grfs += bonus_grfs; + } + + /* If the minimum vector length can't fit in any single existing block, we + * will need a new block for it. This is quite conservative. + */ + unsigned min_ugprs = special_u * ugpr_per_grf; + if (instr_req.ugpr > payload_u[0] * ugpr_per_grf && + instr_req.ugpr > payload_u[1] * ugpr_per_grf && + instr_req.ugpr > eot_u * ugpr_per_grf) { + + min_ugprs += instr_req.ugpr; + } + + /* Finally, we need to snap to GPR bounds */ + uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf), + 128 - (32 * grf_per_gpr)); + uniform_grfs = align(uniform_grfs, grf_per_gpr); + nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; + + /* Set the targets for the virtual register file accordingly */ + shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr; + shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf; + + /* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the + * limit without spilling, we're good to go. + */ + if (demand[GPR] <= jay_gpr_limit(shader) && !spilling) { + break; + } + } + + /* This should be an acceptable upper limit since we assign memory + * tightly thanks to the usual SSA allocator guarantees. + */ + if (spilling_grfs) { + mem_slots = demand[GPR] * grf_per_gpr; + shader->num_regs[MEM] = demand[GPR]; + } struct jay_partition_builder blocks[] = { /* Stage-specific payload */ @@ -221,7 +315,7 @@ jay_partition_grf(jay_shader *shader) { GPR, JAY_STRIDE_2, grf_2 }, /* Spilling registers */ - { UGPR, 0, spill_reservation, JAY_BLOCK_SPILL }, + { UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL }, { MEM, JAY_STRIDE_4, mem_slots }, /* EOT */