mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-09 23:08:18 +02:00
jay/partition: pick better partitions
look at what the program actually does instead of hardcoding a worst-case. SIMD16: Totals from 1965 (74.23% of 2647) affected shaders: Instrs: 2603230 -> 2539932 (-2.43%); split: -3.44%, +1.01% CodeSize: 38826160 -> 37811904 (-2.61%); split: -3.59%, +0.97% Number of spill instructions: 1206 -> 555 (-53.98%) Number of fill instructions: 1194 -> 551 (-53.85%) SIMD32: Totals from 1974 (74.57% of 2647) affected shaders: Instrs: 3998126 -> 3033333 (-24.13%); split: -24.18%, +0.05% CodeSize: 59563952 -> 45580448 (-23.48%); split: -23.52%, +0.05% Number of spill instructions: 43534 -> 37471 (-13.93%); split: -13.97%, +0.04% Number of fill instructions: 43118 -> 36412 (-15.55%) Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41872>
This commit is contained in:
parent
7221ef0230
commit
c9eb9ea52a
1 changed files with 168 additions and 74 deletions
|
|
@ -3,19 +3,75 @@
|
|||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "util/u_math.h"
|
||||
#include "jay_ir.h"
|
||||
#include "jay_private.h"
|
||||
|
||||
/*
|
||||
* jay_partition_grf partitions the register file for the entire shader,
|
||||
* satisfying functional and performance rules. The partition is specified in a
|
||||
* convenient form within this file, as a flat array of jay_partition_builder
|
||||
* structs, which build_partition translates to the more complicated
|
||||
* jay_partition structs.
|
||||
* In addition to having enough total registers globally, the partition needs to
|
||||
* have enough contiguous registers in each file/stride to allocate any single
|
||||
* instruction in isolation. analyze_per_inst calculates bounds on the numbers
|
||||
* of required contiguous registers.
|
||||
*
|
||||
* All functions must share the same partition for correctness with non-uniform
|
||||
* function calls. For unlinked library functions, we must use the ABI
|
||||
* partition (TODO).
|
||||
* This analysis is overly conservative, implicitly assuming that all operands
|
||||
* of a given file/stride must be contiguous. This could be improved with a lot
|
||||
* more bookkeeping, but it's unclear if it matters much in practice.
|
||||
*
|
||||
* Note the util_next_power_of_two below is critical for correctness. Although
|
||||
* RA handles non-power-of-two vectors, it aligns vectors to the next
|
||||
* power-of-two size. That doesn't affect global register demand, but it needs
|
||||
* to be reflected in our per-instruction minimum.
|
||||
*
|
||||
* As an example, an instruction with sources (vec3, vec2) requires at least 6
|
||||
* contiguous registers to satisfy RA, since the vec3 gets vec4 alignment and
|
||||
* the vec2 gets vec2 alignment. In principal, future RA improvements might
|
||||
* relax this, but we're not there yet.
|
||||
*/
|
||||
struct instruction_req {
|
||||
unsigned gpr[JAY_NUM_STRIDES];
|
||||
unsigned ugpr;
|
||||
};
|
||||
|
||||
static struct instruction_req
|
||||
analyze_per_inst(jay_shader *shader)
|
||||
{
|
||||
struct instruction_req global = { 0 };
|
||||
|
||||
jay_foreach_inst_in_shader(shader, func, I) {
|
||||
struct instruction_req local = { 0 };
|
||||
|
||||
for (int i = -1; i < I->num_srcs; ++i) {
|
||||
jay_def x = i >= 0 ? I->src[i] : I->dst;
|
||||
if (!jay_is_null(x)) {
|
||||
unsigned size = util_next_power_of_two(jay_num_values(x));
|
||||
|
||||
if (x.file == UGPR) {
|
||||
local.ugpr += size;
|
||||
} else if (x.file == GPR && i >= 0) {
|
||||
enum jay_stride min_stride = jay_src_stride_minmax(I, i, false);
|
||||
local.gpr[min_stride] += size;
|
||||
} else if (x.file == GPR) {
|
||||
enum jay_stride min_stride = jay_dst_stride_minmax(I, false);
|
||||
local.gpr[min_stride] += size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Take the worst-case for each block */
|
||||
global.ugpr = MAX2(global.ugpr, local.ugpr);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(global.gpr); ++i) {
|
||||
global.gpr[i] = MAX2(global.gpr[i], local.gpr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return global;
|
||||
}
|
||||
|
||||
/*
|
||||
* The partition is specified in a convenient form within jay_partition_grf(),
|
||||
* as a flat array of jay_partition_builder structs, which build_partition
|
||||
* translates to the more complicated jay_partition structs.
|
||||
*/
|
||||
struct jay_partition_builder {
|
||||
enum jay_file file;
|
||||
|
|
@ -100,72 +156,20 @@ build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n)
|
|||
assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped");
|
||||
}
|
||||
|
||||
/*
|
||||
* Partition the register file for the entire shader.
|
||||
*
|
||||
* All functions must share the same partition for correctness with non-uniform
|
||||
* function calls. For unlinked library functions, we must use the ABI
|
||||
* partition (TODO).
|
||||
*/
|
||||
void
|
||||
jay_partition_grf(jay_shader *shader)
|
||||
{
|
||||
/* Calculate the maximum register demand across all functions in the shader.
|
||||
* We will use this to choose a good partition.
|
||||
*/
|
||||
unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
|
||||
|
||||
jay_foreach_function(shader, f) {
|
||||
jay_compute_liveness(f);
|
||||
jay_calculate_register_demands(f);
|
||||
|
||||
demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
|
||||
demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
|
||||
}
|
||||
|
||||
/* We must have enough register file space for the register payload, plus the
|
||||
* reserved UGPRs in the case we spill. That UGPR interferes with everything
|
||||
* we preload so it needs to be reserved specially here for the worst case.
|
||||
*/
|
||||
jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
|
||||
unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
|
||||
unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
|
||||
assert(I->dst.file < JAY_NUM_GRF_FILES);
|
||||
demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
|
||||
}
|
||||
|
||||
/* Determine a good GPR/UGPR split informed by the demand calculation */
|
||||
unsigned grf_per_gpr = jay_grf_per_gpr(shader);
|
||||
unsigned ugpr_per_grf = jay_ugpr_per_grf(shader);
|
||||
unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf);
|
||||
|
||||
/* We must have enough for SIMD1 images (TODO: Check if this actually
|
||||
* applies. Or if we could eliminate this with smarter partitioning even.)
|
||||
*/
|
||||
unsigned min_ugprs = 16;
|
||||
min_ugprs = MAX2(min_ugprs, 256);
|
||||
|
||||
/* TODO: We could partition more cleverly */
|
||||
uniform_grfs = align(uniform_grfs, grf_per_gpr);
|
||||
uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
|
||||
128 - (32 * grf_per_gpr));
|
||||
unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
|
||||
|
||||
/* Check the split */
|
||||
assert((uniform_grfs * ugpr_per_grf) >= min_ugprs);
|
||||
assert(nonuniform_grfs >= 32 * grf_per_gpr);
|
||||
assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF);
|
||||
|
||||
/* Set the targets for the virtual register file accordingly */
|
||||
shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
|
||||
shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
|
||||
|
||||
unsigned spill_reservation = 0, mem_slots = 0;
|
||||
|
||||
/* Spilling requires reserving UGPRs for the lowered SENDs */
|
||||
if (demand[GPR] > jay_gpr_limit(shader)) {
|
||||
spill_reservation = shader->dispatch_width / ugpr_per_grf;
|
||||
|
||||
/* This should be an acceptable upper limit since we assign memory tightly
|
||||
* thanks to the usual SSA allocator guarantees.
|
||||
*/
|
||||
mem_slots = demand[GPR] * grf_per_gpr;
|
||||
shader->num_regs[MEM] = demand[GPR];
|
||||
}
|
||||
|
||||
/* Determine the shape of the payload/EOT sections upfront. */
|
||||
unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 };
|
||||
unsigned eot_u = 0, eot_4 = 0;
|
||||
|
||||
|
|
@ -200,12 +204,102 @@ jay_partition_grf(jay_shader *shader)
|
|||
eot_4 = eot_u = 0;
|
||||
}
|
||||
|
||||
unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u;
|
||||
unsigned special_4 = payload_4[0] + payload_4[1] + eot_4;
|
||||
/* Calculate the maximum register demand across all functions in the shader.
|
||||
* We will use this to choose a good partition.
|
||||
*/
|
||||
unsigned demand[JAY_NUM_GRF_FILES] = { 0 };
|
||||
struct instruction_req instr_req = analyze_per_inst(shader);
|
||||
|
||||
/* TODO: Make the stride partition smarter */
|
||||
unsigned grf_8 = 8 * grf_per_gpr;
|
||||
unsigned grf_2 = 8;
|
||||
jay_foreach_function(shader, f) {
|
||||
jay_compute_liveness(f);
|
||||
jay_calculate_register_demands(f);
|
||||
|
||||
demand[GPR] = MAX2(demand[GPR], f->demand[GPR]);
|
||||
demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]);
|
||||
}
|
||||
|
||||
/* We must have enough register file space for the register payload, plus the
|
||||
* reserved UGPRs in the case we spill. That UGPR interferes with everything
|
||||
* we preload so it needs to be reserved specially here for the worst case.
|
||||
*/
|
||||
jay_foreach_preload(jay_shader_get_entrypoint(shader), I) {
|
||||
unsigned end = jay_preload_reg(I) + jay_num_values(I->dst);
|
||||
unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0;
|
||||
assert(I->dst.file < JAY_NUM_GRF_FILES);
|
||||
demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra);
|
||||
}
|
||||
|
||||
unsigned uniform_grfs, nonuniform_grfs;
|
||||
unsigned spilling_grfs = 0, mem_slots = 0;
|
||||
unsigned special_4 = payload_4[0] + payload_4[1] + eot_4, special_u;
|
||||
|
||||
/* Our current stride partition heuristic is rather dumb: allocate as few
|
||||
* non-32-bit GRFs as possible, and assign the rest to 32-bit. This performs
|
||||
* well for 32-bit code but poorly for 16-bit/64-bit heavy routines.
|
||||
* We'll need stride-aware demand calculations to fix that (TODO).
|
||||
*/
|
||||
unsigned grf_8 = align(instr_req.gpr[JAY_STRIDE_8], 2) * grf_per_gpr;
|
||||
unsigned grf_2 = instr_req.gpr[JAY_STRIDE_2] * grf_per_gpr;
|
||||
|
||||
for (unsigned spilling = 0; spilling <= 1; spilling++) {
|
||||
/* There is an interdependence between partition choice and spilling,
|
||||
* because spilling requires reserved UGPRs for the lowered SENDs. The
|
||||
* solution is to first try to build a partition that forbids spilling,
|
||||
* and if that fails, build one with it.
|
||||
*/
|
||||
spilling_grfs = spilling ? shader->dispatch_width / ugpr_per_grf : 0;
|
||||
special_u = payload_u[0] + payload_u[1] + spilling_grfs + eot_u;
|
||||
|
||||
/* We want to determine a good GPR/UGPR split by the demand calculation.
|
||||
* At minimum we need to not spill UGPRs, but if GPR pressure is low we
|
||||
* want to take extra UGPRs to reduce shuffling.
|
||||
*/
|
||||
uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf) + spilling_grfs;
|
||||
unsigned bonus_grfs = 4 * grf_per_gpr;
|
||||
unsigned estimate_nonunif_grf =
|
||||
(demand[GPR] * grf_per_gpr) + grf_8 + grf_2 + special_4;
|
||||
|
||||
if ((uniform_grfs + estimate_nonunif_grf + bonus_grfs) <=
|
||||
JAY_NUM_PHYS_GRF) {
|
||||
uniform_grfs += bonus_grfs;
|
||||
}
|
||||
|
||||
/* If the minimum vector length can't fit in any single existing block, we
|
||||
* will need a new block for it. This is quite conservative.
|
||||
*/
|
||||
unsigned min_ugprs = special_u * ugpr_per_grf;
|
||||
if (instr_req.ugpr > payload_u[0] * ugpr_per_grf &&
|
||||
instr_req.ugpr > payload_u[1] * ugpr_per_grf &&
|
||||
instr_req.ugpr > eot_u * ugpr_per_grf) {
|
||||
|
||||
min_ugprs += instr_req.ugpr;
|
||||
}
|
||||
|
||||
/* Finally, we need to snap to GPR bounds */
|
||||
uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf),
|
||||
128 - (32 * grf_per_gpr));
|
||||
uniform_grfs = align(uniform_grfs, grf_per_gpr);
|
||||
nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs;
|
||||
|
||||
/* Set the targets for the virtual register file accordingly */
|
||||
shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr;
|
||||
shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf;
|
||||
|
||||
/* jay_gpr_limit depends on shader->num_regs[GPR]. If we're under the
|
||||
* limit without spilling, we're good to go.
|
||||
*/
|
||||
if (demand[GPR] <= jay_gpr_limit(shader) && !spilling) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* This should be an acceptable upper limit since we assign memory
|
||||
* tightly thanks to the usual SSA allocator guarantees.
|
||||
*/
|
||||
if (spilling_grfs) {
|
||||
mem_slots = demand[GPR] * grf_per_gpr;
|
||||
shader->num_regs[MEM] = demand[GPR];
|
||||
}
|
||||
|
||||
struct jay_partition_builder blocks[] = {
|
||||
/* Stage-specific payload */
|
||||
|
|
@ -221,7 +315,7 @@ jay_partition_grf(jay_shader *shader)
|
|||
{ GPR, JAY_STRIDE_2, grf_2 },
|
||||
|
||||
/* Spilling registers */
|
||||
{ UGPR, 0, spill_reservation, JAY_BLOCK_SPILL },
|
||||
{ UGPR, 0, spilling_grfs, JAY_BLOCK_SPILL },
|
||||
{ MEM, JAY_STRIDE_4, mem_slots },
|
||||
|
||||
/* EOT */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue