mesa/src/intel/compiler/brw_opt_virtual_grfs.cpp
Kenneth Graunke eb1ec9cf8e brw: Don't assert about MAX_VGRF_SIZE in brw_opt_split_virtual_grfs()
This allows us to create temporary VGRFs that are larger than
MAX_VGRF_SIZE(devinfo), which will be split eventually.  They may not
be split on the initial pass, because we may need LOAD_PAYLOAD lowering,
copy propagation, and so on to occur first.  So we allow registers to
exceed that size initially.

The "Register allocation relies on split_virtual_grfs()" assertion in
brw_reg_allocate.cpp still asserts that all VGRFs which reach the
register allocator have been properly split.

One case where this is useful is for vectorizing convergent block loads.
We create temporaries to splat the SIMD1 values out to SIMD(N), which
can lead to some very large temporaries.  However, copy propagation and
so on ultimately eliminate these and they'll get split down to proper
sizes or elided entirely in the end.

(Note: both this and the prior commits from this merge request are
 needed to close the linked issue.)

Cc: mesa-stable
Reviewed-by: Matt Turner <mattst88@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12324
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34461>
2025-04-11 20:34:51 +00:00

287 lines
9.9 KiB
C++

/*
* Copyright © 2010 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_shader.h"
#include "brw_builder.h"
/**
* Split large virtual GRFs into separate components if we can.
*
* This pass aggressively splits VGRFs into as small a chunks as possible,
* down to single registers if it can. If no VGRFs can be split, we return
* false so this pass can safely be used inside an optimization loop. We
* want to split, because virtual GRFs are what we register allocate and
* spill (due to contiguousness requirements for some instructions), and
* they're what we naturally generate in the codegen process, but most
* virtual GRFs don't actually need to be contiguous sets of GRFs. If we
* split, we'll end up with reduced live intervals and better dead code
* elimination and coalescing.
*/
bool
brw_opt_split_virtual_grfs(brw_shader &s)
{
/* Compact the register file so we eliminate dead vgrfs. This
* only defines split points for live registers, so if we have
* too large dead registers they will hit assertions later.
*/
brw_opt_compact_virtual_grfs(s);
unsigned num_vars = s.alloc.count;
/* Count the total number of registers */
unsigned reg_count = 0;
unsigned *vgrf_to_reg = new unsigned[num_vars];
for (unsigned i = 0; i < num_vars; i++) {
vgrf_to_reg[i] = reg_count;
reg_count += s.alloc.sizes[i];
}
/* An array of "split points". For each register slot, this indicates
* if this slot can be separated from the previous slot. Every time an
* instruction uses multiple elements of a register (as a source or
* destination), we mark the used slots as inseparable. Then we go
* through and split the registers into the smallest pieces we can.
*/
bool *split_points = new bool[reg_count];
memset(split_points, 0, reg_count * sizeof(*split_points));
/* Mark all used registers as fully splittable following the physical
* register size.
*/
const unsigned reg_inc = reg_unit(s.devinfo);
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
if (inst->dst.file == VGRF) {
unsigned reg = vgrf_to_reg[inst->dst.nr];
for (unsigned j = reg_inc; j < s.alloc.sizes[inst->dst.nr]; j += reg_inc)
split_points[reg + j] = true;
}
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
unsigned reg = vgrf_to_reg[inst->src[i].nr];
for (unsigned j = reg_inc; j < s.alloc.sizes[inst->src[i].nr]; j += reg_inc)
split_points[reg + j] = true;
}
}
}
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
/* We fix up undef instructions later */
if (inst->opcode == SHADER_OPCODE_UNDEF) {
assert(inst->dst.file == VGRF);
continue;
}
if (inst->dst.file == VGRF) {
unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
for (unsigned j = 1; j < regs_written(inst); j++)
split_points[reg + j] = false;
}
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF) {
unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
for (unsigned j = 1; j < regs_read(s.devinfo, inst, i); j++)
split_points[reg + j] = false;
}
}
}
/* Bitset of which registers have been split */
bool *vgrf_has_split = new bool[num_vars];
memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
unsigned *new_virtual_grf = new unsigned[reg_count];
unsigned *new_reg_offset = new unsigned[reg_count];
unsigned reg = 0;
bool has_splits = false;
for (unsigned i = 0; i < num_vars; i++) {
/* The first one should always be 0 as a quick sanity check. */
assert(split_points[reg] == false);
/* j = 0 case */
new_reg_offset[reg] = 0;
reg++;
unsigned offset = 1;
/* j > 0 case */
for (unsigned j = 1; j < s.alloc.sizes[i]; j++) {
/* If this is a split point, reset the offset to 0 and allocate a
* new virtual GRF for the previous offset many registers
*/
if (split_points[reg]) {
has_splits = true;
vgrf_has_split[i] = true;
unsigned grf = brw_allocate_vgrf_units(s, offset).nr;
for (unsigned k = reg - offset; k < reg; k++)
new_virtual_grf[k] = grf;
offset = 0;
}
new_reg_offset[reg] = offset;
offset++;
reg++;
}
/* The last one gets the original register number */
s.alloc.sizes[i] = offset;
for (unsigned k = reg - offset; k < reg; k++)
new_virtual_grf[k] = i;
}
assert(reg == reg_count);
bool progress;
if (!has_splits) {
progress = false;
goto cleanup;
}
foreach_block_and_inst_safe(block, brw_inst, inst, s.cfg) {
if (inst->opcode == SHADER_OPCODE_UNDEF) {
assert(inst->dst.file == VGRF);
if (vgrf_has_split[inst->dst.nr]) {
const brw_builder ibld(inst);
assert(inst->size_written % REG_SIZE == 0);
unsigned reg_offset = inst->dst.offset / REG_SIZE;
unsigned size_written = 0;
while (size_written < inst->size_written) {
reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
brw_inst *undef =
ibld.UNDEF(
byte_offset(brw_vgrf(new_virtual_grf[reg], inst->dst.type),
new_reg_offset[reg] * REG_SIZE));
undef->size_written =
MIN2(inst->size_written - size_written, undef->size_written);
assert(undef->size_written % REG_SIZE == 0);
size_written += undef->size_written;
}
inst->remove();
} else {
reg = vgrf_to_reg[inst->dst.nr];
assert(new_reg_offset[reg] == 0);
assert(new_virtual_grf[reg] == inst->dst.nr);
}
continue;
}
if (inst->dst.file == VGRF) {
reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
if (vgrf_has_split[inst->dst.nr]) {
inst->dst.nr = new_virtual_grf[reg];
inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
inst->dst.offset % REG_SIZE;
assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
} else {
assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
assert(new_virtual_grf[reg] == inst->dst.nr);
}
}
for (unsigned i = 0; i < inst->sources; i++) {
if (inst->src[i].file != VGRF)
continue;
reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
if (vgrf_has_split[inst->src[i].nr]) {
inst->src[i].nr = new_virtual_grf[reg];
inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
inst->src[i].offset % REG_SIZE;
assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
} else {
assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
assert(new_virtual_grf[reg] == inst->src[i].nr);
}
}
}
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
BRW_DEPENDENCY_VARIABLES);
progress = true;
cleanup:
delete[] split_points;
delete[] vgrf_has_split;
delete[] new_virtual_grf;
delete[] new_reg_offset;
delete[] vgrf_to_reg;
return progress;
}
/**
* Remove unused virtual GRFs and compact the vgrf_* arrays.
*
* During code generation, we create tons of temporary variables, many of
* which get immediately killed and are never used again. Yet, in later
* optimization and analysis passes, such as compute_live_intervals, we need
* to loop over all the virtual GRFs. Compacting them can save a lot of
* overhead.
*/
bool
brw_opt_compact_virtual_grfs(brw_shader &s)
{
bool progress = false;
int *remap_table = new int[s.alloc.count];
memset(remap_table, -1, s.alloc.count * sizeof(int));
/* Mark which virtual GRFs are used. */
foreach_block_and_inst(block, const brw_inst, inst, s.cfg) {
if (inst->dst.file == VGRF)
remap_table[inst->dst.nr] = 0;
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF)
remap_table[inst->src[i].nr] = 0;
}
}
/* Compact the GRF arrays. */
int new_index = 0;
for (unsigned i = 0; i < s.alloc.count; i++) {
if (remap_table[i] == -1) {
/* We just found an unused register. This means that we are
* actually going to compact something.
*/
progress = true;
} else {
remap_table[i] = new_index;
s.alloc.sizes[new_index] = s.alloc.sizes[i];
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTION_DETAIL |
BRW_DEPENDENCY_VARIABLES);
++new_index;
}
}
s.alloc.count = new_index;
/* Patch all the instructions to use the newly renumbered registers */
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
if (inst->dst.file == VGRF)
inst->dst.nr = remap_table[inst->dst.nr];
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == VGRF)
inst->src[i].nr = remap_table[inst->src[i].nr];
}
}
/* Patch all the references to delta_xy, since they're used in register
* allocation. If they're unused, switch them to BAD_FILE so we don't
* think some random VGRF is delta_xy.
*/
for (unsigned i = 0; i < ARRAY_SIZE(s.delta_xy); i++) {
if (s.delta_xy[i].file == VGRF) {
if (remap_table[s.delta_xy[i].nr] != -1) {
s.delta_xy[i].nr = remap_table[s.delta_xy[i].nr];
} else {
s.delta_xy[i].file = BAD_FILE;
}
}
}
delete[] remap_table;
return progress;
}