intel/brw: Lower VGRFs to FIXED_GRFs earlier

Moves the lowering of VGRFs into FIXED_GRFs from the code generation
to (almost) right after the register allocation.

This will allow: (1) later passes not worry about VGRFs (and what they
mean in a post reg alloc phase) and (2) make easier to add certain
types of validation post reg alloc phase using the backend IR.

Note that a couple of passes still take advantage of seeing "allocated
VGRFs", so perform lowering after they run.

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28604>
This commit is contained in:
Caio Oliveira 2024-04-04 16:03:34 -07:00 committed by Marge Bot
parent 5b3d4c757d
commit ff89e83178
4 changed files with 109 additions and 88 deletions

View file

@ -2940,6 +2940,18 @@ fs_visitor::allocate_registers(bool allow_spilling)
debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
* of part of assign_regs since both bank conflicts optimization and post
* RA scheduling take advantage of distinguishing references to registers
* that were allocated from references that were already fixed.
*
* TODO: Change the passes above, then move this lowering to be part of
* assign_regs.
*/
brw_fs_lower_vgrfs_to_fixed_grfs(*this);
debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
if (last_scratch > 0) {
ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;

View file

@ -608,6 +608,7 @@ bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s);
bool brw_fs_lower_simd_width(fs_visitor &s);
bool brw_fs_lower_sub_sat(fs_visitor &s);
bool brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s);
void brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
bool brw_fs_opt_algebraic(fs_visitor &s);
bool brw_fs_opt_bank_conflicts(fs_visitor &s);

View file

@ -64,80 +64,13 @@ brw_math_function(enum opcode op)
}
}
static enum brw_reg_file
brw_file_from_reg(fs_reg *reg)
{
switch (reg->file) {
case ARF:
return BRW_ARCHITECTURE_REGISTER_FILE;
case FIXED_GRF:
case VGRF:
return BRW_GENERAL_REGISTER_FILE;
case IMM:
return BRW_IMMEDIATE_VALUE;
case BAD_FILE:
case ATTR:
case UNIFORM:
unreachable("not reached");
}
return BRW_ARCHITECTURE_REGISTER_FILE;
}
static struct brw_reg
brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
fs_reg *reg, bool compressed)
fs_reg *reg)
{
struct brw_reg brw_reg;
switch (reg->file) {
case VGRF:
if (reg->stride == 0) {
brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
} else {
/* From the Haswell PRM:
*
* "VertStride must be used to cross GRF register boundaries. This
* rule implies that elements within a 'Width' cannot cross GRF
* boundaries."
*
* The maximum width value that could satisfy this restriction is:
*/
const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
/* Because the hardware can only split source regions at a whole
* multiple of width during decompression (i.e. vertically), clamp
* the value obtained above to the physical execution size of a
* single decompressed chunk of the instruction:
*/
const unsigned phys_width = compressed ? inst->exec_size / 2 :
inst->exec_size;
const unsigned max_hw_width = 16;
/* XXX - The equation above is strictly speaking not correct on
* hardware that supports unbalanced GRF writes -- On Gfx9+
* each decompressed chunk of the instruction may have a
* different execution size when the number of components
* written to each destination GRF is not the same.
*/
if (reg->stride > 4) {
assert(reg != &inst->dst);
assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
brw_reg = stride(brw_reg, reg->stride, 1, 0);
} else {
const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
}
}
brw_reg = retype(brw_reg, reg->type);
brw_reg = byte_offset(brw_reg, reg->offset);
brw_reg.abs = reg->abs;
brw_reg.negate = reg->negate;
break;
case ARF:
case FIXED_GRF:
case IMM:
@ -148,6 +81,7 @@ brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
/* Probably unused. */
brw_reg = brw_null_reg();
break;
case VGRF:
case ATTR:
case UNIFORM:
unreachable("not reached");
@ -913,22 +847,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
if (unlikely(debug_flag))
disasm_annotate(disasm_info, inst, p->next_insn_offset);
/* If the instruction writes to more than one register, it needs to be
* explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
* hardware figures out by itself what the right compression mode is,
* but we still need to know whether the instruction is compressed to
* set up the source register regions appropriately.
*
* XXX - This is wrong for instructions that write a single register but
* read more than one which should strictly speaking be treated as
* compressed. For instructions that don't write any registers it
* relies on the destination being a null register of the correct
* type and regioning so the instruction is considered compressed
* or not accordingly.
*/
const bool compressed =
inst->dst.component_size(inst->exec_size) > REG_SIZE;
if (devinfo->ver >= 20 && inst->group % 8 != 0) {
assert(inst->force_writemask_all);
assert(!inst->predicate && !inst->conditional_mod);
@ -941,8 +859,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
}
for (unsigned int i = 0; i < inst->sources; i++) {
src[i] = brw_reg_from_fs_reg(devinfo, inst,
&inst->src[i], compressed);
src[i] = brw_reg_from_fs_reg(devinfo, inst, &inst->src[i]);
/* The accumulator result appears to get used for the
* conditional modifier generation. When negating a UD
* value, there is a 33rd bit generated for the sign in the
@ -953,8 +870,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
inst->src[i].type != BRW_REGISTER_TYPE_UD ||
!inst->src[i].negate);
}
dst = brw_reg_from_fs_reg(devinfo, inst,
&inst->dst, compressed);
dst = brw_reg_from_fs_reg(devinfo, inst, &inst->dst);
brw_set_default_access_mode(p, BRW_ALIGN_1);
brw_set_default_predicate_control(p, inst->predicate);

View file

@ -612,3 +612,95 @@ brw_fs_lower_alu_restrictions(fs_visitor &s)
return progress;
}
static void
brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
fs_reg *reg, bool compressed)
{
if (reg->file != VGRF)
return;
struct brw_reg new_reg;
if (reg->stride == 0) {
new_reg = brw_vec1_grf(reg->nr, 0);
} else if (reg->stride > 4) {
assert(reg != &inst->dst);
assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
new_reg = brw_vecn_grf(1, reg->nr, 0);
new_reg = stride(new_reg, reg->stride, 1, 0);
} else {
/* From the Haswell PRM:
*
* "VertStride must be used to cross GRF register boundaries. This
* rule implies that elements within a 'Width' cannot cross GRF
* boundaries."
*
* The maximum width value that could satisfy this restriction is:
*/
const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
/* Because the hardware can only split source regions at a whole
* multiple of width during decompression (i.e. vertically), clamp
* the value obtained above to the physical execution size of a
* single decompressed chunk of the instruction:
*/
const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
const unsigned phys_width = compressed ? inst->exec_size / 2 :
inst->exec_size;
/* XXX - The equation above is strictly speaking not correct on
* hardware that supports unbalanced GRF writes -- On Gfx9+
* each decompressed chunk of the instruction may have a
* different execution size when the number of components
* written to each destination GRF is not the same.
*/
const unsigned max_hw_width = 16;
const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
new_reg = brw_vecn_grf(width, reg->nr, 0);
new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
}
new_reg = retype(new_reg, reg->type);
new_reg = byte_offset(new_reg, reg->offset);
new_reg.abs = reg->abs;
new_reg.negate = reg->negate;
*reg = new_reg;
}
void
brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
{
assert(s.grf_used || !"Must be called after register allocation");
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
/* If the instruction writes to more than one register, it needs to be
* explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
* hardware figures out by itself what the right compression mode is,
* but we still need to know whether the instruction is compressed to
* set up the source register regions appropriately.
*
* XXX - This is wrong for instructions that write a single register but
* read more than one which should strictly speaking be treated as
* compressed. For instructions that don't write any registers it
* relies on the destination being a null register of the correct
* type and regioning so the instruction is considered compressed
* or not accordingly.
*/
const bool compressed =
inst->dst.component_size(inst->exec_size) > REG_SIZE;
brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
for (int i = 0; i < inst->sources; i++) {
brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
}
}
s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}