aco: rematerialize constants in every basic block during optimizer

Totals from 16837 (21.25% of 79242) affected shaders: (GFX11)

MaxWaves: 441634 -> 444546 (+0.66%); split: +0.66%, -0.00%
Instrs: 25908303 -> 25838469 (-0.27%); split: -0.36%, +0.09%
CodeSize: 133943168 -> 135446948 (+1.12%); split: -0.04%, +1.16%
VGPRs: 985332 -> 977440 (-0.80%); split: -0.83%, +0.03%
SpillSGPRs: 9133 -> 7535 (-17.50%); split: -17.74%, +0.24%
SpillVGPRs: 1418 -> 1359 (-4.16%); split: -4.58%, +0.42%
Scratch: 5047552 -> 5040640 (-0.14%)
Latency: 204330340 -> 204179212 (-0.07%); split: -0.32%, +0.25%
InvThroughput: 36584220 -> 36508856 (-0.21%); split: -0.40%, +0.19%
VClause: 437847 -> 437344 (-0.11%); split: -0.34%, +0.22%
SClause: 771311 -> 771013 (-0.04%); split: -0.42%, +0.38%
Copies: 1774950 -> 1712070 (-3.54%); split: -4.46%, +0.91%
Branches: 580595 -> 580478 (-0.02%); split: -0.03%, +0.01%
PreSGPRs: 877017 -> 817549 (-6.78%)
PreVGPRs: 852747 -> 846966 (-0.68%); split: -0.68%, +0.00%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26875>
This commit is contained in:
Daniel Schürmann 2024-01-04 15:50:10 +01:00 committed by Marge Bot
parent 9baa57158d
commit 61854009f3

View file

@ -521,7 +521,7 @@ struct opt_ctx {
Program* program;
float_mode fp_mode;
std::vector<aco_ptr<Instruction>> instructions;
ssa_info* info;
std::vector<ssa_info> info;
std::pair<uint32_t, Temp> last_literal;
std::vector<mad_info> mad_infos;
std::vector<uint16_t> uses;
@ -3116,6 +3116,7 @@ combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op
* uses properly initialized to 0.
*/
ctx.uses.push_back(0);
ctx.info.push_back(ssa_info{});
}
new_instr->operands[0] = Operand::zero();
new_instr->operands[1] = instr->operands[!i];
@ -4616,6 +4617,91 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
}
struct remat_entry {
Instruction* instr;
uint32_t block;
};
inline bool
is_constant(Instruction* instr)
{
if (instr->opcode != aco_opcode::p_parallelcopy || instr->operands.size() != 1)
return false;
return instr->operands[0].isConstant() && instr->definitions[0].isTemp();
}
void
remat_constants_instr(opt_ctx& ctx, aco::map<Temp, remat_entry>& constants, Instruction* instr,
uint32_t block_idx)
{
for (Operand& op : instr->operands) {
if (!op.isTemp())
continue;
auto it = constants.find(op.getTemp());
if (it == constants.end())
continue;
/* Check if we already emitted the same constant in this block. */
if (it->second.block != block_idx) {
/* Rematerialize the constant. */
Builder bld(ctx.program, &ctx.instructions);
Operand const_op = it->second.instr->operands[0];
it->second.instr = bld.copy(bld.def(op.regClass()), const_op);
it->second.block = block_idx;
ctx.uses.push_back(0);
ctx.info.push_back(ctx.info[op.tempId()]);
}
/* Use the rematerialized constant and update information about latest use. */
if (op.getTemp() != it->second.instr->definitions[0].getTemp()) {
ctx.uses[op.tempId()]--;
op.setTemp(it->second.instr->definitions[0].getTemp());
ctx.uses[op.tempId()]++;
}
}
}
/**
* This pass implements a simple constant rematerialization.
* As common subexpression elimination (CSE) might increase the live-ranges
* of loaded constants over large distances, this pass splits the live-ranges
* again by re-emitting constants in every basic block.
*/
void
rematerialize_constants(opt_ctx& ctx)
{
aco::monotonic_buffer_resource memory(1024);
aco::map<Temp, remat_entry> constants(memory);
for (Block& block : ctx.program->blocks) {
if (block.logical_idom == -1)
continue;
if (block.logical_idom == (int)block.index)
constants.clear();
ctx.instructions.reserve(block.instructions.size());
for (aco_ptr<Instruction>& instr : block.instructions) {
if (is_dead(ctx.uses, instr.get()))
continue;
if (is_constant(instr.get())) {
Temp tmp = instr->definitions[0].getTemp();
constants[tmp] = {instr.get(), block.index};
} else if (!is_phi(instr)) {
remat_constants_instr(ctx, constants, instr.get(), block.index);
}
ctx.instructions.emplace_back(instr.release());
}
block.instructions = std::move(ctx.instructions);
}
}
bool
to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
@ -5301,8 +5387,7 @@ optimize(Program* program)
{
opt_ctx ctx;
ctx.program = program;
std::vector<ssa_info> info(program->peekAllocationId());
ctx.info = info.data();
ctx.info = std::vector<ssa_info>(program->peekAllocationId());
/* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
for (Block& block : program->blocks) {
@ -5313,14 +5398,17 @@ optimize(Program* program)
ctx.uses = dead_code_analysis(program);
/* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
/* 2. Rematerialize constants in every block. */
rematerialize_constants(ctx);
/* 3. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
for (Block& block : program->blocks) {
ctx.fp_mode = block.fp_mode;
for (aco_ptr<Instruction>& instr : block.instructions)
combine_instruction(ctx, instr);
}
/* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
/* 4. Top-Down DAG pass (backward) to select instructions (includes DCE) */
for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
++block_rit) {
Block* block = &(*block_rit);
@ -5330,7 +5418,7 @@ optimize(Program* program)
select_instruction(ctx, *instr_rit);
}
/* 4. Add literals to instructions */
/* 5. Add literals to instructions */
for (Block& block : program->blocks) {
ctx.instructions.reserve(block.instructions.size());
ctx.fp_mode = block.fp_mode;