aco: refactor and speed-up dead code analysis

Assuming that no loop header phis are dead code,
we can perform the dead code analysis in a single iteration.

Totals from 25 (0.03% of 79330) affected shaders: (GFX11)

MaxWaves: 664 -> 662 (-0.30%)
Instrs: 487618 -> 488822 (+0.25%)
CodeSize: 2451548 -> 2459756 (+0.33%)
VGPRs: 1296 -> 1332 (+2.78%)
Latency: 2337256 -> 2338098 (+0.04%); split: -0.00%, +0.04%
InvThroughput: 560682 -> 576158 (+2.76%)
VClause: 15782 -> 15790 (+0.05%)
Copies: 37905 -> 38731 (+2.18%)
PreVGPRs: 1124 -> 1156 (+2.85%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26901>
This commit is contained in:
Daniel Schürmann 2024-01-05 08:22:36 +01:00 committed by Marge Bot
parent a37f43e422
commit dce695b24f
2 changed files with 33 additions and 41 deletions

View file

@ -30,51 +30,40 @@
/*
* Implements an analysis pass to determine the number of uses
* for each SSA-definition.
*
* This pass assumes that no loop header phis are dead code.
*/
namespace aco {
namespace {
struct dce_ctx {
int current_block;
std::vector<uint16_t> uses;
std::vector<std::vector<bool>> live;
dce_ctx(Program* program)
: current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
{
live.reserve(program->blocks.size());
for (Block& block : program->blocks)
live.emplace_back(block.instructions.size());
}
};
void
process_block(dce_ctx& ctx, Block& block)
process_loop_header_phis(std::vector<uint16_t>& uses, Block& block)
{
std::vector<bool>& live = ctx.live[block.index];
assert(live.size() == block.instructions.size());
bool process_predecessors = false;
for (int idx = block.instructions.size() - 1; idx >= 0; idx--) {
if (live[idx])
continue;
aco_ptr<Instruction>& instr = block.instructions[idx];
if (!is_dead(ctx.uses, instr.get())) {
for (const Operand& op : instr->operands) {
if (op.isTemp()) {
if (ctx.uses[op.tempId()] == 0)
process_predecessors = true;
ctx.uses[op.tempId()]++;
}
}
live[idx] = true;
for (aco_ptr<Instruction>& instr : block.instructions) {
if (!is_phi(instr))
return;
for (const Operand& op : instr->operands) {
if (op.isTemp())
uses[op.tempId()]++;
}
}
}
if (process_predecessors) {
for (unsigned pred_idx : block.linear_preds)
ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
void
process_block(std::vector<uint16_t>& uses, Block& block)
{
for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); it++) {
aco_ptr<Instruction>& instr = *it;
if ((block.kind & block_kind_loop_header) && is_phi(instr))
break;
if (!is_dead(uses, instr.get())) {
for (const Operand& op : instr->operands) {
if (op.isTemp())
uses[op.tempId()]++;
}
}
}
}
@ -83,15 +72,17 @@ process_block(dce_ctx& ctx, Block& block)
std::vector<uint16_t>
dead_code_analysis(Program* program)
{
std::vector<uint16_t> uses(program->peekAllocationId());
dce_ctx ctx(program);
while (ctx.current_block >= 0) {
unsigned next_block = ctx.current_block--;
process_block(ctx, program->blocks[next_block]);
for (Block& block : program->blocks) {
if (block.kind & block_kind_loop_header)
process_loop_header_phis(uses, block);
}
return ctx.uses;
for (auto it = program->blocks.rbegin(); it != program->blocks.rend(); it++)
process_block(uses, *it);
return uses;
}
} // namespace aco

View file

@ -251,6 +251,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
/* create ssa name for restore mask */
if (info.has_divergent_break) {
// TODO: this phi is unnecessary if we end WQM immediately after the loop
/* this phi might be trivial but ensures a parallelcopy on the loop header */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};