aco: make optimize_postRA() work across blocks

fossil-db (Sienna Cichlid):
Totals from 46 (0.03% of 150170) affected shaders:
CodeSize: 103672 -> 103488 (-0.18%)
Instrs: 21968 -> 21922 (-0.21%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11924>
This commit is contained in:
Rhys Perry 2021-07-08 17:43:37 +01:00 committed by Marge Bot
parent 1d894a8c85
commit 2e56e23420

View file

@ -34,26 +34,65 @@ namespace {
constexpr const size_t max_reg_cnt = 512;
enum {
not_written_in_block = -1,
clobbered = -2,
const_or_undef = -3,
written_by_multiple_instrs = -4,
struct Idx {
bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; }
bool operator!=(const Idx& other) const { return !operator==(other); }
bool found() const { return block != UINT32_MAX; }
uint32_t block;
uint32_t instr;
};
Idx not_written_in_block{UINT32_MAX, 0};
Idx clobbered{UINT32_MAX, 1};
Idx const_or_undef{UINT32_MAX, 2};
Idx written_by_multiple_instrs{UINT32_MAX, 3};
bool
is_instr_after(Idx second, Idx first)
{
if (first == not_written_in_block && second != not_written_in_block)
return true;
if (!first.found() || !second.found())
return false;
return second.block > first.block || (second.block == first.block && second.instr > first.instr);
}
struct pr_opt_ctx {
Program* program;
Block* current_block;
int current_instr_idx;
uint32_t current_instr_idx;
std::vector<uint16_t> uses;
std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;
std::vector<std::array<Idx, max_reg_cnt>> instr_idx_by_regs;
void reset_block(Block* block)
{
current_block = block;
current_instr_idx = -1;
std::fill(instr_idx_by_regs.begin(), instr_idx_by_regs.end(), not_written_in_block);
current_instr_idx = 0;
if ((block->kind & block_kind_loop_header) || block->linear_preds.empty()) {
std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
not_written_in_block);
} else {
unsigned first_pred = block->linear_preds[0];
for (unsigned i = 0; i < max_reg_cnt; i++) {
bool all_same = std::all_of(
std::next(block->linear_preds.begin()), block->linear_preds.end(),
[&](unsigned pred)
{ return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_pred][i]; });
if (all_same)
instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_pred][i];
else
instr_idx_by_regs[block->index][i] = not_written_in_block;
}
}
}
Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); }
};
void
@ -65,36 +104,38 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u);
unsigned r = def.physReg().reg();
int idx = ctx.current_instr_idx;
Idx idx{ctx.current_block->index, ctx.current_instr_idx};
if (def.regClass().is_subdword())
idx = clobbered;
assert(def.size() == dw_size || def.regClass().is_subdword());
std::fill(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], idx);
std::fill(&ctx.instr_idx_by_regs[ctx.current_block->index][r],
&ctx.instr_idx_by_regs[ctx.current_block->index][r + dw_size], idx);
}
}
int
Idx
last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
{
/* Verify that all of the operand's registers are written by the same instruction. */
int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()];
unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
unsigned r = physReg.reg();
bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
[instr_idx](int i) { return i == instr_idx; });
bool all_same = std::all_of(&ctx.instr_idx_by_regs[ctx.current_block->index][r],
&ctx.instr_idx_by_regs[ctx.current_block->index][r + dw_size],
[instr_idx](Idx i) { return i == instr_idx; });
return all_same ? instr_idx : written_by_multiple_instrs;
}
int
Idx
last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
{
if (op.isConstant() || op.isUndefined())
return const_or_undef;
int instr_idx = ctx.instr_idx_by_regs[op.physReg().reg()];
Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][op.physReg().reg()];
#ifndef NDEBUG
/* Debug mode: */
@ -129,21 +170,22 @@ try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
instr->operands[0].physReg() != scc)
return;
int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
Idx last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
/* We need to make sure:
* - the operand register used by the branch, and VCC were both written in the current block
* - VCC was NOT written after the operand register
* - EXEC is sane and was NOT written after the operand register
*/
if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() ||
!is_instr_after(last_vcc_wr_idx, last_exec_wr_idx) ||
!is_instr_after(op0_instr_idx, last_vcc_wr_idx))
return;
aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
Instruction* op0_instr = ctx.get(op0_instr_idx);
Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx);
if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
@ -192,12 +234,12 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
return;
/* Make sure both SCC and Operand 0 are written by the same instruction. */
int wr_idx = last_writer_idx(ctx, instr->operands[0]);
int sccwr_idx = last_writer_idx(ctx, scc, s1);
if (wr_idx < 0 || wr_idx != sccwr_idx)
Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
if (!wr_idx.found() || wr_idx != sccwr_idx)
return;
aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
Instruction* wr_instr = ctx.get(wr_idx);
if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
wr_instr->definitions[1].physReg() != scc)
return;
@ -259,11 +301,11 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
scc_op_idx = 2;
}
int wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
if (wr_idx < 0)
Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
if (!wr_idx.found())
return;
aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
Instruction* wr_instr = ctx.get(wr_idx);
/* Check if we found the pattern above. */
if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
@ -299,14 +341,14 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
void
process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
ctx.current_instr_idx++;
try_apply_branch_vcc(ctx, instr);
try_optimize_scc_nocompare(ctx, instr);
if (instr)
save_reg_writes(ctx, instr);
ctx.current_instr_idx++;
}
} // namespace
@ -317,6 +359,7 @@ optimize_postRA(Program* program)
pr_opt_ctx ctx;
ctx.program = program;
ctx.uses = dead_code_analysis(program);
ctx.instr_idx_by_regs.resize(program->blocks.size());
/* Forward pass
* Goes through each instruction exactly once, and can transform