From 2e56e2342094e8ec90afa5265b1c43503f662939 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 8 Jul 2021 17:43:37 +0100 Subject: [PATCH] aco: make optimize_postRA() work across blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Sienna Cichlid): Totals from 46 (0.03% of 150170) affected shaders: CodeSize: 103672 -> 103488 (-0.18%) Instrs: 21968 -> 21922 (-0.21%) Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_optimizer_postRA.cpp | 109 +++++++++++++++------- 1 file changed, 76 insertions(+), 33 deletions(-) diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp index d086eff7cef..e612292e822 100644 --- a/src/amd/compiler/aco_optimizer_postRA.cpp +++ b/src/amd/compiler/aco_optimizer_postRA.cpp @@ -34,26 +34,65 @@ namespace { constexpr const size_t max_reg_cnt = 512; -enum { - not_written_in_block = -1, - clobbered = -2, - const_or_undef = -3, - written_by_multiple_instrs = -4, +struct Idx { + bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; } + bool operator!=(const Idx& other) const { return !operator==(other); } + + bool found() const { return block != UINT32_MAX; } + + uint32_t block; + uint32_t instr; }; +Idx not_written_in_block{UINT32_MAX, 0}; +Idx clobbered{UINT32_MAX, 1}; +Idx const_or_undef{UINT32_MAX, 2}; +Idx written_by_multiple_instrs{UINT32_MAX, 3}; + +bool +is_instr_after(Idx second, Idx first) +{ + if (first == not_written_in_block && second != not_written_in_block) + return true; + + if (!first.found() || !second.found()) + return false; + + return second.block > first.block || (second.block == first.block && second.instr > first.instr); +} + struct pr_opt_ctx { Program* program; Block* current_block; - int current_instr_idx; + uint32_t current_instr_idx; std::vector uses; - std::array instr_idx_by_regs; + std::vector> instr_idx_by_regs; void reset_block(Block* block) { current_block = block; - current_instr_idx = -1; - std::fill(instr_idx_by_regs.begin(), instr_idx_by_regs.end(), not_written_in_block); + current_instr_idx = 0; + + if ((block->kind & block_kind_loop_header) || block->linear_preds.empty()) { + std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(), + not_written_in_block); + } else { + unsigned first_pred = block->linear_preds[0]; + for (unsigned i = 0; i < max_reg_cnt; i++) { + bool all_same = std::all_of( + std::next(block->linear_preds.begin()), block->linear_preds.end(), + [&](unsigned pred) + { return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_pred][i]; }); + + if (all_same) + instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_pred][i]; + else + instr_idx_by_regs[block->index][i] = not_written_in_block; + } + } } + + Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); } }; void @@ -65,36 +104,38 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr& instr) unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u); unsigned r = def.physReg().reg(); - int idx = ctx.current_instr_idx; + Idx idx{ctx.current_block->index, ctx.current_instr_idx}; if (def.regClass().is_subdword()) idx = clobbered; assert(def.size() == dw_size || def.regClass().is_subdword()); - std::fill(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], idx); + std::fill(&ctx.instr_idx_by_regs[ctx.current_block->index][r], + &ctx.instr_idx_by_regs[ctx.current_block->index][r + dw_size], idx); } } -int +Idx last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc) { /* Verify that all of the operand's registers are written by the same instruction. */ - int instr_idx = ctx.instr_idx_by_regs[physReg.reg()]; + Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()]; unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u); unsigned r = physReg.reg(); - bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], - [instr_idx](int i) { return i == instr_idx; }); + bool all_same = std::all_of(&ctx.instr_idx_by_regs[ctx.current_block->index][r], + &ctx.instr_idx_by_regs[ctx.current_block->index][r + dw_size], + [instr_idx](Idx i) { return i == instr_idx; }); return all_same ? instr_idx : written_by_multiple_instrs; } -int +Idx last_writer_idx(pr_opt_ctx& ctx, const Operand& op) { if (op.isConstant() || op.isUndefined()) return const_or_undef; - int instr_idx = ctx.instr_idx_by_regs[op.physReg().reg()]; + Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][op.physReg().reg()]; #ifndef NDEBUG /* Debug mode: */ @@ -129,21 +170,22 @@ try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr& instr) instr->operands[0].physReg() != scc) return; - int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]); - int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask); - int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask); + Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]); + Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask); + Idx last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask); /* We need to make sure: * - the operand register used by the branch, and VCC were both written in the current block * - VCC was NOT written after the operand register * - EXEC is sane and was NOT written after the operand register */ - if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx || - last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block) + if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() || + !is_instr_after(last_vcc_wr_idx, last_exec_wr_idx) || + !is_instr_after(op0_instr_idx, last_vcc_wr_idx)) return; - aco_ptr& op0_instr = ctx.current_block->instructions[op0_instr_idx]; - aco_ptr& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx]; + Instruction* op0_instr = ctx.get(op0_instr_idx); + Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx); if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ && op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) || @@ -192,12 +234,12 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr& instr) return; /* Make sure both SCC and Operand 0 are written by the same instruction. */ - int wr_idx = last_writer_idx(ctx, instr->operands[0]); - int sccwr_idx = last_writer_idx(ctx, scc, s1); - if (wr_idx < 0 || wr_idx != sccwr_idx) + Idx wr_idx = last_writer_idx(ctx, instr->operands[0]); + Idx sccwr_idx = last_writer_idx(ctx, scc, s1); + if (!wr_idx.found() || wr_idx != sccwr_idx) return; - aco_ptr& wr_instr = ctx.current_block->instructions[wr_idx]; + Instruction* wr_instr = ctx.get(wr_idx); if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || wr_instr->definitions[1].physReg() != scc) return; @@ -259,11 +301,11 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr& instr) scc_op_idx = 2; } - int wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]); - if (wr_idx < 0) + Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]); + if (!wr_idx.found()) return; - aco_ptr& wr_instr = ctx.current_block->instructions[wr_idx]; + Instruction* wr_instr = ctx.get(wr_idx); /* Check if we found the pattern above. */ if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && @@ -299,14 +341,14 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr& instr) void process_instruction(pr_opt_ctx& ctx, aco_ptr& instr) { - ctx.current_instr_idx++; - try_apply_branch_vcc(ctx, instr); try_optimize_scc_nocompare(ctx, instr); if (instr) save_reg_writes(ctx, instr); + + ctx.current_instr_idx++; } } // namespace @@ -317,6 +359,7 @@ optimize_postRA(Program* program) pr_opt_ctx ctx; ctx.program = program; ctx.uses = dead_code_analysis(program); + ctx.instr_idx_by_regs.resize(program->blocks.size()); /* Forward pass * Goes through each instruction exactly once, and can transform