From 7b92e11e16ad1e947035c2af40d54076457e81e0 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 5 Aug 2024 12:30:06 +0100 Subject: [PATCH] aco: forget valu delays after certain s_waitcnt_depctr/LDSDIR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (navi31): Totals from 55242 (69.58% of 79395) affected shaders: Instrs: 40507666 -> 40138006 (-0.91%); split: -0.91%, +0.00% CodeSize: 212516104 -> 211025880 (-0.70%); split: -0.70%, +0.00% Latency: 281643258 -> 281628053 (-0.01%); split: -0.01%, +0.00% InvThroughput: 46370668 -> 46369637 (-0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Acked-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_NOPs.cpp | 21 ++++----------------- src/amd/compiler/aco_insert_delay_alu.cpp | 16 ++++++++++------ src/amd/compiler/aco_ir.cpp | 13 +++++++++++++ src/amd/compiler/aco_ir.h | 2 ++ 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 08856fd0359..d6044e138f5 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -1136,19 +1136,6 @@ test_vgpr_bitset(std::bitset<256>& set, Operand op) } /* GFX11 */ -unsigned -parse_vdst_wait(aco_ptr& instr) -{ - if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) - return 0; - else if (instr->isLDSDIR()) - return instr->ldsdir().wait_vdst; - else if (instr->opcode == aco_opcode::s_waitcnt_depctr) - return (instr->salu().imm >> 12) & 0xf; - else - return 15; -} - struct LdsDirectVALUHazardGlobalState { unsigned wait_vdst = 15; PhysReg vgpr; @@ -1188,7 +1175,7 @@ handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state block_state.num_valu++; } - if (parse_vdst_wait(instr) == 0) + if (parse_vdst_wait(instr.get()) == 0) return true; block_state.num_instrs++; @@ -1310,7 +1297,7 @@ handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalSta } block_state.num_valu_since_read++; - } else if (parse_vdst_wait(instr) == 0) { + } else if (parse_vdst_wait(instr.get()) == 0) { return true; } @@ -1407,7 +1394,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& ctx.has_Vcmpx = false; } - unsigned va_vdst = parse_vdst_wait(instr); + unsigned va_vdst = parse_vdst_wait(instr.get()); unsigned vm_vsrc = 7; unsigned sa_sdst = 1; @@ -1608,7 +1595,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& bool has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr& pred) { - if (parse_vdst_wait(pred) == 0) + if (parse_vdst_wait(pred.get()) == 0) return true; if (--block_state == 0) { diff --git a/src/amd/compiler/aco_insert_delay_alu.cpp b/src/amd/compiler/aco_insert_delay_alu.cpp index baf1a5c5f58..286b304297e 100644 --- a/src/amd/compiler/aco_insert_delay_alu.cpp +++ b/src/amd/compiler/aco_insert_delay_alu.cpp @@ -192,6 +192,16 @@ update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles) void kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx) { + if (parse_vdst_wait(instr) == 0) { + std::map::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) { + alu_delay_info& entry = it->second; + entry.valu_instrs = alu_delay_info::valu_nop; + entry.trans_instrs = alu_delay_info::trans_nop; + it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); + } + } + if (instr->isVALU() || instr->isSALU()) check_alu(ctx, delay, instr); @@ -213,12 +223,6 @@ kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx) void gen_alu(Instruction* instr, delay_ctx& ctx) { - if (instr->isEXP() || instr->isDS() || instr->isMIMG() || instr->isFlatLike() || - instr->isMUBUF() || instr->isMTBUF()) { - ctx.gpr_map.clear(); - return; - } - Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr); bool is_valu = instr->isVALU(); bool is_trans = instr->isTrans(); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 089dedd7848..35ba6691139 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1398,6 +1398,19 @@ get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr) return 0; } +unsigned +parse_vdst_wait(Instruction* instr) +{ + if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP()) + return 0; + else if (instr->isLDSDIR()) + return instr->ldsdir().wait_vdst; + else if (instr->opcode == aco_opcode::s_waitcnt_depctr) + return (instr->salu().imm >> 12) & 0xf; + else + return 15; +} + bool dealloc_vgprs(Program* program) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 5100ebe33c7..19dc72deb62 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1853,6 +1853,8 @@ enum vmem_type : uint8_t { */ uint8_t get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr); +unsigned parse_vdst_wait(Instruction* instr); + enum block_kind { /* uniform indicates that leaving this block, * all actives lanes stay active */