From d9cdb3524a3d4e2ee7c2600a1d6b6764cb9cb791 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 22 May 2023 16:32:00 +0100 Subject: [PATCH] aco: fix update_alu(clear=true) for exports For: v_mov_b32_e32 v0, 1.0 exp mrtz v0, off, off, off we should completely remove the ALU entry before creating the EXP's WaR entry for v0. Otherwise, the two will be combined into an entry which will wait for expcnt(0) for later uses of v0. gen_alu() should also be before gen(), since gen_alu() performs the clear while gen() creates the WaR entry. fossil-db (gfx1100): Totals from 3589 (2.69% of 133428) affected shaders: Instrs: 5591041 -> 5589047 (-0.04%); split: -0.04%, +0.00% CodeSize: 28580840 -> 28572864 (-0.03%); split: -0.03%, +0.00% Latency: 65427923 -> 65427543 (-0.00%); split: -0.00%, +0.00% InvThroughput: 11109079 -> 11109065 (-0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 0a87cccde28..8a8c6949b38 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -495,11 +495,12 @@ force_waitcnt(wait_ctx& ctx, wait_imm& imm) void update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles) { - for (std::pair& e : ctx.gpr_map) { - wait_entry& entry = e.second; + std::map::iterator it = ctx.gpr_map.begin(); + while (it != ctx.gpr_map.end()) { + wait_entry& entry = it->second; if (clear) { - entry.delay = alu_delay_info(); + entry.remove_counter(counter_alu); } else { entry.delay.valu_instrs += is_valu ? 1 : 0; entry.delay.trans_instrs += is_trans ? 1 : 0; @@ -508,7 +509,14 @@ update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles) entry.delay.trans_cycles -= cycles; entry.delay.fixup(); + if (it->second.delay.empty()) + entry.remove_counter(counter_alu); } + + if (!entry.counters) + it = ctx.gpr_map.erase(it); + else + it++; } } @@ -992,9 +1000,9 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) memory_sync_info sync_info = get_sync_info(instr.get()); kill(queued_imm, queued_delay, instr.get(), ctx, sync_info); - gen(instr.get(), ctx); if (program->gfx_level >= GFX11) gen_alu(instr.get(), ctx); + gen(instr.get(), ctx); if (instr->format != Format::PSEUDO_BARRIER && !is_wait && !is_delay_alu) { if (instr->isVINTERP_INREG() && queued_imm.exp != wait_imm::unset_counter) {