From 47e0f468cf238089e16a804236e49c674abc5303 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 25 Jul 2024 21:40:02 +0100 Subject: [PATCH] aco: workaround VALUReadSGPRHazard fossil-db (gfx1200): Totals from 65112 (82.01% of 79395) affected shaders: Instrs: 41732906 -> 42987198 (+3.01%); split: -0.00%, +3.01% CodeSize: 222451964 -> 226942644 (+2.02%); split: -0.01%, +2.03% Latency: 290411063 -> 290944688 (+0.18%); split: -0.00%, +0.18% InvThroughput: 45854913 -> 45910275 (+0.12%); split: -0.00%, +0.12% Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_insert_NOPs.cpp | 91 +++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 8f9bc97d458..5bbf3f39a00 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -273,6 +273,10 @@ struct NOP_ctx_gfx11 { /* WMMAHazards */ std::bitset<256> vgpr_written_by_wmma; + /* VALUReadSGPRHazard */ + std::bitset sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */ + CounterMap<0, m0.reg(), 11> sgpr_read_by_valu_then_wr_by_salu; + void join(const NOP_ctx_gfx11& other) { has_Vcmpx |= other.has_Vcmpx; @@ -287,6 +291,8 @@ struct NOP_ctx_gfx11 { sgpr_read_by_valu_as_lanemask_then_wr_by_salu |= other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu; vgpr_written_by_wmma |= other.vgpr_written_by_wmma; + sgpr_read_by_valu |= other.sgpr_read_by_valu; + sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu); } bool operator==(const NOP_ctx_gfx11& other) @@ -302,7 +308,9 @@ struct NOP_ctx_gfx11 { sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask && sgpr_read_by_valu_as_lanemask_then_wr_by_salu == other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu && - vgpr_written_by_wmma == other.vgpr_written_by_wmma; + vgpr_written_by_wmma == other.vgpr_written_by_wmma && + sgpr_read_by_valu == other.sgpr_read_by_valu && + sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu; } }; @@ -1514,6 +1522,48 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& } } } + } else { + /* VALUReadSGPRHazard + * VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU. + */ + if (instr->isVALU() || instr->isSALU()) { + unsigned expiry_count = instr->isSALU() ? 10 : 11; + for (Operand& op : instr->operands) { + if (sa_sdst == 0) + break; + + for (unsigned i = 0; i < op.size(); i++) { + unsigned reg = op.physReg() + i; + if (reg < ctx.sgpr_read_by_valu_then_wr_by_salu.size() && + ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) { + bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe); + sa_sdst = 0; + break; + } + } + } + } + + if (sa_sdst == 0) + ctx.sgpr_read_by_valu_then_wr_by_salu.reset(); + else if (instr->isSALU() && !instr->isSOPP()) + ctx.sgpr_read_by_valu_then_wr_by_salu.inc(); + + if (instr->isVALU()) { + for (const Operand& op : instr->operands) { + for (unsigned i = 0; i < DIV_ROUND_UP(op.size(), 2); i++) { + unsigned reg = (op.physReg() / 2) + i; + if (reg < ctx.sgpr_read_by_valu.size()) + ctx.sgpr_read_by_valu.set(reg); + } + } + } else if (instr->isSALU() && !instr->definitions.empty()) { + for (unsigned i = 0; i < instr->definitions[0].size(); i++) { + unsigned def_reg = instr->definitions[0].physReg() + i; + if ((def_reg / 2) < ctx.sgpr_read_by_valu.size() && ctx.sgpr_read_by_valu[def_reg / 2]) + ctx.sgpr_read_by_valu_then_wr_by_salu.set(def_reg); + } + } } /* LdsDirectVMEMHazard @@ -1670,6 +1720,15 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx, } } + /* VALUReadSGPRHazard */ + if (state.program->gfx_level >= GFX12) { + for (unsigned i = 0; i < ctx.sgpr_read_by_valu_then_wr_by_salu.size(); i++) { + if (ctx.sgpr_read_by_valu_then_wr_by_salu.get(i) < 11) + waitcnt_depctr &= 0xfffe; + } + ctx.sgpr_read_by_valu_then_wr_by_salu.reset(); + } + /* LdsDirectVMEMHazard */ if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() || ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() || @@ -1745,7 +1804,7 @@ handle_block(Program* program, Ctx& ctx, Block& block) template Handle, ResolveAll Resolve> void -mitigate_hazards(Program* program) +mitigate_hazards(Program* program, Ctx initial_ctx = Ctx()) { std::vector all_ctx(program->blocks.size()); std::stack> loop_header_indices; @@ -1754,6 +1813,9 @@ mitigate_hazards(Program* program) Block& block = program->blocks[i]; Ctx& ctx = all_ctx[i]; + if (i == 0 || (block.kind & block_kind_resume)) + ctx = initial_ctx; + if (block.kind & block_kind_loop_header) { loop_header_indices.push(i); } else if (block.kind & block_kind_loop_exit) { @@ -1851,14 +1913,29 @@ required_export_priority(Program* program) void insert_NOPs(Program* program) { - if (program->gfx_level >= GFX11) - mitigate_hazards(program); - else if (program->gfx_level >= GFX10_3) + if (program->gfx_level >= GFX11) { + NOP_ctx_gfx11 initial_ctx; + + bool has_previous_part = + program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog || + (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS && + program->stage.sw != SWStage::TES) || program->stage == raytracing_cs; + if (program->gfx_level >= GFX12 && has_previous_part) { + /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any + * SGPR might have been read by VALU if there was a previous shader part. + */ + initial_ctx.sgpr_read_by_valu.flip(); + } + + mitigate_hazards(program, + initial_ctx); + } else if (program->gfx_level >= GFX10_3) { ; /* no hazards/bugs to mitigate */ - else if (program->gfx_level >= GFX10) + } else if (program->gfx_level >= GFX10) { mitigate_hazards(program); - else + } else { mitigate_hazards(program); + } if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER || program->stage.hw == AC_HW_PIXEL_SHADER))