From 65f95ae74e9c0e30bd14bfca347caeedce486bba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 17 Dec 2024 12:22:27 +0100 Subject: [PATCH] aco/insert_NOPs: implement VALU -> VALU case for VALUReadSGPRHazard on GFX12 Totals from 36918 (46.50% of 79395) affected shaders: (GFX1200) Instrs: 34997889 -> 35296429 (+0.85%); split: -0.00%, +0.85% CodeSize: 186161112 -> 187334364 (+0.63%); split: -0.00%, +0.63% Latency: 250265551 -> 250330784 (+0.03%); split: -0.00%, +0.03% InvThroughput: 41185298 -> 41192503 (+0.02%); split: -0.00%, +0.02% Part-of: --- src/amd/compiler/README-ISA.md | 21 ++++ src/amd/compiler/aco_insert_NOPs.cpp | 48 +++++++- src/amd/compiler/tests/test_insert_nops.cpp | 119 ++++++++++++++++++++ 3 files changed, 184 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 55f77938c54..5baec8cba27 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -381,3 +381,24 @@ SALU writing then SALU or VALU reading a SGPR that was previously used as a lane Mitigated by: A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the SALU write: `s_waitcnt_depctr 0xfffe` + +## RDNA4 / GFX12 hazards + +### VcmpxPermlaneHazard + +Same as GFX10 + +### LdsDirectVALUHazard +### LdsDirectVMEMHazard + +Same as GFX11 + +### VALUReadSGPRHazard + +Triggered by: +VALU reads an SGPR, then written by SALU cannot safely be read by SALU or VALU, or +VALU reads an SGPR, then written by VALU cannot safely be read by VALU. + +Mitigated by: +After the SALU write a sa_sdst=0 wait. After the VALU write a va_sdst=0 / va_vcc=0 wait. +It does not reset the first step. diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 296b958da0c..de062be2c74 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -264,6 +264,7 @@ struct NOP_ctx_gfx11 { /* VALUReadSGPRHazard */ std::bitset sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */ + std::bitset sgpr_read_by_valu_then_wr_by_valu; RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu; void join(const NOP_ctx_gfx11& other) @@ -281,6 +282,7 @@ struct NOP_ctx_gfx11 { other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu; vgpr_written_by_wmma |= other.vgpr_written_by_wmma; sgpr_read_by_valu |= other.sgpr_read_by_valu; + sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu; sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu); } @@ -1405,7 +1407,8 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& ctx.has_Vcmpx = false; } - unsigned va_vdst = parse_depctr_wait(instr.get()).va_vdst; + depctr_wait wait = parse_depctr_wait(instr.get()); + unsigned va_vdst = wait.va_vdst; unsigned vm_vsrc = 7; unsigned sa_sdst = 1; @@ -1543,8 +1546,19 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& imm &= 0xfffe; sa_sdst = 0; } - if (instr->isVALU()) + if (instr->isVALU()) { ctx.sgpr_read_by_valu.set(reg / 2); + + /* s_wait_alu on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */ + if (ctx.sgpr_read_by_valu_then_wr_by_valu[reg]) { + bool is_vcc = reg == vcc || reg == vcc_hi; + imm &= is_vcc ? 0xfffd : 0xf1ff; + if (is_vcc) + wait.va_vcc = 0; + else + wait.va_sdst = 0; + } + } } } @@ -1557,8 +1571,24 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& else if (instr->isSALU() && !instr->isSOPP()) ctx.sgpr_read_by_valu_then_wr_by_salu.inc(); - if (instr->isSALU() && !instr->definitions.empty()) { - assert(instr->definitions[0].size() <= 2); + if (wait.va_sdst == 0) { + std::bitset old = ctx.sgpr_read_by_valu_then_wr_by_valu; + ctx.sgpr_read_by_valu_then_wr_by_valu.reset(); + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = old[vcc]; + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = old[vcc_hi]; + } + if (wait.va_vcc == 0) { + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false; + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false; + } + + if (instr->isVALU() && !instr->definitions.empty()) { + PhysReg reg = instr->definitions[0].physReg(); + if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) { + for (unsigned i = 0; i < instr->definitions[0].size(); i++) + ctx.sgpr_read_by_valu_then_wr_by_valu.set(reg + i); + } + } else if (instr->isSALU() && !instr->definitions.empty()) { PhysReg reg = instr->definitions[0].physReg(); if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) { for (unsigned i = 0; i < instr->definitions[0].size(); i++) @@ -1727,6 +1757,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx, waitcnt_depctr &= 0xfffe; ctx.sgpr_read_by_valu_then_wr_by_salu.reset(); + if (ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] || + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi]) { + waitcnt_depctr &= 0xfffd; + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false; + ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false; + } + if (ctx.sgpr_read_by_valu_then_wr_by_valu.any()) { + waitcnt_depctr &= 0xf1ff; + ctx.sgpr_read_by_valu_then_wr_by_valu.reset(); + } } /* LdsDirectVMEMHazard */ diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index f487d6484c1..f1e12524ca5 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -1515,6 +1515,7 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.basic) bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1)); bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc, s1)); /* no hazard: SALU write missing */ //>> p_unit_test 0 @@ -1673,6 +1674,86 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.basic) bld.sopp(aco_opcode::s_nop, 0); bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1)); + /* VALU -> VALU non-VCC SGPR */ + //! p_unit_test 17 + //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] + //! s_waitcnt_depctr va_sdst(0) + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + + /* VALU -> VALU VCC SGPR */ + //! p_unit_test 18 + //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] + //! s_waitcnt_depctr va_vcc(0) + //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); + + /* va_sdst=0 from SALU reading an SGPR: hazard mitigated */ + //! p_unit_test 19 + //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] + //! s1: %0:s[64] = s_mov_b32 %0:s[6] + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + + /* va_vcc=0 from SALU reading VCC: hazard mitigated */ + //! p_unit_test 20 + //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] + //! s1: %0:s[64] = s_mov_b32 %0:vcc_lo + //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(vcc, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); + + /* VALU -> VALU read VCC and then SGPR */ + //! p_unit_test 21 + //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] + //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] + //! s_waitcnt_depctr va_vcc(0) + //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi + //! s_waitcnt_depctr va_sdst(0) + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(21)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + + /* VALU -> VALU read SGPR and then VCC */ + //! p_unit_test 22 + //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] + //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] + //! s_waitcnt_depctr va_sdst(0) + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + //! s_waitcnt_depctr va_vcc(0) + //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(22)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); + + /* VALU writes VCC and SALU writes SGPR */ + //! p_unit_test 23 + //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0] + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr va_vcc(0) + //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi + //! s_waitcnt_depctr sa_sdst(0) + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(23)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + finish_insert_nops_test(); END_TEST @@ -2121,5 +2202,43 @@ BEGIN_TEST(insert_nops.setpc_gfx12) bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4)); bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + //! p_unit_test 7 + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0] + //! s_waitcnt_depctr va_vdst(0) va_sdst(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + //! p_unit_test 8 + //! v1: %0:v[0] = v_mov_b32 %0:vcc_lo + //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[0] + //! s_waitcnt_depctr va_vdst(0) va_vcc(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(vcc), s1)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(256), v1)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + //! p_unit_test 9 + //! v1: %0:v[0] = v_mov_b32 %0:s[4] + //! v1: %0:v[1] = v_mov_b32 %0:s[5] + //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo + //! s1: %0:s[4] = s_mov_b32 0 + //! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0] + //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1] + //! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) sa_sdst(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + finish_insert_nops_test(true); END_TEST