aco/insert_NOPs: implement VALU -> VALU case for VALUReadSGPRHazard on GFX12

Totals from 36918 (46.50% of 79395) affected shaders: (GFX1200)

Instrs: 34997889 -> 35296429 (+0.85%); split: -0.00%, +0.85%
CodeSize: 186161112 -> 187334364 (+0.63%); split: -0.00%, +0.63%
Latency: 250265551 -> 250330784 (+0.03%); split: -0.00%, +0.03%
InvThroughput: 41185298 -> 41192503 (+0.02%); split: -0.00%, +0.02%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32682>
This commit is contained in:
Daniel Schürmann 2024-12-17 12:22:27 +01:00 committed by Marge Bot
parent 6c7355f0e6
commit 65f95ae74e
3 changed files with 184 additions and 4 deletions

View file

@ -381,3 +381,24 @@ SALU writing then SALU or VALU reading a SGPR that was previously used as a lane
Mitigated by:
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the
SALU write: `s_waitcnt_depctr 0xfffe`
## RDNA4 / GFX12 hazards
### VcmpxPermlaneHazard
Same as GFX10
### LdsDirectVALUHazard
### LdsDirectVMEMHazard
Same as GFX11
### VALUReadSGPRHazard
Triggered by:
VALU reads an SGPR, then written by SALU cannot safely be read by SALU or VALU, or
VALU reads an SGPR, then written by VALU cannot safely be read by VALU.
Mitigated by:
After the SALU write a sa_sdst=0 wait. After the VALU write a va_sdst=0 / va_vcc=0 wait.
It does not reset the first step.

View file

@ -264,6 +264,7 @@ struct NOP_ctx_gfx11 {
/* VALUReadSGPRHazard */
std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
std::bitset<m0.reg()> sgpr_read_by_valu_then_wr_by_valu;
RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
void join(const NOP_ctx_gfx11& other)
@ -281,6 +282,7 @@ struct NOP_ctx_gfx11 {
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
sgpr_read_by_valu |= other.sgpr_read_by_valu;
sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu;
sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
}
@ -1405,7 +1407,8 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
ctx.has_Vcmpx = false;
}
unsigned va_vdst = parse_depctr_wait(instr.get()).va_vdst;
depctr_wait wait = parse_depctr_wait(instr.get());
unsigned va_vdst = wait.va_vdst;
unsigned vm_vsrc = 7;
unsigned sa_sdst = 1;
@ -1543,8 +1546,19 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
imm &= 0xfffe;
sa_sdst = 0;
}
if (instr->isVALU())
if (instr->isVALU()) {
ctx.sgpr_read_by_valu.set(reg / 2);
/* s_wait_alu on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */
if (ctx.sgpr_read_by_valu_then_wr_by_valu[reg]) {
bool is_vcc = reg == vcc || reg == vcc_hi;
imm &= is_vcc ? 0xfffd : 0xf1ff;
if (is_vcc)
wait.va_vcc = 0;
else
wait.va_sdst = 0;
}
}
}
}
@ -1557,8 +1571,24 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
else if (instr->isSALU() && !instr->isSOPP())
ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
if (instr->isSALU() && !instr->definitions.empty()) {
assert(instr->definitions[0].size() <= 2);
if (wait.va_sdst == 0) {
std::bitset<m0.reg()> old = ctx.sgpr_read_by_valu_then_wr_by_valu;
ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = old[vcc];
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = old[vcc_hi];
}
if (wait.va_vcc == 0) {
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
}
if (instr->isVALU() && !instr->definitions.empty()) {
PhysReg reg = instr->definitions[0].physReg();
if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
for (unsigned i = 0; i < instr->definitions[0].size(); i++)
ctx.sgpr_read_by_valu_then_wr_by_valu.set(reg + i);
}
} else if (instr->isSALU() && !instr->definitions.empty()) {
PhysReg reg = instr->definitions[0].physReg();
if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
for (unsigned i = 0; i < instr->definitions[0].size(); i++)
@ -1727,6 +1757,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
waitcnt_depctr &= 0xfffe;
ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
if (ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] ||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi]) {
waitcnt_depctr &= 0xfffd;
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
}
if (ctx.sgpr_read_by_valu_then_wr_by_valu.any()) {
waitcnt_depctr &= 0xf1ff;
ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
}
}
/* LdsDirectVMEMHazard */

View file

@ -1515,6 +1515,7 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.basic)
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc, s1));
/* no hazard: SALU write missing */
//>> p_unit_test 0
@ -1673,6 +1674,86 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.basic)
bld.sopp(aco_opcode::s_nop, 0);
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
/* VALU -> VALU non-VCC SGPR */
//! p_unit_test 17
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
//! s_waitcnt_depctr va_sdst(0)
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
/* VALU -> VALU VCC SGPR */
//! p_unit_test 18
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
//! s_waitcnt_depctr va_vcc(0)
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
/* va_sdst=0 from SALU reading an SGPR: hazard mitigated */
//! p_unit_test 19
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
//! s1: %0:s[64] = s_mov_b32 %0:s[6]
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
/* va_vcc=0 from SALU reading VCC: hazard mitigated */
//! p_unit_test 20
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
//! s1: %0:s[64] = s_mov_b32 %0:vcc_lo
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(vcc, s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
/* VALU -> VALU read VCC and then SGPR */
//! p_unit_test 21
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
//! s_waitcnt_depctr va_vcc(0)
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
//! s_waitcnt_depctr va_sdst(0)
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(21));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
/* VALU -> VALU read SGPR and then VCC */
//! p_unit_test 22
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
//! s_waitcnt_depctr va_sdst(0)
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
//! s_waitcnt_depctr va_vcc(0)
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(22));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
/* VALU writes VCC and SALU writes SGPR */
//! p_unit_test 23
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
//! s1: %0:s[4] = s_mov_b32 0
//! s_waitcnt_depctr va_vcc(0)
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
//! s_waitcnt_depctr sa_sdst(0)
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(23));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
finish_insert_nops_test();
END_TEST
@ -2121,5 +2202,43 @@ BEGIN_TEST(insert_nops.setpc_gfx12)
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
//! p_unit_test 7
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
//! s_waitcnt_depctr va_vdst(0) va_sdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
//! p_unit_test 8
//! v1: %0:v[0] = v_mov_b32 %0:vcc_lo
//! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[0]
//! s_waitcnt_depctr va_vdst(0) va_vcc(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(vcc), s1));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
//! p_unit_test 9
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
//! v1: %0:v[1] = v_mov_b32 %0:s[5]
//! v1: %0:v[2] = v_mov_b32 %0:vcc_lo
//! s1: %0:s[4] = s_mov_b32 0
//! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0]
//! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1]
//! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) sa_sdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
finish_insert_nops_test(true);
END_TEST