mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 13:10:10 +01:00
aco/insert_NOPs: implement VALU -> VALU case for VALUReadSGPRHazard on GFX12
Totals from 36918 (46.50% of 79395) affected shaders: (GFX1200) Instrs: 34997889 -> 35296429 (+0.85%); split: -0.00%, +0.85% CodeSize: 186161112 -> 187334364 (+0.63%); split: -0.00%, +0.63% Latency: 250265551 -> 250330784 (+0.03%); split: -0.00%, +0.03% InvThroughput: 41185298 -> 41192503 (+0.02%); split: -0.00%, +0.02% Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32682>
This commit is contained in:
parent
6c7355f0e6
commit
65f95ae74e
3 changed files with 184 additions and 4 deletions
|
|
@ -381,3 +381,24 @@ SALU writing then SALU or VALU reading a SGPR that was previously used as a lane
|
|||
Mitigated by:
|
||||
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the
|
||||
SALU write: `s_waitcnt_depctr 0xfffe`
|
||||
|
||||
## RDNA4 / GFX12 hazards
|
||||
|
||||
### VcmpxPermlaneHazard
|
||||
|
||||
Same as GFX10
|
||||
|
||||
### LdsDirectVALUHazard
|
||||
### LdsDirectVMEMHazard
|
||||
|
||||
Same as GFX11
|
||||
|
||||
### VALUReadSGPRHazard
|
||||
|
||||
Triggered by:
|
||||
VALU reads an SGPR, then written by SALU cannot safely be read by SALU or VALU, or
|
||||
VALU reads an SGPR, then written by VALU cannot safely be read by VALU.
|
||||
|
||||
Mitigated by:
|
||||
After the SALU write a sa_sdst=0 wait. After the VALU write a va_sdst=0 / va_vcc=0 wait.
|
||||
It does not reset the first step.
|
||||
|
|
|
|||
|
|
@ -264,6 +264,7 @@ struct NOP_ctx_gfx11 {
|
|||
|
||||
/* VALUReadSGPRHazard */
|
||||
std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
|
||||
std::bitset<m0.reg()> sgpr_read_by_valu_then_wr_by_valu;
|
||||
RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
|
||||
|
||||
void join(const NOP_ctx_gfx11& other)
|
||||
|
|
@ -281,6 +282,7 @@ struct NOP_ctx_gfx11 {
|
|||
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
||||
vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
|
||||
sgpr_read_by_valu |= other.sgpr_read_by_valu;
|
||||
sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu;
|
||||
sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
|
||||
}
|
||||
|
||||
|
|
@ -1405,7 +1407,8 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
ctx.has_Vcmpx = false;
|
||||
}
|
||||
|
||||
unsigned va_vdst = parse_depctr_wait(instr.get()).va_vdst;
|
||||
depctr_wait wait = parse_depctr_wait(instr.get());
|
||||
unsigned va_vdst = wait.va_vdst;
|
||||
unsigned vm_vsrc = 7;
|
||||
unsigned sa_sdst = 1;
|
||||
|
||||
|
|
@ -1543,8 +1546,19 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
imm &= 0xfffe;
|
||||
sa_sdst = 0;
|
||||
}
|
||||
if (instr->isVALU())
|
||||
if (instr->isVALU()) {
|
||||
ctx.sgpr_read_by_valu.set(reg / 2);
|
||||
|
||||
/* s_wait_alu on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */
|
||||
if (ctx.sgpr_read_by_valu_then_wr_by_valu[reg]) {
|
||||
bool is_vcc = reg == vcc || reg == vcc_hi;
|
||||
imm &= is_vcc ? 0xfffd : 0xf1ff;
|
||||
if (is_vcc)
|
||||
wait.va_vcc = 0;
|
||||
else
|
||||
wait.va_sdst = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1557,8 +1571,24 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
else if (instr->isSALU() && !instr->isSOPP())
|
||||
ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
|
||||
|
||||
if (instr->isSALU() && !instr->definitions.empty()) {
|
||||
assert(instr->definitions[0].size() <= 2);
|
||||
if (wait.va_sdst == 0) {
|
||||
std::bitset<m0.reg()> old = ctx.sgpr_read_by_valu_then_wr_by_valu;
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = old[vcc];
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = old[vcc_hi];
|
||||
}
|
||||
if (wait.va_vcc == 0) {
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
|
||||
}
|
||||
|
||||
if (instr->isVALU() && !instr->definitions.empty()) {
|
||||
PhysReg reg = instr->definitions[0].physReg();
|
||||
if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
|
||||
for (unsigned i = 0; i < instr->definitions[0].size(); i++)
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu.set(reg + i);
|
||||
}
|
||||
} else if (instr->isSALU() && !instr->definitions.empty()) {
|
||||
PhysReg reg = instr->definitions[0].physReg();
|
||||
if (reg < m0 && ctx.sgpr_read_by_valu[reg / 2]) {
|
||||
for (unsigned i = 0; i < instr->definitions[0].size(); i++)
|
||||
|
|
@ -1727,6 +1757,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
|
|||
waitcnt_depctr &= 0xfffe;
|
||||
|
||||
ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
|
||||
if (ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] ||
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi]) {
|
||||
waitcnt_depctr &= 0xfffd;
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc] = false;
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu[vcc_hi] = false;
|
||||
}
|
||||
if (ctx.sgpr_read_by_valu_then_wr_by_valu.any()) {
|
||||
waitcnt_depctr &= 0xf1ff;
|
||||
ctx.sgpr_read_by_valu_then_wr_by_valu.reset();
|
||||
}
|
||||
}
|
||||
|
||||
/* LdsDirectVMEMHazard */
|
||||
|
|
|
|||
|
|
@ -1515,6 +1515,7 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.basic)
|
|||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc, s1));
|
||||
|
||||
/* no hazard: SALU write missing */
|
||||
//>> p_unit_test 0
|
||||
|
|
@ -1673,6 +1674,86 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.basic)
|
|||
bld.sopp(aco_opcode::s_nop, 0);
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
|
||||
|
||||
/* VALU -> VALU non-VCC SGPR */
|
||||
//! p_unit_test 17
|
||||
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr va_sdst(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
|
||||
/* VALU -> VALU VCC SGPR */
|
||||
//! p_unit_test 18
|
||||
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr va_vcc(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
|
||||
|
||||
/* va_sdst=0 from SALU reading an SGPR: hazard mitigated */
|
||||
//! p_unit_test 19
|
||||
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
|
||||
//! s1: %0:s[64] = s_mov_b32 %0:s[6]
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
|
||||
/* va_vcc=0 from SALU reading VCC: hazard mitigated */
|
||||
//! p_unit_test 20
|
||||
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
|
||||
//! s1: %0:s[64] = s_mov_b32 %0:vcc_lo
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(vcc, s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
|
||||
|
||||
/* VALU -> VALU read VCC and then SGPR */
|
||||
//! p_unit_test 21
|
||||
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
|
||||
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr va_vcc(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
|
||||
//! s_waitcnt_depctr va_sdst(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(21));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
|
||||
/* VALU -> VALU read SGPR and then VCC */
|
||||
//! p_unit_test 22
|
||||
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
|
||||
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr va_sdst(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
//! s_waitcnt_depctr va_vcc(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(22));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
|
||||
|
||||
/* VALU writes VCC and SALU writes SGPR */
|
||||
//! p_unit_test 23
|
||||
//! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
|
||||
//! s1: %0:s[4] = s_mov_b32 0
|
||||
//! s_waitcnt_depctr va_vcc(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
|
||||
//! s_waitcnt_depctr sa_sdst(0)
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(23));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
|
||||
finish_insert_nops_test();
|
||||
END_TEST
|
||||
|
||||
|
|
@ -2121,5 +2202,43 @@ BEGIN_TEST(insert_nops.setpc_gfx12)
|
|||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
|
||||
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
|
||||
|
||||
//! p_unit_test 7
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
//! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr va_vdst(0) va_sdst(0)
|
||||
//! s_setpc_b64 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
|
||||
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
|
||||
|
||||
//! p_unit_test 8
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:vcc_lo
|
||||
//! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr va_vdst(0) va_vcc(0)
|
||||
//! s_setpc_b64 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(vcc), s1));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(256), v1));
|
||||
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
|
||||
|
||||
//! p_unit_test 9
|
||||
//! v1: %0:v[0] = v_mov_b32 %0:s[4]
|
||||
//! v1: %0:v[1] = v_mov_b32 %0:s[5]
|
||||
//! v1: %0:v[2] = v_mov_b32 %0:vcc_lo
|
||||
//! s1: %0:s[4] = s_mov_b32 0
|
||||
//! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0]
|
||||
//! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1]
|
||||
//! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) sa_sdst(0)
|
||||
//! s_setpc_b64 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1));
|
||||
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
|
||||
|
||||
finish_insert_nops_test(true);
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue