aco: don't consider sa_sdst=0 before SALU write to fix VALUMaskWriteHazard

LLVM does but that's probably a bug.

fossil-db (navi31):
Totals from 311 (0.39% of 79395) affected shaders:
Instrs: 380453 -> 381075 (+0.16%)
CodeSize: 1961012 -> 1964744 (+0.19%)
Latency: 4799095 -> 4800313 (+0.03%)
InvThroughput: 958358 -> 958904 (+0.06%)
VALU: 242322 -> 242633 (+0.13%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Backport-to: 24.1
Backport-to: 24.2
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30818>
This commit is contained in:
Rhys Perry 2024-08-22 16:03:31 +01:00 committed by Marge Bot
parent 8f5ee70d85
commit b1ba7d1b99
3 changed files with 35 additions and 8 deletions

View file

@ -375,5 +375,5 @@ Triggered by:
SALU writing then SALU or VALU reading a SGPR that was previously used as a lane mask for a VALU.
Mitigated by:
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait:
`s_waitcnt_depctr 0xfffe`
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the
SALU write: `s_waitcnt_depctr 0xfffe`

View file

@ -1631,6 +1631,7 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
Builder bld(state.program, &new_instructions);
unsigned waitcnt_depctr = 0xffff;
bool valu_read_sgpr = false;
/* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
bool has_vdst0_since_valu = true;
@ -1651,12 +1652,15 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
}
/* VALUMaskWriteHazard */
if (state.program->gfx_level < GFX12 && state.program->wave_size == 64 &&
(ctx.sgpr_read_by_valu_as_lanemask.any() ||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) {
waitcnt_depctr &= 0xfffe;
ctx.sgpr_read_by_valu_as_lanemask.reset();
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
if (state.program->gfx_level < GFX12 && state.program->wave_size == 64) {
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
waitcnt_depctr &= 0xfffe;
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
}
if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
valu_read_sgpr = true;
ctx.sgpr_read_by_valu_as_lanemask.reset();
}
}
/* LdsDirectVMEMHazard */
@ -1671,6 +1675,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
if (waitcnt_depctr != 0xffff)
bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
if (valu_read_sgpr) {
/* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
* other hazards. */
bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
/* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
}
}
template <typename Ctx>

View file

@ -1646,9 +1646,22 @@ BEGIN_TEST(insert_nops.setpc_gfx11)
/* VALUMaskWriteHazard */
//! p_unit_test 4
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
//! s1: %0:vcc_hi = s_mov_b32 0
//! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
Operand::zero(), Operand(vcc, s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
//! p_unit_test 8
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
//! s_waitcnt_depctr va_vdst(0)
//! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
//! s_waitcnt_depctr va_vdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
Operand::zero(), Operand(vcc, s2));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));