aco/gfx11: workaround VALUMaskWriteHazard

fossil-db (gfx1100):
Totals from 62812 (46.52% of 135032) affected shaders:
Instrs: 43971580 -> 44069887 (+0.22%)
CodeSize: 233473420 -> 233866648 (+0.17%)
Latency: 463487489 -> 463520688 (+0.01%); split: -0.00%, +0.01%
InvThroughput: 86505748 -> 86509679 (+0.00%); split: -0.00%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18273>
This commit is contained in:
Rhys Perry 2022-09-27 15:37:11 +01:00 committed by Marge Bot
parent 5806f0246f
commit 6f368431fd
3 changed files with 143 additions and 1 deletions

View file

@ -330,3 +330,11 @@ VALU between the second VGPR write and the current instruction.
Mitigated by:
A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff`
### VALUMaskWriteHazard
Triggered by:
SALU writing then reading a SGPR that was previously used as a lane mask for a VALU.
Mitigated by:
A VALU instruction reading a SGPR or with literal, or a sa_sdst=0 wait: `s_waitcnt_depctr 0xfffe`

View file

@ -292,6 +292,10 @@ struct NOP_ctx_gfx11 {
VGPRCounterMap<15> valu_since_wr_by_trans;
VGPRCounterMap<2> trans_since_wr_by_trans;
/* VALUMaskWriteHazard */
std::bitset<128> sgpr_read_by_valu_as_lanemask;
std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
void join(const NOP_ctx_gfx11& other)
{
has_Vcmpx |= other.has_Vcmpx;
@ -300,6 +304,9 @@ struct NOP_ctx_gfx11 {
vgpr_used_by_ds |= other.vgpr_used_by_ds;
valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
}
bool operator==(const NOP_ctx_gfx11& other)
@ -309,7 +316,10 @@ struct NOP_ctx_gfx11 {
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
vgpr_used_by_ds == other.vgpr_used_by_ds &&
valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
trans_since_wr_by_trans == other.trans_since_wr_by_trans;
trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
}
};
@ -728,6 +738,24 @@ check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& chec
});
}
template <std::size_t N>
bool
check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
{
return std::any_of(instr->operands.begin(), instr->operands.end(),
[&check_regs](const Operand& op) -> bool
{
if (op.isConstant())
return false;
bool writes_any = false;
for (unsigned i = 0; i < op.size(); i++) {
unsigned op_reg = op.physReg() + i;
writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
}
return writes_any;
});
}
template <std::size_t N>
void
mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
@ -1265,6 +1293,21 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
va_vdst = 0;
}
/* VALUMaskWriteHazard
* VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
*/
if (state.program->wave_size == 64 && instr->isSALU() &&
check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
ctx.sgpr_read_by_valu_as_lanemask.reset();
} else if (state.program->wave_size == 64 && instr->isSALU() &&
check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
} else if (instr->opcode == aco_opcode::s_waitcnt_depctr && (instr->sopp().imm & 0x1) == 0) {
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
}
va_vdst = std::min(va_vdst, parse_vdst_wait(instr));
if (va_vdst == 0) {
ctx.valu_since_wr_by_trans.reset();
@ -1286,6 +1329,28 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
}
}
if (state.program->wave_size == 64) {
for (Operand& op : instr->operands) {
if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
ctx.sgpr_read_by_valu_as_lanemask.reset();
}
switch (instr->opcode) {
case aco_opcode::v_addc_co_u32:
case aco_opcode::v_subb_co_u32:
case aco_opcode::v_subbrev_co_u32:
case aco_opcode::v_cndmask_b16:
case aco_opcode::v_cndmask_b32:
case aco_opcode::v_div_fmas_f32:
case aco_opcode::v_div_fmas_f64:
if (instr->operands.back().physReg() != exec) {
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
}
break;
default: break;
}
}
}
/* LdsDirectVMEMHazard

View file

@ -920,3 +920,72 @@ BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
finish_insert_nops_test();
END_TEST
BEGIN_TEST(insert_nops.valu_mask_write)
if (!setup_cs(NULL, GFX11))
return;
/* Basic case. */
//>> p_unit_test 0
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
//! s1: %0:s[1] = s_mov_b32 0
//! s_waitcnt_depctr sa_sdst(0)
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
/* Mitigation. */
//! p_unit_test 1
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
//! v1: %0:v[1] = v_mov_b32 %0:s[1]
//! s1: %0:s[1] = s_mov_b32 0
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
//! p_unit_test 2
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
//! s1: %0:s[1] = s_mov_b32 0
//! s_waitcnt_depctr sa_sdst(0)
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
//! p_unit_test 3
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
//! s1: %0:s[1] = s_mov_b32 0
//! s_waitcnt_depctr sa_sdst(0)
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
/* Instruction which is both involved in the hazard and is a mitigation. */
//! p_unit_test 4
//! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
//! s1: %0:s[1] = s_mov_b32 0
//! s_waitcnt_depctr sa_sdst(0)
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
Operand(PhysReg(2), s1), Operand::zero(), Operand(PhysReg(0), s2));
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
finish_insert_nops_test();
END_TEST