mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-20 12:20:29 +01:00
aco/gfx11: workaround VALUMaskWriteHazard
fossil-db (gfx1100): Totals from 62812 (46.52% of 135032) affected shaders: Instrs: 43971580 -> 44069887 (+0.22%) CodeSize: 233473420 -> 233866648 (+0.17%) Latency: 463487489 -> 463520688 (+0.01%); split: -0.00%, +0.01% InvThroughput: 86505748 -> 86509679 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18273>
This commit is contained in:
parent
5806f0246f
commit
6f368431fd
3 changed files with 143 additions and 1 deletions
|
|
@ -330,3 +330,11 @@ VALU between the second VGPR write and the current instruction.
|
|||
|
||||
Mitigated by:
|
||||
A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff`
|
||||
|
||||
### VALUMaskWriteHazard
|
||||
|
||||
Triggered by:
|
||||
SALU writing then reading a SGPR that was previously used as a lane mask for a VALU.
|
||||
|
||||
Mitigated by:
|
||||
A VALU instruction reading a SGPR or with literal, or a sa_sdst=0 wait: `s_waitcnt_depctr 0xfffe`
|
||||
|
|
|
|||
|
|
@ -292,6 +292,10 @@ struct NOP_ctx_gfx11 {
|
|||
VGPRCounterMap<15> valu_since_wr_by_trans;
|
||||
VGPRCounterMap<2> trans_since_wr_by_trans;
|
||||
|
||||
/* VALUMaskWriteHazard */
|
||||
std::bitset<128> sgpr_read_by_valu_as_lanemask;
|
||||
std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
||||
|
||||
void join(const NOP_ctx_gfx11& other)
|
||||
{
|
||||
has_Vcmpx |= other.has_Vcmpx;
|
||||
|
|
@ -300,6 +304,9 @@ struct NOP_ctx_gfx11 {
|
|||
vgpr_used_by_ds |= other.vgpr_used_by_ds;
|
||||
valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
|
||||
trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
|
||||
sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
|
||||
sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
|
||||
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
||||
}
|
||||
|
||||
bool operator==(const NOP_ctx_gfx11& other)
|
||||
|
|
@ -309,7 +316,10 @@ struct NOP_ctx_gfx11 {
|
|||
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
|
||||
vgpr_used_by_ds == other.vgpr_used_by_ds &&
|
||||
valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
|
||||
trans_since_wr_by_trans == other.trans_since_wr_by_trans;
|
||||
trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
|
||||
sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
|
||||
sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
|
||||
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -728,6 +738,24 @@ check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& chec
|
|||
});
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
bool
|
||||
check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
|
||||
{
|
||||
return std::any_of(instr->operands.begin(), instr->operands.end(),
|
||||
[&check_regs](const Operand& op) -> bool
|
||||
{
|
||||
if (op.isConstant())
|
||||
return false;
|
||||
bool writes_any = false;
|
||||
for (unsigned i = 0; i < op.size(); i++) {
|
||||
unsigned op_reg = op.physReg() + i;
|
||||
writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
|
||||
}
|
||||
return writes_any;
|
||||
});
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
void
|
||||
mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
|
||||
|
|
@ -1265,6 +1293,21 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
va_vdst = 0;
|
||||
}
|
||||
|
||||
/* VALUMaskWriteHazard
|
||||
* VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
|
||||
*/
|
||||
if (state.program->wave_size == 64 && instr->isSALU() &&
|
||||
check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
|
||||
ctx.sgpr_read_by_valu_as_lanemask.reset();
|
||||
} else if (state.program->wave_size == 64 && instr->isSALU() &&
|
||||
check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
|
||||
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
|
||||
} else if (instr->opcode == aco_opcode::s_waitcnt_depctr && (instr->sopp().imm & 0x1) == 0) {
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
|
||||
}
|
||||
|
||||
va_vdst = std::min(va_vdst, parse_vdst_wait(instr));
|
||||
if (va_vdst == 0) {
|
||||
ctx.valu_since_wr_by_trans.reset();
|
||||
|
|
@ -1286,6 +1329,28 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
|
||||
}
|
||||
}
|
||||
|
||||
if (state.program->wave_size == 64) {
|
||||
for (Operand& op : instr->operands) {
|
||||
if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
|
||||
ctx.sgpr_read_by_valu_as_lanemask.reset();
|
||||
}
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_addc_co_u32:
|
||||
case aco_opcode::v_subb_co_u32:
|
||||
case aco_opcode::v_subbrev_co_u32:
|
||||
case aco_opcode::v_cndmask_b16:
|
||||
case aco_opcode::v_cndmask_b32:
|
||||
case aco_opcode::v_div_fmas_f32:
|
||||
case aco_opcode::v_div_fmas_f64:
|
||||
if (instr->operands.back().physReg() != exec) {
|
||||
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
|
||||
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
|
||||
}
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* LdsDirectVMEMHazard
|
||||
|
|
|
|||
|
|
@ -920,3 +920,72 @@ BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
|
|||
|
||||
finish_insert_nops_test();
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_nops.valu_mask_write)
|
||||
if (!setup_cs(NULL, GFX11))
|
||||
return;
|
||||
|
||||
/* Basic case. */
|
||||
//>> p_unit_test 0
|
||||
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
|
||||
//! s1: %0:s[1] = s_mov_b32 0
|
||||
//! s_waitcnt_depctr sa_sdst(0)
|
||||
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
|
||||
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
|
||||
|
||||
/* Mitigation. */
|
||||
//! p_unit_test 1
|
||||
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
|
||||
//! v1: %0:v[1] = v_mov_b32 %0:s[1]
|
||||
//! s1: %0:s[1] = s_mov_b32 0
|
||||
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
|
||||
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
|
||||
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
|
||||
|
||||
//! p_unit_test 2
|
||||
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
|
||||
//! s1: %0:s[1] = s_mov_b32 0
|
||||
//! s_waitcnt_depctr sa_sdst(0)
|
||||
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
|
||||
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
|
||||
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
|
||||
|
||||
//! p_unit_test 3
|
||||
//! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
|
||||
//! s1: %0:s[1] = s_mov_b32 0
|
||||
//! s_waitcnt_depctr sa_sdst(0)
|
||||
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
|
||||
Operand::zero(), Operand::zero(), Operand(PhysReg(0), s2));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
|
||||
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
|
||||
|
||||
/* Instruction which is both involved in the hazard and is a mitigation. */
|
||||
//! p_unit_test 4
|
||||
//! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
|
||||
//! s1: %0:s[1] = s_mov_b32 0
|
||||
//! s_waitcnt_depctr sa_sdst(0)
|
||||
//! s1: %0:s[2] = s_mov_b32 %0:s[1]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1),
|
||||
Operand(PhysReg(2), s1), Operand::zero(), Operand(PhysReg(0), s2));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
|
||||
|
||||
finish_insert_nops_test();
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue