mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 19:40:10 +01:00
aco: combine VALU lanemask hazard into VALUMaskWriteHazard
This is now basically the same as the original VALUMaskWriteHazard, except it now considers both VALU and SALU writes. Now that it's a part of VALUMaskWriteHazard, differences from the original VALU lanemask workaround are: - it includes SALU reads after the write - it includes VALU writes and SALU/VALU reads after the write which are not lanemasks - it combines s_waitcnt_depctr instructions when it's a read after both a SALU write and a VALU write - non-exec VALU SGPR reads reset the SGPRs read by VALU as a lanemask - exec SGPRs are ignored resolve_all_gfx11() is also finished. fossil-db (navi31): Totals from 21538 (27.13% of 79377) affected shaders: Instrs: 27628855 -> 27552972 (-0.27%); split: -0.30%, +0.03% CodeSize: 145968448 -> 145667616 (-0.21%); split: -0.23%, +0.02% Latency: 209537805 -> 209509519 (-0.01%); split: -0.02%, +0.00% InvThroughput: 36304270 -> 36301624 (-0.01%); split: -0.01%, +0.00% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12623 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11480 Backport-to: 25.0 Backport-to: 25.1 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34529>
This commit is contained in:
parent
eee3c8eab8
commit
ce2be5ab8e
2 changed files with 66 additions and 92 deletions
|
|
@ -376,11 +376,13 @@ A va_vdst=0 wait: `s_waitcnt_deptr 0x0fff`
|
|||
### VALUMaskWriteHazard
|
||||
|
||||
Triggered by:
|
||||
SALU writing then SALU or VALU reading a SGPR that was previously used as a lane mask for a VALU.
|
||||
SALU or VALU writing then SALU or VALU reading a SGPR that was previously used as a lane mask for a
|
||||
VALU when using wave64.
|
||||
|
||||
Mitigated by:
|
||||
A VALU instruction reading a non-exec SGPR before the SALU write, or a sa_sdst=0 wait after the
|
||||
SALU write: `s_waitcnt_depctr 0xfffe`
|
||||
A VALU instruction reading a non-exec SGPR before the SGPR write, or a wait after the
|
||||
write: `s_waitcnt_depctr 0xfffe` for SALU, `s_waitcnt_depctr 0xf1ff` for non-VCC VALU and
|
||||
`s_waitcnt_depctr 0xfffd` for VCC VALU.
|
||||
|
||||
## RDNA4 / GFX12 hazards
|
||||
|
||||
|
|
|
|||
|
|
@ -258,8 +258,6 @@ struct NOP_ctx_gfx11 {
|
|||
/* VALUMaskWriteHazard */
|
||||
std::bitset<128> sgpr_read_by_valu_as_lanemask;
|
||||
std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
||||
|
||||
std::bitset<128> sgpr_read_by_valu_as_lanemask2;
|
||||
std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_valu;
|
||||
|
||||
/* WMMAHazards */
|
||||
|
|
@ -281,7 +279,6 @@ struct NOP_ctx_gfx11 {
|
|||
valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
|
||||
trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
|
||||
sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
|
||||
sgpr_read_by_valu_as_lanemask2 |= other.sgpr_read_by_valu_as_lanemask2;
|
||||
sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
|
||||
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
|
||||
sgpr_read_by_valu_as_lanemask_then_wr_by_valu |=
|
||||
|
|
@ -303,7 +300,6 @@ struct NOP_ctx_gfx11 {
|
|||
valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
|
||||
trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
|
||||
sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
|
||||
sgpr_read_by_valu_as_lanemask2 == other.sgpr_read_by_valu_as_lanemask2 &&
|
||||
sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
|
||||
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
|
||||
sgpr_read_by_valu_as_lanemask_then_wr_by_valu ==
|
||||
|
|
@ -807,24 +803,6 @@ check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& chec
|
|||
});
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
bool
|
||||
check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
|
||||
{
|
||||
return std::any_of(instr->operands.begin(), instr->operands.end(),
|
||||
[&check_regs](const Operand& op) -> bool
|
||||
{
|
||||
if (op.isConstant())
|
||||
return false;
|
||||
bool writes_any = false;
|
||||
for (unsigned i = 0; i < op.size(); i++) {
|
||||
unsigned op_reg = op.physReg() + i;
|
||||
writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
|
||||
}
|
||||
return writes_any;
|
||||
});
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
void
|
||||
mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
|
||||
|
|
@ -1386,30 +1364,6 @@ handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
|
|||
return global_state.hazard_found;
|
||||
}
|
||||
|
||||
static bool
|
||||
instr_reads_lanemask(Instruction* instr, Operand* op)
|
||||
{
|
||||
if (!instr->isVALU())
|
||||
return false;
|
||||
if (instr->isVOPD()) {
|
||||
*op = Operand(vcc, s1);
|
||||
return instr->opcode == aco_opcode::v_dual_cndmask_b32 ||
|
||||
instr->vopd().opy == aco_opcode::v_dual_cndmask_b32;
|
||||
}
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_addc_co_u32:
|
||||
case aco_opcode::v_subb_co_u32:
|
||||
case aco_opcode::v_subbrev_co_u32:
|
||||
case aco_opcode::v_cndmask_b16:
|
||||
case aco_opcode::v_cndmask_b32:
|
||||
case aco_opcode::v_div_fmas_f32:
|
||||
case aco_opcode::v_div_fmas_f64:
|
||||
*op = instr->operands.back();
|
||||
return !instr->operands.back().isConstant();
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
|
||||
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
|
|
@ -1497,33 +1451,39 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
|
||||
if (state.program->gfx_level < GFX12) {
|
||||
/* VALUMaskWriteHazard
|
||||
* VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
|
||||
* VALU.
|
||||
* VALU reads SGPR as a lane mask and later written by SALU or VALU cannot safely be read by
|
||||
* SALU or VALU.
|
||||
*/
|
||||
if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
|
||||
check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
|
||||
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
|
||||
sa_sdst = 0;
|
||||
}
|
||||
if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU())) {
|
||||
uint16_t imm = 0xffff;
|
||||
|
||||
/* VALU reading a SGPR as a lane mask and later written as a lane mask shouldn't be read again
|
||||
* as a lane mask without a wait.
|
||||
*
|
||||
* TODO: this fixes #12623 and #11480, but needs further investigation as to why.
|
||||
*/
|
||||
Operand lanemask_op;
|
||||
if (instr_reads_lanemask(instr.get(), &lanemask_op)) {
|
||||
unsigned reg = lanemask_op.physReg().reg();
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg] ||
|
||||
(state.program->wave_size == 64 &&
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg + 1])) {
|
||||
bool is_vcc = reg == vcc || reg == vcc_hi;
|
||||
bld.sopp(aco_opcode::s_waitcnt_depctr, is_vcc ? 0xfffd : 0xf1ff);
|
||||
if (is_vcc)
|
||||
wait.va_vcc = 0;
|
||||
else
|
||||
wait.va_sdst = 0;
|
||||
for (Operand op : instr->operands) {
|
||||
if (op.physReg() >= state.program->dev.sgpr_limit)
|
||||
continue;
|
||||
|
||||
for (unsigned i = 0; i < op.size(); i++) {
|
||||
unsigned reg = op.physReg() + i;
|
||||
|
||||
/* s_waitcnt_depctr on sa_sdst */
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg]) {
|
||||
imm &= 0xfffe;
|
||||
sa_sdst = 0;
|
||||
}
|
||||
|
||||
/* s_waitcnt_depctr on va_sdst (if non-VCC SGPR) or va_vcc (if VCC SGPR) */
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg]) {
|
||||
bool is_vcc = reg == vcc || reg == vcc_hi;
|
||||
imm &= is_vcc ? 0xfffd : 0xf1ff;
|
||||
if (is_vcc)
|
||||
wait.va_vcc = 0;
|
||||
else
|
||||
wait.va_sdst = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (imm != 0xffff)
|
||||
bld.sopp(aco_opcode::s_waitcnt_depctr, imm);
|
||||
}
|
||||
|
||||
if (va_vdst == 0) {
|
||||
|
|
@ -1577,28 +1537,30 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
if (!op.isConstant() && op.physReg().reg() < 126)
|
||||
ctx.sgpr_read_by_valu_as_lanemask.reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (instr_reads_lanemask(instr.get(), &lanemask_op)) {
|
||||
unsigned reg = lanemask_op.physReg().reg();
|
||||
if (state.program->wave_size == 64 && reg != exec) {
|
||||
ctx.sgpr_read_by_valu_as_lanemask.set(reg);
|
||||
ctx.sgpr_read_by_valu_as_lanemask.set(reg + 1);
|
||||
if (!instr->definitions.empty() &&
|
||||
instr->definitions.back().getTemp().type() == RegType::sgpr &&
|
||||
check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
|
||||
unsigned reg = instr->definitions.back().physReg().reg();
|
||||
for (unsigned i = 0; i < instr->definitions.back().size(); i++)
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg + i] = 1;
|
||||
}
|
||||
ctx.sgpr_read_by_valu_as_lanemask2.set(reg);
|
||||
if (state.program->wave_size == 64)
|
||||
ctx.sgpr_read_by_valu_as_lanemask2.set(reg + 1);
|
||||
}
|
||||
|
||||
if (instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
||||
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
||||
!instr->definitions.empty() &&
|
||||
instr->definitions.back().getTemp().type() == RegType::sgpr) {
|
||||
unsigned reg = instr->definitions.back().physReg().reg();
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask2[reg])
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg] = true;
|
||||
if (state.program->wave_size == 64 && ctx.sgpr_read_by_valu_as_lanemask2[reg + 1])
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[reg + 1] = true;
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_addc_co_u32:
|
||||
case aco_opcode::v_subb_co_u32:
|
||||
case aco_opcode::v_subbrev_co_u32:
|
||||
case aco_opcode::v_cndmask_b16:
|
||||
case aco_opcode::v_cndmask_b32:
|
||||
case aco_opcode::v_div_fmas_f32:
|
||||
case aco_opcode::v_div_fmas_f64:
|
||||
if (instr->operands.back().physReg() != exec) {
|
||||
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
|
||||
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
|
||||
}
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1818,6 +1780,16 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
|
|||
waitcnt_depctr &= 0xfffe;
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
|
||||
}
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc] ||
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc_hi]) {
|
||||
waitcnt_depctr &= 0xfffd;
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc] = false;
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu[vcc_hi] = false;
|
||||
}
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu.any()) {
|
||||
waitcnt_depctr &= 0xf1ff;
|
||||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_valu.reset();
|
||||
}
|
||||
if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
|
||||
valu_read_sgpr = true;
|
||||
ctx.sgpr_read_by_valu_as_lanemask.reset();
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue