aco: workaround VALUReadSGPRHazard

fossil-db (gfx1200):
Totals from 65112 (82.01% of 79395) affected shaders:
Instrs: 41732906 -> 42987198 (+3.01%); split: -0.00%, +3.01%
CodeSize: 222451964 -> 226942644 (+2.02%); split: -0.01%, +2.03%
Latency: 290411063 -> 290944688 (+0.18%); split: -0.00%, +0.18%
InvThroughput: 45854913 -> 45910275 (+0.12%); split: -0.00%, +0.12%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30478>
This commit is contained in:
Rhys Perry 2024-07-25 21:40:02 +01:00 committed by Marge Bot
parent 9ab0c4b047
commit 47e0f468cf

View file

@ -273,6 +273,10 @@ struct NOP_ctx_gfx11 {
/* WMMAHazards */
std::bitset<256> vgpr_written_by_wmma;
/* VALUReadSGPRHazard */
std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
CounterMap<0, m0.reg(), 11> sgpr_read_by_valu_then_wr_by_salu;
void join(const NOP_ctx_gfx11& other)
{
has_Vcmpx |= other.has_Vcmpx;
@ -287,6 +291,8 @@ struct NOP_ctx_gfx11 {
sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
sgpr_read_by_valu |= other.sgpr_read_by_valu;
sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
}
bool operator==(const NOP_ctx_gfx11& other)
@ -302,7 +308,9 @@ struct NOP_ctx_gfx11 {
sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
vgpr_written_by_wmma == other.vgpr_written_by_wmma;
vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
sgpr_read_by_valu == other.sgpr_read_by_valu &&
sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
}
};
@ -1514,6 +1522,48 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
}
}
}
} else {
/* VALUReadSGPRHazard
* VALU reads SGPR and later written by SALU cannot safely be read by VALU/SALU.
*/
if (instr->isVALU() || instr->isSALU()) {
unsigned expiry_count = instr->isSALU() ? 10 : 11;
for (Operand& op : instr->operands) {
if (sa_sdst == 0)
break;
for (unsigned i = 0; i < op.size(); i++) {
unsigned reg = op.physReg() + i;
if (reg < ctx.sgpr_read_by_valu_then_wr_by_salu.size() &&
ctx.sgpr_read_by_valu_then_wr_by_salu.get(reg) < expiry_count) {
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
sa_sdst = 0;
break;
}
}
}
}
if (sa_sdst == 0)
ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
else if (instr->isSALU() && !instr->isSOPP())
ctx.sgpr_read_by_valu_then_wr_by_salu.inc();
if (instr->isVALU()) {
for (const Operand& op : instr->operands) {
for (unsigned i = 0; i < DIV_ROUND_UP(op.size(), 2); i++) {
unsigned reg = (op.physReg() / 2) + i;
if (reg < ctx.sgpr_read_by_valu.size())
ctx.sgpr_read_by_valu.set(reg);
}
}
} else if (instr->isSALU() && !instr->definitions.empty()) {
for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
unsigned def_reg = instr->definitions[0].physReg() + i;
if ((def_reg / 2) < ctx.sgpr_read_by_valu.size() && ctx.sgpr_read_by_valu[def_reg / 2])
ctx.sgpr_read_by_valu_then_wr_by_salu.set(def_reg);
}
}
}
/* LdsDirectVMEMHazard
@ -1670,6 +1720,15 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
}
}
/* VALUReadSGPRHazard */
if (state.program->gfx_level >= GFX12) {
for (unsigned i = 0; i < ctx.sgpr_read_by_valu_then_wr_by_salu.size(); i++) {
if (ctx.sgpr_read_by_valu_then_wr_by_salu.get(i) < 11)
waitcnt_depctr &= 0xfffe;
}
ctx.sgpr_read_by_valu_then_wr_by_salu.reset();
}
/* LdsDirectVMEMHazard */
if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
@ -1745,7 +1804,7 @@ handle_block(Program* program, Ctx& ctx, Block& block)
template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
void
mitigate_hazards(Program* program)
mitigate_hazards(Program* program, Ctx initial_ctx = Ctx())
{
std::vector<Ctx> all_ctx(program->blocks.size());
std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
@ -1754,6 +1813,9 @@ mitigate_hazards(Program* program)
Block& block = program->blocks[i];
Ctx& ctx = all_ctx[i];
if (i == 0 || (block.kind & block_kind_resume))
ctx = initial_ctx;
if (block.kind & block_kind_loop_header) {
loop_header_indices.push(i);
} else if (block.kind & block_kind_loop_exit) {
@ -1851,14 +1913,29 @@ required_export_priority(Program* program)
void
insert_NOPs(Program* program)
{
if (program->gfx_level >= GFX11)
mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program);
else if (program->gfx_level >= GFX10_3)
if (program->gfx_level >= GFX11) {
NOP_ctx_gfx11 initial_ctx;
bool has_previous_part =
program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
(program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
if (program->gfx_level >= GFX12 && has_previous_part) {
/* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
* SGPR might have been read by VALU if there was a previous shader part.
*/
initial_ctx.sgpr_read_by_valu.flip();
}
mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
initial_ctx);
} else if (program->gfx_level >= GFX10_3) {
; /* no hazards/bugs to mitigate */
else if (program->gfx_level >= GFX10)
} else if (program->gfx_level >= GFX10) {
mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
else
} else {
mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
}
if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
program->stage.hw == AC_HW_PIXEL_SHADER))