mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-22 00:08:09 +02:00
aco/gfx11: workaround LdsDirectVMEMHazard
fossil-db (gfx1100): Totals from 27217 (20.16% of 135032) affected shaders: Instrs: 18010853 -> 18047277 (+0.20%) CodeSize: 99369568 -> 99515264 (+0.15%) Latency: 207454040 -> 207464932 (+0.01%); split: -0.00%, +0.01% InvThroughput: 39810158 -> 39810628 (+0.00%); split: -0.00%, +0.00% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18273>
This commit is contained in:
parent
296b4d95a3
commit
98ee3e1468
3 changed files with 223 additions and 5 deletions
|
|
@ -302,3 +302,12 @@ LDSDIR instruction writing a VGPR soon after it's used by a VALU instruction.
|
|||
|
||||
Mitigated by:
|
||||
A vdst wait, preferably using the LDSDIR's field.
|
||||
|
||||
### LdsDirectVMEMHazard
|
||||
|
||||
Triggered by:
|
||||
LDSDIR instruction writing a VGPR after it's used by a VMEM/DS instruction.
|
||||
|
||||
Mitigated by:
|
||||
Waiting for the VMEM/DS instruction to finish, a VALU or export instruction, or
|
||||
`s_waitcnt_depctr 0xffe3`.
|
||||
|
|
|
|||
|
|
@ -198,9 +198,26 @@ struct NOP_ctx_gfx11 {
|
|||
/* VcmpxPermlaneHazard */
|
||||
bool has_Vcmpx = false;
|
||||
|
||||
void join(const NOP_ctx_gfx11& other) { has_Vcmpx |= other.has_Vcmpx; }
|
||||
/* LdsDirectVMEMHazard */
|
||||
std::bitset<256> vgpr_used_by_vmem_load;
|
||||
std::bitset<256> vgpr_used_by_vmem_store;
|
||||
std::bitset<256> vgpr_used_by_ds;
|
||||
|
||||
bool operator==(const NOP_ctx_gfx11& other) { return has_Vcmpx == other.has_Vcmpx; }
|
||||
void join(const NOP_ctx_gfx11& other)
|
||||
{
|
||||
has_Vcmpx |= other.has_Vcmpx;
|
||||
vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
|
||||
vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
|
||||
vgpr_used_by_ds |= other.vgpr_used_by_ds;
|
||||
}
|
||||
|
||||
bool operator==(const NOP_ctx_gfx11& other)
|
||||
{
|
||||
return has_Vcmpx == other.has_Vcmpx &&
|
||||
vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
|
||||
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
|
||||
vgpr_used_by_ds == other.vgpr_used_by_ds;
|
||||
}
|
||||
};
|
||||
|
||||
int
|
||||
|
|
@ -866,6 +883,15 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
|
||||
{
|
||||
if (reg.reg() < 256)
|
||||
return;
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
|
||||
set.set(reg.reg() - 256 + i);
|
||||
}
|
||||
|
||||
/* GFX11 */
|
||||
unsigned
|
||||
parse_vdst_wait(aco_ptr<Instruction>& instr)
|
||||
|
|
@ -983,6 +1009,51 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
|
|||
LDSDIR_instruction* ldsdir = &instr->ldsdir();
|
||||
ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
|
||||
}
|
||||
|
||||
/* LdsDirectVMEMHazard
|
||||
* Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
|
||||
*/
|
||||
if (instr->isVMEM() || instr->isFlatLike()) {
|
||||
for (Definition& def : instr->definitions)
|
||||
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, def.physReg(), def.bytes());
|
||||
if (instr->definitions.empty()) {
|
||||
for (Operand& op : instr->operands)
|
||||
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
|
||||
} else {
|
||||
for (Operand& op : instr->operands)
|
||||
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes());
|
||||
}
|
||||
}
|
||||
if (instr->isDS() || instr->isFlat()) {
|
||||
for (Definition& def : instr->definitions)
|
||||
fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
|
||||
for (Operand& op : instr->operands)
|
||||
fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
|
||||
}
|
||||
if (instr->isVALU() || instr->isVINTERP_INREG() || instr->isEXP() ||
|
||||
(instr->opcode == aco_opcode::s_waitcnt_depctr && ((instr->sopp().imm >> 2) & 0x7) == 0)) {
|
||||
ctx.vgpr_used_by_vmem_load.reset();
|
||||
ctx.vgpr_used_by_vmem_store.reset();
|
||||
ctx.vgpr_used_by_ds.reset();
|
||||
} else if (instr->opcode == aco_opcode::s_waitcnt) {
|
||||
wait_imm imm(GFX11, instr->sopp().imm);
|
||||
if (imm.vm == 0)
|
||||
ctx.vgpr_used_by_vmem_load.reset();
|
||||
if (imm.lgkm == 0)
|
||||
ctx.vgpr_used_by_ds.reset();
|
||||
} else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->sopk().imm == 0) {
|
||||
ctx.vgpr_used_by_vmem_store.reset();
|
||||
}
|
||||
if (instr->isLDSDIR()) {
|
||||
if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
|
||||
ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
|
||||
ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
|
||||
bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xffe3);
|
||||
ctx.vgpr_used_by_vmem_load.reset();
|
||||
ctx.vgpr_used_by_vmem_store.reset();
|
||||
ctx.vgpr_used_by_ds.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Ctx>
|
||||
|
|
|
|||
|
|
@ -25,10 +25,10 @@
|
|||
|
||||
using namespace aco;
|
||||
|
||||
void create_mubuf(unsigned offset)
|
||||
void create_mubuf(unsigned offset, PhysReg dst=PhysReg(256), PhysReg vaddr=PhysReg(256))
|
||||
{
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, Definition(PhysReg(256), v1), Operand(PhysReg(0), s4),
|
||||
Operand(PhysReg(256), v1), Operand::zero(), offset, true);
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
|
||||
Operand(vaddr, v1), Operand::zero(), offset, true);
|
||||
}
|
||||
|
||||
void create_mubuf_store(PhysReg src=PhysReg(256))
|
||||
|
|
@ -432,3 +432,141 @@ BEGIN_TEST(insert_nops.lds_direct_valu)
|
|||
|
||||
finish_insert_nops_test();
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_nops.lds_direct_vmem)
|
||||
if (!setup_cs(NULL, GFX11))
|
||||
return;
|
||||
|
||||
/* WaR: VMEM */
|
||||
//>> p_unit_test 0
|
||||
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
create_mubuf(0, PhysReg(257));
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* WaW: VMEM */
|
||||
//! p_unit_test 1
|
||||
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
create_mubuf(0, PhysReg(256), PhysReg(257));
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* no hazard: VMEM */
|
||||
//! p_unit_test 2
|
||||
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
create_mubuf(0, PhysReg(257), PhysReg(257));
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* no hazard: VMEM with VALU in-between */
|
||||
//! p_unit_test 3
|
||||
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
|
||||
//! v_nop
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
create_mubuf(0, PhysReg(257));
|
||||
bld.vop1(aco_opcode::v_nop);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* WaR: LDS */
|
||||
//! p_unit_test 4
|
||||
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* WaW: LDS */
|
||||
//! p_unit_test 5
|
||||
//! v1: %0:v[0] = ds_read_b32 %0:v[1]
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* no hazard: LDS */
|
||||
//! p_unit_test 6
|
||||
//! v1: %0:v[1] = ds_read_b32 %0:v[1]
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* no hazard: LDS with VALU in-between */
|
||||
//! p_unit_test 7
|
||||
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
|
||||
//! v_nop
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
|
||||
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
|
||||
bld.vop1(aco_opcode::v_nop);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* no hazard: VMEM/LDS with the correct waitcnt in-between */
|
||||
//! p_unit_test 8
|
||||
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
|
||||
//! s_waitcnt vmcnt(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
|
||||
create_mubuf(0, PhysReg(257));
|
||||
bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
//! p_unit_test 9
|
||||
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
|
||||
//! s1: %0:null = s_waitcnt_vscnt imm:0
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
|
||||
create_mubuf_store();
|
||||
bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
//! p_unit_test 10
|
||||
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
|
||||
//! s_waitcnt lgkmcnt(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
|
||||
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
/* VMEM/LDS with the wrong waitcnt in-between */
|
||||
//! p_unit_test 11
|
||||
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
|
||||
//! s1: %0:null = s_waitcnt_vscnt imm:0
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
|
||||
create_mubuf(0, PhysReg(257));
|
||||
bld.sopk(aco_opcode::s_waitcnt_vscnt, Definition(sgpr_null, s1), 0);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
//! p_unit_test 12
|
||||
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
|
||||
//! s_waitcnt lgkmcnt(0)
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
|
||||
create_mubuf_store();
|
||||
bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
//! p_unit_test 13
|
||||
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
|
||||
//! s_waitcnt vmcnt(0)
|
||||
//! s_waitcnt_depctr vm_vsrc(0)
|
||||
//! v1: %0:v[0] = lds_direct_load %0:m0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
|
||||
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
|
||||
bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
|
||||
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
|
||||
|
||||
finish_insert_nops_test();
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue