aco: support GFX12 in insert_NOPs

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29330>
This commit is contained in:
Rhys Perry 2024-05-03 12:05:00 +01:00 committed by Marge Bot
parent 4835dc0e7f
commit 872dda2bc5
2 changed files with 514 additions and 300 deletions

View file

@ -253,6 +253,8 @@ struct NOP_ctx_gfx11 {
/* LdsDirectVMEMHazard */ /* LdsDirectVMEMHazard */
std::bitset<256> vgpr_used_by_vmem_load; std::bitset<256> vgpr_used_by_vmem_load;
std::bitset<256> vgpr_used_by_vmem_sample;
std::bitset<256> vgpr_used_by_vmem_bvh;
std::bitset<256> vgpr_used_by_vmem_store; std::bitset<256> vgpr_used_by_vmem_store;
std::bitset<256> vgpr_used_by_ds; std::bitset<256> vgpr_used_by_ds;
@ -268,6 +270,8 @@ struct NOP_ctx_gfx11 {
{ {
has_Vcmpx |= other.has_Vcmpx; has_Vcmpx |= other.has_Vcmpx;
vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load; vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store; vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
vgpr_used_by_ds |= other.vgpr_used_by_ds; vgpr_used_by_ds |= other.vgpr_used_by_ds;
valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans); valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
@ -281,6 +285,8 @@ struct NOP_ctx_gfx11 {
{ {
return has_Vcmpx == other.has_Vcmpx && return has_Vcmpx == other.has_Vcmpx &&
vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load && vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store && vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
vgpr_used_by_ds == other.vgpr_used_by_ds && vgpr_used_by_ds == other.vgpr_used_by_ds &&
valu_since_wr_by_trans == other.valu_since_wr_by_trans && valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
@ -1373,7 +1379,9 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
ctx.has_Vcmpx = true; ctx.has_Vcmpx = true;
} else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 || } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
instr->opcode == aco_opcode::v_permlanex16_b32 || instr->opcode == aco_opcode::v_permlanex16_b32 ||
instr->opcode == aco_opcode::v_permlane64_b32)) { instr->opcode == aco_opcode::v_permlane64_b32 ||
instr->opcode == aco_opcode::v_permlane16_var_b32 ||
instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
ctx.has_Vcmpx = false; ctx.has_Vcmpx = false;
/* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */ /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
@ -1395,6 +1403,8 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
/* va_vdst already obtained through parse_vdst_wait(). */ /* va_vdst already obtained through parse_vdst_wait(). */
vm_vsrc = (instr->salu().imm >> 2) & 0x7; vm_vsrc = (instr->salu().imm >> 2) & 0x7;
sa_sdst = instr->salu().imm & 0x1; sa_sdst = instr->salu().imm & 0x1;
} else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
} }
if (instr->isLDSDIR()) { if (instr->isLDSDIR()) {
@ -1410,7 +1420,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
* VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
* in-between. * in-between.
*/ */
if (va_vdst > 0 && instr->isVALU()) { if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
uint8_t num_valu = 15; uint8_t num_valu = 15;
uint8_t num_trans = 15; uint8_t num_trans = 15;
for (Operand& op : instr->operands) { for (Operand& op : instr->operands) {
@ -1427,11 +1437,13 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
} }
} }
if (va_vdst > 0 && handle_valu_partial_forwarding_hazard(state, instr)) { if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
handle_valu_partial_forwarding_hazard(state, instr)) {
bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff); bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
va_vdst = 0; va_vdst = 0;
} }
if (state.program->gfx_level < GFX12) {
/* VALUMaskWriteHazard /* VALUMaskWriteHazard
* VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU. * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
*/ */
@ -1489,19 +1501,29 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
} }
} }
} }
}
/* LdsDirectVMEMHazard /* LdsDirectVMEMHazard
* Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction. * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
*/ */
if (instr->isVMEM() || instr->isFlatLike()) { if (instr->isVMEM() || instr->isFlatLike()) {
for (Definition& def : instr->definitions)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, def.physReg(), def.bytes());
if (instr->definitions.empty()) { if (instr->definitions.empty()) {
for (Operand& op : instr->operands) for (Operand& op : instr->operands)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes()); fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
} else { } else {
uint8_t vmem_type = state.program->gfx_level >= GFX12
? get_vmem_type(state.program->gfx_level, instr.get())
: vmem_nosampler;
std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
if (vmem_type == vmem_sampler)
vgprs = &ctx.vgpr_used_by_vmem_sample;
else if (vmem_type == vmem_bvh)
vgprs = &ctx.vgpr_used_by_vmem_bvh;
for (Definition& def : instr->definitions)
fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
for (Operand& op : instr->operands) for (Operand& op : instr->operands)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes()); fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
} }
} }
if (instr->isDS() || instr->isFlat()) { if (instr->isDS() || instr->isFlat()) {
@ -1513,11 +1535,17 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
wait_imm imm; wait_imm imm;
if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) { if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
ctx.vgpr_used_by_vmem_load.reset(); ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_sample.reset();
ctx.vgpr_used_by_vmem_bvh.reset();
ctx.vgpr_used_by_vmem_store.reset(); ctx.vgpr_used_by_vmem_store.reset();
ctx.vgpr_used_by_ds.reset(); ctx.vgpr_used_by_ds.reset();
} else if (imm.unpack(state.program->gfx_level, instr.get())) { } else if (imm.unpack(state.program->gfx_level, instr.get())) {
if (imm.vm == 0) if (imm.vm == 0)
ctx.vgpr_used_by_vmem_load.reset(); ctx.vgpr_used_by_vmem_load.reset();
if (imm.sample == 0)
ctx.vgpr_used_by_vmem_sample.reset();
if (imm.bvh == 0)
ctx.vgpr_used_by_vmem_bvh.reset();
if (imm.lgkm == 0) if (imm.lgkm == 0)
ctx.vgpr_used_by_ds.reset(); ctx.vgpr_used_by_ds.reset();
if (imm.vs == 0) if (imm.vs == 0)
@ -1525,10 +1553,17 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
} }
if (instr->isLDSDIR()) { if (instr->isLDSDIR()) {
if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] || if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] || ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) { ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
if (state.program->gfx_level >= GFX12)
instr->ldsdir().wait_vsrc = 0;
else
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3); bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
ctx.vgpr_used_by_vmem_load.reset(); ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_sample.reset();
ctx.vgpr_used_by_vmem_bvh.reset();
ctx.vgpr_used_by_vmem_store.reset(); ctx.vgpr_used_by_vmem_store.reset();
ctx.vgpr_used_by_ds.reset(); ctx.vgpr_used_by_ds.reset();
} }
@ -1591,7 +1626,7 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
} }
/* VALUMaskWriteHazard */ /* VALUMaskWriteHazard */
if (state.program->wave_size == 64 && if (state.program->gfx_level < GFX12 && state.program->wave_size == 64 &&
(ctx.sgpr_read_by_valu_as_lanemask.any() || (ctx.sgpr_read_by_valu_as_lanemask.any() ||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) { ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) {
waitcnt_depctr &= 0xfffe; waitcnt_depctr &= 0xfffe;
@ -1601,7 +1636,8 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
/* LdsDirectVMEMHazard */ /* LdsDirectVMEMHazard */
if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() || if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
ctx.vgpr_used_by_ds.any()) { ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
ctx.vgpr_used_by_vmem_bvh.any()) {
waitcnt_depctr &= 0xffe3; waitcnt_depctr &= 0xffe3;
ctx.vgpr_used_by_vmem_load.reset(); ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_store.reset(); ctx.vgpr_used_by_vmem_store.reset();

View file

@ -40,6 +40,26 @@ create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
bld.insert(std::move(mimg)); bld.insert(std::move(mimg));
} }
void
create_bvh()
{
aco_ptr<Instruction> instr{
create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)};
instr->definitions[0] = Definition(PhysReg(256), v4);
instr->operands[0] = Operand(PhysReg(0), s4);
instr->operands[1] = Operand(s4);
instr->operands[2] = Operand(v1);
instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */
instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */
instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */
instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */
instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */
instr->mimg().dmask = 0xf;
instr->mimg().unrm = true;
instr->mimg().r128 = true;
bld.insert(std::move(instr));
}
BEGIN_TEST(insert_nops.nsa_to_vmem_bug) BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
if (!setup_cs(NULL, GFX10)) if (!setup_cs(NULL, GFX10))
return; return;
@ -299,8 +319,9 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write)
END_TEST END_TEST
BEGIN_TEST(insert_nops.lds_direct_valu) BEGIN_TEST(insert_nops.lds_direct_valu)
if (!setup_cs(NULL, GFX11)) for (amd_gfx_level gfx : {GFX11, GFX12}) {
return; if (!setup_cs(NULL, gfx))
continue;
/* WaW */ /* WaW */
//>> p_unit_test 0 //>> p_unit_test 0
@ -422,17 +443,20 @@ BEGIN_TEST(insert_nops.lds_direct_valu)
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
finish_insert_nops_test(); finish_insert_nops_test();
}
END_TEST END_TEST
BEGIN_TEST(insert_nops.lds_direct_vmem) BEGIN_TEST(insert_nops.lds_direct_vmem)
if (!setup_cs(NULL, GFX11)) for (amd_gfx_level gfx : {GFX11, GFX12}) {
return; if (!setup_cs(NULL, gfx))
continue;
/* WaR: VMEM */ /* WaR: VMEM */
//>> p_unit_test 0 //>> p_unit_test 0
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
create_mubuf(0, PhysReg(257)); create_mubuf(0, PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
@ -440,8 +464,9 @@ BEGIN_TEST(insert_nops.lds_direct_vmem)
/* WaW: VMEM */ /* WaW: VMEM */
//! p_unit_test 1 //! p_unit_test 1
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
create_mubuf(0, PhysReg(256), PhysReg(257)); create_mubuf(0, PhysReg(256), PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
@ -467,8 +492,9 @@ BEGIN_TEST(insert_nops.lds_direct_vmem)
/* WaR: LDS */ /* WaR: LDS */
//! p_unit_test 4 //! p_unit_test 4
//! v1: %0:v[1] = ds_read_b32 %0:v[0] //! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
@ -476,8 +502,9 @@ BEGIN_TEST(insert_nops.lds_direct_vmem)
/* WaW: LDS */ /* WaW: LDS */
//! p_unit_test 5 //! p_unit_test 5
//! v1: %0:v[0] = ds_read_b32 %0:v[1] //! v1: %0:v[0] = ds_read_b32 %0:v[1]
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
@ -503,73 +530,184 @@ BEGIN_TEST(insert_nops.lds_direct_vmem)
/* no hazard: VMEM/LDS with the correct waitcnt in-between */ /* no hazard: VMEM/LDS with the correct waitcnt in-between */
//! p_unit_test 8 //! p_unit_test 8
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! s_waitcnt vmcnt(0) //~gfx11! s_waitcnt vmcnt(0)
//~gfx12! s_wait_loadcnt imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0 //! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
create_mubuf(0, PhysReg(257)); create_mubuf(0, PhysReg(257));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0x3ff); bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 9 //! p_unit_test 9
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
//! s_waitcnt_vscnt %0:null imm:0 //~gfx11! s_waitcnt_vscnt %0:null imm:0
//~gfx12! s_wait_storecnt imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0 //! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
create_mubuf_store(); create_mubuf_store();
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_storecnt, 0);
else
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 10 //! p_unit_test 10
//! v1: %0:v[1] = ds_read_b32 %0:v[0] //! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! s_waitcnt lgkmcnt(0) //~gfx11! s_waitcnt lgkmcnt(0)
//~gfx12! s_wait_dscnt imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0 //! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_dscnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* VMEM/LDS with the wrong waitcnt in-between */ if (gfx >= GFX12) {
//! p_unit_test 11 //~gfx12! p_unit_test 11
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
//! s_waitcnt_vscnt %0:null imm:0 //~gfx12! s_wait_loadcnt imm:0
//! s_waitcnt_depctr vm_vsrc(0) //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
Instruction* instr =
bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 12
//~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_samplecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_samplecnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 13
//~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
//~gfx12! s_wait_bvhcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
create_bvh();
bld.sopp(aco_opcode::s_wait_bvhcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
}
/* VMEM/LDS with the wrong waitcnt in-between */
//! p_unit_test 14
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//~gfx11! s_waitcnt_vscnt %0:null imm:0
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! s_wait_storecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
create_mubuf(0, PhysReg(257)); create_mubuf(0, PhysReg(257));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_storecnt, 0);
else
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 12 //! p_unit_test 15
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
//! s_waitcnt lgkmcnt(0) //~gfx11! s_waitcnt lgkmcnt(0)
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); //~gfx12! s_wait_dscnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
create_mubuf_store(); create_mubuf_store();
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_dscnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 13 //! p_unit_test 16
//! v1: %0:v[1] = ds_read_b32 %0:v[0] //! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! s_waitcnt vmcnt(0) //~gfx11! s_waitcnt vmcnt(0)
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); //~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0x3ff); bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 14 //! p_unit_test 17
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//! s_waitcnt_vscnt %0:null imm:0 //~gfx11! s_waitcnt_vscnt %0:null imm:0
//! s_waitcnt_depctr vm_vsrc(0) //~gfx11! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14)); //~gfx12! s_wait_storecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
create_mubuf(0, PhysReg(256), PhysReg(257)); create_mubuf(0, PhysReg(256), PhysReg(257));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_storecnt, 0);
else
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
if (gfx >= GFX12) {
//~gfx12! p_unit_test 18
//~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_samplecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
Instruction* instr =
bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_samplecnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 19
//~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 20
//~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
create_bvh();
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
}
finish_insert_nops_test(); finish_insert_nops_test();
}
END_TEST END_TEST
BEGIN_TEST(insert_nops.valu_trans_use) BEGIN_TEST(insert_nops.valu_trans_use)
@ -1310,4 +1448,44 @@ BEGIN_TEST(insert_nops.setpc_gfx11)
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
finish_insert_nops_test(true); finish_insert_nops_test(true);
} END_TEST
BEGIN_TEST(insert_nops.setpc_gfx12)
if (!setup_cs(NULL, GFX12))
return;
//>> p_unit_test 0
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
/* LdsDirectVALUHazard */
//! p_unit_test 1
//! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
//! s_waitcnt_depctr va_vdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
Operand::zero());
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
/* VcmpxPermlaneHazard */
//! p_unit_test 2
//! s2: %0:exec = v_cmpx_eq_u32 0, 0
//! v_nop
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
/* LdsDirectVMEMHazard */
//! p_unit_test 3
//! v1: %0:v[0] = ds_read_b32 %0:v[0]
//! s_waitcnt_depctr vm_vsrc(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
finish_insert_nops_test(true);
END_TEST