aco: support GFX12 in insert_NOPs

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29330>
This commit is contained in:
Rhys Perry 2024-05-03 12:05:00 +01:00 committed by Marge Bot
parent 4835dc0e7f
commit 872dda2bc5
2 changed files with 514 additions and 300 deletions

View file

@ -253,6 +253,8 @@ struct NOP_ctx_gfx11 {
/* LdsDirectVMEMHazard */
std::bitset<256> vgpr_used_by_vmem_load;
std::bitset<256> vgpr_used_by_vmem_sample;
std::bitset<256> vgpr_used_by_vmem_bvh;
std::bitset<256> vgpr_used_by_vmem_store;
std::bitset<256> vgpr_used_by_ds;
@ -268,6 +270,8 @@ struct NOP_ctx_gfx11 {
{
has_Vcmpx |= other.has_Vcmpx;
vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
vgpr_used_by_ds |= other.vgpr_used_by_ds;
valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
@ -281,6 +285,8 @@ struct NOP_ctx_gfx11 {
{
return has_Vcmpx == other.has_Vcmpx &&
vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
vgpr_used_by_ds == other.vgpr_used_by_ds &&
valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
@ -1373,7 +1379,9 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
ctx.has_Vcmpx = true;
} else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
instr->opcode == aco_opcode::v_permlanex16_b32 ||
instr->opcode == aco_opcode::v_permlane64_b32)) {
instr->opcode == aco_opcode::v_permlane64_b32 ||
instr->opcode == aco_opcode::v_permlane16_var_b32 ||
instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
ctx.has_Vcmpx = false;
/* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
@ -1395,6 +1403,8 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
/* va_vdst already obtained through parse_vdst_wait(). */
vm_vsrc = (instr->salu().imm >> 2) & 0x7;
sa_sdst = instr->salu().imm & 0x1;
} else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
}
if (instr->isLDSDIR()) {
@ -1410,7 +1420,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
* VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
* in-between.
*/
if (va_vdst > 0 && instr->isVALU()) {
if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
uint8_t num_valu = 15;
uint8_t num_trans = 15;
for (Operand& op : instr->operands) {
@ -1427,65 +1437,68 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
}
}
if (va_vdst > 0 && handle_valu_partial_forwarding_hazard(state, instr)) {
if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
handle_valu_partial_forwarding_hazard(state, instr)) {
bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
va_vdst = 0;
}
/* VALUMaskWriteHazard
* VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
*/
if (state.program->wave_size == 64 && instr->isSALU() &&
check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
ctx.sgpr_read_by_valu_as_lanemask.reset();
} else if (state.program->wave_size == 64 && instr->isSALU() &&
check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
sa_sdst = 0;
}
if (va_vdst == 0) {
ctx.valu_since_wr_by_trans.reset();
ctx.trans_since_wr_by_trans.reset();
}
if (sa_sdst == 0)
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
if (instr->isVALU()) {
bool is_trans = instr->isTrans();
ctx.valu_since_wr_by_trans.inc();
if (is_trans)
ctx.trans_since_wr_by_trans.inc();
if (is_trans) {
for (Definition& def : instr->definitions) {
ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes());
ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
}
if (state.program->gfx_level < GFX12) {
/* VALUMaskWriteHazard
* VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU.
*/
if (state.program->wave_size == 64 && instr->isSALU() &&
check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask;
ctx.sgpr_read_by_valu_as_lanemask.reset();
} else if (state.program->wave_size == 64 && instr->isSALU() &&
check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
sa_sdst = 0;
}
if (state.program->wave_size == 64) {
for (Operand& op : instr->operands) {
if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
ctx.sgpr_read_by_valu_as_lanemask.reset();
}
switch (instr->opcode) {
case aco_opcode::v_addc_co_u32:
case aco_opcode::v_subb_co_u32:
case aco_opcode::v_subbrev_co_u32:
case aco_opcode::v_cndmask_b16:
case aco_opcode::v_cndmask_b32:
case aco_opcode::v_div_fmas_f32:
case aco_opcode::v_div_fmas_f64:
if (instr->operands.back().physReg() != exec) {
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
if (va_vdst == 0) {
ctx.valu_since_wr_by_trans.reset();
ctx.trans_since_wr_by_trans.reset();
}
if (sa_sdst == 0)
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
if (instr->isVALU()) {
bool is_trans = instr->isTrans();
ctx.valu_since_wr_by_trans.inc();
if (is_trans)
ctx.trans_since_wr_by_trans.inc();
if (is_trans) {
for (Definition& def : instr->definitions) {
ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes());
ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
}
}
if (state.program->wave_size == 64) {
for (Operand& op : instr->operands) {
if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128))
ctx.sgpr_read_by_valu_as_lanemask.reset();
}
switch (instr->opcode) {
case aco_opcode::v_addc_co_u32:
case aco_opcode::v_subb_co_u32:
case aco_opcode::v_subbrev_co_u32:
case aco_opcode::v_cndmask_b16:
case aco_opcode::v_cndmask_b32:
case aco_opcode::v_div_fmas_f32:
case aco_opcode::v_div_fmas_f64:
if (instr->operands.back().physReg() != exec) {
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
}
break;
default: break;
}
break;
default: break;
}
}
}
@ -1494,14 +1507,23 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
* Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
*/
if (instr->isVMEM() || instr->isFlatLike()) {
for (Definition& def : instr->definitions)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, def.physReg(), def.bytes());
if (instr->definitions.empty()) {
for (Operand& op : instr->operands)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
} else {
uint8_t vmem_type = state.program->gfx_level >= GFX12
? get_vmem_type(state.program->gfx_level, instr.get())
: vmem_nosampler;
std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
if (vmem_type == vmem_sampler)
vgprs = &ctx.vgpr_used_by_vmem_sample;
else if (vmem_type == vmem_bvh)
vgprs = &ctx.vgpr_used_by_vmem_bvh;
for (Definition& def : instr->definitions)
fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
for (Operand& op : instr->operands)
fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes());
fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
}
}
if (instr->isDS() || instr->isFlat()) {
@ -1513,11 +1535,17 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
wait_imm imm;
if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_sample.reset();
ctx.vgpr_used_by_vmem_bvh.reset();
ctx.vgpr_used_by_vmem_store.reset();
ctx.vgpr_used_by_ds.reset();
} else if (imm.unpack(state.program->gfx_level, instr.get())) {
if (imm.vm == 0)
ctx.vgpr_used_by_vmem_load.reset();
if (imm.sample == 0)
ctx.vgpr_used_by_vmem_sample.reset();
if (imm.bvh == 0)
ctx.vgpr_used_by_vmem_bvh.reset();
if (imm.lgkm == 0)
ctx.vgpr_used_by_ds.reset();
if (imm.vs == 0)
@ -1525,10 +1553,17 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
}
if (instr->isLDSDIR()) {
if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
if (state.program->gfx_level >= GFX12)
instr->ldsdir().wait_vsrc = 0;
else
bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_sample.reset();
ctx.vgpr_used_by_vmem_bvh.reset();
ctx.vgpr_used_by_vmem_store.reset();
ctx.vgpr_used_by_ds.reset();
}
@ -1591,7 +1626,7 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
}
/* VALUMaskWriteHazard */
if (state.program->wave_size == 64 &&
if (state.program->gfx_level < GFX12 && state.program->wave_size == 64 &&
(ctx.sgpr_read_by_valu_as_lanemask.any() ||
ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) {
waitcnt_depctr &= 0xfffe;
@ -1601,7 +1636,8 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
/* LdsDirectVMEMHazard */
if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
ctx.vgpr_used_by_ds.any()) {
ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
ctx.vgpr_used_by_vmem_bvh.any()) {
waitcnt_depctr &= 0xffe3;
ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_store.reset();

View file

@ -40,6 +40,26 @@ create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
bld.insert(std::move(mimg));
}
void
create_bvh()
{
aco_ptr<Instruction> instr{
create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)};
instr->definitions[0] = Definition(PhysReg(256), v4);
instr->operands[0] = Operand(PhysReg(0), s4);
instr->operands[1] = Operand(s4);
instr->operands[2] = Operand(v1);
instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */
instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */
instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */
instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */
instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */
instr->mimg().dmask = 0xf;
instr->mimg().unrm = true;
instr->mimg().r128 = true;
bld.insert(std::move(instr));
}
BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
if (!setup_cs(NULL, GFX10))
return;
@ -299,277 +319,395 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write)
END_TEST
BEGIN_TEST(insert_nops.lds_direct_valu)
if (!setup_cs(NULL, GFX11))
return;
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx))
continue;
/* WaW */
//>> p_unit_test 0
//! v1: %0:v[0] = v_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaW */
//>> p_unit_test 0
//! v1: %0:v[0] = v_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaR */
//! p_unit_test 1
//! v1: %0:v[1] = v_mov_b32 %0:v[0]
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaR */
//! p_unit_test 1
//! v1: %0:v[1] = v_mov_b32 %0:v[0]
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* No hazard. */
//! p_unit_test 2
//! v1: %0:v[1] = v_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* No hazard. */
//! p_unit_test 2
//! v1: %0:v[1] = v_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* multiples hazards, nearest should be considered */
//! p_unit_test 3
//! v1: %0:v[1] = v_mov_b32 %0:v[0]
//! v1: %0:v[0] = v_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* multiples hazards, nearest should be considered */
//! p_unit_test 3
//! v1: %0:v[1] = v_mov_b32 %0:v[0]
//! v1: %0:v[0] = v_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* independent VALU increase wait_vdst */
//! p_unit_test 4
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 5
//! v1: %0:v[0] = v_mov_b32 0
//; for i in range(10): insert_pattern('v_nop')
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
for (unsigned i = 0; i < 10; i++)
/* independent VALU increase wait_vdst */
//! p_unit_test 4
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 6
//! v1: %0:v[0] = v_mov_b32 0
//; for i in range(20): insert_pattern('v_nop')
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
for (unsigned i = 0; i < 20; i++)
//! p_unit_test 5
//! v1: %0:v[0] = v_mov_b32 0
//; for i in range(10): insert_pattern('v_nop')
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
for (unsigned i = 0; i < 10; i++)
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 6
//! v1: %0:v[0] = v_mov_b32 0
//; for i in range(20): insert_pattern('v_nop')
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
for (unsigned i = 0; i < 20; i++)
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* transcendental requires wait_vdst=0 */
//! p_unit_test 7
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* transcendental requires wait_vdst=0 */
//! p_unit_test 7
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 8
//! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 8
//! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* transcendental is fine if it's before the instruction */
//! p_unit_test 9
//! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* transcendental is fine if it's before the instruction */
//! p_unit_test 9
//! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* non-VALU does not increase wait_vdst */
//! p_unit_test 10
//! v1: %0:v[0] = v_mov_b32 0
//! s1: %0:m0 = s_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* non-VALU does not increase wait_vdst */
//! p_unit_test 10
//! v1: %0:v[0] = v_mov_b32 0
//! s1: %0:m0 = s_mov_b32 0
//! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* consider instructions which wait on vdst */
//! p_unit_test 11
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! s_waitcnt_depctr va_vdst(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* consider instructions which wait on vdst */
//! p_unit_test 11
//! v1: %0:v[0] = v_mov_b32 0
//! v_nop
//! s_waitcnt_depctr va_vdst(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
bld.vop1(aco_opcode::v_nop);
bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
finish_insert_nops_test();
finish_insert_nops_test();
}
END_TEST
BEGIN_TEST(insert_nops.lds_direct_vmem)
if (!setup_cs(NULL, GFX11))
return;
for (amd_gfx_level gfx : {GFX11, GFX12}) {
if (!setup_cs(NULL, gfx))
continue;
/* WaR: VMEM */
//>> p_unit_test 0
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
create_mubuf(0, PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaR: VMEM */
//>> p_unit_test 0
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
create_mubuf(0, PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaW: VMEM */
//! p_unit_test 1
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
create_mubuf(0, PhysReg(256), PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaW: VMEM */
//! p_unit_test 1
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
create_mubuf(0, PhysReg(256), PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: VMEM */
//! p_unit_test 2
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
create_mubuf(0, PhysReg(257), PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: VMEM */
//! p_unit_test 2
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
create_mubuf(0, PhysReg(257), PhysReg(257));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: VMEM with VALU in-between */
//! p_unit_test 3
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
create_mubuf(0, PhysReg(257));
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: VMEM with VALU in-between */
//! p_unit_test 3
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
create_mubuf(0, PhysReg(257));
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaR: LDS */
//! p_unit_test 4
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaR: LDS */
//! p_unit_test 4
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaW: LDS */
//! p_unit_test 5
//! v1: %0:v[0] = ds_read_b32 %0:v[1]
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* WaW: LDS */
//! p_unit_test 5
//! v1: %0:v[0] = ds_read_b32 %0:v[1]
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: LDS */
//! p_unit_test 6
//! v1: %0:v[1] = ds_read_b32 %0:v[1]
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: LDS */
//! p_unit_test 6
//! v1: %0:v[1] = ds_read_b32 %0:v[1]
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: LDS with VALU in-between */
//! p_unit_test 7
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: LDS with VALU in-between */
//! p_unit_test 7
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! v_nop
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.vop1(aco_opcode::v_nop);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: VMEM/LDS with the correct waitcnt in-between */
//! p_unit_test 8
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! s_waitcnt vmcnt(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
create_mubuf(0, PhysReg(257));
bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* no hazard: VMEM/LDS with the correct waitcnt in-between */
//! p_unit_test 8
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//~gfx11! s_waitcnt vmcnt(0)
//~gfx12! s_wait_loadcnt imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
create_mubuf(0, PhysReg(257));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 9
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
//! s_waitcnt_vscnt %0:null imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
create_mubuf_store();
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 9
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
//~gfx11! s_waitcnt_vscnt %0:null imm:0
//~gfx12! s_wait_storecnt imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
create_mubuf_store();
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_storecnt, 0);
else
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 10
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! s_waitcnt lgkmcnt(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 10
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//~gfx11! s_waitcnt lgkmcnt(0)
//~gfx12! s_wait_dscnt imm:0
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_dscnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* VMEM/LDS with the wrong waitcnt in-between */
//! p_unit_test 11
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//! s_waitcnt_vscnt %0:null imm:0
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
create_mubuf(0, PhysReg(257));
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
if (gfx >= GFX12) {
//~gfx12! p_unit_test 11
//~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
Instruction* instr =
bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 12
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
//! s_waitcnt lgkmcnt(0)
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
create_mubuf_store();
bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 12
//~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_samplecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_samplecnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 13
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//! s_waitcnt vmcnt(0)
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 13
//~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
//~gfx12! s_wait_bvhcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
create_bvh();
bld.sopp(aco_opcode::s_wait_bvhcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
}
//! p_unit_test 14
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//! s_waitcnt_vscnt %0:null imm:0
//! s_waitcnt_depctr vm_vsrc(0)
//! v1: %0:v[0] = lds_direct_load %0:m0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
create_mubuf(0, PhysReg(256), PhysReg(257));
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
/* VMEM/LDS with the wrong waitcnt in-between */
//! p_unit_test 14
//! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
//~gfx11! s_waitcnt_vscnt %0:null imm:0
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! s_wait_storecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
create_mubuf(0, PhysReg(257));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_storecnt, 0);
else
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
finish_insert_nops_test();
//! p_unit_test 15
//! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
//~gfx11! s_waitcnt lgkmcnt(0)
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! s_wait_dscnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
create_mubuf_store();
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_dscnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 16
//! v1: %0:v[1] = ds_read_b32 %0:v[0]
//~gfx11! s_waitcnt vmcnt(0)
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
else
bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//! p_unit_test 17
//! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
//~gfx11! s_waitcnt_vscnt %0:null imm:0
//~gfx11! s_waitcnt_depctr vm_vsrc(0)
//~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
//~gfx12! s_wait_storecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
create_mubuf(0, PhysReg(256), PhysReg(257));
if (gfx >= GFX12)
bld.sopp(aco_opcode::s_wait_storecnt, 0);
else
bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
if (gfx >= GFX12) {
//~gfx12! p_unit_test 18
//~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_samplecnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
Instruction* instr =
bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_samplecnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 19
//~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
Operand(PhysReg(256), v2))
.instr;
instr->mimg().dmask = 0x1;
instr->mimg().dim = ac_image_2d;
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
//~gfx12! p_unit_test 20
//~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
create_bvh();
bld.sopp(aco_opcode::s_wait_loadcnt, 0);
bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
}
finish_insert_nops_test();
}
END_TEST
BEGIN_TEST(insert_nops.valu_trans_use)
@ -1310,4 +1448,44 @@ BEGIN_TEST(insert_nops.setpc_gfx11)
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
finish_insert_nops_test(true);
}
END_TEST
BEGIN_TEST(insert_nops.setpc_gfx12)
if (!setup_cs(NULL, GFX12))
return;
//>> p_unit_test 0
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
/* LdsDirectVALUHazard */
//! p_unit_test 1
//! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
//! s_waitcnt_depctr va_vdst(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
Operand::zero());
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
/* VcmpxPermlaneHazard */
//! p_unit_test 2
//! s2: %0:exec = v_cmpx_eq_u32 0, 0
//! v_nop
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
/* LdsDirectVMEMHazard */
//! p_unit_test 3
//! v1: %0:v[0] = ds_read_b32 %0:v[0]
//! s_waitcnt_depctr vm_vsrc(0)
//! s_setpc_b64 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
finish_insert_nops_test(true);
END_TEST