diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index c2f9b680fa7..a47c6e147bb 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -253,6 +253,8 @@ struct NOP_ctx_gfx11 { /* LdsDirectVMEMHazard */ std::bitset<256> vgpr_used_by_vmem_load; + std::bitset<256> vgpr_used_by_vmem_sample; + std::bitset<256> vgpr_used_by_vmem_bvh; std::bitset<256> vgpr_used_by_vmem_store; std::bitset<256> vgpr_used_by_ds; @@ -268,6 +270,8 @@ struct NOP_ctx_gfx11 { { has_Vcmpx |= other.has_Vcmpx; vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load; + vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample; + vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh; vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store; vgpr_used_by_ds |= other.vgpr_used_by_ds; valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans); @@ -281,6 +285,8 @@ struct NOP_ctx_gfx11 { { return has_Vcmpx == other.has_Vcmpx && vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load && + vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample && + vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh && vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store && vgpr_used_by_ds == other.vgpr_used_by_ds && valu_since_wr_by_trans == other.valu_since_wr_by_trans && @@ -1373,7 +1379,9 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& ctx.has_Vcmpx = true; } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 || instr->opcode == aco_opcode::v_permlanex16_b32 || - instr->opcode == aco_opcode::v_permlane64_b32)) { + instr->opcode == aco_opcode::v_permlane64_b32 || + instr->opcode == aco_opcode::v_permlane16_var_b32 || + instr->opcode == aco_opcode::v_permlanex16_var_b32)) { ctx.has_Vcmpx = false; /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */ @@ -1395,6 +1403,8 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& /* va_vdst already obtained through parse_vdst_wait(). */ vm_vsrc = (instr->salu().imm >> 2) & 0x7; sa_sdst = instr->salu().imm & 0x1; + } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) { + vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0; } if (instr->isLDSDIR()) { @@ -1410,7 +1420,7 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental * in-between. */ - if (va_vdst > 0 && instr->isVALU()) { + if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) { uint8_t num_valu = 15; uint8_t num_trans = 15; for (Operand& op : instr->operands) { @@ -1427,65 +1437,68 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& } } - if (va_vdst > 0 && handle_valu_partial_forwarding_hazard(state, instr)) { + if (va_vdst > 0 && state.program->gfx_level < GFX12 && + handle_valu_partial_forwarding_hazard(state, instr)) { bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff); va_vdst = 0; } - /* VALUMaskWriteHazard - * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU. - */ - if (state.program->wave_size == 64 && instr->isSALU() && - check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) { - ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask; - ctx.sgpr_read_by_valu_as_lanemask.reset(); - } else if (state.program->wave_size == 64 && instr->isSALU() && - check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) { - bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe); - sa_sdst = 0; - } - - if (va_vdst == 0) { - ctx.valu_since_wr_by_trans.reset(); - ctx.trans_since_wr_by_trans.reset(); - } - - if (sa_sdst == 0) - ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset(); - - if (instr->isVALU()) { - bool is_trans = instr->isTrans(); - - ctx.valu_since_wr_by_trans.inc(); - if (is_trans) - ctx.trans_since_wr_by_trans.inc(); - - if (is_trans) { - for (Definition& def : instr->definitions) { - ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes()); - ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes()); - } + if (state.program->gfx_level < GFX12) { + /* VALUMaskWriteHazard + * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU. + */ + if (state.program->wave_size == 64 && instr->isSALU() && + check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) { + ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu = ctx.sgpr_read_by_valu_as_lanemask; + ctx.sgpr_read_by_valu_as_lanemask.reset(); + } else if (state.program->wave_size == 64 && instr->isSALU() && + check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) { + bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe); + sa_sdst = 0; } - if (state.program->wave_size == 64) { - for (Operand& op : instr->operands) { - if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128)) - ctx.sgpr_read_by_valu_as_lanemask.reset(); - } - switch (instr->opcode) { - case aco_opcode::v_addc_co_u32: - case aco_opcode::v_subb_co_u32: - case aco_opcode::v_subbrev_co_u32: - case aco_opcode::v_cndmask_b16: - case aco_opcode::v_cndmask_b32: - case aco_opcode::v_div_fmas_f32: - case aco_opcode::v_div_fmas_f64: - if (instr->operands.back().physReg() != exec) { - ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg()); - ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1); + if (va_vdst == 0) { + ctx.valu_since_wr_by_trans.reset(); + ctx.trans_since_wr_by_trans.reset(); + } + + if (sa_sdst == 0) + ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset(); + + if (instr->isVALU()) { + bool is_trans = instr->isTrans(); + + ctx.valu_since_wr_by_trans.inc(); + if (is_trans) + ctx.trans_since_wr_by_trans.inc(); + + if (is_trans) { + for (Definition& def : instr->definitions) { + ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes()); + ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes()); + } + } + + if (state.program->wave_size == 64) { + for (Operand& op : instr->operands) { + if (op.isLiteral() || (!op.isConstant() && op.physReg().reg() < 128)) + ctx.sgpr_read_by_valu_as_lanemask.reset(); + } + switch (instr->opcode) { + case aco_opcode::v_addc_co_u32: + case aco_opcode::v_subb_co_u32: + case aco_opcode::v_subbrev_co_u32: + case aco_opcode::v_cndmask_b16: + case aco_opcode::v_cndmask_b32: + case aco_opcode::v_div_fmas_f32: + case aco_opcode::v_div_fmas_f64: + if (instr->operands.back().physReg() != exec) { + ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg()); + ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1); + } + break; + default: break; } - break; - default: break; } } } @@ -1494,14 +1507,23 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction. */ if (instr->isVMEM() || instr->isFlatLike()) { - for (Definition& def : instr->definitions) - fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, def.physReg(), def.bytes()); if (instr->definitions.empty()) { for (Operand& op : instr->operands) fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes()); } else { + uint8_t vmem_type = state.program->gfx_level >= GFX12 + ? get_vmem_type(state.program->gfx_level, instr.get()) + : vmem_nosampler; + std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load; + if (vmem_type == vmem_sampler) + vgprs = &ctx.vgpr_used_by_vmem_sample; + else if (vmem_type == vmem_bvh) + vgprs = &ctx.vgpr_used_by_vmem_bvh; + + for (Definition& def : instr->definitions) + fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes()); for (Operand& op : instr->operands) - fill_vgpr_bitset(ctx.vgpr_used_by_vmem_load, op.physReg(), op.bytes()); + fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes()); } } if (instr->isDS() || instr->isFlat()) { @@ -1513,11 +1535,17 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& wait_imm imm; if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) { ctx.vgpr_used_by_vmem_load.reset(); + ctx.vgpr_used_by_vmem_sample.reset(); + ctx.vgpr_used_by_vmem_bvh.reset(); ctx.vgpr_used_by_vmem_store.reset(); ctx.vgpr_used_by_ds.reset(); } else if (imm.unpack(state.program->gfx_level, instr.get())) { if (imm.vm == 0) ctx.vgpr_used_by_vmem_load.reset(); + if (imm.sample == 0) + ctx.vgpr_used_by_vmem_sample.reset(); + if (imm.bvh == 0) + ctx.vgpr_used_by_vmem_bvh.reset(); if (imm.lgkm == 0) ctx.vgpr_used_by_ds.reset(); if (imm.vs == 0) @@ -1525,10 +1553,17 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& } if (instr->isLDSDIR()) { if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] || + ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] || + ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] || ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] || ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) { - bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3); + if (state.program->gfx_level >= GFX12) + instr->ldsdir().wait_vsrc = 0; + else + bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3); ctx.vgpr_used_by_vmem_load.reset(); + ctx.vgpr_used_by_vmem_sample.reset(); + ctx.vgpr_used_by_vmem_bvh.reset(); ctx.vgpr_used_by_vmem_store.reset(); ctx.vgpr_used_by_ds.reset(); } @@ -1591,7 +1626,7 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx, } /* VALUMaskWriteHazard */ - if (state.program->wave_size == 64 && + if (state.program->gfx_level < GFX12 && state.program->wave_size == 64 && (ctx.sgpr_read_by_valu_as_lanemask.any() || ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any())) { waitcnt_depctr &= 0xfffe; @@ -1601,7 +1636,8 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx, /* LdsDirectVMEMHazard */ if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() || - ctx.vgpr_used_by_ds.any()) { + ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() || + ctx.vgpr_used_by_vmem_bvh.any()) { waitcnt_depctr &= 0xffe3; ctx.vgpr_used_by_vmem_load.reset(); ctx.vgpr_used_by_vmem_store.reset(); diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index f2a1df784f1..938f75c104f 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -40,6 +40,26 @@ create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords) bld.insert(std::move(mimg)); } +void +create_bvh() +{ + aco_ptr instr{ + create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)}; + instr->definitions[0] = Definition(PhysReg(256), v4); + instr->operands[0] = Operand(PhysReg(0), s4); + instr->operands[1] = Operand(s4); + instr->operands[2] = Operand(v1); + instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */ + instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */ + instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */ + instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */ + instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */ + instr->mimg().dmask = 0xf; + instr->mimg().unrm = true; + instr->mimg().r128 = true; + bld.insert(std::move(instr)); +} + BEGIN_TEST(insert_nops.nsa_to_vmem_bug) if (!setup_cs(NULL, GFX10)) return; @@ -299,277 +319,395 @@ BEGIN_TEST(insert_nops.vmem_to_scalar_write) END_TEST BEGIN_TEST(insert_nops.lds_direct_valu) - if (!setup_cs(NULL, GFX11)) - return; + for (amd_gfx_level gfx : {GFX11, GFX12}) { + if (!setup_cs(NULL, gfx)) + continue; - /* WaW */ - //>> p_unit_test 0 - //! v1: %0:v[0] = v_mov_b32 0 - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* WaW */ + //>> p_unit_test 0 + //! v1: %0:v[0] = v_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* WaR */ - //! p_unit_test 1 - //! v1: %0:v[1] = v_mov_b32 %0:v[0] - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* WaR */ + //! p_unit_test 1 + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* No hazard. */ - //! p_unit_test 2 - //! v1: %0:v[1] = v_mov_b32 0 - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero()); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* No hazard. */ + //! p_unit_test 2 + //! v1: %0:v[1] = v_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* multiples hazards, nearest should be considered */ - //! p_unit_test 3 - //! v1: %0:v[1] = v_mov_b32 %0:v[0] - //! v1: %0:v[0] = v_mov_b32 0 - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* multiples hazards, nearest should be considered */ + //! p_unit_test 3 + //! v1: %0:v[1] = v_mov_b32 %0:v[0] + //! v1: %0:v[0] = v_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* independent VALU increase wait_vdst */ - //! p_unit_test 4 - //! v1: %0:v[0] = v_mov_b32 0 - //! v_nop - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - - //! p_unit_test 5 - //! v1: %0:v[0] = v_mov_b32 0 - //; for i in range(10): insert_pattern('v_nop') - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - for (unsigned i = 0; i < 10; i++) + /* independent VALU increase wait_vdst */ + //! p_unit_test 4 + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - //! p_unit_test 6 - //! v1: %0:v[0] = v_mov_b32 0 - //; for i in range(20): insert_pattern('v_nop') - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - for (unsigned i = 0; i < 20; i++) + //! p_unit_test 5 + //! v1: %0:v[0] = v_mov_b32 0 + //; for i in range(10): insert_pattern('v_nop') + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + for (unsigned i = 0; i < 10; i++) + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 6 + //! v1: %0:v[0] = v_mov_b32 0 + //; for i in range(20): insert_pattern('v_nop') + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + for (unsigned i = 0; i < 20; i++) + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + /* transcendental requires wait_vdst=0 */ + //! p_unit_test 7 + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* transcendental requires wait_vdst=0 */ - //! p_unit_test 7 - //! v1: %0:v[0] = v_mov_b32 0 - //! v_nop - //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.vop1(aco_opcode::v_nop); - bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + //! p_unit_test 8 + //! v1: %0:v[0] = v_sqrt_f32 %0:v[0] + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - //! p_unit_test 8 - //! v1: %0:v[0] = v_sqrt_f32 %0:v[0] - //! v_nop - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); - bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); - bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* transcendental is fine if it's before the instruction */ + //! p_unit_test 9 + //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* transcendental is fine if it's before the instruction */ - //! p_unit_test 9 - //! v1: %0:v[1] = v_sqrt_f32 %0:v[1] - //! v1: %0:v[0] = v_mov_b32 0 - //! v_nop - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); - bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* non-VALU does not increase wait_vdst */ + //! p_unit_test 10 + //! v1: %0:v[0] = v_mov_b32 0 + //! s1: %0:m0 = s_mov_b32 0 + //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* non-VALU does not increase wait_vdst */ - //! p_unit_test 10 - //! v1: %0:v[0] = v_mov_b32 0 - //! s1: %0:m0 = s_mov_b32 0 - //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero()); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* consider instructions which wait on vdst */ + //! p_unit_test 11 + //! v1: %0:v[0] = v_mov_b32 0 + //! v_nop + //! s_waitcnt_depctr va_vdst(0) + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); + bld.vop1(aco_opcode::v_nop); + bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* consider instructions which wait on vdst */ - //! p_unit_test 11 - //! v1: %0:v[0] = v_mov_b32 0 - //! v_nop - //! s_waitcnt_depctr va_vdst(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero()); - bld.vop1(aco_opcode::v_nop); - bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - - finish_insert_nops_test(); + finish_insert_nops_test(); + } END_TEST BEGIN_TEST(insert_nops.lds_direct_vmem) - if (!setup_cs(NULL, GFX11)) - return; + for (amd_gfx_level gfx : {GFX11, GFX12}) { + if (!setup_cs(NULL, gfx)) + continue; - /* WaR: VMEM */ - //>> p_unit_test 0 - //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); - create_mubuf(0, PhysReg(257)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* WaR: VMEM */ + //>> p_unit_test 0 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + create_mubuf(0, PhysReg(257)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* WaW: VMEM */ - //! p_unit_test 1 - //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); - create_mubuf(0, PhysReg(256), PhysReg(257)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* WaW: VMEM */ + //! p_unit_test 1 + //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + create_mubuf(0, PhysReg(256), PhysReg(257)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* no hazard: VMEM */ - //! p_unit_test 2 - //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); - create_mubuf(0, PhysReg(257), PhysReg(257)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* no hazard: VMEM */ + //! p_unit_test 2 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + create_mubuf(0, PhysReg(257), PhysReg(257)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* no hazard: VMEM with VALU in-between */ - //! p_unit_test 3 - //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen - //! v_nop - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); - create_mubuf(0, PhysReg(257)); - bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* no hazard: VMEM with VALU in-between */ + //! p_unit_test 3 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + create_mubuf(0, PhysReg(257)); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* WaR: LDS */ - //! p_unit_test 4 - //! v1: %0:v[1] = ds_read_b32 %0:v[0] - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* WaR: LDS */ + //! p_unit_test 4 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* WaW: LDS */ - //! p_unit_test 5 - //! v1: %0:v[0] = ds_read_b32 %0:v[1] - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* WaW: LDS */ + //! p_unit_test 5 + //! v1: %0:v[0] = ds_read_b32 %0:v[1] + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* no hazard: LDS */ - //! p_unit_test 6 - //! v1: %0:v[1] = ds_read_b32 %0:v[1] - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* no hazard: LDS */ + //! p_unit_test 6 + //! v1: %0:v[1] = ds_read_b32 %0:v[1] + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1)); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* no hazard: LDS with VALU in-between */ - //! p_unit_test 7 - //! v1: %0:v[1] = ds_read_b32 %0:v[0] - //! v_nop - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); - bld.vop1(aco_opcode::v_nop); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* no hazard: LDS with VALU in-between */ + //! p_unit_test 7 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //! v_nop + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + bld.vop1(aco_opcode::v_nop); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* no hazard: VMEM/LDS with the correct waitcnt in-between */ - //! p_unit_test 8 - //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen - //! s_waitcnt vmcnt(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); - create_mubuf(0, PhysReg(257)); - bld.sopp(aco_opcode::s_waitcnt, 0x3ff); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* no hazard: VMEM/LDS with the correct waitcnt in-between */ + //! p_unit_test 8 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_loadcnt imm:0 + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + create_mubuf(0, PhysReg(257)); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_loadcnt, 0); + else + bld.sopp(aco_opcode::s_waitcnt, 0x3ff); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - //! p_unit_test 9 - //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen - //! s_waitcnt_vscnt %0:null imm:0 - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); - create_mubuf_store(); - bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + //! p_unit_test 9 + //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen + //~gfx11! s_waitcnt_vscnt %0:null imm:0 + //~gfx12! s_wait_storecnt imm:0 + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + create_mubuf_store(); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_storecnt, 0); + else + bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - //! p_unit_test 10 - //! v1: %0:v[1] = ds_read_b32 %0:v[0] - //! s_waitcnt lgkmcnt(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); - bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + //! p_unit_test 10 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //~gfx11! s_waitcnt lgkmcnt(0) + //~gfx12! s_wait_dscnt imm:0 + //! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_dscnt, 0); + else + bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - /* VMEM/LDS with the wrong waitcnt in-between */ - //! p_unit_test 11 - //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen - //! s_waitcnt_vscnt %0:null imm:0 - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); - create_mubuf(0, PhysReg(257)); - bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + if (gfx >= GFX12) { + //~gfx12! p_unit_test 11 + //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d + //~gfx12! s_wait_loadcnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); + Instruction* instr = + bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8), + Operand(s4), Operand(v1), Operand(PhysReg(256), v2)) + .instr; + instr->mimg().dmask = 0x1; + instr->mimg().dim = ac_image_2d; + bld.sopp(aco_opcode::s_wait_loadcnt, 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - //! p_unit_test 12 - //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen - //! s_waitcnt lgkmcnt(0) - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); - create_mubuf_store(); - bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + //~gfx12! p_unit_test 12 + //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d + //~gfx12! s_wait_samplecnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); + instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1), + Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1), + Operand(PhysReg(256), v2)) + .instr; + instr->mimg().dmask = 0x1; + instr->mimg().dim = ac_image_2d; + bld.sopp(aco_opcode::s_wait_samplecnt, 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - //! p_unit_test 13 - //! v1: %0:v[1] = ds_read_b32 %0:v[0] - //! s_waitcnt vmcnt(0) - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); - bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); - bld.sopp(aco_opcode::s_waitcnt, 0x3ff); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + //~gfx12! p_unit_test 13 + //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128 + //~gfx12! s_wait_bvhcnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); + create_bvh(); + bld.sopp(aco_opcode::s_wait_bvhcnt, 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + } - //! p_unit_test 14 - //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen - //! s_waitcnt_vscnt %0:null imm:0 - //! s_waitcnt_depctr vm_vsrc(0) - //! v1: %0:v[0] = lds_direct_load %0:m0 - bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14)); - create_mubuf(0, PhysReg(256), PhysReg(257)); - bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); - bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + /* VMEM/LDS with the wrong waitcnt in-between */ + //! p_unit_test 14 + //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen + //~gfx11! s_waitcnt_vscnt %0:null imm:0 + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! s_wait_storecnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14)); + create_mubuf(0, PhysReg(257)); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_storecnt, 0); + else + bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); - finish_insert_nops_test(); + //! p_unit_test 15 + //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen + //~gfx11! s_waitcnt lgkmcnt(0) + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! s_wait_dscnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15)); + create_mubuf_store(); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_dscnt, 0); + else + bld.sopp(aco_opcode::s_waitcnt, 0xfc0f); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 16 + //! v1: %0:v[1] = ds_read_b32 %0:v[0] + //~gfx11! s_waitcnt vmcnt(0) + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! s_wait_loadcnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1)); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_loadcnt, 0); + else + bld.sopp(aco_opcode::s_waitcnt, 0x3ff); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //! p_unit_test 17 + //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen + //~gfx11! s_waitcnt_vscnt %0:null imm:0 + //~gfx11! s_waitcnt_depctr vm_vsrc(0) + //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0 + //~gfx12! s_wait_storecnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17)); + create_mubuf(0, PhysReg(256), PhysReg(257)); + if (gfx >= GFX12) + bld.sopp(aco_opcode::s_wait_storecnt, 0); + else + bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + if (gfx >= GFX12) { + //~gfx12! p_unit_test 18 + //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d + //~gfx12! s_wait_samplecnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18)); + Instruction* instr = + bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8), + Operand(s4), Operand(v1), Operand(PhysReg(256), v2)) + .instr; + instr->mimg().dmask = 0x1; + instr->mimg().dim = ac_image_2d; + bld.sopp(aco_opcode::s_wait_samplecnt, 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //~gfx12! p_unit_test 19 + //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d + //~gfx12! s_wait_loadcnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19)); + instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1), + Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1), + Operand(PhysReg(256), v2)) + .instr; + instr->mimg().dmask = 0x1; + instr->mimg().dim = ac_image_2d; + bld.sopp(aco_opcode::s_wait_loadcnt, 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + + //~gfx12! p_unit_test 20 + //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128 + //~gfx12! s_wait_loadcnt imm:0 + //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20)); + create_bvh(); + bld.sopp(aco_opcode::s_wait_loadcnt, 0); + bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1)); + } + + finish_insert_nops_test(); + } END_TEST BEGIN_TEST(insert_nops.valu_trans_use) @@ -1310,4 +1448,44 @@ BEGIN_TEST(insert_nops.setpc_gfx11) bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); finish_insert_nops_test(true); -} +END_TEST + +BEGIN_TEST(insert_nops.setpc_gfx12) + if (!setup_cs(NULL, GFX12)) + return; + + //>> p_unit_test 0 + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + /* LdsDirectVALUHazard */ + //! p_unit_test 1 + //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0 + //! s_waitcnt_depctr va_vdst(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1), + Operand::zero()); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + /* VcmpxPermlaneHazard */ + //! p_unit_test 2 + //! s2: %0:exec = v_cmpx_eq_u32 0, 0 + //! v_nop + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero()); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + /* LdsDirectVMEMHazard */ + //! p_unit_test 3 + //! v1: %0:v[0] = ds_read_b32 %0:v[0] + //! s_waitcnt_depctr vm_vsrc(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + finish_insert_nops_test(true); +END_TEST