diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index a2142444110..8f8101743e2 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -1395,22 +1395,14 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& } depctr_wait wait = parse_depctr_wait(instr.get()); - unsigned va_vdst = wait.va_vdst; - unsigned vm_vsrc = 7; - unsigned sa_sdst = 1; + if (debug_flags & DEBUG_FORCE_WAITDEPS) + wait = parse_depctr_wait(bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000)); + else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) + wait.vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0; - if (debug_flags & DEBUG_FORCE_WAITDEPS) { - bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000); - va_vdst = 0; - vm_vsrc = 0; - sa_sdst = 0; - } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) { - /* va_vdst already obtained through parse_depctr_wait(). */ - vm_vsrc = (instr->salu().imm >> 2) & 0x7; - sa_sdst = instr->salu().imm & 0x1; - } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) { - vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0; - } + unsigned va_vdst = wait.va_vdst; + unsigned vm_vsrc = wait.vm_vsrc; + unsigned sa_sdst = wait.sa_sdst; if (instr->isLDSDIR()) { unsigned count = handle_lds_direct_valu_hazard(state, instr); diff --git a/src/amd/compiler/tests/test_insert_nops.cpp b/src/amd/compiler/tests/test_insert_nops.cpp index f1e12524ca5..96b372bbe81 100644 --- a/src/amd/compiler/tests/test_insert_nops.cpp +++ b/src/amd/compiler/tests/test_insert_nops.cpp @@ -2224,18 +2224,28 @@ BEGIN_TEST(insert_nops.setpc_gfx12) //! p_unit_test 9 //! v1: %0:v[0] = v_mov_b32 %0:s[4] - //! v1: %0:v[1] = v_mov_b32 %0:s[5] //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo - //! s1: %0:s[4] = s_mov_b32 0 - //! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0] //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1] - //! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) sa_sdst(0) + //! s1: %0:s[4] = s_mov_b32 0 + //! s_waitcnt_depctr va_vdst(0) va_vcc(0) sa_sdst(0) //! s_setpc_b64 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1)); + bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1)); + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1)); + bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); + bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8)); + + //! p_unit_test 10 + //! v1: %0:v[1] = v_mov_b32 %0:s[5] + //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo + //! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0] + //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1] + //! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) + //! s_setpc_b64 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1)); bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1)); - bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4)); bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1)); bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1)); bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));