diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index e6263d6f191..345ba6047bd 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -666,43 +666,7 @@ void emit_waitcnt(wait_ctx& ctx, std::vector>& instructions, wait_imm& imm) { Builder bld(ctx.program, &instructions); - - if (ctx.gfx_level >= GFX12) { - if (imm.vm != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) { - bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (imm.vm << 8) | imm.lgkm); - imm.vm = wait_imm::unset_counter; - imm.lgkm = wait_imm::unset_counter; - } - - if (imm.vs != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) { - bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (imm.vs << 8) | imm.lgkm); - imm.vs = wait_imm::unset_counter; - imm.lgkm = wait_imm::unset_counter; - } - - aco_opcode op[wait_type_num]; - op[wait_type_exp] = aco_opcode::s_wait_expcnt; - op[wait_type_lgkm] = aco_opcode::s_wait_dscnt; - op[wait_type_vm] = aco_opcode::s_wait_loadcnt; - op[wait_type_vs] = aco_opcode::s_wait_storecnt; - op[wait_type_sample] = aco_opcode::s_wait_samplecnt; - op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt; - op[wait_type_km] = aco_opcode::s_wait_kmcnt; - - for (unsigned i = 0; i < wait_type_num; i++) { - if (imm[i] != wait_imm::unset_counter) - bld.sopp(op[i], imm[i]); - } - } else { - if (imm.vs != wait_imm::unset_counter) { - assert(ctx.gfx_level >= GFX10); - bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), imm.vs); - imm.vs = wait_imm::unset_counter; - } - if (!imm.empty()) - bld.sopp(aco_opcode::s_waitcnt, imm.pack(ctx.gfx_level)); - } - imm = wait_imm(); + imm.build_waitcnt(bld); } bool diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 2c0b17a82ca..022dd60e87c 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1328,6 +1328,50 @@ wait_imm::print(FILE* output) const } } +void +wait_imm::build_waitcnt(Builder& bld) +{ + enum amd_gfx_level gfx_level = bld.program->gfx_level; + + if (gfx_level >= GFX12) { + if (vm != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) { + bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (vm << 8) | lgkm); + vm = wait_imm::unset_counter; + lgkm = wait_imm::unset_counter; + } + + if (vs != wait_imm::unset_counter && lgkm != wait_imm::unset_counter) { + bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (vs << 8) | lgkm); + vs = wait_imm::unset_counter; + lgkm = wait_imm::unset_counter; + } + + aco_opcode op[wait_type_num]; + op[wait_type_exp] = aco_opcode::s_wait_expcnt; + op[wait_type_lgkm] = aco_opcode::s_wait_dscnt; + op[wait_type_vm] = aco_opcode::s_wait_loadcnt; + op[wait_type_vs] = aco_opcode::s_wait_storecnt; + op[wait_type_sample] = aco_opcode::s_wait_samplecnt; + op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt; + op[wait_type_km] = aco_opcode::s_wait_kmcnt; + + for (unsigned i = 0; i < wait_type_num; i++) { + if ((*this)[i] != wait_imm::unset_counter) + bld.sopp(op[i], (*this)[i]); + } + } else { + if (vs != wait_imm::unset_counter) { + assert(gfx_level >= GFX10); + bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), vs); + vs = wait_imm::unset_counter; + } + if (!empty()) + bld.sopp(aco_opcode::s_waitcnt, pack(gfx_level)); + } + + *this = wait_imm(); +} + bool should_form_clause(const Instruction* a, const Instruction* b) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 2925a2fcdeb..6848f721eb4 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -182,6 +182,7 @@ enum wait_type { }; struct Instruction; +class Builder; struct wait_imm { static const uint8_t unset_counter = 0xff; @@ -209,6 +210,8 @@ struct wait_imm { void print(FILE* output) const; + void build_waitcnt(Builder& bld); + uint8_t& operator[](size_t i) { assert(i < wait_type_num); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 0e18aa66069..9d7c895f195 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2380,13 +2380,13 @@ lower_to_hw_instr(Program* program) * the waitcnt necessary before resuming overlapping waves as the normal * waitcnt insertion doesn't work in a discard early exit block. */ - if (program->gfx_level >= GFX10) - bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0); wait_imm pops_exit_wait_imm; + if (program->gfx_level >= GFX10) + pops_exit_wait_imm.vs = 0; pops_exit_wait_imm.vm = 0; if (program->has_smem_buffer_or_global_loads) pops_exit_wait_imm.lgkm = 0; - bld.sopp(aco_opcode::s_waitcnt, pops_exit_wait_imm.pack(program->gfx_level)); + pops_exit_wait_imm.build_waitcnt(bld); } if (discard_sends_pops_done) bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);