diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index b9d25db4f1c..277228f6b83 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -39,7 +39,7 @@ namespace { /* Instructions of the same event will finish in-order except for smem * and maybe flat. Instructions of different events may not finish in-order. */ -enum wait_event : uint16_t { +enum wait_event : uint32_t { event_smem = 1 << 0, event_lds = 1 << 1, event_gds = 1 << 2, @@ -53,10 +53,12 @@ enum wait_event : uint16_t { event_vmem_gpr_lock = 1 << 10, event_sendmsg = 1 << 11, event_ldsdir = 1 << 12, - event_valu = 1 << 13, - event_trans = 1 << 14, - event_salu = 1 << 15, - num_events = 16, + event_vmem_sample = 1 << 13, /* GFX12+ */ + event_vmem_bvh = 1 << 14, /* GFX12+ */ + event_valu = 1 << 15, + event_trans = 1 << 16, + event_salu = 1 << 17, + num_events = 18, }; enum counter_type : uint8_t { @@ -64,6 +66,9 @@ enum counter_type : uint8_t { counter_lgkm = 1 << wait_type_lgkm, counter_vm = 1 << wait_type_vm, counter_vs = 1 << wait_type_vs, + counter_sample = 1 << wait_type_sample, + counter_bvh = 1 << wait_type_bvh, + counter_km = 1 << wait_type_km, counter_alu = 1 << wait_type_num, num_counters = wait_type_num + 1, wait_counters = BITFIELD_MASK(wait_type_num), @@ -162,11 +167,11 @@ struct alu_delay_info { struct wait_entry { wait_imm imm; alu_delay_info delay; - uint16_t events; /* use wait_event notion */ + uint32_t events; /* use wait_event notion */ uint8_t counters; /* use counter_type notion */ bool wait_on_read : 1; bool logical : 1; - uint8_t vmem_types : 4; + uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */ wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_, bool logical_, bool wait_on_read_) @@ -243,6 +248,12 @@ struct target_info { events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg; events[wait_type_vm] = event_vmem | event_flat; events[wait_type_vs] = event_vmem_store; + if (gfx_level >= GFX12) { + events[wait_type_sample] = event_vmem_sample; + events[wait_type_bvh] = event_vmem_bvh; + events[wait_type_km] = event_smem | event_sendmsg; + events[wait_type_lgkm] &= ~events[wait_type_km]; + } for (unsigned i = 0; i < wait_type_num; i++) { u_foreach_bit (j, events[i]) @@ -339,10 +350,12 @@ struct wait_ctx { }; uint8_t -get_vmem_type(Instruction* instr) +get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr) { if (instr->opcode == aco_opcode::image_bvh64_intersect_ray) return vmem_bvh; + else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load) + return vmem_sampler; else if (instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4) return vmem_sampler; @@ -351,6 +364,17 @@ get_vmem_type(Instruction* instr) return 0; } +wait_event +get_vmem_event(wait_ctx& ctx, Instruction* instr, uint8_t type) +{ + if (instr->definitions.empty() && ctx.gfx_level >= GFX10) + return event_vmem_store; + wait_event ev = event_vmem; + if (ctx.gfx_level >= GFX12 && type != vmem_nosampler) + ev = type == vmem_bvh ? event_vmem_bvh : event_vmem_sample; + return ev; +} + void check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* instr) { @@ -383,10 +407,11 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i wait_imm reg_imm = it->second.imm; /* Vector Memory reads and writes return in the order they were issued */ - uint8_t vmem_type = get_vmem_type(instr); + uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr); if (vmem_type) { - wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event_vmem)) - 1); - if ((it->second.events & ctx.info->events[type]) == event_vmem && + uint32_t event = get_vmem_event(ctx, instr, vmem_type); + wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1); + if ((it->second.events & ctx.info->events[type]) == event && (type != wait_type_vm || it->second.vmem_types == vmem_type)) reg_imm[type] = wait_imm::unset_counter; } @@ -701,7 +726,8 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear, wait_on_read); - new_entry.vmem_types |= vmem_types; + if (counters & counter_vm) + new_entry.vmem_types |= vmem_types; for (unsigned i = 0; i < rc.size(); i++) { auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry); @@ -835,12 +861,13 @@ gen(Instruction* instr, wait_ctx& ctx) case Format::MIMG: case Format::GLOBAL: case Format::SCRATCH: { - wait_event ev = - !instr->definitions.empty() || ctx.gfx_level < GFX10 ? event_vmem : event_vmem_store; + uint8_t type = get_vmem_type(ctx.gfx_level, instr); + wait_event ev = get_vmem_event(ctx, instr, type); + update_counters(ctx, ev, get_sync_info(instr)); if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], ev, get_vmem_type(instr)); + insert_wait_entry(ctx, instr->definitions[0], ev, type); if (ctx.gfx_level == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) { update_counters(ctx, event_vmem_gpr_lock); @@ -872,18 +899,42 @@ gen(Instruction* instr, wait_ctx& ctx) void emit_waitcnt(wait_ctx& ctx, std::vector>& instructions, wait_imm& imm) { - if (imm.vs != wait_imm::unset_counter) { - assert(ctx.gfx_level >= GFX10); - Instruction* waitcnt_vs = create_instruction(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 1, 0); - waitcnt_vs->operands[0] = Operand(sgpr_null, s1); - waitcnt_vs->salu().imm = imm.vs; - instructions.emplace_back(waitcnt_vs); - imm.vs = wait_imm::unset_counter; - } - if (!imm.empty()) { - Instruction* waitcnt = create_instruction(aco_opcode::s_waitcnt, Format::SOPP, 0, 0); - waitcnt->salu().imm = imm.pack(ctx.gfx_level); - instructions.emplace_back(waitcnt); + Builder bld(ctx.program, &instructions); + + if (ctx.gfx_level >= GFX12) { + if (imm.vm != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) { + bld.sopp(aco_opcode::s_wait_loadcnt_dscnt, (imm.vm << 8) | imm.lgkm); + imm.vm = wait_imm::unset_counter; + imm.lgkm = wait_imm::unset_counter; + } + + if (imm.vs != wait_imm::unset_counter && imm.lgkm != wait_imm::unset_counter) { + bld.sopp(aco_opcode::s_wait_storecnt_dscnt, (imm.vs << 8) | imm.lgkm); + imm.vs = wait_imm::unset_counter; + imm.lgkm = wait_imm::unset_counter; + } + + aco_opcode op[wait_type_num]; + op[wait_type_exp] = aco_opcode::s_wait_expcnt; + op[wait_type_lgkm] = aco_opcode::s_wait_dscnt; + op[wait_type_vm] = aco_opcode::s_wait_loadcnt; + op[wait_type_vs] = aco_opcode::s_wait_storecnt; + op[wait_type_sample] = aco_opcode::s_wait_samplecnt; + op[wait_type_bvh] = aco_opcode::s_wait_bvhcnt; + op[wait_type_km] = aco_opcode::s_wait_kmcnt; + + for (unsigned i = 0; i < wait_type_num; i++) { + if (imm[i] != wait_imm::unset_counter) + bld.sopp(op[i], imm[i]); + } + } else { + if (imm.vs != wait_imm::unset_counter) { + assert(ctx.gfx_level >= GFX10); + bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), imm.vs); + imm.vs = wait_imm::unset_counter; + } + if (!imm.empty()) + bld.sopp(aco_opcode::s_waitcnt, imm.pack(ctx.gfx_level)); } imm = wait_imm(); } diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index 14c83198e53..edc34bfc7f8 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -179,3 +179,356 @@ BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.lds) finish_waitcnt_test(); END_TEST + +BEGIN_TEST(insert_waitcnt.waw.vmem_types) + for (amd_gfx_level gfx : {GFX11, GFX12}) { + if (!setup_cs(NULL, gfx)) + continue; + + Definition def_v4(PhysReg(260), v1); + Operand op_v0(PhysReg(256), v1); + Operand desc_s4(PhysReg(0), s4); + Operand desc_s8(PhysReg(8), s8); + + //>> p_unit_test 0 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + //>> p_unit_test 1 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_loadcnt imm:0 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 2 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_loadcnt imm:0 + //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); + + //>> p_unit_test 3 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> p_unit_test 4 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_samplecnt imm:0 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + //>> p_unit_test 5 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_samplecnt imm:0 + //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); + + //>> p_unit_test 6 + //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d + //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); + + //>> p_unit_test 7 + //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_bvhcnt imm:0 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + //>> p_unit_test 8 + //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_bvhcnt imm:0 + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> BB9 + //! /* logical preds: / linear preds: / kind: */ + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + //>> BB10 + //! /* logical preds: / linear preds: / kind: */ + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + bld.reset(program->create_and_insert_block()); + program->blocks[11].linear_preds.push_back(9); + program->blocks[11].linear_preds.push_back(10); + program->blocks[11].logical_preds.push_back(9); + program->blocks[11].logical_preds.push_back(10); + + //>> BB11 + //! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */ + //! p_unit_test 9 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + //>> BB12 + //! /* logical preds: / linear preds: / kind: */ + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); + + //>> BB13 + //! /* logical preds: / linear preds: / kind: */ + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + bld.reset(program->create_and_insert_block()); + program->blocks[14].linear_preds.push_back(12); + program->blocks[14].linear_preds.push_back(13); + program->blocks[14].logical_preds.push_back(12); + program->blocks[14].logical_preds.push_back(13); + + //>> BB14 + //! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */ + //! p_unit_test 10 + //~gfx11! s_waitcnt vmcnt(0) + //~gfx12! s_wait_samplecnt imm:0 + //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + + finish_waitcnt_test(); + } +END_TEST + +BEGIN_TEST(insert_waitcnt.vmem) + if (!setup_cs(NULL, GFX12)) + return; + + Definition def_v4(PhysReg(260), v1); + Definition def_v5(PhysReg(261), v1); + Definition def_v6(PhysReg(262), v1); + Definition def_v7(PhysReg(263), v1); + Definition def_v8(PhysReg(264), v1); + Definition def_v9(PhysReg(265), v1); + Operand op_v0(PhysReg(256), v1); + Operand op_v4(PhysReg(260), v1); + Operand op_v5(PhysReg(261), v1); + Operand op_v6(PhysReg(262), v1); + Operand op_v7(PhysReg(263), v1); + Operand op_v8(PhysReg(264), v1); + Operand op_v9(PhysReg(265), v1); + Operand desc_s4(PhysReg(0), s4); + Operand desc_s8(PhysReg(8), s8); + + //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + //! v1: %0:v[5] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! v1: %0:v[6] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d unrm r128 + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + bld.mimg(aco_opcode::image_sample, def_v5, desc_s8, desc_s4, Operand(v1), op_v0); + Instruction* instr = + bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v6, desc_s4, Operand(s4), Operand(v1), + Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))) + .instr; + instr->mimg().unrm = true; + instr->mimg().r128 = true; + + //! v1: %0:v[7] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d + //! v1: %0:v[8] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + //! v1: %0:v[9] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d unrm r128 + bld.mimg(aco_opcode::image_load, def_v7, desc_s8, Operand(s4), Operand(v1), op_v0, 0x1); + bld.mimg(aco_opcode::image_sample, def_v8, desc_s8, desc_s4, Operand(v1), op_v0); + instr = bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v9, desc_s4, Operand(s4), + Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))) + .instr; + instr->mimg().unrm = true; + instr->mimg().r128 = true; + + //! s_wait_loadcnt imm:1 + //! p_unit_test 0, %0:v[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4); + //! s_wait_samplecnt imm:1 + //! p_unit_test 1, %0:v[5] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_v5); + //! s_wait_bvhcnt imm:1 + //! p_unit_test 2, %0:v[6] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v6); + //! s_wait_loadcnt imm:0 + //! p_unit_test 3, %0:v[7] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_v7); + //! s_wait_samplecnt imm:0 + //! p_unit_test 4, %0:v[8] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4), op_v8); + //! s_wait_bvhcnt imm:0 + //! p_unit_test 5, %0:v[9] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5), op_v9); + + /* Despite not using a sampler, this uses samplecnt. */ + //! v1: %0:v[5] = image_msaa_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d + //! s_wait_samplecnt imm:0 + //! p_unit_test 6, %0:v[5] + bld.mimg(aco_opcode::image_msaa_load, def_v5, desc_s8, Operand(s4), Operand(v1), op_v0); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6), op_v5); + + finish_waitcnt_test(); +END_TEST + +BEGIN_TEST(insert_waitcnt.lds_smem) + for (amd_gfx_level gfx : {GFX11, GFX12}) { + if (!setup_cs(NULL, gfx)) + continue; + + Definition def_v4(PhysReg(260), v1); + Definition def_v5(PhysReg(261), v1); + Definition def_s4(PhysReg(4), s1); + Definition def_s5(PhysReg(5), s1); + Operand op_s0(PhysReg(0), s1); + Operand op_s4(PhysReg(4), s1); + Operand op_s5(PhysReg(5), s1); + Operand op_v0(PhysReg(256), v1); + Operand op_v4(PhysReg(260), v1); + Operand op_v5(PhysReg(261), v1); + Operand desc_s4(PhysReg(0), s4); + + //>> v1: %0:v[4] = ds_read_b32 %0:v[0] + //! s1: %0:s[4] = s_buffer_load_dword %0:s[0-3], %0:s[0] + //! v1: %0:v[5] = ds_read_b32 %0:v[0] + //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0] + bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); + bld.smem(aco_opcode::s_buffer_load_dword, def_s4, desc_s4, op_s0); + bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0); + bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0); + + //~gfx11! s_waitcnt lgkmcnt(1) + //~gfx12! s_wait_dscnt imm:1 + //! p_unit_test 0, %0:v[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4); + //~gfx11! s_waitcnt lgkmcnt(0) + //~gfx12! s_wait_kmcnt imm:0 + //! p_unit_test 1, %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s4); + //~gfx12! s_wait_dscnt imm:0 + //! p_unit_test 2, %0:v[5] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v5); + //! p_unit_test 3, %0:s[5] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s5); + + finish_waitcnt_test(); + } +END_TEST + +BEGIN_TEST(insert_waitcnt.sendmsg_smem) + for (amd_gfx_level gfx : {GFX11, GFX12}) { + if (!setup_cs(NULL, gfx)) + continue; + + Definition def_s4(PhysReg(4), s1); + Definition def_s5(PhysReg(5), s1); + Definition def_s6(PhysReg(6), s1); + Definition def_s7(PhysReg(7), s1); + Operand op_s0(PhysReg(0), s1); + Operand op_s4(PhysReg(4), s1); + Operand op_s5(PhysReg(5), s1); + Operand op_s6(PhysReg(6), s1); + Operand op_s7(PhysReg(7), s1); + Operand desc_s4(PhysReg(0), s4); + + //>> s1: %0:s[4] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime) + //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0] + //! s1: %0:s[6] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime) + //! s1: %0:s[7] = s_buffer_load_dword %0:s[0-3], %0:s[0] + bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s4, Operand::c32(sendmsg_rtn_get_realtime)); + bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0); + bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s6, Operand::c32(sendmsg_rtn_get_realtime)); + bld.smem(aco_opcode::s_buffer_load_dword, def_s7, desc_s4, op_s0); + + //~gfx12! s_wait_kmcnt imm:1 + //~gfx11! s_waitcnt lgkmcnt(1) + //! p_unit_test 0, %0:s[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_s4); + //~gfx12! s_wait_kmcnt imm:0 + //~gfx11! s_waitcnt lgkmcnt(0) + //! p_unit_test 1, %0:s[5] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s5); + //! p_unit_test 2, %0:s[6] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_s6); + //! p_unit_test 3, %0:s[7] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s7); + + finish_waitcnt_test(); + } +END_TEST + +BEGIN_TEST(insert_waitcnt.vmem_ds) + if (!setup_cs(NULL, GFX12)) + return; + + Definition def_v4(PhysReg(260), v1); + Definition def_v5(PhysReg(261), v1); + Operand op_v0(PhysReg(256), v1); + Operand op_v1(PhysReg(257), v1); + Operand op_v4(PhysReg(260), v1); + Operand op_v5(PhysReg(261), v1); + Operand desc_s4(PhysReg(0), s4); + + program->workgroup_size = 128; + program->wgp_mode = true; + + //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 + //! v1: %0:v[5] = ds_read_b32 %0:v[0] + bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); + bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0); + + //! s_wait_loadcnt_dscnt dscnt(0) loadcnt(0) + //! p_unit_test 0, %0:v[4], %0:v[5] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4, op_v5); + + //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[1] storage:buffer + //! v1: %0:v[5] = ds_write_b32 %0:v[0], %0:v[1] storage:shared + Instruction* instr = + bld.mubuf(aco_opcode::buffer_store_dword, desc_s4, op_v0, Operand::zero(), op_v1, 0, false) + .instr; + instr->mubuf().sync = memory_sync_info(storage_buffer); + instr = bld.ds(aco_opcode::ds_write_b32, def_v5, op_v0, op_v1).instr; + instr->ds().sync = memory_sync_info(storage_shared); + + //! s_wait_storecnt_dscnt dscnt(0) storecnt(0) + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup)); + + finish_waitcnt_test(); +END_TEST