/* * Copyright © 2022 Valve Corporation * * SPDX-License-Identifier: MIT */ #include "helpers.h" using namespace aco; BEGIN_TEST(insert_waitcnt.ds_ordered_count) if (!setup_cs(NULL, GFX10_3)) return; Operand def0(PhysReg(256), v1); Operand def1(PhysReg(257), v1); Operand def2(PhysReg(258), v1); Operand gds_base(PhysReg(259), v1); Operand chan_counter(PhysReg(260), v1); Operand m(m0, s1); Instruction* ds_instr; //>> ds_ordered_count %0:v[0], %0:v[3], %0:m0 offset0:3072 gds storage:gds semantics:volatile //! s_waitcnt lgkmcnt(0) ds_instr = bld.ds(aco_opcode::ds_ordered_count, def0, gds_base, m, 3072u, 0u, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile); //! ds_add_rtn_u32 %0:v[1], %0:v[3], %0:v[4], %0:m0 gds storage:gds semantics:volatile,atomic,rmw ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, def1, gds_base, chan_counter, m, 0u, 0u, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw); //! s_waitcnt lgkmcnt(0) //! ds_ordered_count %0:v[2], %0:v[3], %0:m0 offset0:3840 gds storage:gds semantics:volatile ds_instr = bld.ds(aco_opcode::ds_ordered_count, def2, gds_base, m, 3840u, 0u, true); ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile); finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.clause) if (!setup_cs(NULL, GFX11)) return; Definition def_v4(PhysReg(260), v1); Definition def_v5(PhysReg(261), v1); Definition def_v6(PhysReg(262), v1); Definition def_v7(PhysReg(263), v1); Operand op_v0(PhysReg(256), v1); Operand op_v4(PhysReg(260), v1); Operand op_v5(PhysReg(261), v1); Operand op_v6(PhysReg(262), v1); Operand op_v7(PhysReg(263), v1); Operand desc0(PhysReg(0), s4); //>> p_unit_test 0 bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //! v1: %0:v[6] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //! v1: %0:v[7] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v6, desc0, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v7, desc0, op_v0, Operand::zero(), 0, false); //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[4], 0 //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[5], 0 //! v1: %0:v[6] = buffer_load_dword %0:s[0-3], %0:v[6], 0 //! v1: %0:v[7] = buffer_load_dword %0:s[0-3], %0:v[7], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v4, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v5, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v6, desc0, op_v6, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v7, desc0, op_v7, Operand::zero(), 0, false); //! s_waitcnt vmcnt(0) //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[4] //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[5] //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[6] //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[7] bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v4, 0, false); bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v5, 0, false); bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v6, 0, false); bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v7, 0, false); //>> p_unit_test 1 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); //! s4: %0:s[4-7] = s_load_dwordx4 %0:s[0-1], 0 bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg(4), s4), Operand(PhysReg(0), s2), Operand::zero()); //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); //! s_waitcnt lgkmcnt(0) vmcnt(0) //! v1: %0:v[5] = buffer_load_dword %0:s[4-7], %0:v[4], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v5, Operand(PhysReg(4), s4), op_v4, Operand::zero(), 0, false); //>> p_unit_test 2 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); //! v_nop bld.vop1(aco_opcode::v_nop); //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); //! s_waitcnt vmcnt(0) //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[4], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v4, Operand::zero(), 0, false); finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.vmem) if (!setup_cs(NULL, GFX10)) return; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc0(PhysReg(0), s4); //>> BB0 //! /* logical preds: / linear preds: / kind: top-level, */ //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); //>> BB1 //! /* logical preds: / linear preds: / kind: */ //! v1: %0:v[4] = ds_read_b32 %0:v[0] bld.reset(program->create_and_insert_block()); bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); bld.reset(program->create_and_insert_block()); program->blocks[2].linear_preds.push_back(0); program->blocks[2].linear_preds.push_back(1); program->blocks[2].logical_preds.push_back(0); program->blocks[2].logical_preds.push_back(1); //>> BB2 //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */ //! s_waitcnt lgkmcnt(0) //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.lds) if (!setup_cs(NULL, GFX10)) return; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc0(PhysReg(0), s4); //>> BB0 //! /* logical preds: / linear preds: / kind: top-level, */ //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); //>> BB1 //! /* logical preds: / linear preds: / kind: */ //! v1: %0:v[4] = ds_read_b32 %0:v[0] bld.reset(program->create_and_insert_block()); bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); bld.reset(program->create_and_insert_block()); program->blocks[2].linear_preds.push_back(0); program->blocks[2].linear_preds.push_back(1); program->blocks[2].logical_preds.push_back(0); program->blocks[2].logical_preds.push_back(1); //>> BB2 //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */ //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = ds_read_b32 %0:v[0] bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.waw.vmem_types) for (amd_gfx_level gfx : {GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); //>> p_unit_test 0 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //~gfx12! s_wait_loadcnt imm:0 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 1 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 2 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); //>> p_unit_test 3 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //~gfx12! s_wait_samplecnt imm:0 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 4 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_samplecnt imm:0 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 5 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_samplecnt imm:0 //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); //>> p_unit_test 6 //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d //~gfx12! s_wait_bvhcnt imm:0 //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); //>> p_unit_test 7 //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_bvhcnt imm:0 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 8 //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_bvhcnt imm:0 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> BB9 //! /* logical preds: / linear preds: / kind: */ //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); //>> BB10 //! /* logical preds: / linear preds: / kind: */ //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.reset(program->create_and_insert_block()); program->blocks[11].linear_preds.push_back(9); program->blocks[11].linear_preds.push_back(10); program->blocks[11].logical_preds.push_back(9); program->blocks[11].logical_preds.push_back(10); //>> BB11 //! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */ //! p_unit_test 9 //~gfx12! s_wait_loadcnt imm:0 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); //>> BB12 //! /* logical preds: / linear preds: / kind: */ //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> BB13 //! /* logical preds: / linear preds: / kind: */ //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.reset(program->create_and_insert_block()); program->blocks[14].linear_preds.push_back(12); program->blocks[14].linear_preds.push_back(13); program->blocks[14].logical_preds.push_back(12); program->blocks[14].logical_preds.push_back(13); //>> BB14 //! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */ //! p_unit_test 10 //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //~gfx12! s_wait_samplecnt imm:0 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.waw.point_sample_accel) for (radeon_family family : {CHIP_GFX1150, CHIP_GFX1153}) { if (!setup_cs(NULL, GFX11_5, family, family == CHIP_GFX1153 ? "_3" : "_0")) continue; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); /* image_sample has point sample acceleration, but image_sample_b does not. Both are VMEM * sample instructions. */ //>> p_unit_test 0 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //~gfx11_5_0! s_waitcnt vmcnt(0) //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 1 //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //~gfx11_5_0! s_waitcnt vmcnt(0) //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 2 //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 3 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //~gfx11_5_0! s_waitcnt vmcnt(0) //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 4 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); //>> p_unit_test 5 //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //! v1: %0:v[4] = image_sample_b %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); bld.mimg(aco_opcode::image_sample_b, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); //>> p_unit_test 5 //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d //! v1: %0:v[4] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); bld.mimg(aco_opcode::image_load, def_v4, desc_s8, Operand(s4), Operand(v1), op_v0); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.vmem) if (!setup_cs(NULL, GFX12)) return; Definition def_v4(PhysReg(260), v1); Definition def_v5(PhysReg(261), v1); Definition def_v6(PhysReg(262), v1); Definition def_v7(PhysReg(263), v1); Definition def_v8(PhysReg(264), v1); Definition def_v9(PhysReg(265), v1); Operand op_v0(PhysReg(256), v1); Operand op_v4(PhysReg(260), v1); Operand op_v5(PhysReg(261), v1); Operand op_v6(PhysReg(262), v1); Operand op_v7(PhysReg(263), v1); Operand op_v8(PhysReg(264), v1); Operand op_v9(PhysReg(265), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //! v1: %0:v[5] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //! v1: %0:v[6] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d unrm r128 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.mimg(aco_opcode::image_sample, def_v5, desc_s8, desc_s4, Operand(v1), op_v0); Instruction* instr = bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v6, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))) .instr; instr->mimg().unrm = true; instr->mimg().r128 = true; //! v1: %0:v[7] = image_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d //! v1: %0:v[8] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d //! v1: %0:v[9] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d unrm r128 bld.mimg(aco_opcode::image_load, def_v7, desc_s8, Operand(s4), Operand(v1), op_v0, 0x1); bld.mimg(aco_opcode::image_sample, def_v8, desc_s8, desc_s4, Operand(v1), op_v0); instr = bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v9, desc_s4, Operand(s4), Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4))) .instr; instr->mimg().unrm = true; instr->mimg().r128 = true; //! s_wait_loadcnt imm:1 //! p_unit_test 0, %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4); //! s_wait_samplecnt imm:1 //! p_unit_test 1, %0:v[5] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_v5); //! s_wait_bvhcnt imm:1 //! p_unit_test 2, %0:v[6] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v6); //! s_wait_loadcnt imm:0 //! p_unit_test 3, %0:v[7] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_v7); //! s_wait_samplecnt imm:0 //! p_unit_test 4, %0:v[8] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4), op_v8); //! s_wait_bvhcnt imm:0 //! p_unit_test 5, %0:v[9] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5), op_v9); /* Despite not using a sampler, this uses samplecnt. */ //! v1: %0:v[5] = image_msaa_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d //! s_wait_samplecnt imm:0 //! p_unit_test 6, %0:v[5] bld.mimg(aco_opcode::image_msaa_load, def_v5, desc_s8, Operand(s4), Operand(v1), op_v0); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6), op_v5); finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.lds_smem) for (amd_gfx_level gfx : {GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_v4(PhysReg(260), v1); Definition def_v5(PhysReg(261), v1); Definition def_s4(PhysReg(4), s1); Definition def_s5(PhysReg(5), s1); Operand op_s0(PhysReg(0), s1); Operand op_s4(PhysReg(4), s1); Operand op_s5(PhysReg(5), s1); Operand op_v0(PhysReg(256), v1); Operand op_v4(PhysReg(260), v1); Operand op_v5(PhysReg(261), v1); Operand desc_s4(PhysReg(0), s4); //>> v1: %0:v[4] = ds_read_b32 %0:v[0] //! s1: %0:s[4] = s_buffer_load_dword %0:s[0-3], %0:s[0] //! v1: %0:v[5] = ds_read_b32 %0:v[0] //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0] bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); bld.smem(aco_opcode::s_buffer_load_dword, def_s4, desc_s4, op_s0); bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0); bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0); //~gfx11! s_waitcnt lgkmcnt(1) //~gfx12! s_wait_dscnt imm:1 //! p_unit_test 0, %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4); //~gfx11! s_waitcnt lgkmcnt(0) //~gfx12! s_wait_kmcnt imm:0 //! p_unit_test 1, %0:s[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s4); //~gfx12! s_wait_dscnt imm:0 //! p_unit_test 2, %0:v[5] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v5); //! p_unit_test 3, %0:s[5] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s5); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.sendmsg_smem) for (amd_gfx_level gfx : {GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_s4(PhysReg(4), s1); Definition def_s5(PhysReg(5), s1); Definition def_s6(PhysReg(6), s1); Definition def_s7(PhysReg(7), s1); Operand op_s0(PhysReg(0), s1); Operand op_s4(PhysReg(4), s1); Operand op_s5(PhysReg(5), s1); Operand op_s6(PhysReg(6), s1); Operand op_s7(PhysReg(7), s1); Operand desc_s4(PhysReg(0), s4); //>> s1: %0:s[4] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime) //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0] //! s1: %0:s[6] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime) //! s1: %0:s[7] = s_buffer_load_dword %0:s[0-3], %0:s[0] bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s4, Operand::c32(sendmsg_rtn_get_realtime)); bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0); bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s6, Operand::c32(sendmsg_rtn_get_realtime)); bld.smem(aco_opcode::s_buffer_load_dword, def_s7, desc_s4, op_s0); //~gfx12! s_wait_kmcnt imm:1 //~gfx11! s_waitcnt lgkmcnt(1) //! p_unit_test 0, %0:s[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_s4); //~gfx12! s_wait_kmcnt imm:0 //~gfx11! s_waitcnt lgkmcnt(0) //! p_unit_test 1, %0:s[5] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s5); //! p_unit_test 2, %0:s[6] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_s6); //! p_unit_test 3, %0:s[7] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s7); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.vmem_ds) if (!setup_cs(NULL, GFX12)) return; Definition def_v4(PhysReg(260), v1); Definition def_v5(PhysReg(261), v1); Operand op_v0(PhysReg(256), v1); Operand op_v1(PhysReg(257), v1); Operand op_v4(PhysReg(260), v1); Operand op_v5(PhysReg(261), v1); Operand desc_s4(PhysReg(0), s4); program->workgroup_size = 128; program->wgp_mode = true; //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 //! v1: %0:v[5] = ds_read_b32 %0:v[0] bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0); //! s_wait_loadcnt_dscnt dscnt(0) loadcnt(0) //! p_unit_test 0, %0:v[4], %0:v[5] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4, op_v5); //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[1] storage:buffer //! v1: %0:v[5] = ds_write_b32 %0:v[0], %0:v[1] storage:shared Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, desc_s4, op_v0, Operand::zero(), op_v1, 0, false) .instr; instr->mubuf().sync = memory_sync_info(storage_buffer); instr = bld.ds(aco_opcode::ds_write_b32, def_v5, op_v0, op_v1).instr; instr->ds().sync = memory_sync_info(storage_shared); //! s_wait_storecnt_dscnt dscnt(0) storecnt(0) bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup); finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.waw.vmem_ds_valu) for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); emit_divergent_if_else( program.get(), bld, Operand::c64(1), [&]() { //>> p_unit_test 1 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); }, [&]() { //>> p_unit_test 2 //~gfx11! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //! v1: %0:v[4] = v_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.vop1(aco_opcode::v_mov_b32, def_v4, Operand::zero()); }); //>> p_unit_test 3 //~gfx(10_3|11)! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //! p_unit_test %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); emit_divergent_if_else( program.get(), bld, Operand::c64(1), [&]() { //>> p_unit_test 4 //! v1: %0:v[4] = ds_read_b32 %0:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0); }, [&]() { //>> p_unit_test 5 //~gfx11! s_waitcnt lgkmcnt(0) //~gfx12! s_wait_dscnt imm:0 //! v1: %0:v[4] = v_mov_b32 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.vop1(aco_opcode::v_mov_b32, def_v4, Operand::zero()); }); //>> p_unit_test 6 //~gfx(10_3|11)! s_waitcnt lgkmcnt(0) //~gfx12! s_wait_dscnt imm:0 //! p_unit_test %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves) if (!setup_cs(NULL, GFX12)) return; Definition def_v4_lo(PhysReg(260), v2b); Definition def_v4_hi(PhysReg(260).advance(2), v2b); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); //>> p_unit_test 0 //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 1 //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 2 //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 //! s_wait_loadcnt imm:0 //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 3 //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 //! s_wait_loadcnt imm:0 //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, false); bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 4 //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16 //! s_wait_samplecnt imm:0 //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); Instruction* instr = bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0); instr->mimg().dmask = 0x1; instr->mimg().d16 = true; bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 5 //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 //! s_wait_loadcnt imm:0 //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16 bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, false); instr = bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0); instr->mimg().dmask = 0x1; instr->mimg().d16 = true; finish_waitcnt_test(); END_TEST BEGIN_TEST(insert_waitcnt.waw.vmem_different_lanes) for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); emit_divergent_if_else( program.get(), bld, Operand::c64(1), [&]() { //>> p_unit_test 1 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); }, [&]() { //>> p_unit_test 2 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); }); //>> p_unit_test 3 //~gfx(10_3|11)! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //! p_unit_test %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); emit_divergent_if_else( program.get(), bld, Operand::c64(1), [&]() { //>> p_unit_test 4 //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); }, [&]() { //>> p_unit_test 5 //~gfx12! s_wait_loadcnt imm:0 //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0); }); //>> p_unit_test 6 //~gfx(10_3|11)! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //~gfx12! s_wait_samplecnt imm:0 //! p_unit_test %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.divergent_branch.inc_counter) for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_v4(PhysReg(260), v1); Definition def_v5(PhysReg(261), v1); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); emit_divergent_if_else( program.get(), bld, Operand::c64(1), [&]() { //>> p_unit_test 1 //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc_s4, op_v0, Operand::zero(), 0, false); }, [&]() { //>> p_unit_test 2 //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc_s4, op_v0, Operand::zero(), 0, false); }); //>> p_unit_test 3 //~gfx(10_3|11)! s_waitcnt vmcnt(1) //~gfx12! s_wait_loadcnt imm:1 //! p_unit_test %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.divergent_branch.no_skip) for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) { if (!setup_cs(NULL, gfx)) continue; Definition def_v4(PhysReg(260), v1); Operand op_v0(PhysReg(256), v1); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false); //>> p_unit_test 1 //~gfx(10_3|11)! s_waitcnt vmcnt(0) //~gfx12! s_wait_loadcnt imm:0 //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); program->blocks[1].linear_preds.push_back(0); program->blocks[1].logical_preds.push_back(0); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); //>> p_unit_test 2 //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); program->blocks[2].linear_preds.push_back(1); program->blocks[2].logical_preds.push_back(1); program->blocks[2].logical_preds.push_back(0); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1)); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.flat.wait_zero) for (amd_gfx_level gfx : {GFX9, GFX10}) { if (!setup_cs(NULL, gfx)) continue; Definition dest0(PhysReg(260), v1); Definition dest1(PhysReg(261), v1); Operand offset(PhysReg(256), v1); Operand addr(PhysReg(256), v2); //>> p_unit_test 0 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //~gfx9! s_waitcnt vmcnt(0) //~gfx10! s_waitcnt vmcnt(1) //! p_unit_test %0:v[4] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1)); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 1 //! v1: %0:v[4] = ds_read_b32 %0:v[0] //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //~gfx9! s_waitcnt lgkmcnt(0) //~gfx10! s_waitcnt lgkmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.ds(aco_opcode::ds_read_b32, dest0, offset); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 2 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) //~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.global(aco_opcode::global_load_dword, dest1, addr, Operand(s1)); bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 3 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! v1: %0:v[5] = ds_read_b32 %0:v[0] //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) //~gfx10! s_waitcnt lgkmcnt(1) vmcnt(0) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.ds(aco_opcode::ds_read_b32, dest1, offset); bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 4 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) //~gfx10! s_waitcnt lgkmcnt(1) vmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 5 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef //! s_waitcnt vmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1)); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)); bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 6 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef //! s_waitcnt vmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)); bld.global(aco_opcode::global_load_dword, dest1, addr, Operand(s1)); bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 7 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef //! s_waitcnt vmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)); bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); //>> p_unit_test 8 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //~gfx9! s_waitcnt vmcnt(0) //~gfx10! s_waitcnt vmcnt(1) //! p_unit_test %0:v[4] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.flat.waw) for (amd_gfx_level gfx : {GFX9, GFX10}) { if (!setup_cs(NULL, gfx)) continue; /* Flat might use either LDS or VMEM, so WaW always needs a wait. */ Definition dest(PhysReg(260), v1); Operand offset(PhysReg(256), v1); Operand addr(PhysReg(256), v2); Operand desc_s4(PhysReg(0), s4); Operand desc_s8(PhysReg(8), s8); //>> p_unit_test 0 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1)); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; //>> p_unit_test 1 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! s_waitcnt lgkmcnt(0) vmcnt(0) //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1)); //>> p_unit_test 2 //! v1: %0:v[4] = ds_read_b32 %0:v[0] //! s_waitcnt lgkmcnt(0) //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.ds(aco_opcode::ds_read_b32, dest, offset); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; //>> p_unit_test 3 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = ds_read_b32 %0:v[0] bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.ds(aco_opcode::ds_read_b32, dest, offset); /* In theory, we don't need a wait here, but we don't optimize this. */ //>> p_unit_test 4 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! s_waitcnt lgkmcnt(0) vmcnt(0) //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; //>> p_unit_test 5 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //! s_waitcnt lgkmcnt(0) vmcnt(0) //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.mimg(aco_opcode::image_sample, dest, desc_s8, desc_s4, Operand(v1), offset); //>> p_unit_test 6 //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef //! s_waitcnt vmcnt(0) //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); bld.mimg(aco_opcode::image_sample, dest, desc_s8, desc_s4, Operand(v1), offset); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.flat.barrier) for (amd_gfx_level gfx : {GFX9, GFX10}) { if (!setup_cs(NULL, gfx)) continue; Definition dest0(PhysReg(260), v1); Definition dest1(PhysReg(261), v1); Operand addr(PhysReg(256), v2); Operand data(PhysReg(256), v1); //>> p_unit_test 0 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds //~gfx9! s_waitcnt vmcnt(0) //~gfx10! s_waitcnt vmcnt(1) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0, memory_sync_info(storage_buffer)); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true; bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup); //>> p_unit_test 1 //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds storage:buffer //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) //~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1), 0, memory_sync_info(storage_buffer)).instr->flat().may_use_lds = true; bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0); bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup); //>> p_unit_test 2 //! flat_store_dword %0:v[0-1], s1: undef, %0:v[0] may_use_lds storage:buffer //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) //~gfx10! s_waitcnt_vscnt %0:null imm:0 //~gfx10! s_waitcnt lgkmcnt(0) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); bld.flat(aco_opcode::flat_store_dword, addr, Operand(s1), data, 0, memory_sync_info(storage_buffer)).instr->flat().may_use_lds = true; bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup); finish_waitcnt_test(); } END_TEST static void barrier(unsigned storage, unsigned semantics, sync_scope scope, sync_scope exec_scope = scope_invocation) { bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage, semantics, scope), exec_scope); } static Definition& load_global(unsigned semantics = 0, sync_scope scope = scope_invocation) { Definition dest0(PhysReg(260), v1); Operand addr(PhysReg(256), v2); return bld .global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0, memory_sync_info(storage_buffer, semantics, scope)) .def(0); } static void store_global(unsigned semantics = 0, sync_scope scope = scope_invocation) { Operand addr(PhysReg(256), v2); Operand data(PhysReg(256), v1); bld.global(aco_opcode::global_store_dword, addr, Operand(s1), data, 0, memory_sync_info(storage_buffer, semantics, scope)); } static Definition& load_shared(unsigned semantics = 0, sync_scope scope = scope_invocation) { Definition dest0(PhysReg(260), v1); Operand offset(PhysReg(256), v1); Builder::Result res = bld.ds(aco_opcode::ds_read_b32, dest0, offset); res.instr->ds().sync = memory_sync_info(storage_shared, semantics, scope); return res.def(0); } static void store_shared(unsigned semantics = 0, sync_scope scope = scope_invocation) { Operand offset(PhysReg(256), v1); Operand data(PhysReg(256), v1); bld.ds(aco_opcode::ds_write_b32, offset, data).instr->ds().sync = memory_sync_info(storage_shared, semantics, scope); } struct barrier_test_variant { unsigned workgroup_size; bool wgp; const char* name; }; static const barrier_test_variant barrier_test_variants[] = { {64, false, "_wg64cu"}, {128, false, "_wg128cu"}, {64, true, "_wg64wgp"}, {128, true, "_wg128wgp"}, }; BEGIN_TEST(insert_waitcnt.barrier.release) for (barrier_test_variant var : barrier_test_variants) { if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name)) continue; program->workgroup_size = var.workgroup_size; program->wgp_mode = var.wgp; Definition dest0(PhysReg(260), v1); Definition dest1(PhysReg(261), v1); Operand addr(PhysReg(256), v2); Operand offset(PhysReg(256), v1); Operand data(PhysReg(256), v1); /* global->global, device scope */ //>> p_unit_test 0 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //! s_waitcnt_vscnt %0:null imm:0 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); store_global(); barrier(storage_buffer, semantic_release, scope_device); store_global(semantic_atomic); //>> p_unit_test 1 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer //! s_waitcnt vmcnt(0) //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); load_global(); barrier(storage_buffer, semantic_release, scope_device); store_global(semantic_atomic); /* global->global, workgroup scope */ //>> p_unit_test 2 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //~gfx10_wg128wgp! s_waitcnt_vscnt %0:null imm:0 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); store_global(); barrier(storage_buffer, semantic_release, scope_workgroup); store_global(semantic_atomic); //>> p_unit_test 3 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer //~gfx10_wg128wgp! s_waitcnt vmcnt(0) //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); load_global(); barrier(storage_buffer, semantic_release, scope_workgroup); store_global(semantic_atomic); /* shared->shared */ //>> p_unit_test 4 //! ds_write_b32 %0:v[0], %0:v[0] storage:shared //! ds_write_b32 %0:v[0], %0:v[0] storage:shared semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); store_shared(); barrier(storage_shared, semantic_release, scope_workgroup); store_shared(semantic_atomic); //>> p_unit_test 5 //! v1: %0:v[4] = ds_read_b32 %0:v[0] storage:shared //! ds_write_b32 %0:v[0], %0:v[0] storage:shared semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); load_shared(); barrier(storage_shared, semantic_release, scope_workgroup); store_shared(semantic_atomic); /* shared->global */ //>> p_unit_test 6 //! ds_write_b32 %0:v[0], %0:v[0] storage:shared //~gfx10_wg128(cu|wgp)! s_waitcnt lgkmcnt(0) //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); store_shared(); barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup); store_global(semantic_atomic); /* global->shared */ //>> p_unit_test 7 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //~gfx10_wg128(cu|wgp)! s_waitcnt_vscnt %0:null imm:0 //! ds_write_b32 %0:v[0], %0:v[0] storage:shared semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); store_global(); barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup); store_shared(semantic_atomic); /* global->global, device scope, release in the atomic */ //>> p_unit_test 8 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //! s_waitcnt_vscnt %0:null imm:0 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:release,atomic scope:device bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); store_global(); store_global(semantic_atomic | semantic_release, scope_device); /* global->global, workgroup scope, control barrier */ //>> p_unit_test 9 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //~gfx10_wg128wgp! s_waitcnt_vscnt %0:null imm:0 //~gfx10_wg128cu! s_waitcnt_depctr vm_vsrc(0) //! s_barrier bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); store_global(); barrier(storage_buffer, semantic_release, scope_workgroup); barrier(0, 0, scope_invocation, scope_workgroup); bld.sopp(aco_opcode::s_barrier); /* shared->shared, workgroup scope, control barrier */ //>> p_unit_test 10 //! ds_write_b32 %0:v[0], %0:v[0] storage:shared //~gfx10_wg128cu! s_waitcnt_depctr vm_vsrc(0) //~gfx10_wg128wgp! s_waitcnt lgkmcnt(0) //! s_barrier bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); store_shared(); barrier(storage_shared, semantic_release, scope_workgroup); barrier(0, 0, scope_invocation, scope_workgroup); bld.sopp(aco_opcode::s_barrier); /* global->global, device scope, delayed waitcnt */ //>> p_unit_test 11 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //! v_nop //! s_waitcnt_vscnt %0:null imm:0 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); store_global(); barrier(storage_buffer, semantic_release, scope_device); bld.vop1(aco_opcode::v_nop); store_global(semantic_atomic); //>> p_unit_test 12 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //! v_nop //! s_waitcnt_vscnt %0:null imm:0 //! v_nop //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); store_global(); barrier(storage_buffer, semantic_release, scope_device); bld.vop1(aco_opcode::v_nop); wait_imm vs_imm; vs_imm.vs = 0; vs_imm.build_waitcnt(bld); bld.vop1(aco_opcode::v_nop); store_global(semantic_atomic); //>> p_unit_test 13 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:private //! v_nop //! s_waitcnt_vscnt %0:null imm:1 //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer semantics:atomic bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13)); store_global(); barrier(storage_buffer, semantic_release, scope_device); store_global(semantic_private); bld.vop1(aco_opcode::v_nop); /* break up clause */ store_global(semantic_atomic); finish_waitcnt_test(); } END_TEST BEGIN_TEST(insert_waitcnt.barrier.acquire) for (barrier_test_variant var : barrier_test_variants) { if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name)) continue; program->workgroup_size = var.workgroup_size; program->wgp_mode = var.wgp; Definition dest0(PhysReg(260), v1); Definition dest1(PhysReg(261), v1); Definition dest2(PhysReg(262), v1); Operand addr(PhysReg(256), v2); Operand offset(PhysReg(256), v1); Operand data(PhysReg(256), v1); /* global->global, device scope */ //>> p_unit_test 0 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //! s_waitcnt vmcnt(0) //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_device); load_global() = dest1; //>> p_unit_test 1 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //! s_waitcnt vmcnt(0) //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_device); store_global(); /* global->global, workgroup scope */ //>> p_unit_test 2 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //~gfx10_wg128wgp! s_waitcnt vmcnt(0) //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_workgroup); load_global() = dest1; //>> p_unit_test 3 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //~gfx10_wg128wgp! s_waitcnt vmcnt(0) //! global_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_workgroup); store_global(); /* shared->shared */ //>> p_unit_test 4 //! v1: %0:v[4] = ds_read_b32 %0:v[0] storage:shared semantics:atomic //! v1: %0:v[5] = ds_read_b32 %0:v[0] storage:shared bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); load_shared(semantic_atomic); barrier(storage_shared, semantic_acquire, scope_workgroup); load_shared() = dest1; //>> p_unit_test 5 //! v1: %0:v[4] = ds_read_b32 %0:v[0] storage:shared semantics:atomic //! ds_write_b32 %0:v[0], %0:v[0] storage:shared bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); load_shared(semantic_atomic); barrier(storage_shared, semantic_acquire, scope_workgroup); store_shared(); /* shared->global */ //>> p_unit_test 6 //! v1: %0:v[4] = ds_read_b32 %0:v[0] storage:shared semantics:atomic //~gfx10_wg128(cu|wgp)! s_waitcnt lgkmcnt(0) //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); load_shared(semantic_atomic); barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup); load_global() = dest1; /* global->shared */ //>> p_unit_test 7 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //~gfx10_wg128(cu|wgp)! s_waitcnt vmcnt(0) //! v1: %0:v[5] = ds_read_b32 %0:v[0] storage:shared bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); load_global(semantic_atomic); barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup); load_shared() = dest1; /* global->global, device scope, acquire in the atomic */ //>> p_unit_test 8 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:acquire,atomic scope:device //! s_waitcnt vmcnt(0) //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8)); load_global(semantic_atomic | semantic_acquire, scope_device); load_global() = dest1; /* global->global, workgroup scope, control barrier */ //>> p_unit_test 9 //! s_barrier //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); barrier(0, 0, scope_invocation, scope_workgroup); bld.sopp(aco_opcode::s_barrier); barrier(storage_buffer, semantic_acquire, scope_workgroup); load_global(); /* global->global, device scope, delayed waitcnt */ //>> p_unit_test 10 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //! v_nop //! s_waitcnt vmcnt(0) //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_device); bld.vop1(aco_opcode::v_nop); load_global() = dest1; //>> p_unit_test 11 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //! v_nop //! s_waitcnt vmcnt(0) //! v_nop //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_device); bld.vop1(aco_opcode::v_nop); wait_imm vm_imm; vm_imm.vm = 0; vm_imm.build_waitcnt(bld); bld.vop1(aco_opcode::v_nop); load_global() = dest1; //>> p_unit_test 12 //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:atomic //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef storage:buffer semantics:private //! v_nop //! s_waitcnt vmcnt(1) //! v1: %0:v[6] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12)); load_global(semantic_atomic); barrier(storage_buffer, semantic_acquire, scope_device); load_global(semantic_private) = dest1; bld.vop1(aco_opcode::v_nop); /* break up clause */ load_global() = dest2; finish_waitcnt_test(); } END_TEST