diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index e56c876e783..16f51e2bac2 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -350,7 +350,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) * VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the * linear CFG, so this is difficult */ uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr); - if (vmem_type && ctx.gfx_level < GFX12) { + if (vmem_type) { wait_event event = get_vmem_event(ctx, instr, vmem_type); wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1); @@ -359,7 +359,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type && util_bitcount(vmem_type) == 1); - if (event_matches && type_matches) + bool different_halves = false; + if (event == event_vmem && event_matches) { + uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3; + different_halves = !(mask & it->second.vm_mask); + } + + if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves) reg_imm[type] = wait_imm::unset_counter; } diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index 8afecdbb879..c0520bd62e7 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -612,3 +612,82 @@ BEGIN_TEST(insert_waitcnt.vmem_ds) finish_waitcnt_test(); END_TEST + +BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves) + if (!setup_cs(NULL, GFX12)) + return; + + Definition def_v4_lo(PhysReg(260), v2b); + Definition def_v4_hi(PhysReg(260).advance(2), v2b); + Operand op_v0(PhysReg(256), v1); + Operand desc_s4(PhysReg(0), s4); + Operand desc_s8(PhysReg(8), s8); + + //>> p_unit_test 0 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 1 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 2 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + //! s_wait_loadcnt imm:0 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 3 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + //! s_wait_loadcnt imm:0 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 4 + //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16 + //! s_wait_samplecnt imm:0 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + Instruction* instr = + bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0); + instr->mimg().dmask = 0x1; + instr->mimg().d16 = true; + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 5 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + //! s_wait_loadcnt imm:0 + //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + instr = bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0); + instr->mimg().dmask = 0x1; + instr->mimg().d16 = true; + + finish_waitcnt_test(); +END_TEST