From 9649deb50e1e83d210966a6c98a23f21243c622b Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Tue, 29 Apr 2025 17:37:59 +0100
Subject: [PATCH] aco: skip waitcnt between two vmem writing different halves

fossil-db (gfx1201):
Totals from 4 (0.01% of 79653) affected shaders:
Instrs: 41374 -> 41380 (+0.01%); split: -0.01%, +0.02%
CodeSize: 238912 -> 238924 (+0.01%); split: -0.01%, +0.01%
Latency: 706714 -> 706410 (-0.04%)
InvThroughput: 352269 -> 352118 (-0.04%)
VClause: 803 -> 798 (-0.62%)

fossil-db (navi31):
Totals from 0 (0.00% of 79653) affected shaders:

fossil-db (navi21):
Totals from 0 (0.00% of 79653) affected shaders:

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13028
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
---
 src/amd/compiler/aco_insert_waitcnt.cpp       | 10 ++-
 .../compiler/tests/test_insert_waitcnt.cpp    | 79 +++++++++++++++++++
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index e56c876e783..16f51e2bac2 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -350,7 +350,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
           * VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the
           * linear CFG, so this is difficult */
          uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
-         if (vmem_type && ctx.gfx_level < GFX12) {
+         if (vmem_type) {
             wait_event event = get_vmem_event(ctx, instr, vmem_type);
             wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
 
@@ -359,7 +359,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
             bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type &&
                                                          util_bitcount(vmem_type) == 1);
 
-            if (event_matches && type_matches)
+            bool different_halves = false;
+            if (event == event_vmem && event_matches) {
+               uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3;
+               different_halves = !(mask & it->second.vm_mask);
+            }
+
+            if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves)
                reg_imm[type] = wait_imm::unset_counter;
          }
 
diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp
index 8afecdbb879..c0520bd62e7 100644
--- a/src/amd/compiler/tests/test_insert_waitcnt.cpp
+++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp
@@ -612,3 +612,82 @@ BEGIN_TEST(insert_waitcnt.vmem_ds)
 
    finish_waitcnt_test();
 END_TEST
+
+BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves)
+   if (!setup_cs(NULL, GFX12))
+      return;
+
+   Definition def_v4_lo(PhysReg(260), v2b);
+   Definition def_v4_hi(PhysReg(260).advance(2), v2b);
+   Operand op_v0(PhysReg(256), v1);
+   Operand desc_s4(PhysReg(0), s4);
+   Operand desc_s8(PhysReg(8), s8);
+
+   //>> p_unit_test 0
+   //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
+   //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
+   bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+   bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+
+   //>> p_unit_test 1
+   //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
+   //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+   bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+   bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+
+   //>> p_unit_test 2
+   //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
+   //! s_wait_loadcnt imm:0
+   //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+   bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+   bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+
+   //>> p_unit_test 3
+   //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
+   //! s_wait_loadcnt imm:0
+   //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+   bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+   bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+
+   //>> p_unit_test 4
+   //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d d16
+   //! s_wait_samplecnt imm:0
+   //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+   Instruction* instr =
+      bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0);
+   instr->mimg().dmask = 0x1;
+   instr->mimg().d16 = true;
+   bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+
+   //>> p_unit_test 5
+   //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
+   //! s_wait_loadcnt imm:0
+   //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d d16
+   bld.reset(program->create_and_insert_block());
+   bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
+   bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
+             false);
+   instr = bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0);
+   instr->mimg().dmask = 0x1;
+   instr->mimg().d16 = true;
+
+   finish_waitcnt_test();
+END_TEST