From 7c6be36cf4d2d5fdc5f5ca18cae6d7516ec5ce05 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Mon, 18 May 2026 16:44:27 +0100
Subject: [PATCH] aco: don't emit waitcnts before subgroup-scope execution
 barriers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This delays the waitcnt for has_attr_ring_wait_bug by a few instructions.

fossil-db (gfx1201):
Totals from 9 (0.00% of 208640) affected shaders:
Instrs: 19352 -> 19506 (+0.80%)
CodeSize: 101180 -> 101716 (+0.53%)
Latency: 660221 -> 678782 (+2.81%); split: -0.00%, +2.81%
InvThroughput: 95106 -> 97398 (+2.41%)

fossil-db (navi33):
Totals from 58834 (28.20% of 208626) affected shaders:
Instrs: 22424304 -> 22424571 (+0.00%)
CodeSize: 110198112 -> 110199184 (+0.00%)
Latency: 115894319 -> 126491124 (+9.14%); split: -0.00%, +9.14%
InvThroughput: 19424631 -> 19754358 (+1.70%); split: -0.00%, +1.70%

I don't think the stats are very accurate. This seems to often move the
s_waitcnt down into a divergent branch, but the wait still happens later
if the branch isn't taken, so the wait is counted twice.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41364>
---
 src/amd/compiler/aco_insert_waitcnt.cpp | 4 +++-
 src/amd/compiler/aco_ir.cpp             | 4 ++--
 src/amd/compiler/aco_ir.h               | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index c3d5d2ba583..a72e76594e8 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -523,8 +523,10 @@ finish_barriers(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction*
                 memory_sync_info sync)
 {
    if (ctx.bar_nonempty & (1 << barrier_info_release)) {
+      /* s_waitcnt are subgroup-wide and no waitcnts are necessary within a subgroup, so we can
+       * delay this until we reach a workgroup or larger scope atomic. */
       uint16_t storage_release =
-         is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release);
+         is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release, scope_subgroup);
       u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage)
          finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_release], i);
    }
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index ae03bf607e5..27d80c0bf8e 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -225,7 +225,7 @@ is_ordered_ps_done_sendmsg(const Instruction* instr)
 
 uint16_t
 is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sync_info sync,
-                           unsigned semantic)
+                           unsigned semantic, sync_scope ignore_scope)
 {
    bool is_acquire = semantic & semantic_acquire;
    bool is_release = semantic & semantic_release;
@@ -254,7 +254,7 @@ is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sy
       if (instr->opcode == aco_opcode::s_sethalt)
          return cls & ~storage_shared;
    }
-   return (instr->isBarrier() && instr->barrier().exec_scope > scope_invocation) ? cls : 0;
+   return (instr->isBarrier() && instr->barrier().exec_scope > ignore_scope) ? cls : 0;
 }
 
 memory_sync_info
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 2f1c58f5c61..7b6cb035398 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2028,7 +2028,8 @@ bool is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr);
 class Program;
 
 uint16_t is_atomic_or_control_instr(Program* program, const Instruction* instr,
-                                    memory_sync_info sync, unsigned semantic);
+                                    memory_sync_info sync, unsigned semantic,
+                                    sync_scope ignore_scope = scope_invocation);
 
 memory_sync_info get_sync_info(const Instruction* instr);