From 7c6be36cf4d2d5fdc5f5ca18cae6d7516ec5ce05 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 18 May 2026 16:44:27 +0100 Subject: [PATCH] aco: don't emit waitcnts before subgroup-scope execution barriers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This delays the waitcnt for has_attr_ring_wait_bug by a few instructions. fossil-db (gfx1201): Totals from 9 (0.00% of 208640) affected shaders: Instrs: 19352 -> 19506 (+0.80%) CodeSize: 101180 -> 101716 (+0.53%) Latency: 660221 -> 678782 (+2.81%); split: -0.00%, +2.81% InvThroughput: 95106 -> 97398 (+2.41%) fossil-db (navi33): Totals from 58834 (28.20% of 208626) affected shaders: Instrs: 22424304 -> 22424571 (+0.00%) CodeSize: 110198112 -> 110199184 (+0.00%) Latency: 115894319 -> 126491124 (+9.14%); split: -0.00%, +9.14% InvThroughput: 19424631 -> 19754358 (+1.70%); split: -0.00%, +1.70% I don't think the stats are very accurate. This seems to often move the s_waitcnt down into a divergent branch, but the wait still happens later if the branch isn't taken, so the wait is counted twice. Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 4 +++- src/amd/compiler/aco_ir.cpp | 4 ++-- src/amd/compiler/aco_ir.h | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index c3d5d2ba583..a72e76594e8 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -523,8 +523,10 @@ finish_barriers(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction* memory_sync_info sync) { if (ctx.bar_nonempty & (1 << barrier_info_release)) { + /* s_waitcnt are subgroup-wide and no waitcnts are necessary within a subgroup, so we can + * delay this until we reach a workgroup or larger scope atomic. */ uint16_t storage_release = - is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release); + is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release, scope_subgroup); u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage) finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_release], i); } diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index ae03bf607e5..27d80c0bf8e 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -225,7 +225,7 @@ is_ordered_ps_done_sendmsg(const Instruction* instr) uint16_t is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sync_info sync, - unsigned semantic) + unsigned semantic, sync_scope ignore_scope) { bool is_acquire = semantic & semantic_acquire; bool is_release = semantic & semantic_release; @@ -254,7 +254,7 @@ is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sy if (instr->opcode == aco_opcode::s_sethalt) return cls & ~storage_shared; } - return (instr->isBarrier() && instr->barrier().exec_scope > scope_invocation) ? cls : 0; + return (instr->isBarrier() && instr->barrier().exec_scope > ignore_scope) ? cls : 0; } memory_sync_info diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 2f1c58f5c61..7b6cb035398 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2028,7 +2028,8 @@ bool is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr); class Program; uint16_t is_atomic_or_control_instr(Program* program, const Instruction* instr, - memory_sync_info sync, unsigned semantic); + memory_sync_info sync, unsigned semantic, + sync_scope ignore_scope = scope_invocation); memory_sync_info get_sync_info(const Instruction* instr);