aco: don't emit waitcnts before subgroup-scope execution barriers

This delays the waitcnt for has_attr_ring_wait_bug by a few instructions.

fossil-db (gfx1201):
Totals from 9 (0.00% of 208640) affected shaders:
Instrs: 19352 -> 19506 (+0.80%)
CodeSize: 101180 -> 101716 (+0.53%)
Latency: 660221 -> 678782 (+2.81%); split: -0.00%, +2.81%
InvThroughput: 95106 -> 97398 (+2.41%)

fossil-db (navi33):
Totals from 58834 (28.20% of 208626) affected shaders:
Instrs: 22424304 -> 22424571 (+0.00%)
CodeSize: 110198112 -> 110199184 (+0.00%)
Latency: 115894319 -> 126491124 (+9.14%); split: -0.00%, +9.14%
InvThroughput: 19424631 -> 19754358 (+1.70%); split: -0.00%, +1.70%

I don't think the stats are very accurate. This seems to often move the
s_waitcnt down into a divergent branch, but the wait still happens later
if the branch isn't taken, so the wait is counted twice.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41364>
This commit is contained in:
Rhys Perry 2026-05-18 16:44:27 +01:00 committed by Marge Bot
parent 3676c3860e
commit 7c6be36cf4
3 changed files with 7 additions and 4 deletions

View file

@ -523,8 +523,10 @@ finish_barriers(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction*
memory_sync_info sync)
{
if (ctx.bar_nonempty & (1 << barrier_info_release)) {
/* s_waitcnt are subgroup-wide and no waitcnts are necessary within a subgroup, so we can
* delay this until we reach a workgroup or larger scope atomic. */
uint16_t storage_release =
is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release);
is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release, scope_subgroup);
u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage)
finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_release], i);
}

View file

@ -225,7 +225,7 @@ is_ordered_ps_done_sendmsg(const Instruction* instr)
uint16_t
is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sync_info sync,
unsigned semantic)
unsigned semantic, sync_scope ignore_scope)
{
bool is_acquire = semantic & semantic_acquire;
bool is_release = semantic & semantic_release;
@ -254,7 +254,7 @@ is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sy
if (instr->opcode == aco_opcode::s_sethalt)
return cls & ~storage_shared;
}
return (instr->isBarrier() && instr->barrier().exec_scope > scope_invocation) ? cls : 0;
return (instr->isBarrier() && instr->barrier().exec_scope > ignore_scope) ? cls : 0;
}
memory_sync_info

View file

@ -2028,7 +2028,8 @@ bool is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr);
class Program;
uint16_t is_atomic_or_control_instr(Program* program, const Instruction* instr,
memory_sync_info sync, unsigned semantic);
memory_sync_info sync, unsigned semantic,
sync_scope ignore_scope = scope_invocation);
memory_sync_info get_sync_info(const Instruction* instr);