diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index c3d5d2ba583..a72e76594e8 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -523,8 +523,10 @@ finish_barriers(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction* memory_sync_info sync) { if (ctx.bar_nonempty & (1 << barrier_info_release)) { + /* s_waitcnt are subgroup-wide and no waitcnts are necessary within a subgroup, so we can + * delay this until we reach a workgroup or larger scope atomic. */ uint16_t storage_release = - is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release); + is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release, scope_subgroup); u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage) finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_release], i); } diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index ae03bf607e5..27d80c0bf8e 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -225,7 +225,7 @@ is_ordered_ps_done_sendmsg(const Instruction* instr) uint16_t is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sync_info sync, - unsigned semantic) + unsigned semantic, sync_scope ignore_scope) { bool is_acquire = semantic & semantic_acquire; bool is_release = semantic & semantic_release; @@ -254,7 +254,7 @@ is_atomic_or_control_instr(Program* program, const Instruction* instr, memory_sy if (instr->opcode == aco_opcode::s_sethalt) return cls & ~storage_shared; } - return (instr->isBarrier() && instr->barrier().exec_scope > scope_invocation) ? cls : 0; + return (instr->isBarrier() && instr->barrier().exec_scope > ignore_scope) ? cls : 0; } memory_sync_info diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 2f1c58f5c61..7b6cb035398 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2028,7 +2028,8 @@ bool is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr); class Program; uint16_t is_atomic_or_control_instr(Program* program, const Instruction* instr, - memory_sync_info sync, unsigned semantic); + memory_sync_info sync, unsigned semantic, + sync_scope ignore_scope = scope_invocation); memory_sync_info get_sync_info(const Instruction* instr);