aco: don't emit workgroup-scope p_barrier for single-wave workgroups

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41364>
This commit is contained in:
Rhys Perry 2026-05-18 16:41:54 +01:00 committed by Marge Bot
parent e6703f8e68
commit 49fb361c0a
4 changed files with 34 additions and 25 deletions

View file

@ -447,9 +447,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
void
setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
{
sync_scope subgroup_scope =
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
if (sync.scope <= subgroup_scope)
if (sync.scope <= scope_subgroup)
return;
barrier_info& src = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
@ -457,8 +455,8 @@ setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acqui
wait_imm dst_imm;
uint16_t dst_events = 0;
u_foreach_bit (i, sync.storage & src.storage) {
/* LDS is private to the workgroup, so reduce the scope in that case. */
if (src.events[i] == event_lds && MIN2(sync.scope, scope_workgroup) <= subgroup_scope)
/* LDS is private to the workgroup, but sync.scope might be device scope. */
if (src.events[i] == event_lds && ctx.program->workgroup_size <= ctx.program->wave_size)
continue;
dst_imm.combine(src.imm[i]);

View file

@ -2955,8 +2955,7 @@ lower_to_hw_instr(Program* program)
/* Anything larger than a workgroup isn't possible. Anything
* smaller requires no instructions and this pseudo instruction
* would only be included to control optimizations. */
bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
program->workgroup_size > program->wave_size;
bool emit_s_barrier = barrier.exec_scope == scope_workgroup;
bld.insert(std::move(instr));
if (emit_s_barrier && ctx.program->gfx_level >= GFX12) {

View file

@ -2948,6 +2948,12 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
if (ctx->program->workgroup_size <= ctx->program->wave_size) {
exec_scope = scope_subgroup;
if (mem_scope == scope_workgroup)
mem_scope = scope_subgroup;
}
bld.barrier(aco_opcode::p_barrier,
memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
exec_scope);

View file

@ -1215,6 +1215,9 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name))
continue;
sync_scope workgroup_scope =
var.workgroup_size > program->wave_size ? scope_workgroup : scope_subgroup;
program->workgroup_size = var.workgroup_size;
program->wgp_mode = var.wgp;
@ -1252,7 +1255,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
store_global();
barrier(storage_buffer, semantic_release, scope_workgroup);
barrier(storage_buffer, semantic_release, workgroup_scope);
store_global(semantic_atomic);
//>> p_unit_test 3
@ -1262,7 +1265,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
load_global();
barrier(storage_buffer, semantic_release, scope_workgroup);
barrier(storage_buffer, semantic_release, workgroup_scope);
store_global(semantic_atomic);
/* shared->shared */
@ -1272,7 +1275,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
store_shared();
barrier(storage_shared, semantic_release, scope_workgroup);
barrier(storage_shared, semantic_release, workgroup_scope);
store_shared(semantic_atomic);
//>> p_unit_test 5
@ -1281,7 +1284,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
load_shared();
barrier(storage_shared, semantic_release, scope_workgroup);
barrier(storage_shared, semantic_release, workgroup_scope);
store_shared(semantic_atomic);
/* shared->global */
@ -1292,7 +1295,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
store_shared();
barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup);
barrier(storage_buffer | storage_shared, semantic_release, workgroup_scope);
store_global(semantic_atomic);
/* global->shared */
@ -1303,7 +1306,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
store_global();
barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup);
barrier(storage_buffer | storage_shared, semantic_release, workgroup_scope);
store_shared(semantic_atomic);
/* global->global, device scope, release in the atomic */
@ -1325,8 +1328,8 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
store_global();
barrier(storage_buffer, semantic_release, scope_workgroup);
barrier(0, 0, scope_invocation, scope_workgroup);
barrier(storage_buffer, semantic_release, workgroup_scope);
barrier(0, 0, scope_invocation, workgroup_scope);
bld.sopp(aco_opcode::s_barrier);
/* shared->shared, workgroup scope, control barrier */
@ -1338,8 +1341,8 @@ BEGIN_TEST(insert_waitcnt.barrier.release)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
store_shared();
barrier(storage_shared, semantic_release, scope_workgroup);
barrier(0, 0, scope_invocation, scope_workgroup);
barrier(storage_shared, semantic_release, workgroup_scope);
barrier(0, 0, scope_invocation, workgroup_scope);
bld.sopp(aco_opcode::s_barrier);
/* global->global, device scope, delayed waitcnt */
@ -1395,6 +1398,9 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name))
continue;
sync_scope workgroup_scope =
var.workgroup_size > program->wave_size ? scope_workgroup : scope_subgroup;
program->workgroup_size = var.workgroup_size;
program->wgp_mode = var.wgp;
@ -1433,7 +1439,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
load_global(semantic_atomic);
barrier(storage_buffer, semantic_acquire, scope_workgroup);
barrier(storage_buffer, semantic_acquire, workgroup_scope);
load_global() = dest1;
//>> p_unit_test 3
@ -1443,7 +1449,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
load_global(semantic_atomic);
barrier(storage_buffer, semantic_acquire, scope_workgroup);
barrier(storage_buffer, semantic_acquire, workgroup_scope);
store_global();
/* shared->shared */
@ -1453,7 +1459,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
load_shared(semantic_atomic);
barrier(storage_shared, semantic_acquire, scope_workgroup);
barrier(storage_shared, semantic_acquire, workgroup_scope);
load_shared() = dest1;
//>> p_unit_test 5
@ -1462,7 +1468,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
load_shared(semantic_atomic);
barrier(storage_shared, semantic_acquire, scope_workgroup);
barrier(storage_shared, semantic_acquire, workgroup_scope);
store_shared();
/* shared->global */
@ -1473,7 +1479,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
load_shared(semantic_atomic);
barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup);
barrier(storage_buffer | storage_shared, semantic_acquire, workgroup_scope);
load_global() = dest1;
/* global->shared */
@ -1484,7 +1490,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
load_global(semantic_atomic);
barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup);
barrier(storage_buffer | storage_shared, semantic_acquire, workgroup_scope);
load_shared() = dest1;
/* global->global, device scope, acquire in the atomic */
@ -1503,9 +1509,9 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire)
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
barrier(0, 0, scope_invocation, scope_workgroup);
barrier(0, 0, scope_invocation, workgroup_scope);
bld.sopp(aco_opcode::s_barrier);
barrier(storage_buffer, semantic_acquire, scope_workgroup);
barrier(storage_buffer, semantic_acquire, workgroup_scope);
load_global();
/* global->global, device scope, delayed waitcnt */