From 49fb361c0a42cf93020bf4d20c9d181284006282 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 18 May 2026 16:41:54 +0100 Subject: [PATCH] aco: don't emit workgroup-scope p_barrier for single-wave workgroups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 8 ++-- src/amd/compiler/aco_lower_to_hw_instr.cpp | 3 +- .../aco_select_nir_intrinsics.cpp | 6 +++ .../compiler/tests/test_insert_waitcnt.cpp | 42 +++++++++++-------- 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 7bae3fb4428..022828c978e 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -447,9 +447,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) void setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire) { - sync_scope subgroup_scope = - ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup; - if (sync.scope <= subgroup_scope) + if (sync.scope <= scope_subgroup) return; barrier_info& src = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep]; @@ -457,8 +455,8 @@ setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acqui wait_imm dst_imm; uint16_t dst_events = 0; u_foreach_bit (i, sync.storage & src.storage) { - /* LDS is private to the workgroup, so reduce the scope in that case. */ - if (src.events[i] == event_lds && MIN2(sync.scope, scope_workgroup) <= subgroup_scope) + /* LDS is private to the workgroup, but sync.scope might be device scope. */ + if (src.events[i] == event_lds && ctx.program->workgroup_size <= ctx.program->wave_size) continue; dst_imm.combine(src.imm[i]); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 8bc97931789..fdff62e8c11 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2955,8 +2955,7 @@ lower_to_hw_instr(Program* program) /* Anything larger than a workgroup isn't possible. Anything * smaller requires no instructions and this pseudo instruction * would only be included to control optimizations. */ - bool emit_s_barrier = barrier.exec_scope == scope_workgroup && - program->workgroup_size > program->wave_size; + bool emit_s_barrier = barrier.exec_scope == scope_workgroup; bld.insert(std::move(instr)); if (emit_s_barrier && ctx.program->gfx_level >= GFX12) { diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index b6e6ad8e8c4..0d47fefbef8 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -2948,6 +2948,12 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); assert(exec_scope != scope_workgroup || workgroup_scope_allowed); + if (ctx->program->workgroup_size <= ctx->program->wave_size) { + exec_scope = scope_subgroup; + if (mem_scope == scope_workgroup) + mem_scope = scope_subgroup; + } + bld.barrier(aco_opcode::p_barrier, memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), exec_scope); diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index 86d60e6a2ea..9aa40a89764 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -1215,6 +1215,9 @@ BEGIN_TEST(insert_waitcnt.barrier.release) if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name)) continue; + sync_scope workgroup_scope = + var.workgroup_size > program->wave_size ? scope_workgroup : scope_subgroup; + program->workgroup_size = var.workgroup_size; program->wgp_mode = var.wgp; @@ -1252,7 +1255,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); store_global(); - barrier(storage_buffer, semantic_release, scope_workgroup); + barrier(storage_buffer, semantic_release, workgroup_scope); store_global(semantic_atomic); //>> p_unit_test 3 @@ -1262,7 +1265,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); load_global(); - barrier(storage_buffer, semantic_release, scope_workgroup); + barrier(storage_buffer, semantic_release, workgroup_scope); store_global(semantic_atomic); /* shared->shared */ @@ -1272,7 +1275,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); store_shared(); - barrier(storage_shared, semantic_release, scope_workgroup); + barrier(storage_shared, semantic_release, workgroup_scope); store_shared(semantic_atomic); //>> p_unit_test 5 @@ -1281,7 +1284,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); load_shared(); - barrier(storage_shared, semantic_release, scope_workgroup); + barrier(storage_shared, semantic_release, workgroup_scope); store_shared(semantic_atomic); /* shared->global */ @@ -1292,7 +1295,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); store_shared(); - barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup); + barrier(storage_buffer | storage_shared, semantic_release, workgroup_scope); store_global(semantic_atomic); /* global->shared */ @@ -1303,7 +1306,7 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); store_global(); - barrier(storage_buffer | storage_shared, semantic_release, scope_workgroup); + barrier(storage_buffer | storage_shared, semantic_release, workgroup_scope); store_shared(semantic_atomic); /* global->global, device scope, release in the atomic */ @@ -1325,8 +1328,8 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); store_global(); - barrier(storage_buffer, semantic_release, scope_workgroup); - barrier(0, 0, scope_invocation, scope_workgroup); + barrier(storage_buffer, semantic_release, workgroup_scope); + barrier(0, 0, scope_invocation, workgroup_scope); bld.sopp(aco_opcode::s_barrier); /* shared->shared, workgroup scope, control barrier */ @@ -1338,8 +1341,8 @@ BEGIN_TEST(insert_waitcnt.barrier.release) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10)); store_shared(); - barrier(storage_shared, semantic_release, scope_workgroup); - barrier(0, 0, scope_invocation, scope_workgroup); + barrier(storage_shared, semantic_release, workgroup_scope); + barrier(0, 0, scope_invocation, workgroup_scope); bld.sopp(aco_opcode::s_barrier); /* global->global, device scope, delayed waitcnt */ @@ -1395,6 +1398,9 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) if (!setup_cs(NULL, GFX10, CHIP_UNKNOWN, var.name)) continue; + sync_scope workgroup_scope = + var.workgroup_size > program->wave_size ? scope_workgroup : scope_subgroup; + program->workgroup_size = var.workgroup_size; program->wgp_mode = var.wgp; @@ -1433,7 +1439,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); load_global(semantic_atomic); - barrier(storage_buffer, semantic_acquire, scope_workgroup); + barrier(storage_buffer, semantic_acquire, workgroup_scope); load_global() = dest1; //>> p_unit_test 3 @@ -1443,7 +1449,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); load_global(semantic_atomic); - barrier(storage_buffer, semantic_acquire, scope_workgroup); + barrier(storage_buffer, semantic_acquire, workgroup_scope); store_global(); /* shared->shared */ @@ -1453,7 +1459,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); load_shared(semantic_atomic); - barrier(storage_shared, semantic_acquire, scope_workgroup); + barrier(storage_shared, semantic_acquire, workgroup_scope); load_shared() = dest1; //>> p_unit_test 5 @@ -1462,7 +1468,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); load_shared(semantic_atomic); - barrier(storage_shared, semantic_acquire, scope_workgroup); + barrier(storage_shared, semantic_acquire, workgroup_scope); store_shared(); /* shared->global */ @@ -1473,7 +1479,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6)); load_shared(semantic_atomic); - barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup); + barrier(storage_buffer | storage_shared, semantic_acquire, workgroup_scope); load_global() = dest1; /* global->shared */ @@ -1484,7 +1490,7 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7)); load_global(semantic_atomic); - barrier(storage_buffer | storage_shared, semantic_acquire, scope_workgroup); + barrier(storage_buffer | storage_shared, semantic_acquire, workgroup_scope); load_shared() = dest1; /* global->global, device scope, acquire in the atomic */ @@ -1503,9 +1509,9 @@ BEGIN_TEST(insert_waitcnt.barrier.acquire) //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer bld.reset(program->create_and_insert_block()); bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9)); - barrier(0, 0, scope_invocation, scope_workgroup); + barrier(0, 0, scope_invocation, workgroup_scope); bld.sopp(aco_opcode::s_barrier); - barrier(storage_buffer, semantic_acquire, scope_workgroup); + barrier(storage_buffer, semantic_acquire, workgroup_scope); load_global(); /* global->global, device scope, delayed waitcnt */