From 26b942c30668a1899cbc220c0d4aa58fbca8fb5f Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 18 May 2026 16:41:56 +0100 Subject: [PATCH] aco: use split barrier instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (gfx1201): Totals from 135 (0.06% of 208640) affected shaders: Instrs: 155940 -> 155932 (-0.01%); split: -0.02%, +0.02% CodeSize: 905460 -> 905432 (-0.00%); split: -0.02%, +0.01% Latency: 1910087 -> 1909703 (-0.02%); split: -0.02%, +0.00% InvThroughput: 886321 -> 886280 (-0.00%) Copies: 12025 -> 12024 (-0.01%) VALU: 89681 -> 89679 (-0.00%) VOPD: 177 -> 178 (+0.56%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Reviewed-by: Daniel Schürmann Part-of: --- .../aco_instruction_selection.h | 1 + .../aco_isel_helpers.cpp | 19 +++++++++++++++++++ .../instruction_selection/aco_select_nir.cpp | 5 ++--- .../aco_select_nir_intrinsics.cpp | 16 +++++----------- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index de37f888103..e2b0ddf7d1f 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -301,6 +301,7 @@ struct aco_export_mrt { void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0, const struct aco_export_mrt* mrt1); Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset); +void emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope); void build_end_with_regs(isel_context* ctx, std::vector& regs); Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false); void finish_program(isel_context* ctx); diff --git a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp index 7f62f98d524..bfb57d8d6a6 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp @@ -667,6 +667,25 @@ lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset) } } +void +emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope) +{ + if (bld.program->workgroup_size <= bld.program->wave_size) { + exec_scope = scope_subgroup; + if (sync.scope == scope_workgroup) + sync.scope = scope_subgroup; + } + + if (bld.program->gfx_level >= GFX12 && exec_scope == scope_workgroup) { + memory_sync_info sync_release(sync.storage, sync.semantics & semantic_release, sync.scope); + memory_sync_info sync_acquire(sync.storage, sync.semantics & semantic_acquire, sync.scope); + bld.barrier(aco_opcode::p_barrier_signal, sync_release, exec_scope); + bld.barrier(aco_opcode::p_barrier_wait, sync_acquire, exec_scope); + } else { + bld.barrier(aco_opcode::p_barrier, sync, exec_scope); + } +} + void build_end_with_regs(isel_context* ctx, std::vector& regs) { diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp index 9c7864bab5c..f8fcf4fda96 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp @@ -1522,9 +1522,8 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons ? scope_subgroup : scope_workgroup; - Builder(ctx.program, ctx.block) - .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope), - scope); + Builder bld(ctx.program, ctx.block); + emit_barrier(bld, memory_sync_info(storage_shared, semantic_acqrel, scope), scope); } nir_function_impl* func = nir_shader_get_entrypoint(nir); diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 0d47fefbef8..e36ec44f212 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -2895,7 +2895,7 @@ translate_nir_scope(mesa_scope scope) } void -emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) +visit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); @@ -2948,15 +2948,9 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); assert(exec_scope != scope_workgroup || workgroup_scope_allowed); - if (ctx->program->workgroup_size <= ctx->program->wave_size) { - exec_scope = scope_subgroup; - if (mem_scope == scope_workgroup) - mem_scope = scope_subgroup; - } - - bld.barrier(aco_opcode::p_barrier, - memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), - exec_scope); + emit_barrier(bld, + memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), + exec_scope); } /* The two 32 wide halves of a gfx10+ wave64 LDS instruction might be executed interleaved @@ -4056,7 +4050,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break; case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break; case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break; - case nir_intrinsic_barrier: emit_barrier(ctx, instr); break; + case nir_intrinsic_barrier: visit_barrier(ctx, instr); break; case nir_intrinsic_ddx: case nir_intrinsic_ddy: case nir_intrinsic_ddx_fine: