aco: use split barrier instructions

fossil-db (gfx1201): Totals from 135 (0.06% of 208640) affected shaders: Instrs: 155940 -> 155932 (-0.01%); split: -0.02%, +0.02% CodeSize: 905460 -> 905432 (-0.00%); split: -0.02%, +0.01% Latency: 1910087 -> 1909703 (-0.02%); split: -0.02%, +0.00% InvThroughput: 886321 -> 886280 (-0.00%) Copies: 12025 -> 12024 (-0.01%) VALU: 89681 -> 89679 (-0.00%) VOPD: 177 -> 178 (+0.56%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41364>
2026-06-17 00:08:31 +02:00 · 2026-05-18 16:41:56 +01:00 · 2026-05-18 16:41:56 +01:00 · 26b942c306
commit 26b942c306
parent a95f841125
4 changed files with 27 additions and 14 deletions
--- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h
+++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h
@ -301,6 +301,7 @@ struct aco_export_mrt {
 void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
                                     const struct aco_export_mrt* mrt1);
 Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
+void emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope);
 void build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs);
 Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false);
 void finish_program(isel_context* ctx);
--- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
@ -667,6 +667,25 @@ lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset)
   }
 }

+void
+emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope)
+{
+   if (bld.program->workgroup_size <= bld.program->wave_size) {
+      exec_scope = scope_subgroup;
+      if (sync.scope == scope_workgroup)
+         sync.scope = scope_subgroup;
+   }
+
+   if (bld.program->gfx_level >= GFX12 && exec_scope == scope_workgroup) {
+      memory_sync_info sync_release(sync.storage, sync.semantics & semantic_release, sync.scope);
+      memory_sync_info sync_acquire(sync.storage, sync.semantics & semantic_acquire, sync.scope);
+      bld.barrier(aco_opcode::p_barrier_signal, sync_release, exec_scope);
+      bld.barrier(aco_opcode::p_barrier_wait, sync_acquire, exec_scope);
+   } else {
+      bld.barrier(aco_opcode::p_barrier, sync, exec_scope);
+   }
+}
+
 void
 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
 {
--- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp
@ -1522,9 +1522,8 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons
                                  ? scope_subgroup
                                  : scope_workgroup;

-      Builder(ctx.program, ctx.block)
-         .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
-                  scope);
+      Builder bld(ctx.program, ctx.block);
+      emit_barrier(bld, memory_sync_info(storage_shared, semantic_acqrel, scope), scope);
   }

   nir_function_impl* func = nir_shader_get_entrypoint(nir);
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@ -2895,7 +2895,7 @@ translate_nir_scope(mesa_scope scope)
 }

 void
-emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
+visit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
 {
   Builder bld(ctx->program, ctx->block);

@ -2948,15 +2948,9 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
   assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
   assert(exec_scope != scope_workgroup || workgroup_scope_allowed);

-   if (ctx->program->workgroup_size <= ctx->program->wave_size) {
-      exec_scope = scope_subgroup;
-      if (mem_scope == scope_workgroup)
-         mem_scope = scope_subgroup;
-   }
-
-   bld.barrier(aco_opcode::p_barrier,
-               memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
-               exec_scope);
+   emit_barrier(bld,
+                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
+                exec_scope);
 }

 /* The two 32 wide halves of a gfx10+ wave64 LDS instruction might be executed interleaved
@ -4056,7 +4050,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
   case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
   case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
   case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
-   case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
+   case nir_intrinsic_barrier: visit_barrier(ctx, instr); break;
   case nir_intrinsic_ddx:
   case nir_intrinsic_ddy:
   case nir_intrinsic_ddx_fine: