diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.h b/src/amd/compiler/instruction_selection/aco_instruction_selection.h index de37f888103..e2b0ddf7d1f 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.h +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.h @@ -301,6 +301,7 @@ struct aco_export_mrt { void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0, const struct aco_export_mrt* mrt1); Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset); +void emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope); void build_end_with_regs(isel_context* ctx, std::vector& regs); Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false); void finish_program(isel_context* ctx); diff --git a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp index 7f62f98d524..bfb57d8d6a6 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp @@ -667,6 +667,25 @@ lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset) } } +void +emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope) +{ + if (bld.program->workgroup_size <= bld.program->wave_size) { + exec_scope = scope_subgroup; + if (sync.scope == scope_workgroup) + sync.scope = scope_subgroup; + } + + if (bld.program->gfx_level >= GFX12 && exec_scope == scope_workgroup) { + memory_sync_info sync_release(sync.storage, sync.semantics & semantic_release, sync.scope); + memory_sync_info sync_acquire(sync.storage, sync.semantics & semantic_acquire, sync.scope); + bld.barrier(aco_opcode::p_barrier_signal, sync_release, exec_scope); + bld.barrier(aco_opcode::p_barrier_wait, sync_acquire, exec_scope); + } else { + bld.barrier(aco_opcode::p_barrier, sync, exec_scope); + } +} + void build_end_with_regs(isel_context* ctx, std::vector& regs) { diff --git a/src/amd/compiler/instruction_selection/aco_select_nir.cpp b/src/amd/compiler/instruction_selection/aco_select_nir.cpp index 9c7864bab5c..f8fcf4fda96 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir.cpp @@ -1522,9 +1522,8 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons ? scope_subgroup : scope_workgroup; - Builder(ctx.program, ctx.block) - .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope), - scope); + Builder bld(ctx.program, ctx.block); + emit_barrier(bld, memory_sync_info(storage_shared, semantic_acqrel, scope), scope); } nir_function_impl* func = nir_shader_get_entrypoint(nir); diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 0d47fefbef8..e36ec44f212 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -2895,7 +2895,7 @@ translate_nir_scope(mesa_scope scope) } void -emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) +visit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) { Builder bld(ctx->program, ctx->block); @@ -2948,15 +2948,9 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr) assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); assert(exec_scope != scope_workgroup || workgroup_scope_allowed); - if (ctx->program->workgroup_size <= ctx->program->wave_size) { - exec_scope = scope_subgroup; - if (mem_scope == scope_workgroup) - mem_scope = scope_subgroup; - } - - bld.barrier(aco_opcode::p_barrier, - memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), - exec_scope); + emit_barrier(bld, + memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), + exec_scope); } /* The two 32 wide halves of a gfx10+ wave64 LDS instruction might be executed interleaved @@ -4056,7 +4050,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break; case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break; case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break; - case nir_intrinsic_barrier: emit_barrier(ctx, instr); break; + case nir_intrinsic_barrier: visit_barrier(ctx, instr); break; case nir_intrinsic_ddx: case nir_intrinsic_ddy: case nir_intrinsic_ddx_fine: