aco: use split barrier instructions

fossil-db (gfx1201):
Totals from 135 (0.06% of 208640) affected shaders:
Instrs: 155940 -> 155932 (-0.01%); split: -0.02%, +0.02%
CodeSize: 905460 -> 905432 (-0.00%); split: -0.02%, +0.01%
Latency: 1910087 -> 1909703 (-0.02%); split: -0.02%, +0.00%
InvThroughput: 886321 -> 886280 (-0.00%)
Copies: 12025 -> 12024 (-0.01%)
VALU: 89681 -> 89679 (-0.00%)
VOPD: 177 -> 178 (+0.56%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41364>
This commit is contained in:
Rhys Perry 2026-05-18 16:41:56 +01:00 committed by Marge Bot
parent a95f841125
commit 26b942c306
4 changed files with 27 additions and 14 deletions

View file

@ -301,6 +301,7 @@ struct aco_export_mrt {
void create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
const struct aco_export_mrt* mrt1);
Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
void emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope);
void build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs);
Instruction* add_startpgm(struct isel_context* ctx, bool is_callee = false);
void finish_program(isel_context* ctx);

View file

@ -667,6 +667,25 @@ lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset)
}
}
void
emit_barrier(Builder& bld, memory_sync_info sync, sync_scope exec_scope)
{
if (bld.program->workgroup_size <= bld.program->wave_size) {
exec_scope = scope_subgroup;
if (sync.scope == scope_workgroup)
sync.scope = scope_subgroup;
}
if (bld.program->gfx_level >= GFX12 && exec_scope == scope_workgroup) {
memory_sync_info sync_release(sync.storage, sync.semantics & semantic_release, sync.scope);
memory_sync_info sync_acquire(sync.storage, sync.semantics & semantic_acquire, sync.scope);
bld.barrier(aco_opcode::p_barrier_signal, sync_release, exec_scope);
bld.barrier(aco_opcode::p_barrier_wait, sync_acquire, exec_scope);
} else {
bld.barrier(aco_opcode::p_barrier, sync, exec_scope);
}
}
void
build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
{

View file

@ -1522,9 +1522,8 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons
? scope_subgroup
: scope_workgroup;
Builder(ctx.program, ctx.block)
.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
scope);
Builder bld(ctx.program, ctx.block);
emit_barrier(bld, memory_sync_info(storage_shared, semantic_acqrel, scope), scope);
}
nir_function_impl* func = nir_shader_get_entrypoint(nir);

View file

@ -2895,7 +2895,7 @@ translate_nir_scope(mesa_scope scope)
}
void
emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
visit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
{
Builder bld(ctx->program, ctx->block);
@ -2948,15 +2948,9 @@ emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
if (ctx->program->workgroup_size <= ctx->program->wave_size) {
exec_scope = scope_subgroup;
if (mem_scope == scope_workgroup)
mem_scope = scope_subgroup;
}
bld.barrier(aco_opcode::p_barrier,
memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
exec_scope);
emit_barrier(bld,
memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
exec_scope);
}
/* The two 32 wide halves of a gfx10+ wave64 LDS instruction might be executed interleaved
@ -4056,7 +4050,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
case nir_intrinsic_barrier: visit_barrier(ctx, instr); break;
case nir_intrinsic_ddx:
case nir_intrinsic_ddy:
case nir_intrinsic_ddx_fine: