aco: delay barrier waitcnt until they are needed

fossil-db (navi21):
Totals from 44 (0.06% of 79825) affected shaders:
Instrs: 16001 -> 15932 (-0.43%); split: -0.46%, +0.02%
CodeSize: 85800 -> 85548 (-0.29%); split: -0.30%, +0.01%
Latency: 190124 -> 173458 (-8.77%)
InvThroughput: 23605 -> 22756 (-3.60%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36491>
This commit is contained in:
Rhys Perry 2025-09-03 11:22:40 +01:00 committed by Marge Bot
parent 843acfa50b
commit 20cd5cf5f7
2 changed files with 61 additions and 15 deletions

View file

@ -171,6 +171,12 @@ enum barrier_info_kind {
barrier_info_release_dep,
/* Waits for all atomics */
barrier_info_acquire_dep,
/* A wait that is to be emitted when an
* atomics/control_barriers/sendmsg_gs_done/position-primitive-export is encountered.
*/
barrier_info_release,
/* A wait that is to be emitted when any non-private access is encountered. */
barrier_info_acquire,
num_barrier_infos,
};
@ -428,18 +434,22 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
}
}
/* We delay the waitcnt for a barrier until it's needed. This can help hide the cost or let it be
* eliminated. */
void
perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acquire)
{
sync_scope subgroup_scope =
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
if (sync.scope <= subgroup_scope)
return;
barrier_info& bar = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
barrier_info& src = ctx.bar[is_acquire ? barrier_info_acquire_dep : barrier_info_release_dep];
u_foreach_bit (i, sync.storage & bar.storage) {
uint16_t events = bar.events[i];
wait_imm dst_imm;
uint16_t dst_events = 0;
u_foreach_bit (i, sync.storage & src.storage) {
uint16_t events = src.events[i];
/* LDS is private to the workgroup */
if (MIN2(sync.scope, scope_workgroup) <= subgroup_scope)
@ -448,10 +458,40 @@ perform_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acq
/* Until GFX11, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
* in-order for the same workgroup */
if (ctx.gfx_level < GFX11 && !ctx.program->wgp_mode && sync.scope <= scope_workgroup)
events &= ~(event_vmem | event_vmem_store | event_smem);
events &= ~(event_vmem | event_vmem_store);
if (events)
imm.combine(bar.imm[i]);
if (events) {
dst_imm.combine(src.imm[i]);
dst_events |= src.events[i];
}
}
if (!dst_events)
return;
/* Copy over wait into barrier_info_acquire/barrier_info_release */
unsigned dst_index = is_acquire ? barrier_info_acquire : barrier_info_release;
barrier_info& dst = ctx.bar[dst_index];
u_foreach_bit (i, sync.storage) {
dst.imm[i].combine(dst_imm);
dst.events[i] |= dst_events;
}
dst.storage |= sync.storage;
ctx.bar_nonempty |= 1 << dst_index;
}
void
finish_barriers(wait_ctx& ctx, wait_imm& imm, Instruction* instr, memory_sync_info sync)
{
if (ctx.bar_nonempty & (1 << barrier_info_release)) {
uint16_t storage_release =
is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release);
u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage)
imm.combine(ctx.bar[barrier_info_release].imm[i]);
}
if (ctx.bar_nonempty & (1 << barrier_info_acquire)) {
uint16_t storage_acquire = (sync.semantics & semantic_private) ? 0 : sync.storage;
u_foreach_bit (i, storage_acquire & ctx.bar[barrier_info_acquire].storage)
imm.combine(ctx.bar[barrier_info_acquire].imm[i]);
}
}
@ -558,13 +598,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
if (instr->opcode == aco_opcode::p_barrier) {
if (instr->barrier().sync.semantics & semantic_release)
perform_barrier(ctx, imm, instr->barrier().sync, false);
setup_barrier(ctx, imm, instr->barrier().sync, false);
if (instr->barrier().sync.semantics & semantic_acquire)
perform_barrier(ctx, imm, instr->barrier().sync, true);
setup_barrier(ctx, imm, instr->barrier().sync, true);
} else if (sync_info.semantics & semantic_release) {
perform_barrier(ctx, imm, sync_info, false);
setup_barrier(ctx, imm, sync_info, false);
}
finish_barriers(ctx, imm, instr, sync_info);
if (!imm.empty()) {
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
imm.vm = 0;
@ -647,6 +689,9 @@ update_barriers(wait_ctx& ctx, uint8_t counters, wait_event event, Instruction*
uint16_t storage_acq = is_atomic_or_control_instr(ctx.program, instr, sync, semantic_acquire);
update_barrier_info_for_event(ctx, counters, event, barrier_info_acquire_dep, storage_acq);
}
update_barrier_info_for_event(ctx, counters, event, barrier_info_release, 0);
update_barrier_info_for_event(ctx, counters, event, barrier_info_acquire, 0);
}
void
@ -915,7 +960,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
new_instructions.emplace_back(std::move(instr));
if (sync_info.semantics & semantic_acquire)
perform_barrier(ctx, queued_imm, sync_info, true);
setup_barrier(ctx, queued_imm, sync_info, true);
if (is_ordered_count_acquire)
queued_imm.combine(ctx.bar[barrier_info_release_dep].imm[ffs(storage_gds) - 1]);

View file

@ -610,7 +610,8 @@ BEGIN_TEST(insert_waitcnt.vmem_ds)
//! s_wait_storecnt_dscnt dscnt(0) storecnt(0)
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup));
memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup),
scope_workgroup);
finish_waitcnt_test();
END_TEST
@ -1119,7 +1120,7 @@ BEGIN_TEST(insert_waitcnt.flat.barrier)
memory_sync_info(storage_buffer));
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)).instr->flat().may_use_lds = true;
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup);
//>> p_unit_test 1
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef may_use_lds storage:buffer
@ -1132,7 +1133,7 @@ BEGIN_TEST(insert_waitcnt.flat.barrier)
memory_sync_info(storage_buffer)).instr->flat().may_use_lds = true;
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0);
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup);
//>> p_unit_test 2
//! flat_store_dword %0:v[0-1], s1: undef, %0:v[0] may_use_lds storage:buffer
@ -1144,7 +1145,7 @@ BEGIN_TEST(insert_waitcnt.flat.barrier)
bld.flat(aco_opcode::flat_store_dword, addr, Operand(s1), data, 0,
memory_sync_info(storage_buffer)).instr->flat().may_use_lds = true;
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
memory_sync_info(storage_buffer, semantic_acqrel, scope_device), scope_workgroup);
finish_waitcnt_test();
}