aco/gfx10: skip waitcnts or use vm_vsrc(0) for workgroup vmem barriers

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36491>
This commit is contained in:
Rhys Perry 2025-09-02 15:39:19 +01:00 committed by Marge Bot
parent 145b178de2
commit ac882985c0
3 changed files with 58 additions and 14 deletions

View file

@ -479,37 +479,45 @@ setup_barrier(wait_ctx& ctx, wait_imm& imm, memory_sync_info sync, bool is_acqui
}
void
finish_barrier_internal(wait_ctx& ctx, wait_imm& imm, Instruction* instr, struct barrier_info* info,
unsigned storage_idx)
finish_barrier_internal(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction* instr,
struct barrier_info* info, unsigned storage_idx)
{
uint16_t events = info->events[storage_idx];
bool vm_vsrc = false;
if (info->scope[storage_idx] <= scope_workgroup) {
bool is_vmem = instr->isVMEM() || (instr->isFlatLike() && !instr->flatlike().may_use_lds);
bool is_barrier = instr->isBarrier();
bool is_barrier = instr->isBarrier(); /* This is only called for control barriers. */
/* Until GFX11, in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
* in-order for the same workgroup */
if ((is_vmem || is_barrier) && ctx.gfx_level < GFX11 && !ctx.program->wgp_mode)
/* In non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same
* workgroup */
bool has_vmem_events = events & (event_vmem | event_vmem_store);
if (has_vmem_events && (is_vmem || is_barrier) && !ctx.program->wgp_mode) {
events &= ~(event_vmem | event_vmem_store);
vm_vsrc |= is_barrier && ctx.gfx_level >= GFX10;
}
}
if (events)
imm.combine(info->imm[storage_idx]);
if (vm_vsrc)
depctr.vm_vsrc = 0;
}
void
finish_barriers(wait_ctx& ctx, wait_imm& imm, Instruction* instr, memory_sync_info sync)
finish_barriers(wait_ctx& ctx, wait_imm& imm, depctr_wait& depctr, Instruction* instr,
memory_sync_info sync)
{
if (ctx.bar_nonempty & (1 << barrier_info_release)) {
uint16_t storage_release =
is_atomic_or_control_instr(ctx.program, instr, sync, semantic_release);
u_foreach_bit (i, storage_release & ctx.bar[barrier_info_release].storage)
finish_barrier_internal(ctx, imm, instr, &ctx.bar[barrier_info_release], i);
finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_release], i);
}
if (ctx.bar_nonempty & (1 << barrier_info_acquire)) {
uint16_t storage_acquire = (sync.semantics & semantic_private) ? 0 : sync.storage;
u_foreach_bit (i, storage_acquire & ctx.bar[barrier_info_acquire].storage)
finish_barrier_internal(ctx, imm, instr, &ctx.bar[barrier_info_acquire], i);
finish_barrier_internal(ctx, imm, depctr, instr, &ctx.bar[barrier_info_acquire], i);
}
}
@ -548,7 +556,8 @@ update_barrier_info_for_wait(wait_ctx& ctx, unsigned idx, wait_imm imm)
}
void
kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
memory_sync_info sync_info)
{
if (instr->opcode == aco_opcode::s_setpc_b64 || (debug_flags & DEBUG_FORCE_WAITCNT)) {
/* Force emitting waitcnt states right after the instruction if there is
@ -585,7 +594,7 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
setup_barrier(ctx, imm, sync_info, false);
}
finish_barriers(ctx, imm, instr, sync_info);
finish_barriers(ctx, imm, depctr, instr, sync_info);
if (!imm.empty()) {
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
@ -866,6 +875,14 @@ emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wai
imm.build_waitcnt(bld);
}
void
emit_depctr(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, depctr_wait& depctr)
{
Builder bld(ctx.program, &instructions);
bld.sopp(aco_opcode::s_waitcnt_depctr, depctr.pack());
depctr = depctr_wait();
}
bool
check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
{
@ -892,15 +909,19 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
std::vector<aco_ptr<Instruction>> new_instructions;
wait_imm queued_imm;
depctr_wait queued_depctr;
size_t clause_end = 0;
for (size_t i = 0; i < block.instructions.size(); i++) {
aco_ptr<Instruction>& instr = block.instructions[i];
bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get());
bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get()) ||
instr->opcode == aco_opcode::s_waitcnt_depctr;
if (instr->opcode == aco_opcode::s_waitcnt_depctr)
queued_depctr = parse_depctr_wait(instr.get());
memory_sync_info sync_info = get_sync_info(instr.get());
kill(queued_imm, instr.get(), ctx, sync_info);
kill(queued_imm, queued_depctr, instr.get(), ctx, sync_info);
/* At the start of a possible clause, also emit waitcnts for each instruction to avoid
* splitting the clause.
@ -920,7 +941,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
if (!check_clause_raw(*regs_written, next))
break;
kill(queued_imm, next, ctx, get_sync_info(next));
kill(queued_imm, queued_depctr, next, ctx, get_sync_info(next));
}
}
@ -934,6 +955,8 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
if (!queued_imm.empty())
emit_waitcnt(ctx, new_instructions, queued_imm);
if (!queued_depctr.empty())
emit_depctr(ctx, new_instructions, queued_depctr);
bool is_ordered_count_acquire =
instr->opcode == aco_opcode::ds_ordered_count &&
@ -956,6 +979,8 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
if (!queued_imm.empty())
emit_waitcnt(ctx, new_instructions, queued_imm);
if (!queued_depctr.empty())
emit_depctr(ctx, new_instructions, queued_depctr);
block.instructions.swap(new_instructions);
}

View file

@ -1641,6 +1641,21 @@ parse_depctr_wait(const Instruction* instr)
return res;
}
uint16_t
depctr_wait::pack() const
{
uint16_t imm = 0;
imm |= (va_vdst & 0xf) << 12;
imm |= (va_sdst & 0x7) << 9;
imm |= (va_ssrc & 0x1) << 8;
imm |= (hold_cnt & 0x1) << 7;
imm |= 0x3 << 5; /* don't know what this is, if anything */
imm |= (vm_vsrc & 0x7) << 2;
imm |= (va_vcc & 0x1) << 1;
imm |= (sa_sdst & 0x1);
return imm;
}
bool
dealloc_vgprs(Program* program)
{

View file

@ -1976,6 +1976,10 @@ struct depctr_wait {
};
unsigned packed = -1;
};
bool empty() const { return packed == (unsigned)-1; }
uint16_t pack() const;
};
depctr_wait parse_depctr_wait(const Instruction* instr);