aco: simplify waitcnt insertion for flat access

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35465>
This commit is contained in:
Rhys Perry 2025-06-09 14:54:46 +01:00 committed by Marge Bot
parent 6396a82695
commit d705b6198c
2 changed files with 212 additions and 41 deletions

View file

@ -45,7 +45,6 @@ enum wait_event : uint32_t {
event_gds = 1 << 2,
event_vmem = 1 << 3,
event_vmem_store = 1 << 4, /* GFX10+ */
event_flat = 1 << 5,
event_exp_pos = 1 << 6,
event_exp_param = 1 << 7,
event_exp_mrt_null = 1 << 8,
@ -102,9 +101,7 @@ struct wait_entry {
counters &= ~(1 << type);
imm[type] = wait_imm::unset_counter;
events &= ~type_events | event_flat;
if (!(counters & counter_lgkm) && !(counters & counter_vm))
events &= ~(type_events & event_flat);
events &= ~type_events;
logical_events &= events;
if (type == wait_type_vm)
@ -144,8 +141,8 @@ struct target_info {
events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
events[wait_type_vm] = event_vmem | event_flat;
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_sendmsg;
events[wait_type_vm] = event_vmem;
events[wait_type_vs] = event_vmem_store;
if (gfx_level >= GFX12) {
events[wait_type_sample] = event_vmem_sample;
@ -159,7 +156,7 @@ struct target_info {
counters[j] |= (1 << i);
}
unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0);
unordered_events = event_smem;
}
uint8_t get_counters_for_event(wait_event event) const { return counters[ffs(event) - 1]; }
@ -317,7 +314,7 @@ get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry)
*/
if (ctx.gfx_level >= GFX11) {
uint32_t ds_vmem_events =
event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat;
event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh;
events |= ds_vmem_events;
}
@ -523,11 +520,9 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
for (unsigned j = 0; j < wait_type_num; j++) {
if (bar[j] != wait_imm::unset_counter && imm[j] <= bar[j]) {
bar[j] = wait_imm::unset_counter;
bar_ev &= ~ctx.info->events[j] | event_flat;
bar_ev &= ~ctx.info->events[j];
}
}
if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
bar_ev &= ~event_flat;
}
/* remove all gprs with higher counter from map */
@ -587,11 +582,6 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
if (ctx.info->unordered_events & event)
return;
if (ctx.pending_flat_lgkm)
counters &= ~counter_lgkm;
if (ctx.pending_flat_vm)
counters &= ~counter_vm;
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
wait_entry& entry = e.second;
@ -607,25 +597,6 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
}
}
void
update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
{
assert(ctx.gfx_level < GFX10);
ctx.nonzero |= BITFIELD_BIT(wait_type_lgkm) | BITFIELD_BIT(wait_type_vm);
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
if (e.second.counters & counter_vm)
e.second.imm.vm = 0;
if (e.second.counters & counter_lgkm)
e.second.imm.lgkm = 0;
}
ctx.pending_flat_lgkm = true;
ctx.pending_flat_vm = true;
}
void
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
uint8_t vmem_types = 0, uint32_t vm_mask = 0)
@ -693,13 +664,19 @@ gen(Instruction* instr, wait_ctx& ctx)
}
case Format::FLAT: {
FLAT_instruction& flat = instr->flat();
if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
update_counters_for_flat_load(ctx, flat.sync);
else
update_counters(ctx, event_flat, flat.sync);
wait_event vmem_ev = get_vmem_event(ctx, instr, vmem_nosampler);
update_counters(ctx, vmem_ev, flat.sync);
update_counters(ctx, event_lds, flat.sync);
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], event_flat);
if (!instr->definitions.empty()) {
insert_wait_entry(ctx, instr->definitions[0], vmem_ev, 0, get_vmem_mask(ctx, instr));
insert_wait_entry(ctx, instr->definitions[0], event_lds);
}
if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) {
ctx.pending_flat_lgkm = true;
ctx.pending_flat_vm = true;
}
break;
}
case Format::SMEM: {

View file

@ -901,3 +901,197 @@ BEGIN_TEST(insert_waitcnt.divergent_branch.no_skip)
finish_waitcnt_test();
}
END_TEST
BEGIN_TEST(insert_waitcnt.flat.wait_zero)
for (amd_gfx_level gfx : {GFX9, GFX10}) {
if (!setup_cs(NULL, gfx))
continue;
Definition dest0(PhysReg(260), v1);
Definition dest1(PhysReg(261), v1);
Operand offset(PhysReg(256), v1);
Operand addr(PhysReg(256), v2);
//>> p_unit_test 0
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
//~gfx9! s_waitcnt vmcnt(0)
//~gfx10! s_waitcnt vmcnt(1)
//! p_unit_test %0:v[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1));
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
//>> p_unit_test 1
//! v1: %0:v[4] = ds_read_b32 %0:v[0]
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
//~gfx9! s_waitcnt lgkmcnt(0)
//~gfx10! s_waitcnt lgkmcnt(1)
//! p_unit_test %0:v[4]
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.ds(aco_opcode::ds_read_b32, dest0, offset);
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
//>> p_unit_test 2
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
//~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1)
//! p_unit_test %0:v[4]
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1));
bld.global(aco_opcode::global_load_dword, dest1, addr, Operand(s1));
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
//>> p_unit_test 3
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! v1: %0:v[5] = ds_read_b32 %0:v[0]
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
//~gfx10! s_waitcnt lgkmcnt(1) vmcnt(0)
//! p_unit_test %0:v[4]
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1));
bld.ds(aco_opcode::ds_read_b32, dest1, offset);
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
//>> p_unit_test 4
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
//~gfx10! s_waitcnt lgkmcnt(1) vmcnt(1)
//! p_unit_test %0:v[4]
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1));
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
finish_waitcnt_test();
}
END_TEST
BEGIN_TEST(insert_waitcnt.flat.waw)
for (amd_gfx_level gfx : {GFX9, GFX10}) {
if (!setup_cs(NULL, gfx))
continue;
/* Flat might use either LDS or VMEM, so WaW always needs a wait. */
Definition dest(PhysReg(260), v1);
Operand offset(PhysReg(256), v1);
Operand addr(PhysReg(256), v2);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
//>> p_unit_test 0
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1));
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
//>> p_unit_test 1
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! s_waitcnt lgkmcnt(0) vmcnt(0)
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1));
//>> p_unit_test 2
//! v1: %0:v[4] = ds_read_b32 %0:v[0]
//! s_waitcnt lgkmcnt(0)
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.ds(aco_opcode::ds_read_b32, dest, offset);
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
//>> p_unit_test 3
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! s_waitcnt vmcnt(0)
//! v1: %0:v[4] = ds_read_b32 %0:v[0]
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
bld.ds(aco_opcode::ds_read_b32, dest, offset);
/* In theory, we don't need a wait here, but we don't optimize this. */
//>> p_unit_test 4
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! s_waitcnt lgkmcnt(0) vmcnt(0)
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
//>> p_unit_test 5
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
//! s_waitcnt lgkmcnt(0) vmcnt(0)
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
bld.mimg(aco_opcode::image_sample, dest, desc_s8, desc_s4, Operand(v1), offset);
finish_waitcnt_test();
}
END_TEST
BEGIN_TEST(insert_waitcnt.flat.barrier)
for (amd_gfx_level gfx : {GFX9, GFX10}) {
if (!setup_cs(NULL, gfx))
continue;
Definition dest0(PhysReg(260), v1);
Definition dest1(PhysReg(261), v1);
Operand addr(PhysReg(256), v2);
Operand data(PhysReg(256), v1);
//>> p_unit_test 0
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
//~gfx9! s_waitcnt vmcnt(0)
//~gfx10! s_waitcnt vmcnt(1)
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0,
memory_sync_info(storage_buffer));
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
//>> p_unit_test 1
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef storage:buffer
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
//~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1), 0,
memory_sync_info(storage_buffer));
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0);
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
//>> p_unit_test 2
//! flat_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
//~gfx10! s_waitcnt_vscnt %0:null imm:0
//~gfx10! s_waitcnt lgkmcnt(0)
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.flat(aco_opcode::flat_store_dword, addr, Operand(s1), data, 0,
memory_sync_info(storage_buffer));
bld.barrier(aco_opcode::p_barrier,
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
finish_waitcnt_test();
}
END_TEST