From d705b6198c7d5b3af9ca1d69ba39228b747a73b0 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 9 Jun 2025 14:54:46 +0100 Subject: [PATCH] aco: simplify waitcnt insertion for flat access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 59 ++---- .../compiler/tests/test_insert_waitcnt.cpp | 194 ++++++++++++++++++ 2 files changed, 212 insertions(+), 41 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 248b7f51d83..aea382056cb 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -45,7 +45,6 @@ enum wait_event : uint32_t { event_gds = 1 << 2, event_vmem = 1 << 3, event_vmem_store = 1 << 4, /* GFX10+ */ - event_flat = 1 << 5, event_exp_pos = 1 << 6, event_exp_param = 1 << 7, event_exp_mrt_null = 1 << 8, @@ -102,9 +101,7 @@ struct wait_entry { counters &= ~(1 << type); imm[type] = wait_imm::unset_counter; - events &= ~type_events | event_flat; - if (!(counters & counter_lgkm) && !(counters & counter_vm)) - events &= ~(type_events & event_flat); + events &= ~type_events; logical_events &= events; if (type == wait_type_vm) @@ -144,8 +141,8 @@ struct target_info { events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; - events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg; - events[wait_type_vm] = event_vmem | event_flat; + events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_sendmsg; + events[wait_type_vm] = event_vmem; events[wait_type_vs] = event_vmem_store; if (gfx_level >= GFX12) { events[wait_type_sample] = event_vmem_sample; @@ -159,7 +156,7 @@ struct target_info { counters[j] |= (1 << i); } - unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0); + unordered_events = event_smem; } uint8_t get_counters_for_event(wait_event event) const { return counters[ffs(event) - 1]; } @@ -317,7 +314,7 @@ get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry) */ if (ctx.gfx_level >= GFX11) { uint32_t ds_vmem_events = - event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat; + event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh; events |= ds_vmem_events; } @@ -523,11 +520,9 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf for (unsigned j = 0; j < wait_type_num; j++) { if (bar[j] != wait_imm::unset_counter && imm[j] <= bar[j]) { bar[j] = wait_imm::unset_counter; - bar_ev &= ~ctx.info->events[j] | event_flat; + bar_ev &= ~ctx.info->events[j]; } } - if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter) - bar_ev &= ~event_flat; } /* remove all gprs with higher counter from map */ @@ -587,11 +582,6 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ if (ctx.info->unordered_events & event) return; - if (ctx.pending_flat_lgkm) - counters &= ~counter_lgkm; - if (ctx.pending_flat_vm) - counters &= ~counter_vm; - for (std::pair& e : ctx.gpr_map) { wait_entry& entry = e.second; @@ -607,25 +597,6 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ } } -void -update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info()) -{ - assert(ctx.gfx_level < GFX10); - - ctx.nonzero |= BITFIELD_BIT(wait_type_lgkm) | BITFIELD_BIT(wait_type_vm); - - update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); - - for (std::pair e : ctx.gpr_map) { - if (e.second.counters & counter_vm) - e.second.imm.vm = 0; - if (e.second.counters & counter_lgkm) - e.second.imm.lgkm = 0; - } - ctx.pending_flat_lgkm = true; - ctx.pending_flat_vm = true; -} - void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, uint8_t vmem_types = 0, uint32_t vm_mask = 0) @@ -693,13 +664,19 @@ gen(Instruction* instr, wait_ctx& ctx) } case Format::FLAT: { FLAT_instruction& flat = instr->flat(); - if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) - update_counters_for_flat_load(ctx, flat.sync); - else - update_counters(ctx, event_flat, flat.sync); + wait_event vmem_ev = get_vmem_event(ctx, instr, vmem_nosampler); + update_counters(ctx, vmem_ev, flat.sync); + update_counters(ctx, event_lds, flat.sync); - if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], event_flat); + if (!instr->definitions.empty()) { + insert_wait_entry(ctx, instr->definitions[0], vmem_ev, 0, get_vmem_mask(ctx, instr)); + insert_wait_entry(ctx, instr->definitions[0], event_lds); + } + + if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) { + ctx.pending_flat_lgkm = true; + ctx.pending_flat_vm = true; + } break; } case Format::SMEM: { diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index 5f9d19672f9..1e7a8554728 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -901,3 +901,197 @@ BEGIN_TEST(insert_waitcnt.divergent_branch.no_skip) finish_waitcnt_test(); } END_TEST + +BEGIN_TEST(insert_waitcnt.flat.wait_zero) + for (amd_gfx_level gfx : {GFX9, GFX10}) { + if (!setup_cs(NULL, gfx)) + continue; + + Definition dest0(PhysReg(260), v1); + Definition dest1(PhysReg(261), v1); + Operand offset(PhysReg(256), v1); + Operand addr(PhysReg(256), v2); + + //>> p_unit_test 0 + //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef + //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef + //~gfx9! s_waitcnt vmcnt(0) + //~gfx10! s_waitcnt vmcnt(1) + //! p_unit_test %0:v[4] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1)); + bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)); + bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); + + //>> p_unit_test 1 + //! v1: %0:v[4] = ds_read_b32 %0:v[0] + //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef + //~gfx9! s_waitcnt lgkmcnt(0) + //~gfx10! s_waitcnt lgkmcnt(1) + //! p_unit_test %0:v[4] + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.ds(aco_opcode::ds_read_b32, dest0, offset); + bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)); + bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); + + //>> p_unit_test 2 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef + //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) + //~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1) + //! p_unit_test %0:v[4] + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)); + bld.global(aco_opcode::global_load_dword, dest1, addr, Operand(s1)); + bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); + + //>> p_unit_test 3 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! v1: %0:v[5] = ds_read_b32 %0:v[0] + //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) + //~gfx10! s_waitcnt lgkmcnt(1) vmcnt(0) + //! p_unit_test %0:v[4] + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)); + bld.ds(aco_opcode::ds_read_b32, dest1, offset); + bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); + + //>> p_unit_test 4 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef + //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) + //~gfx10! s_waitcnt lgkmcnt(1) vmcnt(1) + //! p_unit_test %0:v[4] + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1)); + bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)); + bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1)); + + finish_waitcnt_test(); + } +END_TEST + +BEGIN_TEST(insert_waitcnt.flat.waw) + for (amd_gfx_level gfx : {GFX9, GFX10}) { + if (!setup_cs(NULL, gfx)) + continue; + + /* Flat might use either LDS or VMEM, so WaW always needs a wait. */ + Definition dest(PhysReg(260), v1); + Operand offset(PhysReg(256), v1); + Operand addr(PhysReg(256), v2); + Operand desc_s4(PhysReg(0), s4); + Operand desc_s8(PhysReg(8), s8); + + //>> p_unit_test 0 + //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1)); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + + //>> p_unit_test 1 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! s_waitcnt lgkmcnt(0) vmcnt(0) + //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1)); + + //>> p_unit_test 2 + //! v1: %0:v[4] = ds_read_b32 %0:v[0] + //! s_waitcnt lgkmcnt(0) + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.ds(aco_opcode::ds_read_b32, dest, offset); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + + //>> p_unit_test 3 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! s_waitcnt vmcnt(0) + //! v1: %0:v[4] = ds_read_b32 %0:v[0] + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + bld.ds(aco_opcode::ds_read_b32, dest, offset); + + /* In theory, we don't need a wait here, but we don't optimize this. */ + //>> p_unit_test 4 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! s_waitcnt lgkmcnt(0) vmcnt(0) + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + + //>> p_unit_test 5 + //! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef + //! s_waitcnt lgkmcnt(0) vmcnt(0) + //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1)); + bld.mimg(aco_opcode::image_sample, dest, desc_s8, desc_s4, Operand(v1), offset); + + finish_waitcnt_test(); + } +END_TEST + +BEGIN_TEST(insert_waitcnt.flat.barrier) + for (amd_gfx_level gfx : {GFX9, GFX10}) { + if (!setup_cs(NULL, gfx)) + continue; + + Definition dest0(PhysReg(260), v1); + Definition dest1(PhysReg(261), v1); + Operand addr(PhysReg(256), v2); + Operand data(PhysReg(256), v1); + + //>> p_unit_test 0 + //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer + //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef + //~gfx9! s_waitcnt vmcnt(0) + //~gfx10! s_waitcnt vmcnt(1) + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0)); + bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0, + memory_sync_info(storage_buffer)); + bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1)); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_buffer, semantic_acqrel, scope_device)); + + //>> p_unit_test 1 + //! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef storage:buffer + //! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef + //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) + //~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1) + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1), 0, + memory_sync_info(storage_buffer)); + bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_buffer, semantic_acqrel, scope_device)); + + //>> p_unit_test 2 + //! flat_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer + //~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0) + //~gfx10! s_waitcnt_vscnt %0:null imm:0 + //~gfx10! s_waitcnt lgkmcnt(0) + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.flat(aco_opcode::flat_store_dword, addr, Operand(s1), data, 0, + memory_sync_info(storage_buffer)); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_buffer, semantic_acqrel, scope_device)); + + finish_waitcnt_test(); + } +END_TEST