From 9649deb50e1e83d210966a6c98a23f21243c622b Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 29 Apr 2025 17:37:59 +0100 Subject: [PATCH] aco: skip waitcnt between two vmem writing different halves fossil-db (gfx1201): Totals from 4 (0.01% of 79653) affected shaders: Instrs: 41374 -> 41380 (+0.01%); split: -0.01%, +0.02% CodeSize: 238912 -> 238924 (+0.01%); split: -0.01%, +0.01% Latency: 706714 -> 706410 (-0.04%) InvThroughput: 352269 -> 352118 (-0.04%) VClause: 803 -> 798 (-0.62%) fossil-db (navi31): Totals from 0 (0.00% of 79653) affected shaders: fossil-db (navi21): Totals from 0 (0.00% of 79653) affected shaders: Signed-off-by: Rhys Perry Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13028 Reviewed-by: Georg Lehmann Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 10 ++- .../compiler/tests/test_insert_waitcnt.cpp | 79 +++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index e56c876e783..16f51e2bac2 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -350,7 +350,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) * VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the * linear CFG, so this is difficult */ uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr); - if (vmem_type && ctx.gfx_level < GFX12) { + if (vmem_type) { wait_event event = get_vmem_event(ctx, instr, vmem_type); wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1); @@ -359,7 +359,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr) bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type && util_bitcount(vmem_type) == 1); - if (event_matches && type_matches) + bool different_halves = false; + if (event == event_vmem && event_matches) { + uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3; + different_halves = !(mask & it->second.vm_mask); + } + + if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves) reg_imm[type] = wait_imm::unset_counter; } diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index 8afecdbb879..c0520bd62e7 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -612,3 +612,82 @@ BEGIN_TEST(insert_waitcnt.vmem_ds) finish_waitcnt_test(); END_TEST + +BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves) + if (!setup_cs(NULL, GFX12)) + return; + + Definition def_v4_lo(PhysReg(260), v2b); + Definition def_v4_hi(PhysReg(260).advance(2), v2b); + Operand op_v0(PhysReg(256), v1); + Operand desc_s4(PhysReg(0), s4); + Operand desc_s8(PhysReg(8), s8); + + //>> p_unit_test 0 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 1 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 2 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + //! s_wait_loadcnt imm:0 + //! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 3 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + //! s_wait_loadcnt imm:0 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 4 + //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16 + //! s_wait_samplecnt imm:0 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + Instruction* instr = + bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0); + instr->mimg().dmask = 0x1; + instr->mimg().d16 = true; + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + + //>> p_unit_test 5 + //! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0 + //! s_wait_loadcnt imm:0 + //! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16 + bld.reset(program->create_and_insert_block()); + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5)); + bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0, + false); + instr = bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0); + instr->mimg().dmask = 0x1; + instr->mimg().d16 = true; + + finish_waitcnt_test(); +END_TEST