aco: skip waitcnt between two vmem writing different lanes

fossil-db (gfx1201):
Totals from 1382 (1.74% of 79653) affected shaders:
Instrs: 6531704 -> 6523935 (-0.12%); split: -0.12%, +0.00%
CodeSize: 34992076 -> 34933568 (-0.17%); split: -0.17%, +0.01%
Latency: 70183360 -> 69616066 (-0.81%); split: -0.81%, +0.00%
InvThroughput: 11155445 -> 11068667 (-0.78%); split: -0.78%, +0.00%

fossil-db (navi31):
Totals from 46 (0.06% of 79653) affected shaders:
Instrs: 1833768 -> 1833732 (-0.00%)
CodeSize: 9468788 -> 9468716 (-0.00%)
Latency: 11683092 -> 11667865 (-0.13%)
InvThroughput: 2274377 -> 2272872 (-0.07%)

fossil-db (navi21):
Totals from 0 (0.00% of 79653) affected shaders:

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
This commit is contained in:
Rhys Perry 2025-05-12 16:35:21 +01:00
parent 9649deb50e
commit c1f8537131
2 changed files with 72 additions and 4 deletions

View file

@ -346,9 +346,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
/* Vector Memory reads and writes decrease the counter in the order they were issued.
* Before GFX12, they also write VGPRs in order if they're of the same type.
* TODO: We can do this for GFX12 and different types for GFX11 if we know that the two
* VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the
* linear CFG, so this is difficult */
* We can do this for GFX12 and different types for GFX11 if we know that the two
* VMEM loads do not write the same register half or the same lanes.
*/
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
if (vmem_type) {
wait_event event = get_vmem_event(ctx, instr, vmem_type);
@ -365,7 +365,10 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
different_halves = !(mask & it->second.vm_mask);
}
if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves)
bool different_lanes = (it->second.logical_events & ctx.info->events[type]) == 0;
if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves ||
different_lanes)
reg_imm[type] = wait_imm::unset_counter;
}

View file

@ -691,3 +691,68 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves)
finish_waitcnt_test();
END_TEST
BEGIN_TEST(insert_waitcnt.waw.vmem_different_lanes)
for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) {
if (!setup_cs(NULL, gfx))
continue;
Definition def_v4(PhysReg(260), v1);
Operand op_v0(PhysReg(256), v1);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
emit_divergent_if_else(
program.get(), bld, Operand::c64(1),
[&]()
{
//>> p_unit_test 1
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0,
false);
},
[&]()
{
//>> p_unit_test 2
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0,
false);
});
//>> p_unit_test 3
//~gfx(10_3|11)! s_waitcnt vmcnt(0)
//~gfx12! s_wait_loadcnt imm:0
//! p_unit_test %0:v[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1));
emit_divergent_if_else(
program.get(), bld, Operand::c64(1),
[&]()
{
//>> p_unit_test 4
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0,
false);
},
[&]()
{
//>> p_unit_test 5
//~gfx12! s_wait_loadcnt imm:0
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
});
//>> p_unit_test 6
//~gfx(10_3|11)! s_waitcnt vmcnt(0)
//~gfx12! s_wait_loadcnt imm:0
//~gfx12! s_wait_samplecnt imm:0
//! p_unit_test %0:v[4]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1));
finish_waitcnt_test();
}
END_TEST