aco: skip waitcnt between two vmem writing different halves

fossil-db (gfx1201):
Totals from 4 (0.01% of 79653) affected shaders:
Instrs: 41374 -> 41380 (+0.01%); split: -0.01%, +0.02%
CodeSize: 238912 -> 238924 (+0.01%); split: -0.01%, +0.01%
Latency: 706714 -> 706410 (-0.04%)
InvThroughput: 352269 -> 352118 (-0.04%)
VClause: 803 -> 798 (-0.62%)

fossil-db (navi31):
Totals from 0 (0.00% of 79653) affected shaders:

fossil-db (navi21):
Totals from 0 (0.00% of 79653) affected shaders:

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/13028
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
This commit is contained in:
Rhys Perry 2025-04-29 17:37:59 +01:00
parent 9a38ad3ca7
commit 9649deb50e
2 changed files with 87 additions and 2 deletions

View file

@ -350,7 +350,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
* VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the
* linear CFG, so this is difficult */
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
if (vmem_type && ctx.gfx_level < GFX12) {
if (vmem_type) {
wait_event event = get_vmem_event(ctx, instr, vmem_type);
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
@ -359,7 +359,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
bool type_matches = type != wait_type_vm || (it->second.vmem_types == vmem_type &&
util_bitcount(vmem_type) == 1);
if (event_matches && type_matches)
bool different_halves = false;
if (event == event_vmem && event_matches) {
uint32_t mask = (get_vmem_mask(ctx, instr) >> (j * 2)) & 0x3;
different_halves = !(mask & it->second.vm_mask);
}
if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves)
reg_imm[type] = wait_imm::unset_counter;
}

View file

@ -612,3 +612,82 @@ BEGIN_TEST(insert_waitcnt.vmem_ds)
finish_waitcnt_test();
END_TEST
BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves)
if (!setup_cs(NULL, GFX12))
return;
Definition def_v4_lo(PhysReg(260), v2b);
Definition def_v4_hi(PhysReg(260).advance(2), v2b);
Operand op_v0(PhysReg(256), v1);
Operand desc_s4(PhysReg(0), s4);
Operand desc_s8(PhysReg(8), s8);
//>> p_unit_test 0
//! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
//! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
false);
bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
false);
//>> p_unit_test 1
//! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
//! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
false);
bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
false);
//>> p_unit_test 2
//! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
//! s_wait_loadcnt imm:0
//! v2b: %0:v[4][0:16] = buffer_load_short_d16 %0:s[0-3], %0:v[0], 0
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
false);
bld.mubuf(aco_opcode::buffer_load_short_d16, def_v4_lo, desc_s4, op_v0, Operand::zero(), 0,
false);
//>> p_unit_test 3
//! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
//! s_wait_loadcnt imm:0
//! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
false);
bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
false);
//>> p_unit_test 4
//! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16
//! s_wait_samplecnt imm:0
//! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
Instruction* instr =
bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0);
instr->mimg().dmask = 0x1;
instr->mimg().d16 = true;
bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
false);
//>> p_unit_test 5
//! v2b: %0:v[4][16:32] = buffer_load_short_d16_hi %0:s[0-3], %0:v[0], 0
//! s_wait_loadcnt imm:0
//! v2b: %0:v[4][0:16] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d d16
bld.reset(program->create_and_insert_block());
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
bld.mubuf(aco_opcode::buffer_load_short_d16_hi, def_v4_hi, desc_s4, op_v0, Operand::zero(), 0,
false);
instr = bld.mimg(aco_opcode::image_sample, def_v4_lo, desc_s8, desc_s4, Operand(v1), op_v0);
instr->mimg().dmask = 0x1;
instr->mimg().d16 = true;
finish_waitcnt_test();
END_TEST