mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 00:58:05 +02:00
aco: skip waitcnt between two vmem writing different lanes
fossil-db (gfx1201): Totals from 1382 (1.74% of 79653) affected shaders: Instrs: 6531704 -> 6523935 (-0.12%); split: -0.12%, +0.00% CodeSize: 34992076 -> 34933568 (-0.17%); split: -0.17%, +0.01% Latency: 70183360 -> 69616066 (-0.81%); split: -0.81%, +0.00% InvThroughput: 11155445 -> 11068667 (-0.78%); split: -0.78%, +0.00% fossil-db (navi31): Totals from 46 (0.06% of 79653) affected shaders: Instrs: 1833768 -> 1833732 (-0.00%) CodeSize: 9468788 -> 9468716 (-0.00%) Latency: 11683092 -> 11667865 (-0.13%) InvThroughput: 2274377 -> 2272872 (-0.07%) fossil-db (navi21): Totals from 0 (0.00% of 79653) affected shaders: Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
This commit is contained in:
parent
9649deb50e
commit
c1f8537131
2 changed files with 72 additions and 4 deletions
|
|
@ -346,9 +346,9 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
|
|||
|
||||
/* Vector Memory reads and writes decrease the counter in the order they were issued.
|
||||
* Before GFX12, they also write VGPRs in order if they're of the same type.
|
||||
* TODO: We can do this for GFX12 and different types for GFX11 if we know that the two
|
||||
* VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the
|
||||
* linear CFG, so this is difficult */
|
||||
* We can do this for GFX12 and different types for GFX11 if we know that the two
|
||||
* VMEM loads do not write the same register half or the same lanes.
|
||||
*/
|
||||
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
|
||||
if (vmem_type) {
|
||||
wait_event event = get_vmem_event(ctx, instr, vmem_type);
|
||||
|
|
@ -365,7 +365,10 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
|
|||
different_halves = !(mask & it->second.vm_mask);
|
||||
}
|
||||
|
||||
if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves)
|
||||
bool different_lanes = (it->second.logical_events & ctx.info->events[type]) == 0;
|
||||
|
||||
if ((event_matches && type_matches && ctx.gfx_level < GFX12) || different_halves ||
|
||||
different_lanes)
|
||||
reg_imm[type] = wait_imm::unset_counter;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -691,3 +691,68 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_different_halves)
|
|||
|
||||
finish_waitcnt_test();
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.waw.vmem_different_lanes)
|
||||
for (amd_gfx_level gfx : {GFX10_3, GFX11, GFX12}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
Definition def_v4(PhysReg(260), v1);
|
||||
Operand op_v0(PhysReg(256), v1);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
Operand desc_s8(PhysReg(8), s8);
|
||||
|
||||
emit_divergent_if_else(
|
||||
program.get(), bld, Operand::c64(1),
|
||||
[&]()
|
||||
{
|
||||
//>> p_unit_test 1
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0,
|
||||
false);
|
||||
},
|
||||
[&]()
|
||||
{
|
||||
//>> p_unit_test 2
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0,
|
||||
false);
|
||||
});
|
||||
//>> p_unit_test 3
|
||||
//~gfx(10_3|11)! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_loadcnt imm:0
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1));
|
||||
|
||||
emit_divergent_if_else(
|
||||
program.get(), bld, Operand::c64(1),
|
||||
[&]()
|
||||
{
|
||||
//>> p_unit_test 4
|
||||
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0,
|
||||
false);
|
||||
},
|
||||
[&]()
|
||||
{
|
||||
//>> p_unit_test 5
|
||||
//~gfx12! s_wait_loadcnt imm:0
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
|
||||
});
|
||||
//>> p_unit_test 6
|
||||
//~gfx(10_3|11)! s_waitcnt vmcnt(0)
|
||||
//~gfx12! s_wait_loadcnt imm:0
|
||||
//~gfx12! s_wait_samplecnt imm:0
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(PhysReg(260), v1));
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue