mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 04:48:08 +02:00
aco: simplify waitcnt insertion for flat access
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35465>
This commit is contained in:
parent
6396a82695
commit
d705b6198c
2 changed files with 212 additions and 41 deletions
|
|
@ -45,7 +45,6 @@ enum wait_event : uint32_t {
|
|||
event_gds = 1 << 2,
|
||||
event_vmem = 1 << 3,
|
||||
event_vmem_store = 1 << 4, /* GFX10+ */
|
||||
event_flat = 1 << 5,
|
||||
event_exp_pos = 1 << 6,
|
||||
event_exp_param = 1 << 7,
|
||||
event_exp_mrt_null = 1 << 8,
|
||||
|
|
@ -102,9 +101,7 @@ struct wait_entry {
|
|||
counters &= ~(1 << type);
|
||||
imm[type] = wait_imm::unset_counter;
|
||||
|
||||
events &= ~type_events | event_flat;
|
||||
if (!(counters & counter_lgkm) && !(counters & counter_vm))
|
||||
events &= ~(type_events & event_flat);
|
||||
events &= ~type_events;
|
||||
|
||||
logical_events &= events;
|
||||
if (type == wait_type_vm)
|
||||
|
|
@ -144,8 +141,8 @@ struct target_info {
|
|||
|
||||
events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
|
||||
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
|
||||
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
|
||||
events[wait_type_vm] = event_vmem | event_flat;
|
||||
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_sendmsg;
|
||||
events[wait_type_vm] = event_vmem;
|
||||
events[wait_type_vs] = event_vmem_store;
|
||||
if (gfx_level >= GFX12) {
|
||||
events[wait_type_sample] = event_vmem_sample;
|
||||
|
|
@ -159,7 +156,7 @@ struct target_info {
|
|||
counters[j] |= (1 << i);
|
||||
}
|
||||
|
||||
unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0);
|
||||
unordered_events = event_smem;
|
||||
}
|
||||
|
||||
uint8_t get_counters_for_event(wait_event event) const { return counters[ffs(event) - 1]; }
|
||||
|
|
@ -317,7 +314,7 @@ get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry)
|
|||
*/
|
||||
if (ctx.gfx_level >= GFX11) {
|
||||
uint32_t ds_vmem_events =
|
||||
event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat;
|
||||
event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh;
|
||||
events |= ds_vmem_events;
|
||||
}
|
||||
|
||||
|
|
@ -523,11 +520,9 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
|
|||
for (unsigned j = 0; j < wait_type_num; j++) {
|
||||
if (bar[j] != wait_imm::unset_counter && imm[j] <= bar[j]) {
|
||||
bar[j] = wait_imm::unset_counter;
|
||||
bar_ev &= ~ctx.info->events[j] | event_flat;
|
||||
bar_ev &= ~ctx.info->events[j];
|
||||
}
|
||||
}
|
||||
if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
|
||||
bar_ev &= ~event_flat;
|
||||
}
|
||||
|
||||
/* remove all gprs with higher counter from map */
|
||||
|
|
@ -587,11 +582,6 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
|
|||
if (ctx.info->unordered_events & event)
|
||||
return;
|
||||
|
||||
if (ctx.pending_flat_lgkm)
|
||||
counters &= ~counter_lgkm;
|
||||
if (ctx.pending_flat_vm)
|
||||
counters &= ~counter_vm;
|
||||
|
||||
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
||||
wait_entry& entry = e.second;
|
||||
|
||||
|
|
@ -607,25 +597,6 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
|
||||
{
|
||||
assert(ctx.gfx_level < GFX10);
|
||||
|
||||
ctx.nonzero |= BITFIELD_BIT(wait_type_lgkm) | BITFIELD_BIT(wait_type_vm);
|
||||
|
||||
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
|
||||
|
||||
for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
|
||||
if (e.second.counters & counter_vm)
|
||||
e.second.imm.vm = 0;
|
||||
if (e.second.counters & counter_lgkm)
|
||||
e.second.imm.lgkm = 0;
|
||||
}
|
||||
ctx.pending_flat_lgkm = true;
|
||||
ctx.pending_flat_vm = true;
|
||||
}
|
||||
|
||||
void
|
||||
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
|
||||
uint8_t vmem_types = 0, uint32_t vm_mask = 0)
|
||||
|
|
@ -693,13 +664,19 @@ gen(Instruction* instr, wait_ctx& ctx)
|
|||
}
|
||||
case Format::FLAT: {
|
||||
FLAT_instruction& flat = instr->flat();
|
||||
if (ctx.gfx_level < GFX10 && !instr->definitions.empty())
|
||||
update_counters_for_flat_load(ctx, flat.sync);
|
||||
else
|
||||
update_counters(ctx, event_flat, flat.sync);
|
||||
wait_event vmem_ev = get_vmem_event(ctx, instr, vmem_nosampler);
|
||||
update_counters(ctx, vmem_ev, flat.sync);
|
||||
update_counters(ctx, event_lds, flat.sync);
|
||||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], event_flat);
|
||||
if (!instr->definitions.empty()) {
|
||||
insert_wait_entry(ctx, instr->definitions[0], vmem_ev, 0, get_vmem_mask(ctx, instr));
|
||||
insert_wait_entry(ctx, instr->definitions[0], event_lds);
|
||||
}
|
||||
|
||||
if (ctx.gfx_level < GFX10 && !instr->definitions.empty()) {
|
||||
ctx.pending_flat_lgkm = true;
|
||||
ctx.pending_flat_vm = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Format::SMEM: {
|
||||
|
|
|
|||
|
|
@ -901,3 +901,197 @@ BEGIN_TEST(insert_waitcnt.divergent_branch.no_skip)
|
|||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.flat.wait_zero)
|
||||
for (amd_gfx_level gfx : {GFX9, GFX10}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
Definition dest0(PhysReg(260), v1);
|
||||
Definition dest1(PhysReg(261), v1);
|
||||
Operand offset(PhysReg(256), v1);
|
||||
Operand addr(PhysReg(256), v2);
|
||||
|
||||
//>> p_unit_test 0
|
||||
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
|
||||
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//~gfx9! s_waitcnt vmcnt(0)
|
||||
//~gfx10! s_waitcnt vmcnt(1)
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
|
||||
|
||||
//>> p_unit_test 1
|
||||
//! v1: %0:v[4] = ds_read_b32 %0:v[0]
|
||||
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//~gfx9! s_waitcnt lgkmcnt(0)
|
||||
//~gfx10! s_waitcnt lgkmcnt(1)
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.ds(aco_opcode::ds_read_b32, dest0, offset);
|
||||
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
|
||||
|
||||
//>> p_unit_test 2
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! v1: %0:v[5] = global_load_dword %0:v[0-1], s1: undef
|
||||
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1)
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1));
|
||||
bld.global(aco_opcode::global_load_dword, dest1, addr, Operand(s1));
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
|
||||
|
||||
//>> p_unit_test 3
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! v1: %0:v[5] = ds_read_b32 %0:v[0]
|
||||
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//~gfx10! s_waitcnt lgkmcnt(1) vmcnt(0)
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1));
|
||||
bld.ds(aco_opcode::ds_read_b32, dest1, offset);
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
|
||||
|
||||
//>> p_unit_test 4
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//~gfx10! s_waitcnt lgkmcnt(1) vmcnt(1)
|
||||
//! p_unit_test %0:v[4]
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest0, addr, Operand(s1));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand(dest0.physReg(), v1));
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.flat.waw)
|
||||
for (amd_gfx_level gfx : {GFX9, GFX10}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
/* Flat might use either LDS or VMEM, so WaW always needs a wait. */
|
||||
Definition dest(PhysReg(260), v1);
|
||||
Operand offset(PhysReg(256), v1);
|
||||
Operand addr(PhysReg(256), v2);
|
||||
Operand desc_s4(PhysReg(0), s4);
|
||||
Operand desc_s8(PhysReg(8), s8);
|
||||
|
||||
//>> p_unit_test 0
|
||||
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
|
||||
//! s_waitcnt vmcnt(0)
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
|
||||
//>> p_unit_test 1
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
bld.global(aco_opcode::global_load_dword, dest, addr, Operand(s1));
|
||||
|
||||
//>> p_unit_test 2
|
||||
//! v1: %0:v[4] = ds_read_b32 %0:v[0]
|
||||
//! s_waitcnt lgkmcnt(0)
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.ds(aco_opcode::ds_read_b32, dest, offset);
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
|
||||
//>> p_unit_test 3
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! s_waitcnt vmcnt(0)
|
||||
//! v1: %0:v[4] = ds_read_b32 %0:v[0]
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
bld.ds(aco_opcode::ds_read_b32, dest, offset);
|
||||
|
||||
/* In theory, we don't need a wait here, but we don't optimize this. */
|
||||
//>> p_unit_test 4
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
|
||||
//>> p_unit_test 5
|
||||
//! v1: %0:v[4] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest, addr, Operand(s1));
|
||||
bld.mimg(aco_opcode::image_sample, dest, desc_s8, desc_s4, Operand(v1), offset);
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(insert_waitcnt.flat.barrier)
|
||||
for (amd_gfx_level gfx : {GFX9, GFX10}) {
|
||||
if (!setup_cs(NULL, gfx))
|
||||
continue;
|
||||
|
||||
Definition dest0(PhysReg(260), v1);
|
||||
Definition dest1(PhysReg(261), v1);
|
||||
Operand addr(PhysReg(256), v2);
|
||||
Operand data(PhysReg(256), v1);
|
||||
|
||||
//>> p_unit_test 0
|
||||
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef storage:buffer
|
||||
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef
|
||||
//~gfx9! s_waitcnt vmcnt(0)
|
||||
//~gfx10! s_waitcnt vmcnt(1)
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
|
||||
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0,
|
||||
memory_sync_info(storage_buffer));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1));
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
|
||||
|
||||
//>> p_unit_test 1
|
||||
//! v1: %0:v[5] = flat_load_dword %0:v[0-1], s1: undef storage:buffer
|
||||
//! v1: %0:v[4] = global_load_dword %0:v[0-1], s1: undef
|
||||
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//~gfx10! s_waitcnt lgkmcnt(0) vmcnt(1)
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
|
||||
bld.flat(aco_opcode::flat_load_dword, dest1, addr, Operand(s1), 0,
|
||||
memory_sync_info(storage_buffer));
|
||||
bld.global(aco_opcode::global_load_dword, dest0, addr, Operand(s1), 0);
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
|
||||
|
||||
//>> p_unit_test 2
|
||||
//! flat_store_dword %0:v[0-1], s1: undef, %0:v[0] storage:buffer
|
||||
//~gfx9! s_waitcnt lgkmcnt(0) vmcnt(0)
|
||||
//~gfx10! s_waitcnt_vscnt %0:null imm:0
|
||||
//~gfx10! s_waitcnt lgkmcnt(0)
|
||||
bld.reset(program->create_and_insert_block());
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
|
||||
bld.flat(aco_opcode::flat_store_dword, addr, Operand(s1), data, 0,
|
||||
memory_sync_info(storage_buffer));
|
||||
bld.barrier(aco_opcode::p_barrier,
|
||||
memory_sync_info(storage_buffer, semantic_acqrel, scope_device));
|
||||
|
||||
finish_waitcnt_test();
|
||||
}
|
||||
END_TEST
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue