From 88f6d7f4bdf90bcfdb17e4aadddec3c855a12b13 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 11 May 2023 17:08:39 +0100 Subject: [PATCH] aco/gfx11: fix VMEM/DS->VALU WaW/RaW hazard Previously, we could safely read/write unused lanes of VMEM/DS destination VGPRs without waiting for the load to finish. That doesn't seem to be the case on GFX11. fossil-db (gfx1100): Totals from 6698 (4.94% of 135636) affected shaders: Instrs: 11184274 -> 11199420 (+0.14%); split: -0.00%, +0.14% CodeSize: 57578344 -> 57638928 (+0.11%); split: -0.00%, +0.11% Latency: 198348808 -> 198382472 (+0.02%); split: -0.00%, +0.02% InvThroughput: 24376324 -> 24378439 (+0.01%); split: -0.00%, +0.01% VClause: 192420 -> 192559 (+0.07%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/8722 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/8239 Cc: mesa-stable Part-of: --- src/amd/compiler/README-ISA.md | 5 +++++ src/amd/compiler/aco_insert_waitcnt.cpp | 18 +++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/README-ISA.md b/src/amd/compiler/README-ISA.md index 8139d7e6876..7e6329dc7fd 100644 --- a/src/amd/compiler/README-ISA.md +++ b/src/amd/compiler/README-ISA.md @@ -206,6 +206,11 @@ Currently, we don't do this. This leads to wrong bounds checking, using a VGPR offset fixes it. +## unused VMEM/DS destination lanes can't be used without waiting + +On GFX11, we can't safely read/write unused lanes of VMEM/DS destination +VGPRs without waiting for the load to finish. + ## GCN / GFX6 hazards ### VINTRP followed by a read with `v_readfirstlane` or `v_readlane` diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 9b34fb00307..0a87cccde28 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -224,14 +224,15 @@ struct wait_entry { bool join(const wait_entry& other) { bool changed = (other.events & ~events) || (other.counters & ~counters) || - (other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types); + (other.wait_on_read && !wait_on_read) || (other.vmem_types & !vmem_types) || + (!other.logical && logical); events |= other.events; counters |= other.counters; changed |= imm.combine(other.imm); changed |= delay.combine(other.delay); wait_on_read |= other.wait_on_read; vmem_types |= other.vmem_types; - assert(logical == other.logical); + logical &= other.logical; return changed; } @@ -736,7 +737,7 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, - uint8_t vmem_types = 0, unsigned cycles = 0) + uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false) { uint16_t counters = get_counters_for_event(event); wait_imm imm; @@ -760,7 +761,7 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo delay.salu_cycles = cycles; } - wait_entry new_entry(event, imm, delay, !rc.is_linear(), wait_on_read); + wait_entry new_entry(event, imm, delay, !rc.is_linear() && !force_linear, wait_on_read); new_entry.vmem_types |= vmem_types; for (unsigned i = 0; i < rc.size(); i++) { @@ -781,7 +782,14 @@ void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0, unsigned cycles = 0) { - insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles); + /* We can't safely write to unwritten destination VGPR lanes on GFX11 without waiting for + * the load to finish. + */ + bool force_linear = + ctx.gfx_level >= GFX11 && (event & (event_lds | event_gds | event_vmem | event_flat)); + + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, cycles, + force_linear); } void