From 0e21cd9e154bc82b978f295ae2c773de0c818d2d Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Sat, 21 Sep 2024 15:02:23 +0200
Subject: [PATCH] aco/gfx10+: work around non uniform ds_append wave64 result

In wave64 for hw with native wave32, ds_append seems to be split in a load for
the low half and an atomic for the high half, and other LDS instructions can
be scheduled between the two.
Which means the result of the low half is unusable because it might be out of date.

I was only able to reproduce this issue in WGP mode, but be conservative and
apply the workaround in CU mode too.

Foz-DB Navi31:
Totals from 13 (0.02% of 79395) affected shaders:
Instrs: 7599 -> 7656 (+0.75%)
CodeSize: 39708 -> 39972 (+0.66%)
Latency: 83174 -> 83572 (+0.48%)
InvThroughput: 8271 -> 8357 (+1.04%)
Copies: 718 -> 717 (-0.14%)
VALU: 3689 -> 3703 (+0.38%)
SALU: 935 -> 965 (+3.21%)

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11921
Fixes: 45e935800a8 ("aco: implement nir_shared_append/consume_amd")

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31301>
---
 src/amd/compiler/aco_instruction_selection.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index a043f8f3af4..03111d4ce35 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -7595,7 +7595,19 @@ visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr)
       ds = bld.ds(op, Definition(tmp), m, address);
    ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
 
-   bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp);
+   /* In wave64 for hw with native wave32, ds_append seems to be split in a load for the low half
+    * and an atomic for the high half, and other LDS instructions can be scheduled between the two.
+    * Which means the result of the low half is unusable because it might be out of date.
+    */
+   if (ctx->program->gfx_level >= GFX10 && ctx->program->wave_size == 64 &&
+       ctx->program->workgroup_size > 64) {
+      Temp last_lane = bld.sop1(aco_opcode::s_flbit_i32_b64, bld.def(s1), Operand(exec, s2));
+      last_lane = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(63),
+                           last_lane);
+      bld.readlane(Definition(get_ssa_temp(ctx, &instr->def)), tmp, last_lane);
+   } else {
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp);
+   }
 }
 
 void