From 09b856c36744ec6243bbfe9473472e7d8a1eca2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Tue, 11 Nov 2025 06:45:40 +0100
Subject: [PATCH] ac/nir/ngg: Fix scratch space for NGG GS streamout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For GS streamout, we need the following LDS scratch space:

- Repacking streamout vertices takes 1 dword per 4 waves per stream
  (max 16 bytes for Wave64, max 32 bytes for Wave32)
- 1 dword per stream for buffer info
  (16 bytes)
- 1 dword per buffer for buffer info
  (16 bytes)

Previously, the space used for buffer info aliased with the
space for repacking the output vertices in ngg_gs_finale(),
and there was no barrier in between, which caused a race
condition, resulting in random failure.

Fix this by allocating a few more LDS dwords so that aliasing
is not required, which also allows us to remove an extra
workgroup barrier.

Cc: mesa-stable
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12705
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
(cherry picked from commit 8f99d736d09a88713b2aac969a793c3461e6f759)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38432>
---
 .pick_status.json                        |  2 +-
 src/amd/common/nir/ac_nir_lower_ngg.c    | 21 ++++++++++++++++++---
 src/amd/common/nir/ac_nir_lower_ngg_gs.c | 17 ++++++++---------
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 15af04eec76..e5520e2e169 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -754,7 +754,7 @@
         "description": "ac/nir/ngg: Fix scratch space for NGG GS streamout",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null,
         "notes": null
diff --git a/src/amd/common/nir/ac_nir_lower_ngg.c b/src/amd/common/nir/ac_nir_lower_ngg.c
index 03eae2d8fb3..aa454239307 100644
--- a/src/amd/common/nir/ac_nir_lower_ngg.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg.c
@@ -1817,10 +1817,25 @@ ac_ngg_get_scratch_lds_size(mesa_shader_stage stage,
    } else {
       assert(stage == MESA_SHADER_GEOMETRY);
 
+      /* Repacking output vertices at the end in ngg_gs_finale() uses 1 dword per 4 waves */
       scratch_lds_size = ALIGN(max_num_waves, 4u);
-      /* streamout take 8 dwords for buffer offset and emit vertex per stream */
-      if (streamout_enabled)
-         scratch_lds_size = MAX2(scratch_lds_size, 32);
+
+      /* For streamout:
+       * - Repacking streamout vertices takes 1 dword per 4 waves per stream
+       *   (max 16 bytes for Wave64, 32 bytes for Wave32)
+       * - 1 dword per stream for buffer info
+       *   (16 bytes)
+       * - 1 dword per buffer for buffer info
+       *   (16 bytes)
+       */
+      if (streamout_enabled) {
+         const unsigned num_streams = 4;
+         const unsigned num_so_buffers = 4;
+         const unsigned streamout_scratch_size =
+            num_streams * ALIGN(max_num_waves, 4u) + num_streams * 4 + num_so_buffers * 4;
+
+         scratch_lds_size += streamout_scratch_size;
+      }
    }
 
    return scratch_lds_size;
diff --git a/src/amd/common/nir/ac_nir_lower_ngg_gs.c b/src/amd/common/nir/ac_nir_lower_ngg_gs.c
index bc2765cbeac..abd2d7a5d81 100644
--- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c
@@ -660,6 +660,10 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
    nir_def *export_seq[4] = {0};
    nir_def *out_vtx_primflag[4] = {0};
 
+   const unsigned scratch_stride = ALIGN(s->max_num_waves, 4);
+   const unsigned scratch_base_off = scratch_stride;
+   const unsigned num_streams = util_bitcount(info->streams_written);
+
    u_foreach_bit(stream, info->streams_written) {
       out_vtx_primflag[stream] =
          ngg_gs_load_out_vtx_primflag(b, stream, tid_in_tg, out_vtx_lds_addr, max_vtxcnt, s);
@@ -669,9 +673,8 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
        */
       prim_live[stream] = nir_i2b(b, nir_iand_imm(b, out_vtx_primflag[stream], 1));
 
-      unsigned scratch_stride = ALIGN(s->max_num_waves, 4);
       nir_def *scratch_base =
-         nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride);
+         nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride + scratch_base_off);
 
       /* We want to export primitives to streamout buffer in sequence,
        * but not all vertices are alive or mark end of a primitive, so
@@ -697,18 +700,14 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
       export_seq[stream] = rep.repacked_invocation_index;
    }
 
-   /* Workgroup barrier: wait for LDS scratch reads finish. */
-   nir_barrier(b, .execution_scope = SCOPE_WORKGROUP,
-                      .memory_scope = SCOPE_WORKGROUP,
-                      .memory_semantics = NIR_MEMORY_ACQ_REL,
-                      .memory_modes = nir_var_mem_shared);
-
    /* Get global buffer offset where this workgroup will stream out data to. */
    nir_def *emit_prim[4] = {0};
    nir_def *buffer_offsets[4] = {0};
    nir_def *so_buffer[4] = {0};
+   nir_def *buffer_info_scratch_base =
+      nir_iadd_imm_nuw(b, s->lds_addr_gs_out_vtx, num_streams * scratch_stride + scratch_base_off);
    ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query,
-                                   s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_out_vtx, tid_in_tg,
+                                   s->options->use_gfx12_xfb_intrinsic, buffer_info_scratch_base, tid_in_tg,
                                    gen_prim, so_buffer, buffer_offsets, emit_prim);
 
    u_foreach_bit(stream, info->streams_written) {