radeonsi/gfx12: fix VS output corruption with streamout

We increased VS_EXPORT_COUNT to 8 for streamout in gfx10_shader_ngg, but we forgot to increase the attribute ring stride, causing all waves except the first one to get corrupted VS outputs. Fixes: f703dfd1bb - radeonsi: add gfx12 Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30503>
2026-05-05 00:58:05 +02:00 · 2024-08-05 00:14:20 -04:00 · 2024-08-05 00:14:20 -04:00 · 0e27df4521
commit 0e27df4521
parent 8b3e02587e
3 changed files with 23 additions and 11 deletions
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@ -43,7 +43,7 @@ static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
                                sel->info.base.vs.blit_sgprs_amd - 1) :
      ac_nir_load_arg(b, &args->ac, args->gs_attr_address);

-   unsigned stride = 16 * shader->info.nr_param_exports;
+   unsigned stride = 16 * si_shader_num_alloc_param_exports(shader);
   uint32_t desc[4];

   ac_build_attr_ring_descriptor(sel->screen->info.gfx_level,
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -1028,6 +1028,7 @@ void si_nir_late_opts(struct nir_shader *nir);
 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);

 /* si_state_shaders.cpp */
+unsigned si_shader_num_alloc_param_exports(struct si_shader *shader);
 unsigned si_determine_wave_size(struct si_screen *sscreen, struct si_shader *shader);
 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
                      struct gfx9_gs_info *out);
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -1391,6 +1391,26 @@ static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
                                             shader->info.nr_pos_exports > 1));
 }

+/* Return the number of allocated param exports. This can be more than the number of param
+ * exports in the shader.
+ */
+unsigned si_shader_num_alloc_param_exports(struct si_shader *shader)
+{
+   unsigned num_params = shader->info.nr_param_exports;
+
+   /* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs on GFX12, they can wrap
+    * around if there are more than 2^12 workgroups, causing 2 workgroups to get the same
+    * ordered ID, which can deadlock the "ordered add" loop.
+    *
+    * The recommended solution is to use the alloc/dealloc mechanism of the attribute ring to limit
+    * the number of workgroups in flight and thus the number of ordered IDs in flight.
+    */
+   if (shader->selector->screen->info.gfx_level >= GFX12 && si_shader_uses_streamout(shader))
+      num_params = MAX2(num_params, 8);
+
+   return num_params;
+}
+
 /**
 * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader
 * in NGG mode.
@ -1541,16 +1561,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                                        gs_sel->info.writes_primid);

   if (sscreen->info.gfx_level >= GFX12) {
-      unsigned num_params = shader->info.nr_param_exports;
-
-      /* Since there is no alloc/dealloc mechanism for the 12-bit ordered IDs, they can wrap
-       * around if there are more than 2^12 workgroups, causing 2 workgroups to get the same
-       * ordered ID, which would break the streamout algorithm.
-       * The recommended solution is to use the alloc/dealloc mechanism of the attribute ring,
-       * which is enough to limit the range of ordered IDs that can be in flight.
-       */
-      if (si_shader_uses_streamout(shader))
-         num_params = MAX2(num_params, 8);
+      unsigned num_params = si_shader_num_alloc_param_exports(shader);

      shader->ngg.spi_shader_pgm_rsrc4_gs = S_00B220_SPI_SHADER_LATE_ALLOC_GS(127) |
                                            S_00B220_GLG_FORCE_DISABLE(1) |