ac/nir: allow smaller workgroups for GS

It's not good for performance, but it's possible to use for debugging. Running single-wave GS workgroups could work around any LDS race conditions. Setting the workgroup size to 64 reliably works around GLCTS *primitive_counter*line failures, indicating streamout data corruption with multi-wave GS workgroups. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38328>
2026-02-19 03:50:26 +01:00 · 2025-11-08 12:25:50 -05:00 · 2025-11-08 12:25:50 -05:00 · 9bd2c6dcb2
commit 9bd2c6dcb2
parent ff8df8712e
4 changed files with 22 additions and 14 deletions
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@ -1349,9 +1349,11 @@ ac_legacy_gs_compute_subgroup_info(enum mesa_prim input_prim, unsigned gs_vertic
 */
 bool
 ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, mesa_shader_stage es_stage, bool is_gs,
-                             enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
-                             unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
-                             unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
+                             enum mesa_prim input_prim, unsigned gs_vertices_out,
+                             unsigned gs_invocations, unsigned target_workgroup_size,
+                             unsigned max_workgroup_size, unsigned wave_size,
+                             unsigned esgs_vertex_stride, unsigned ngg_lds_vertex_size,
+                             unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
                             unsigned max_esgs_lds_padding, ac_ngg_subgroup_info *out)
 {
   const unsigned gs_num_invocations = MAX2(gs_invocations, 1);
@ -1373,16 +1375,19 @@ ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, mesa_shader_stage es_
   bool max_vert_out_per_gs_instance = false;
   unsigned max_gsprims_base, max_esverts_base;

-   max_gsprims_base = max_esverts_base = max_workgroup_size;
+   /* In the worst case, we can run 1 GS invocation per workgroup. */
+   assert(!is_gs || gs_vertices_out <= max_workgroup_size);
+
+   max_gsprims_base = max_esverts_base = target_workgroup_size;

   if (is_gs) {
      bool force_multi_cycling = false;
      unsigned max_out_verts_per_gsprim = gs_vertices_out * gs_num_invocations;

 retry_select_mode:
-      if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
+      if (max_out_verts_per_gsprim <= max_workgroup_size && !force_multi_cycling) {
         if (max_out_verts_per_gsprim) {
-            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
+            max_gsprims_base = MIN2(max_gsprims_base, max_workgroup_size / max_out_verts_per_gsprim);
         }
      } else {
         /* Use special multi-cycling mode in which each GS
@ -1483,7 +1488,7 @@ retry_select_mode:
         : is_gs
              ? max_gsprims * gs_num_invocations * gs_vertices_out
              : max_esverts;
-   assert(max_out_vertices <= 256);
+   assert(max_out_vertices <= max_workgroup_size);

   out->hw_max_esverts = max_esverts;
   out->max_gsprims = max_gsprims;
@ -1504,6 +1509,6 @@ retry_select_mode:

   /* If asserts are disabled, we use the same conditions to return false */
   return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
-          max_out_vertices <= 256 &&
+          max_out_vertices <= max_workgroup_size &&
          out->hw_max_esverts >= min_esverts;
 }
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@ -336,9 +336,11 @@ typedef struct {

 bool
 ac_ngg_compute_subgroup_info(enum amd_gfx_level gfx_level, mesa_shader_stage es_stage, bool is_gs,
-                             enum mesa_prim input_prim, unsigned gs_vertices_out, unsigned gs_invocations,
-                             unsigned max_workgroup_size, unsigned wave_size, unsigned esgs_vertex_stride,
-                             unsigned ngg_lds_vertex_size, unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
+                             enum mesa_prim input_prim, unsigned gs_vertices_out,
+                             unsigned gs_invocations, unsigned target_workgroup_size,
+                             unsigned max_workgroup_size, unsigned wave_size,
+                             unsigned esgs_vertex_stride, unsigned ngg_lds_vertex_size,
+                             unsigned ngg_lds_scratch_size, bool tess_turns_off_ngg,
                             unsigned max_esgs_lds_padding, ac_ngg_subgroup_info *out);

 static unsigned inline
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@ -1277,8 +1277,8 @@ gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es
   ac_ngg_subgroup_info info;

   ac_ngg_compute_subgroup_info(gfx_level, es_info->stage, !!gs_info, input_prim, gs_vertices_out, gs_num_invocations,
-                                128, stage_info->wave_size, es_info->esgs_itemsize, stage_info->ngg_lds_vertex_size,
-                                stage_info->ngg_lds_scratch_size, false, 0, &info);
+                                128, 256, stage_info->wave_size, es_info->esgs_itemsize,
+                                stage_info->ngg_lds_vertex_size, stage_info->ngg_lds_scratch_size, false, 0, &info);

   out->hw_max_esverts = info.hw_max_esverts;
   out->max_gsprims = info.max_gsprims;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -1887,11 +1887,12 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
      const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key, false);
      unsigned gs_vertices_out = gs_sel->stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.vertices_out : 0;
      unsigned gs_invocations = gs_sel->stage == MESA_SHADER_GEOMETRY ? gs_sel->info.base.gs.invocations : 0;
+      unsigned max_workgroup_size = si_get_max_workgroup_size(shader);

      if (!ac_ngg_compute_subgroup_info(gs_sel->screen->info.gfx_level, es_sel->stage,
                                        gs_sel->stage == MESA_SHADER_GEOMETRY,
                                        input_prim, gs_vertices_out, gs_invocations,
-                                        si_get_max_workgroup_size(shader), shader->wave_size,
+                                        max_workgroup_size, max_workgroup_size, shader->wave_size,
                                        es_sel->info.esgs_vertex_stride, shader->info.ngg_lds_vertex_size,
                                        shader->info.ngg_lds_scratch_size, gs_sel->tess_turns_off_ngg,
                                        gs_sel->stage == MESA_SHADER_GEOMETRY ? 255 : 0, &shader->ngg.info)) {