radeonsi: remove a twice duplicated workaround for VERT_GRP_SIZE

This enables better lane occupancy. Acked-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
2026-03-07 05:30:25 +01:00 · 2021-05-10 20:31:19 -04:00 · 2021-05-10 20:31:19 -04:00 · a0fcd37731
commit a0fcd37731
parent c8e8979d6b
2 changed files with 13 additions and 30 deletions
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@ -1964,16 +1964,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
      max_esverts_base = 128;
   }

-   /* Hardware has the following non-natural restrictions on the value
-    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
-    * the draw:
-    *  - at most 252 for any line input primitive type
-    *  - at most 251 for any quad input primitive type
-    *  - at most 251 for triangle strips with adjacency (this happens to
-    *    be the natural limit for triangle *lists* with adjacency)
-    */
-   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
-
   if (gs_stage == MESA_SHADER_GEOMETRY) {
      bool force_multi_cycling = false;
      unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
@ -2125,18 +2115,7 @@ retry_select_mode:
      }
   }

-   /* On gfx10, the GE only checks against the maximum number of ES verts after
-    * allocating a full GS primitive. So we need to ensure that whenever
-    * this check passes, there is enough space for a full primitive without
-    * vertex reuse.
-    */
-   if (gs_sel->screen->info.chip_class == GFX10 &&
-       !(shader->key.opt.ngg_culling & (SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST |
-                                        SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)))
-      shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
-   else
-      shader->ngg.hw_max_esverts = max_esverts;
-
+   shader->ngg.hw_max_esverts = max_esverts;
   shader->ngg.max_gsprims = max_gsprims;
   shader->ngg.max_out_verts = max_out_vertices;
   shader->ngg.prim_amp_factor = prim_amp_factor;
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -1314,19 +1314,23 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                        S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
                        S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);

-      /* Bug workaround for a possible hang with non-tessellation cases.
-       * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
+      /* On gfx10, the GE only checks against the maximum number of ES verts after
+       * allocating a full GS primitive. So we need to ensure that whenever
+       * this check passes, there is enough space for a full primitive without
+       * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
+       * if we have enough LDS.
       *
-       * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
+       * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
       */
      if ((sscreen->info.chip_class == GFX10) &&
          (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
-          shader->ngg.hw_max_esverts != 256) {
+          shader->ngg.hw_max_esverts != 256 &&
+          shader->ngg.hw_max_esverts > 5) {
+         /* This could be based on the input primitive type. 5 is the worst case
+          * for primitive types with adjacency.
+          */
         shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
-
-         if (shader->ngg.hw_max_esverts > 5) {
-            shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
-         }
+         shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
      }
   }