ac/nir/lower_ngg: return LDS size for NGG VS and TES from the pass

instead of computing it separately. This is better because ac_nir_lower_ngg_nogs knows the final LDS size anyway, and it will be easier to modify the size calculation this way. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35351>
2026-05-05 20:28:04 +02:00 · 2025-05-28 08:30:31 -04:00 · 2025-05-28 08:30:31 -04:00 · d79f28e9b3
commit d79f28e9b3
parent 7fe603ad82
9 changed files with 37 additions and 60 deletions
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@ -306,16 +306,6 @@ uint32_t ac_compute_scratch_wavesize(const struct radeon_info *info, uint32_t by
 void ac_get_scratch_tmpring_size(const struct radeon_info *info, unsigned num_scratch_waves,
                                 unsigned bytes_per_wave, uint32_t *tmpring_size);

-unsigned
-ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
-                                   unsigned shader_num_outputs,
-                                   bool streamout_enabled,
-                                   bool export_prim_id,
-                                   bool has_user_edgeflags,
-                                   bool can_cull,
-                                   bool uses_instance_id,
-                                   bool uses_primitive_id);
-
 unsigned
 ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
                            unsigned workgroup_size,
--- a/src/amd/common/nir/ac_nir.h
+++ b/src/amd/common/nir/ac_nir.h
@ -208,7 +208,8 @@ typedef struct {
 } ac_nir_lower_ngg_options;

 bool
-ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options);
+ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
+                      uint32_t *out_lds_vertex_size);

 bool
 ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options);
--- a/src/amd/common/nir/ac_nir_lower_ngg.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg.c
@ -1547,8 +1547,31 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog
   }
 }

+static unsigned
+ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
+                                   unsigned shader_num_outputs,
+                                   bool streamout_enabled,
+                                   bool export_prim_id,
+                                   bool has_user_edgeflags,
+                                   bool can_cull,
+                                   bool uses_instance_id,
+                                   bool uses_tess_primitive_id)
+{
+   /* for culling time lds layout only */
+   unsigned culling_pervertex_lds_bytes = can_cull ?
+      ngg_nogs_get_culling_pervertex_lds_size(
+         stage, uses_instance_id, uses_tess_primitive_id, NULL) : 0;
+
+   unsigned pervertex_lds_bytes =
+      ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled,
+                                      export_prim_id, has_user_edgeflags);
+
+   return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes);
+}
+
 bool
-ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options)
+ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options,
+                      uint32_t *out_lds_vertex_size)
 {
   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
   assert(impl);
@ -1858,31 +1881,14 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option
      NIR_PASS(progress, shader, nir_opt_dead_cf);
   } while (progress);

+   *out_lds_vertex_size =
+      ac_ngg_nogs_get_pervertex_lds_size(shader->info.stage, shader->num_outputs, state.streamout_enabled,
+                                         options->export_primitive_id, state.has_user_edgeflags,
+                                         options->can_cull, state.deferred.uses_instance_id,
+                                         state.deferred.uses_tess_primitive_id);
   return true;
 }

-unsigned
-ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
-                                   unsigned shader_num_outputs,
-                                   bool streamout_enabled,
-                                   bool export_prim_id,
-                                   bool has_user_edgeflags,
-                                   bool can_cull,
-                                   bool uses_instance_id,
-                                   bool uses_primitive_id)
-{
-   /* for culling time lds layout only */
-   unsigned culling_pervertex_lds_bytes = can_cull ?
-      ngg_nogs_get_culling_pervertex_lds_size(
-         stage, uses_instance_id, uses_primitive_id, NULL) : 0;
-
-   unsigned pervertex_lds_bytes =
-      ngg_nogs_get_pervertex_lds_size(stage, shader_num_outputs, streamout_enabled,
-                                      export_prim_id, has_user_edgeflags);
-
-   return MAX2(culling_pervertex_lds_bytes, pervertex_lds_bytes);
-}
-
 unsigned
 ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
                            unsigned workgroup_size,
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -814,7 +814,7 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
      options.export_primitive_id_per_prim = info->outinfo.export_prim_id_per_primitive;
      options.instance_rate_inputs = gfx_state->vi.instance_rate_inputs << VERT_ATTRIB_GENERIC0;

-      NIR_PASS(_, nir, ac_nir_lower_ngg_nogs, &options);
+      NIR_PASS(_, nir, ac_nir_lower_ngg_nogs, &options, &ngg_stage->info.ngg_lds_vertex_size);
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      assert(info->is_ngg);

--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@ -1424,7 +1424,6 @@ static unsigned
 gfx10_get_ngg_scratch_lds_base(const struct radv_device *device, const struct radv_shader_info *es_info,
                               const struct radv_shader_info *gs_info, const struct gfx10_ngg_info *ngg_info)
 {
-   const struct radv_physical_device *pdev = radv_device_physical(device);
   uint32_t scratch_lds_base;

   if (gs_info) {
@ -1433,17 +1432,8 @@ gfx10_get_ngg_scratch_lds_base(const struct radv_device *device, const struct ra

      scratch_lds_base = ALIGN(esgs_ring_lds_bytes + gs_total_out_vtx_bytes, 8u /* for the repacking code */);
   } else {
-      const bool uses_instanceid = es_info->vs.needs_instance_id;
-      const bool uses_primitive_id = es_info->uses_prim_id;
-      const bool streamout_enabled = es_info->so.enabled_stream_buffers_mask && pdev->use_ngg_streamout;
-      const uint32_t num_outputs =
-         es_info->stage == MESA_SHADER_VERTEX ? es_info->vs.num_outputs : es_info->tes.num_outputs;
-      unsigned pervertex_lds_bytes = ac_ngg_nogs_get_pervertex_lds_size(
-         es_info->stage, num_outputs, streamout_enabled, es_info->outinfo.export_prim_id, false, /* user edge flag */
-         es_info->has_ngg_culling, uses_instanceid, uses_primitive_id);
-
      assert(ngg_info->hw_max_esverts <= 256);
-      unsigned total_es_lds_bytes = pervertex_lds_bytes * ngg_info->hw_max_esverts;
+      unsigned total_es_lds_bytes = es_info->ngg_lds_vertex_size * ngg_info->hw_max_esverts;

      scratch_lds_base = ALIGN(total_es_lds_bytes, 8u);
   }
--- a/src/amd/vulkan/radv_shader_info.h
+++ b/src/amd/vulkan/radv_shader_info.h
@ -100,6 +100,7 @@ struct radv_shader_info {
   bool has_xfb_query;
   uint32_t num_tess_patches;
   uint32_t esgs_itemsize; /* Only for VS or TES as ES */
+   uint32_t ngg_lds_vertex_size;
   struct radv_vs_output_info outinfo;
   unsigned workgroup_size;
   bool force_vrs_per_vertex;
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@ -110,19 +110,7 @@ retry_select_mode:
      }
   } else {
      /* VS and TES. */
-      bool uses_primitive_id = gs_sel->info.uses_primid;
-
-      if (gs_stage == MESA_SHADER_TESS_EVAL)
-         uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id;
-
-      esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size(
-         gs_stage, gs_sel->info.num_outputs,
-         shader->info.num_streamout_vec4s != 0,
-         shader->key.ge.mono.u.vs_export_prim_id,
-         gfx10_ngg_writes_user_edgeflags(shader),
-         si_shader_culling_enabled(shader),
-         shader->info.uses_instance_id,
-         uses_primitive_id) / 4;
+      esvert_lds_size = shader->info.ngg_lds_vertex_size / 4;
   }

   unsigned max_gsprims = max_gsprims_base;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -1172,7 +1172,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir,
      options.instance_rate_inputs = instance_rate_inputs;
      options.user_clip_plane_enable_mask = clip_plane_enable;

-      NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
+      NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options, &shader->info.ngg_lds_vertex_size);
   } else {
      assert(nir->info.stage == MESA_SHADER_GEOMETRY);

--- a/src/gallium/drivers/radeonsi/si_shader_info.h
+++ b/src/gallium/drivers/radeonsi/si_shader_info.h
@ -240,6 +240,7 @@ struct si_shader_variant_info {
   uint8_t num_streamout_vec4s;
   unsigned private_mem_vgprs;
   unsigned max_simd_waves;
+   uint32_t ngg_lds_vertex_size;
 };

 #endif