radv: rework the number of tess patches computation

This uses the same helper as RadeonSI which seems more robust and more optimal (eg. it reduces the number of patches to increase occupancy). fossils-db (NAVI21): Totals from 638 (0.80% of 79395) affected shaders: MaxWaves: 13182 -> 13142 (-0.30%) Instrs: 419446 -> 419322 (-0.03%); split: -0.08%, +0.05% CodeSize: 2261408 -> 2261200 (-0.01%); split: -0.06%, +0.05% VGPRs: 32560 -> 32600 (+0.12%) LDS: 4648960 -> 5343232 (+14.93%); split: -1.67%, +16.61% Latency: 4812105 -> 4811141 (-0.02%); split: -0.04%, +0.02% InvThroughput: 1159924 -> 1153998 (-0.51%); split: -0.60%, +0.09% VClause: 7837 -> 7871 (+0.43%); split: -0.36%, +0.79% SClause: 9378 -> 9381 (+0.03%); split: -0.21%, +0.25% Copies: 28451 -> 28211 (-0.84%); split: -0.97%, +0.13% PreVGPRs: 25404 -> 25411 (+0.03%); split: -0.06%, +0.09% VALU: 278086 -> 277975 (-0.04%); split: -0.11%, +0.07% SALU: 43657 -> 43617 (-0.09%) Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28015>
2026-01-09 06:10:12 +01:00 · 2024-03-06 14:38:35 +01:00 · 2024-03-06 14:38:35 +01:00 · fb323ae46b
commit fb323ae46b
parent 758e6d9005
4 changed files with 58 additions and 93 deletions
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -2647,17 +2647,16 @@ radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
    */
   if (cmd_buffer->state.uses_dynamic_patch_control_points) {
      /* Compute the number of patches. */
-      cmd_buffer->state.tess_num_patches = get_tcs_num_patches(
-         d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
+      cmd_buffer->state.tess_num_patches = radv_get_tcs_num_patches(
+         pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
         tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
-         tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size,
-         pdev->info.gfx_level, pdev->info.family);
+         tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs);

      /* Compute the LDS size. */
      cmd_buffer->state.tess_lds_size =
-         calculate_tess_lds_size(pdev->info.gfx_level, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out,
-                                 vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches,
-                                 tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs);
+         radv_get_tess_lds_size(pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out,
+                                vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches,
+                                tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs);
   }

   ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) |
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -2955,6 +2955,45 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx)
   return &shader->info.user_sgprs_locs.shader_data[idx];
 }

+static uint32_t
+radv_get_tess_patch_size(uint32_t tcs_num_input_vertices, uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs,
+                         uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs)
+{
+   const uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
+   const uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
+   const uint32_t lds_output_vertex_size = tcs_num_lds_outputs * 16;
+   const uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
+   const uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;
+
+   return input_patch_size + lds_output_patch_size;
+}
+
+uint32_t
+radv_get_tcs_num_patches(const struct radv_physical_device *pdev, unsigned tcs_num_input_vertices,
+                         unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, unsigned tcs_num_lds_outputs,
+                         unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
+                         unsigned tcs_num_vram_patch_outputs)
+{
+   const uint32_t lds_per_patch = radv_get_tess_patch_size(
+      tcs_num_input_vertices, tcs_num_output_vertices, tcs_num_inputs, tcs_num_lds_outputs, tcs_num_lds_patch_outputs);
+   const uint32_t vram_per_patch = radv_get_tess_patch_size(tcs_num_input_vertices, tcs_num_output_vertices, 0,
+                                                            tcs_num_vram_outputs, tcs_num_vram_patch_outputs);
+
+   return ac_compute_num_tess_patches(&pdev->info, tcs_num_input_vertices, tcs_num_output_vertices, vram_per_patch,
+                                      lds_per_patch, pdev->ge_wave_size, false);
+}
+
+uint32_t
+radv_get_tess_lds_size(const struct radv_physical_device *pdev, uint32_t tcs_num_input_vertices,
+                       uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, uint32_t tcs_num_patches,
+                       uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs)
+{
+   const uint32_t lds_per_patch = radv_get_tess_patch_size(
+      tcs_num_input_vertices, tcs_num_output_vertices, tcs_num_inputs, tcs_num_lds_outputs, tcs_num_lds_patch_outputs);
+
+   return ac_compute_tess_lds_size(&pdev->info, lds_per_patch, tcs_num_patches);
+}
+
 VkResult
 radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline, struct radv_shader *shader,
                       gl_shader_stage stage, FILE *output)
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -642,85 +642,14 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
   return stride;
 }

-static inline unsigned
-calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices,
-                        unsigned tcs_num_inputs, unsigned tcs_num_patches, unsigned tcs_num_outputs,
-                        unsigned tcs_num_patch_outputs)
-{
-   unsigned input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
-   unsigned output_vertex_size = tcs_num_outputs * 16;
+uint32_t radv_get_tcs_num_patches(const struct radv_physical_device *pdev, unsigned tcs_num_input_vertices,
+                                  unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
+                                  unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs,
+                                  unsigned tcs_num_vram_outputs, unsigned tcs_num_vram_patch_outputs);

-   unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size;
-
-   unsigned pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size;
-   unsigned output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16;
-
-   unsigned output_patch0_offset = input_patch_size * tcs_num_patches;
-
-   unsigned lds_size = output_patch0_offset + output_patch_size * tcs_num_patches;
-
-   if (gfx_level >= GFX7) {
-      assert(lds_size <= 65536);
-      lds_size = align(lds_size, 512) / 512;
-   } else {
-      assert(lds_size <= 32768);
-      lds_size = align(lds_size, 256) / 256;
-   }
-
-   return lds_size;
-}
-
-static inline unsigned
-get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
-                    unsigned tcs_num_lds_outputs, unsigned tcs_num_lds_patch_outputs, unsigned tcs_num_vram_outputs,
-                    unsigned tcs_num_vram_patch_outputs, unsigned tess_offchip_block_dw_size,
-                    enum amd_gfx_level gfx_level, enum radeon_family family)
-{
-   uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
-   uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
-   uint32_t lds_output_vertex_size = tcs_num_lds_outputs * 16;
-   uint32_t lds_pervertex_output_patch_size = tcs_num_output_vertices * lds_output_vertex_size;
-   uint32_t lds_output_patch_size = lds_pervertex_output_patch_size + tcs_num_lds_patch_outputs * 16;
-
-   uint32_t vram_output_vertex_size = tcs_num_vram_outputs * 16;
-   uint32_t vram_pervertex_output_patch_size = tcs_num_output_vertices * vram_output_vertex_size;
-   uint32_t vram_output_patch_size = vram_pervertex_output_patch_size + tcs_num_vram_patch_outputs * 16;
-
-   /* Ensure that we only need one wave per SIMD so we don't need to check
-    * resource usage. Also ensures that the number of tcs in and out
-    * vertices per threadgroup are at most 256.
-    */
-   unsigned num_patches = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices) * 4;
-   /* Make sure that the data fits in LDS. This assumes the shaders only
-    * use LDS for the inputs and outputs.
-    */
-   unsigned hardware_lds_size = 32768;
-
-   /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single
-    * threadgroup, even though there is more than 32 KiB LDS.
-    *
-    * Test: dEQP-VK.tessellation.shader_input_output.barrier
-    */
-   if (gfx_level >= GFX7 && family != CHIP_STONEY)
-      hardware_lds_size = 65536;
-
-   if (input_patch_size + lds_output_patch_size)
-      num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + lds_output_patch_size));
-   /* Make sure the output data fits in the offchip buffer */
-   if (vram_output_patch_size)
-      num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_output_patch_size);
-   /* Not necessary for correctness, but improves performance. The
-    * specific value is taken from the proprietary driver.
-    */
-   num_patches = MIN2(num_patches, 40);
-
-   /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
-   if (gfx_level == GFX6) {
-      unsigned one_wave = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices);
-      num_patches = MIN2(num_patches, one_wave);
-   }
-   return num_patches;
-}
+uint32_t radv_get_tess_lds_size(const struct radv_physical_device *pdev, uint32_t tcs_num_input_vertices,
+                                uint32_t tcs_num_output_vertices, uint32_t tcs_num_inputs, uint32_t tcs_num_patches,
+                                uint32_t tcs_num_lds_outputs, uint32_t tcs_num_lds_patch_outputs);

 void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
                    const struct radv_graphics_state_key *gfx_state);
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@ -564,17 +564,15 @@ gather_shader_info_tcs(struct radv_device *device, const nir_shader *nir,

   if (gfx_state->ts.patch_control_points) {
      /* Number of tessellation patches per workgroup processed by the current pipeline. */
-      info->num_tess_patches = get_tcs_num_patches(
-         gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
+      info->num_tess_patches = radv_get_tcs_num_patches(
+         pdev, gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
         info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs, info->tcs.num_linked_outputs,
-         info->tcs.num_linked_patch_outputs, pdev->hs.tess_offchip_block_dw_size, pdev->info.gfx_level,
-         pdev->info.family);
+         info->tcs.num_linked_patch_outputs);

      /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
-      info->tcs.num_lds_blocks =
-         calculate_tess_lds_size(pdev->info.gfx_level, gfx_state->ts.patch_control_points,
-                                 nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs, info->num_tess_patches,
-                                 info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs);
+      info->tcs.num_lds_blocks = radv_get_tess_lds_size(
+         pdev, gfx_state->ts.patch_control_points, nir->info.tess.tcs_vertices_out, info->tcs.num_linked_inputs,
+         info->num_tess_patches, info->tcs.num_lds_per_vertex_outputs, info->tcs.num_lds_per_patch_outputs);
   }
 }