ac/nir/tess: adjust memory layout of TCS outputs to have aligned store offsets

There is a comment that explains it. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34780>
2026-05-08 02:38:04 +02:00 · 2025-04-19 10:03:37 -04:00 · 2025-04-19 10:03:37 -04:00 · 534b282573
commit 534b282573
parent 80236f2367
6 changed files with 100 additions and 55 deletions
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@ -926,10 +926,27 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
   return CLAMP(workgroup_size, 1, 256);
 }

+static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
+                                           uint32_t num_mem_tcs_patch_outputs, uint32_t num_patches)
+{
+   /* Align each per-vertex and per-patch output to 16 vec4 elements = 256B. It's most optimal when
+    * the 16 vec4 elements are written by 16 consecutive lanes.
+    *
+    * 256B is the granularity of interleaving memory channels, which means a single output store
+    * in wave64 will cover 4 channels (1024B). If an output was only aligned to 128B, wave64 could
+    * cover 5 channels (128B .. 1.125K) instead of 4, which could increase VMEM latency.
+    */
+   unsigned mem_one_pervertex_output = align(16 * num_tcs_output_cp * num_patches, 256);
+   unsigned mem_one_perpatch_output = align(16 * num_patches, 256);
+
+   return mem_one_pervertex_output * num_mem_tcs_outputs +
+          mem_one_perpatch_output * num_mem_tcs_patch_outputs;
+}
+
 uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
-                                     uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
-                                     uint32_t lds_per_patch, uint32_t wave_size,
-                                     bool tess_uses_primid)
+                                     uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
+                                     uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
+                                     uint32_t wave_size, bool tess_uses_primid)
 {
   /* The VGT HS block increments the patch ID unconditionally within a single threadgroup.
    * This results in incorrect patch IDs when instanced draws are used.
@ -956,8 +973,24 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
      num_patches = MIN2(num_patches, 16); /* recommended */

   /* Make sure the output data fits in the offchip buffer */
-   if (vram_per_patch)
-      num_patches = MIN2(num_patches, (info->hs_offchip_workgroup_dw_size * 4) / vram_per_patch);
+   unsigned mem_size = get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+                                                  num_mem_tcs_patch_outputs, num_patches);
+   if (mem_size > info->hs_offchip_workgroup_dw_size * 4) {
+      /* Find the number of patches that fit in memory. Each output is aligned separately,
+       * so this division won't return a precise result.
+       */
+      num_patches = info->hs_offchip_workgroup_dw_size * 4 /
+                    get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+                                               num_mem_tcs_patch_outputs, 1);
+      assert(get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+                                        num_mem_tcs_patch_outputs, num_patches) <=
+             info->hs_offchip_workgroup_dw_size * 4);
+
+      while (get_tcs_wg_output_mem_size(num_tcs_output_cp, num_mem_tcs_outputs,
+                                        num_mem_tcs_patch_outputs, num_patches + 1) <=
+             info->hs_offchip_workgroup_dw_size * 4)
+         num_patches++;
+   }

   /* Make sure that the data fits in LDS. This assumes the shaders only
    * use LDS for the inputs and outputs.
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@ -294,9 +294,9 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
                                       unsigned max_vtx_out, unsigned prim_amp_factor);

 uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
-                                     uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
-                                     uint32_t lds_per_patch, uint32_t wave_size,
-                                     bool tess_uses_primid);
+                                     uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
+                                     uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
+                                     uint32_t wave_size, bool tess_uses_primid);

 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
                        const struct radeon_info *info);
--- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
+++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
@ -76,28 +76,31 @@
 * ### VRAM layout used by TCS-TES I/O:
 *
 * ```
- * attr 0 of patch 0 vertex 0   <─── "off-chip LDS" offset
+ * attr 0 of patch 0 vertex 0   <─── "off-chip LDS" offset, aligned to >= 4K
 * attr 0 of patch 0 vertex 1
 * attr 0 of patch 0 vertex 2
 * ...
 * attr 0 of patch 1 vertex 0
 * attr 0 of patch 1 vertex 1
- * attr 0 of patch 1 vertex 2   <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 1)
+ * attr 0 of patch 1 vertex 2   <─── hs_per_vertex_output_vmem_offset (attribute slot = 0, rel_patch_id = 1, vertex index = 2)
 * ...
 * attr 0 of patch 2 vertex 0
 * attr 0 of patch 2 vertex 1
 * attr 0 of patch 2 vertex 2
 * ...
- * attr 1 of patch 0 vertex 0
+ * [pad to 256B]
+ * attr 1 of patch 0 vertex 0   <─── aligned to 256B
 * attr 1 of patch 0 vertex 1
 * attr 1 of patch 0 vertex 2
 * ...
 * ...
- * per-patch attr 0 of patch 0  <─── hs_out_patch_data_offset_amd
+ * [pad to 256B]
+ * per-patch attr 0 of patch 0  <─── hs_out_patch_data_offset_amd, aligned to 256B
 * per-patch attr 0 of patch 1
 * per-patch attr 0 of patch 2  <─── hs_per_patch_output_vmem_offset (attribute slot = 0, rel_patch_id = 2)
 * ...
- * per-patch attr 1 of patch 0
+ * [pad to 256B]
+ * per-patch attr 1 of patch 0  <─── aligned to 256B
 * per-patch attr 1 of patch 1
 * per-patch attr 1 of patch 2
 * ...
@ -477,6 +480,8 @@ hs_per_vertex_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsign

   nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
   nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, out_vertices_per_patch, 16u));
+   /* Align the stride to 256B. */
+   attr_stride = nir_align_imm(b, attr_stride, 256);
   nir_def *off =
      ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
                         hs_output_vram_map_io_location(b->shader, true, location, st));
@ -495,10 +500,12 @@ hs_per_patch_output_vmem_offset(nir_builder *b, lower_tess_io_state *st, unsigne
 {
   nir_def *tcs_num_patches = nir_load_tcs_num_patches_amd(b);
   nir_def *per_patch_data_offset = nir_load_hs_out_patch_data_offset_amd(b);
+   /* Align the stride to 256B. */
+   nir_def *attr_stride = nir_align_imm(b, nir_imul_imm(b, tcs_num_patches, 16u), 256);

   nir_def *off =
      io_offset
-      ? ac_nir_calc_io_off(b, component, io_offset, nir_imul_imm(b, tcs_num_patches, 16u), 4u,
+      ? ac_nir_calc_io_off(b, component, io_offset, attr_stride, 4u,
                           hs_output_vram_map_io_location(b->shader, false, location, st))
      : nir_imm_int(b, 0);

@ -1023,8 +1030,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes
   const bool tes_reads_inner = st->tes_inputs_read & VARYING_BIT_TESS_LEVEL_INNER;

   if (st->tcs_tess_level_outer_mask && tes_reads_outer) {
-      const unsigned tf_outer_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_OUTER, st);
-      nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_outer_loc * 16);
+      nir_def *vmem_off_outer = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_OUTER, 0, zero, 0);

      nir_store_buffer_amd(b, tessfactors.outer, hs_ring_tess_offchip,
                           vmem_off_outer, offchip_offset, zero,
@ -1033,8 +1039,7 @@ hs_store_tess_factors_for_tes(nir_builder *b, tess_levels tessfactors, lower_tes
   }

   if (tessfactors.inner && st->tcs_tess_level_inner_mask && tes_reads_inner) {
-      const unsigned tf_inner_loc = hs_output_vram_map_io_location(b->shader, false, VARYING_SLOT_TESS_LEVEL_INNER, st);
-      nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, 0, 0, NULL, tf_inner_loc * 16);
+      nir_def *vmem_off_inner = hs_per_patch_output_vmem_offset(b, st, VARYING_SLOT_TESS_LEVEL_INNER, 0, zero, 0);

      nir_store_buffer_amd(b, tessfactors.inner, hs_ring_tess_offchip,
                           vmem_off_inner, offchip_offset, zero,
@ -1328,8 +1333,8 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, uint64_t outputs_rea
   unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
                            num_tcs_output_cp * lds_output_vertex_size +
                            lds_perpatch_output_patch_size;
-   unsigned mem_per_patch = (num_tcs_output_cp * num_mem_tcs_outputs + num_mem_tcs_patch_outputs) * 16;
-   unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp, mem_per_patch,
+   unsigned num_patches = ac_compute_num_tess_patches(info, num_tcs_input_cp, num_tcs_output_cp,
+                                                      num_mem_tcs_outputs, num_mem_tcs_patch_outputs,
                                                      lds_per_patch, wave_size, tess_uses_primid);
   unsigned lds_size = lds_per_patch * num_patches;

--- a/src/amd/vulkan/nir/radv_nir_lower_abi.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c
@ -244,33 +244,31 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
      break;
   }
   case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
-      nir_def *num_tcs_outputs, *out_vertices_per_patch;
-
-      if (stage == MESA_SHADER_TESS_CTRL) {
-         num_tcs_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
-         out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
-      } else {
-         if (s->info->inputs_linked) {
-            out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
-            num_tcs_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
-         } else {
-            nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP);
-            out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
-            num_tcs_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
-         }
-      }
-
-      nir_def *per_vertex_output_patch_size =
-         nir_imul(b, out_vertices_per_patch, nir_imul_imm(b, num_tcs_outputs, 16u));
+      nir_def *num_patches, *out_vertices_per_patch, *num_tcs_mem_outputs;

      if (s->info->num_tess_patches) {
-         unsigned num_patches = s->info->num_tess_patches;
-         replacement = nir_imul_imm(b, per_vertex_output_patch_size, num_patches);
+         num_patches = nir_imm_int(b, s->info->num_tess_patches);
      } else {
         nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_PATCHES);
-         nir_def *num_patches = nir_iadd_imm_nuw(b, n, 1);
-         replacement = nir_imul(b, per_vertex_output_patch_size, num_patches);
+         num_patches = nir_iadd_imm_nuw(b, n, 1);
      }
+
+      if (stage == MESA_SHADER_TESS_CTRL) {
+         out_vertices_per_patch = nir_imm_int(b, s->info->tcs.tcs_vertices_out);
+         num_tcs_mem_outputs = nir_imm_int(b, s->info->tcs.num_linked_outputs);
+      } else if (s->info->inputs_linked) {
+         out_vertices_per_patch = nir_imm_int(b, s->info->tes.tcs_vertices_out);
+         num_tcs_mem_outputs = nir_imm_int(b, s->info->tes.num_linked_inputs);
+      } else {
+         nir_def *n = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP);
+         out_vertices_per_patch = nir_iadd_imm_nuw(b, n, 1);
+         num_tcs_mem_outputs = GET_SGPR_FIELD_NIR(s->args->tcs_offchip_layout, TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS);
+      }
+
+      /* Compute the stride of a single output. */
+      nir_def *attr_stride = nir_imul(b, num_patches, nir_imul_imm(b, out_vertices_per_patch, 16));
+      attr_stride = nir_align_imm(b, attr_stride, 256);
+      replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
      break;
   }
   case nir_intrinsic_load_sample_positions_amd: {
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@ -1221,6 +1221,16 @@ nir_umod_imm(nir_builder *build, nir_def *x, uint64_t y)
   }
 }

+static inline nir_def *
+nir_align_imm(nir_builder *b, nir_def *x, uint64_t align)
+{
+   if (align == 1)
+      return x;
+
+   assert(util_is_power_of_two_nonzero64(align));
+   return nir_iand_imm(b, nir_iadd_imm(b, x, align - 1), ~(align - 1));
+}
+
 static inline nir_def *
 nir_ibfe_imm(nir_builder *build, nir_def *x, uint32_t offset, uint32_t size)
 {
--- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c
@ -309,24 +309,23 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
      break;
   }
   case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
-      nir_def *per_vtx_out_patch_size = NULL;
+      nir_def *tcs_num_patches =
+         nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7), 1);
+      nir_def *tcs_out_vertices, *num_tcs_mem_outputs;

      if (stage == MESA_SHADER_TESS_CTRL) {
-         const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
-         const unsigned out_vtx_size = num_hs_out * 16;
-         const unsigned out_vtx_per_patch = b->shader->info.tess.tcs_vertices_out;
-         per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
+         tcs_out_vertices = nir_imm_int(b, b->shader->info.tess.tcs_vertices_out);
+         num_tcs_mem_outputs = nir_imm_int(b, util_last_bit64(sel->info.tcs_outputs_written_for_tes));
      } else {
-         nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
-         nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
-         nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
-         nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
-         per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
+         tcs_out_vertices =
+            nir_iadd_imm_nuw(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5), 1);
+         num_tcs_mem_outputs = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
      }

-      nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
-      nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
-      replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
+      /* Compute the stride of a single output. */
+      nir_def *attr_stride = nir_imul(b, tcs_num_patches, nir_imul_imm(b, tcs_out_vertices, 16));
+      attr_stride = nir_align_imm(b, attr_stride, 256);
+      replacement = nir_imul(b, attr_stride, num_tcs_mem_outputs);
      break;
   }
   case nir_intrinsic_load_clip_half_line_width_amd: {