ac/nir: reserve the first LDS vec4 for the HS tf0/1 group vote in TCS

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
2026-05-05 09:38:07 +02:00 · 2024-09-29 17:30:49 -04:00 · 2024-09-29 17:30:49 -04:00 · f4eebb373c
commit f4eebb373c
parent fd5779c198
5 changed files with 38 additions and 10 deletions
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@ -17,6 +17,9 @@
 extern "C" {
 #endif

+/* Reserve this size at the beginning of LDS for the tf0/1 shader message group vote. */
+#define AC_HS_MSG_VOTE_LDS_BYTES 16
+
 enum
 {
   /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
@ -76,6 +79,7 @@ bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
 void
 ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
                               ac_nir_map_io_driver_location map,
+                               enum amd_gfx_level gfx_level,
                               bool tcs_in_out_eq,
                               uint64_t tcs_inputs_read,
                               uint64_t tcs_temp_only_inputs);
@ -83,6 +87,7 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
 void
 ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
                              ac_nir_map_io_driver_location map,
+                              enum amd_gfx_level gfx_level,
                              bool tcs_in_out_eq,
                              uint64_t tcs_temp_only_inputs);

--- a/src/amd/common/ac_nir_lower_tess_io_to_mem.c
+++ b/src/amd/common/ac_nir_lower_tess_io_to_mem.c
@ -61,8 +61,8 @@
 * TCS per-vertex inputs for patch 1
 * TCS per-vertex inputs for patch 2  <─── hs_per_vertex_input_lds_offset (rel_patch_id = 2)
 * ...
- * TCS per-vertex outputs for patch 0 <─── output_patch0_offset
- * TCS per-patch outputs for patch 0  <─── output_patch0_patch_data_offset
+ * TCS per-vertex outputs for patch 0 <─── hs_output_lds_offset (rel_patch_id = 0, per-vertex)
+ * TCS per-patch outputs for patch 0  <─── hs_output_lds_offset (rel_patch_id = 0, per-patch)
 * TCS per-vertex outputs for patch 1
 * TCS per-patch outputs for patch 1
 * TCS per-vertex outputs for patch 2 <─── hs_output_lds_offset (rel_patch_id = 2, per-vertex)
@ -284,6 +284,11 @@ lower_ls_output_store(nir_builder *b,
   unsigned write_mask = nir_intrinsic_write_mask(intrin);

   nir_def *off = nir_iadd_nuw(b, base_off_var, io_off);
+
+   /* The first vec4 is reserved for the tf0/1 shader message group vote. */
+   if (st->gfx_level >= GFX11)
+      off = nir_iadd_imm_nuw(b, off, AC_HS_MSG_VOTE_LDS_BYTES);
+
   AC_NIR_STORE_IO(b, intrin->src[0].ssa, 0, write_mask, io_sem.high_16bits,
                   nir_store_shared, off, .write_mask = store_write_mask, .base = store_const_offset);

@ -354,8 +359,10 @@ hs_per_vertex_input_lds_offset(nir_builder *b,
   const unsigned mapped = ac_nir_map_io_location(io_sem.location, st->tcs_inputs_read & ~st->tcs_temp_only_inputs,
                                                  st->map_io);
   nir_def *io_offset = ac_nir_calc_io_off(b, instr, nir_imm_int(b, 16u), 4u, mapped);
+   nir_def *lds_offset = nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset);

-   return nir_iadd_nuw(b, nir_iadd_nuw(b, tcs_in_current_patch_offset, vertex_index_off), io_offset);
+   /* The first LDS vec4 is reserved for the tf0/1 shader message group vote. */
+   return st->gfx_level >= GFX11 ? nir_iadd_imm_nuw(b, lds_offset, AC_HS_MSG_VOTE_LDS_BYTES) : lds_offset;
 }

 static unsigned
@ -419,17 +426,21 @@ hs_output_lds_offset(nir_builder *b,
   nir_def *input_patch_size = nir_imul(b, tcs_in_vtxcnt, nir_load_lshs_vertex_stride_amd(b));
   nir_def *output_patch0_offset = nir_imul(b, input_patch_size, tcs_num_patches);
   nir_def *output_patch_offset = nir_iadd_nuw(b, patch_offset, output_patch0_offset);
+   nir_def *lds_offset;

   if (per_vertex) {
      nir_def *vertex_index = nir_get_io_arrayed_index_src(intrin)->ssa;
      nir_def *vertex_index_off = nir_imul_imm(b, vertex_index, output_vertex_size);

      off = nir_iadd_nuw(b, off, vertex_index_off);
-      return nir_iadd_nuw(b, off, output_patch_offset);
+      lds_offset = nir_iadd_nuw(b, off, output_patch_offset);
   } else {
      off = nir_iadd_imm_nuw(b, off, pervertex_output_patch_size);
-      return nir_iadd_nuw(b, off, output_patch_offset);
+      lds_offset = nir_iadd_nuw(b, off, output_patch_offset);
   }
+
+   /* The first LDS vec4 is reserved for the tf0/1 shader message group vote. */
+   return st->gfx_level >= GFX11 ? nir_iadd_imm_nuw(b, lds_offset, AC_HS_MSG_VOTE_LDS_BYTES) : lds_offset;
 }

 static unsigned
@ -963,6 +974,7 @@ filter_any_input_access(const nir_instr *instr,
 void
 ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
                               ac_nir_map_io_driver_location map,
+                               enum amd_gfx_level gfx_level,
                               bool tcs_in_out_eq,
                               uint64_t tcs_inputs_read,
                               uint64_t tcs_temp_only_inputs)
@ -970,6 +982,7 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
   assert(shader->info.stage == MESA_SHADER_VERTEX);

   lower_tess_io_state state = {
+      .gfx_level = gfx_level,
      .tcs_in_out_eq = tcs_in_out_eq,
      .tcs_inputs_read = tcs_inputs_read,
      .tcs_temp_only_inputs = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
@ -984,12 +997,14 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
 void
 ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
                              ac_nir_map_io_driver_location map,
+                              enum amd_gfx_level gfx_level,
                              bool tcs_in_out_eq,
                              uint64_t tcs_temp_only_inputs)
 {
   assert(shader->info.stage == MESA_SHADER_TESS_CTRL);

   lower_tess_io_state state = {
+      .gfx_level = gfx_level,
      .tcs_inputs_read = shader->info.inputs_read,
      .tcs_in_out_eq = tcs_in_out_eq,
      .tcs_temp_only_inputs = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@ -6,6 +6,7 @@

 #include "ac_shader_util.h"
 #include "ac_gpu_info.h"
+#include "ac_nir.h"

 #include "sid.h"
 #include "util/u_math.h"
@ -1209,7 +1210,11 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
 uint32_t
 ac_compute_tess_lds_size(const struct radeon_info *info, uint32_t lds_per_patch, uint32_t num_patches)
 {
-   const unsigned lds_size = lds_per_patch * num_patches;
+   unsigned lds_size = lds_per_patch * num_patches;
+
+   /* The first vec4 is reserved for the tf0/1 shader message group vote. */
+   if (info->gfx_level >= GFX11)
+      lds_size += AC_HS_MSG_VOTE_LDS_BYTES;

   assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));

--- a/src/amd/vulkan/nir/radv_nir_lower_io.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_io.c
@ -219,7 +219,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s

   if (nir->info.stage == MESA_SHADER_VERTEX) {
      if (info->vs.as_ls) {
-         NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, info->vs.tcs_in_out_eq,
+         NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
                    info->vs.hs_inputs_read, info->vs.tcs_temp_only_input_mask);
         return true;
      } else if (info->vs.as_es) {
@ -227,9 +227,10 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
         return true;
      }
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-      NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, info->vs.tcs_in_out_eq, info->vs.tcs_temp_only_input_mask);
-      NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level,
-                 info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->wave_size);
+      NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
+                 info->vs.tcs_temp_only_input_mask);
+      NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level, info->tcs.tes_inputs_read,
+                 info->tcs.tes_patch_inputs_read, info->wave_size);

      return true;
   } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -1855,6 +1855,7 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
      if (key->ge.as_ls) {
         NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
                    is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
+                    sel->screen->info.gfx_level,
                    key->ge.opt.same_patch_vertices,
                    is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
                    tcs_vgpr_only_inputs);
@ -1867,6 +1868,7 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
      NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
                 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
+                 sel->screen->info.gfx_level,
                 key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);

      /* Used by hs_emit_write_tess_factors() when monolithic shader. */