ac,radv,radeonsi: enable TCS input reads from VGPRs for all compatible loads

Cross-invocation TCS input access doesn't prevent same-invocation access. This improves shaders that use both for the same inputs. Also, if some components of a vec4 slot only use same-invocation access and other components only use cross-invocation access (it's possible after compaction), this takes the VGPR path for the components with same-invocation access, which didn't happen previously because all masks only describe whole vec4s. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
2025-12-24 15:20:10 +01:00 · 2024-10-02 16:48:39 -04:00 · 2024-10-02 16:48:39 -04:00 · 85c20def94
commit 85c20def94
parent 99a03dc9d5
16 changed files with 62 additions and 43 deletions
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@ -81,15 +81,16 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
                               ac_nir_map_io_driver_location map,
                               enum amd_gfx_level gfx_level,
                               bool tcs_in_out_eq,
-                               uint64_t tcs_inputs_read,
-                               uint64_t tcs_temp_only_inputs);
+                               uint64_t tcs_inputs_via_temp,
+                               uint64_t tcs_inputs_via_lds);

 void
 ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
                              ac_nir_map_io_driver_location map,
                              enum amd_gfx_level gfx_level,
                              bool tcs_in_out_eq,
-                              uint64_t tcs_temp_only_inputs);
+                              uint64_t tcs_inputs_via_temp,
+                              uint64_t tcs_inputs_via_lds);

 void
 ac_nir_lower_hs_outputs_to_mem(nir_shader *shader,
--- a/src/amd/common/ac_nir_lower_tess_io_to_mem.c
+++ b/src/amd/common/ac_nir_lower_tess_io_to_mem.c
@ -1168,19 +1168,24 @@ ac_nir_lower_ls_outputs_to_mem(nir_shader *shader,
                               ac_nir_map_io_driver_location map,
                               enum amd_gfx_level gfx_level,
                               bool tcs_in_out_eq,
-                               uint64_t tcs_inputs_read,
-                               uint64_t tcs_temp_only_inputs)
+                               uint64_t tcs_inputs_via_temp,
+                               uint64_t tcs_inputs_via_lds)
 {
   assert(shader->info.stage == MESA_SHADER_VERTEX);
   assert(gfx_level >= GFX9 || !tcs_in_out_eq);

   lower_tess_io_state state = {
      .gfx_level = gfx_level,
-      .tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
-      .tcs_inputs_via_lds = tcs_inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull),
      .map_io = map,
   };

+   if (tcs_in_out_eq) {
+      state.tcs_inputs_via_temp = tcs_inputs_via_temp;
+      state.tcs_inputs_via_lds = tcs_inputs_via_lds;
+   } else {
+      state.tcs_inputs_via_lds = tcs_inputs_via_lds | tcs_inputs_via_temp;
+   }
+
   nir_shader_intrinsics_pass(shader, lower_ls_output_store,
                                nir_metadata_control_flow,
                                &state);
@ -1191,18 +1196,24 @@ ac_nir_lower_hs_inputs_to_mem(nir_shader *shader,
                              ac_nir_map_io_driver_location map,
                              enum amd_gfx_level gfx_level,
                              bool tcs_in_out_eq,
-                              uint64_t tcs_temp_only_inputs)
+                              uint64_t tcs_inputs_via_temp,
+                              uint64_t tcs_inputs_via_lds)
 {
   assert(shader->info.stage == MESA_SHADER_TESS_CTRL);
   assert(gfx_level >= GFX9 || !tcs_in_out_eq);

   lower_tess_io_state state = {
      .gfx_level = gfx_level,
-      .tcs_inputs_via_temp = tcs_in_out_eq ? tcs_temp_only_inputs : 0,
-      .tcs_inputs_via_lds = shader->info.inputs_read & (tcs_in_out_eq ? ~tcs_temp_only_inputs : ~0ull),
      .map_io = map,
   };

+   if (tcs_in_out_eq) {
+      state.tcs_inputs_via_temp = tcs_inputs_via_temp;
+      state.tcs_inputs_via_lds = tcs_inputs_via_lds;
+   } else {
+      state.tcs_inputs_via_lds = shader->info.inputs_read;
+   }
+
   nir_shader_lower_instructions(shader,
                                 filter_load_tcs_per_vertex_input,
                                 lower_hs_per_vertex_input_load,
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -11930,7 +11930,7 @@ select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader

      /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
      const bool tcs_skip_barrier =
-         ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
+         ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds;

      /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
      const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@ -115,7 +115,7 @@ struct isel_context {
   Temp ttmp8;

   /* tessellation information */
-   uint64_t tcs_temp_only_inputs;
+   bool any_tcs_inputs_via_lds = false;
   bool tcs_in_out_eq = false;

   /* Fragment color output information */
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -228,7 +228,7 @@ void
 setup_tcs_info(isel_context* ctx)
 {
   ctx->tcs_in_out_eq = ctx->program->info.vs.tcs_in_out_eq;
-   ctx->tcs_temp_only_inputs = ctx->program->info.vs.tcs_temp_only_input_mask;
+   ctx->any_tcs_inputs_via_lds = ctx->program->info.vs.any_tcs_inputs_via_lds;
 }

 void
--- a/src/amd/compiler/aco_shader_info.h
+++ b/src/amd/compiler/aco_shader_info.h
@ -107,7 +107,7 @@ struct aco_shader_info {
   struct ac_arg epilog_pc; /* Vulkan only */
   struct {
      bool tcs_in_out_eq;
-      uint64_t tcs_temp_only_input_mask;
+      bool any_tcs_inputs_via_lds;
      bool has_prolog;
   } vs;
   struct {
--- a/src/amd/vulkan/nir/radv_nir_lower_io.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_io.c
@ -220,7 +220,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
   if (nir->info.stage == MESA_SHADER_VERTEX) {
      if (info->vs.as_ls) {
         NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, map_output, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
-                    info->vs.hs_inputs_read, info->vs.tcs_temp_only_input_mask);
+                    info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds);
         return true;
      } else if (info->vs.as_es) {
         NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, map_output, pdev->info.gfx_level, info->esgs_itemsize, info->gs_inputs_read);
@ -228,7 +228,7 @@ radv_nir_lower_io_to_mem(struct radv_device *device, struct radv_shader_stage *s
      }
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
      NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, map_input, pdev->info.gfx_level, info->vs.tcs_in_out_eq,
-                 info->vs.tcs_temp_only_input_mask);
+                 info->vs.tcs_inputs_via_temp, info->vs.tcs_inputs_via_lds);
      NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, map_output, pdev->info.gfx_level, info->tcs.tes_inputs_read,
                 info->tcs.tes_patch_inputs_read, info->wave_size);

--- a/src/amd/vulkan/radv_aco_shader_info.h
+++ b/src/amd/vulkan/radv_aco_shader_info.h
@ -34,11 +34,11 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv
   ASSIGN_FIELD(ps.has_epilog);
   ASSIGN_FIELD(merged_shader_compiled_separately);
   ASSIGN_FIELD(vs.tcs_in_out_eq);
-   ASSIGN_FIELD(vs.tcs_temp_only_input_mask);
   ASSIGN_FIELD(vs.has_prolog);
   ASSIGN_FIELD(tcs.num_lds_blocks);
   ASSIGN_FIELD(ps.num_inputs);
   ASSIGN_FIELD(cs.uses_full_subgroups);
+   aco_info->vs.any_tcs_inputs_via_lds = radv->vs.tcs_inputs_via_lds != 0;
   aco_info->ps.spi_ps_input_ena = radv->ps.spi_ps_input_ena;
   aco_info->ps.spi_ps_input_addr = radv->ps.spi_ps_input_addr;
   aco_info->ps.has_prolog = false;
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@ -588,7 +588,7 @@ gather_shader_info_vs(struct radv_device *device, const nir_shader *nir,
   }

   info->gs_inputs_read = ~0ULL;
-   info->vs.hs_inputs_read = ~0ULL;
+   info->vs.tcs_inputs_via_lds = ~0ULL;

   /* Use per-attribute vertex descriptors to prevent faults and for correct bounds checking. */
   info->vs.use_per_attribute_vb_descs = radv_use_per_attribute_vb_descs(nir, gfx_state, stage_key);
@ -1782,7 +1782,7 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro
      struct radv_shader_stage *vs_stage = producer;
      struct radv_shader_stage *tcs_stage = consumer;

-      vs_stage->info.vs.hs_inputs_read = tcs_stage->nir->info.inputs_read;
+      vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.inputs_read;

      if (gfx_state->ts.patch_control_points) {
         vs_stage->info.workgroup_size =
@ -1804,11 +1804,16 @@ radv_link_shaders_info(struct radv_device *device, struct radv_shader_stage *pro
               gfx_state->ts.patch_control_points == tcs_stage->info.tcs.tcs_vertices_out &&
               vs_stage->nir->info.float_controls_execution_mode == tcs_stage->nir->info.float_controls_execution_mode;

-            if (vs_stage->info.vs.tcs_in_out_eq)
-               vs_stage->info.vs.tcs_temp_only_input_mask =
-                  tcs_stage->nir->info.inputs_read & vs_stage->nir->info.outputs_written &
-                  ~tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read &
-                  ~tcs_stage->nir->info.inputs_read_indirectly & ~vs_stage->nir->info.outputs_accessed_indirectly;
+            if (vs_stage->info.vs.tcs_in_out_eq) {
+               vs_stage->info.vs.tcs_inputs_via_temp = vs_stage->nir->info.outputs_written &
+                                                       ~vs_stage->nir->info.outputs_accessed_indirectly &
+                                                       tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read;
+               vs_stage->info.vs.tcs_inputs_via_lds = tcs_stage->nir->info.tess.tcs_cross_invocation_inputs_read |
+                                                      (tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read &
+                                                       tcs_stage->nir->info.inputs_read_indirectly) |
+                                                      (tcs_stage->nir->info.tess.tcs_same_invocation_inputs_read &
+                                                       vs_stage->nir->info.outputs_accessed_indirectly);
+            }
         }
      }
   }
--- a/src/amd/vulkan/radv_shader_info.h
+++ b/src/amd/vulkan/radv_shader_info.h
@ -118,7 +118,8 @@ struct radv_shader_info {
      bool as_es;
      bool as_ls;
      bool tcs_in_out_eq;
-      uint64_t tcs_temp_only_input_mask;
+      uint64_t tcs_inputs_via_temp;
+      uint64_t tcs_inputs_via_lds;
      uint8_t num_linked_outputs;
      bool needs_base_instance;
      bool use_per_attribute_vb_descs;
@ -128,7 +129,6 @@ struct radv_shader_info {
      bool dynamic_inputs;
      bool dynamic_num_verts_per_prim;
      uint32_t num_outputs; /* For NGG streamout only */
-      uint64_t hs_inputs_read; /* Mask of HS inputs read (only used by linked LS) */
   } vs;
   struct {
      uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -1857,8 +1857,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
                    is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
                    sel->screen->info.gfx_level,
                    key->ge.opt.same_patch_vertices,
-                    is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
-                    tcs_vgpr_only_inputs);
+                    is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
+                    is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
         return true;
      } else if (key->ge.as_es) {
         NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
@ -1868,8 +1868,8 @@ static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
      NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
                 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
-                 sel->screen->info.gfx_level,
-                 key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);
+                 sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
+                 sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);

      /* Used by hs_emit_write_tess_factors() when monolithic shader. */
      if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
@ -3644,8 +3644,7 @@ nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,

   si_init_shader_args(prev_shader, args);

-   nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
-                                       sel->info.tcs_vgpr_only_inputs, NULL);
+   nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, 0, NULL);

   si_update_shader_binary_info(shader, nir);

--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -482,7 +482,8 @@ struct si_shader_info {
   uint16_t enabled_streamout_buffer_mask;

   uint64_t inputs_read; /* "get_unique_index" bits */
-   uint64_t tcs_vgpr_only_inputs; /* TCS inputs that are only in VGPRs, not LDS. */
+   uint64_t tcs_inputs_via_temp;
+   uint64_t tcs_inputs_via_lds;

   /* For VS before {TCS, TES, GS} and TES before GS. */
   uint64_t ls_es_outputs_written;     /* "get_unique_index" bits */
--- a/src/gallium/drivers/radeonsi/si_shader_aco.c
+++ b/src/gallium/drivers/radeonsi/si_shader_aco.c
@ -87,7 +87,9 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
   switch (stage) {
   case MESA_SHADER_TESS_CTRL:
      info->vs.tcs_in_out_eq = key->ge.opt.same_patch_vertices;
-      info->vs.tcs_temp_only_input_mask = sel->info.tcs_vgpr_only_inputs;
+      info->vs.any_tcs_inputs_via_lds = sel->info.tcs_inputs_via_lds ||
+                                        (!shader->key.ge.opt.same_patch_vertices &&
+                                         sel->info.tcs_inputs_via_temp);
      info->tcs.tcs_offchip_layout = args->tcs_offchip_layout;
      break;
   case MESA_SHADER_FRAGMENT:
--- a/src/gallium/drivers/radeonsi/si_shader_info.c
+++ b/src/gallium/drivers/radeonsi/si_shader_info.c
@ -702,9 +702,10 @@ void si_nir_scan_shader(struct si_screen *sscreen, const struct nir_shader *nir,
         assert(((info->esgs_vertex_stride / 4) & C_028AAC_ITEMSIZE) == 0);
      }

-      info->tcs_vgpr_only_inputs = ~info->base.tess.tcs_cross_invocation_inputs_read &
-                                   ~info->base.inputs_read_indirectly &
-                                   info->base.inputs_read;
+      info->tcs_inputs_via_temp = info->base.tess.tcs_same_invocation_inputs_read;
+      info->tcs_inputs_via_lds = info->base.tess.tcs_cross_invocation_inputs_read |
+                                 (info->base.tess.tcs_same_invocation_inputs_read &
+                                  info->base.inputs_read_indirectly);
   }

   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@ -702,8 +702,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
      if (ctx->stage == MESA_SHADER_TESS_CTRL) {
         /* We need the barrier only if TCS inputs are read from LDS. */
         if (!shader->key.ge.opt.same_patch_vertices ||
-             shader->selector->info.base.inputs_read &
-             ~shader->selector->info.tcs_vgpr_only_inputs) {
+             shader->selector->info.tcs_inputs_via_lds) {
            ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);

            /* If both input and output patches are wholly in one wave, we don't need a barrier.
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -4661,11 +4661,11 @@ unsigned si_shader_lshs_vertex_stride(struct si_shader *ls)
      assert(tcs->selector->screen->info.gfx_level >= GFX9);

      if (tcs->is_monolithic) {
-         uint64_t lds_inputs_read = tcs->selector->info.base.inputs_read;
+         uint64_t lds_inputs_read = tcs->selector->info.tcs_inputs_via_lds;

-         /* Don't allocate LDS for inputs passed via VGPRs. */
-         if (tcs->key.ge.opt.same_patch_vertices)
-            lds_inputs_read &= ~tcs->selector->info.tcs_vgpr_only_inputs;
+         /* If the TCS in/out number of vertices is different, all inputs are passed via LDS. */
+         if (!tcs->key.ge.opt.same_patch_vertices)
+            lds_inputs_read |= tcs->selector->info.tcs_inputs_via_temp;

         /* NIR lowering passes pack LS outputs/HS inputs if the usage masks of both are known. */
         num_slots = util_bitcount64(lds_inputs_read);