aco: When LS and HS invocations are the same, pass LS outputs in temps.

We know that in this case, the LS and HS invocations are working on the exact same vertex, so it's safe to skip the LDS. Totals: VGPRS: 3960744 -> 3961844 (0.03 %) Code Size: 254824300 -> 254764624 (-0.02 %) bytes Max Waves: 1053748 -> 1053574 (-0.02 %) Totals from affected shaders: VGPRS: 26152 -> 27252 (4.21 %) Code Size: 1496600 -> 1436924 (-3.99 %) bytes Max Waves: 4860 -> 4686 (-3.58 %) Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4165>
2026-02-11 05:10:27 +01:00 · 2020-03-26 17:45:55 +01:00 · 2020-03-26 17:45:55 +01:00 · 798dd98d6e
commit 798dd98d6e
parent 0a91c086b8
1 changed files with 35 additions and 0 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -3329,6 +3329,34 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
   return true;
 }

+bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+{
+   /* Only TCS per-vertex inputs are supported by this function.
+    * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+    */
+   if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
+      return false;
+
+   nir_src *off_src = nir_get_io_offset_src(instr);
+   nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+   nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+   bool can_use_temps = nir_src_is_const(*off_src) &&
+                        vertex_index_instr->type == nir_instr_type_intrinsic &&
+                        nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+
+   if (!can_use_temps)
+      return false;
+
+   unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
+   Temp *src = &ctx->inputs.temps[idx];
+   Temp vec = create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u);
+   assert(vec.size() == dst.size());
+
+   Builder bld(ctx->program, ctx->block);
+   bld.copy(Definition(dst), vec);
+   return true;
+}
+
 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
 {
   Builder bld(ctx->program, ctx->block);
@ -3338,6 +3366,9 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
   unsigned write_mask = nir_intrinsic_write_mask(instr);
   unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;

+   if (ctx->tcs_in_out_eq)
+      store_output_to_temps(ctx, instr);
+
   if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
      /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
      Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
@ -3974,6 +4005,10 @@ void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *ins

   Builder bld(ctx->program, ctx->block);
   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   if (load_input_from_temps(ctx, instr, dst))
+      return;
+
   std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
   unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
   unsigned lds_align = calculate_lds_alignment(ctx, offs.second);