asahi: stop merging VS and TCS

unfortunately, shader stage merging is bogus when coherent images are used, so we need an unmerged path. i'd rather not maintain two paths, so let's just stop merging. as a bonus this makes ESO a lot easier, and lets us reuse the same VS for both VS->GS and VS->TCS. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
2026-04-26 09:00:37 +02:00 · 2024-02-20 20:37:17 -04:00 · 2024-02-20 20:37:17 -04:00 · 72ef80dfc8
commit 72ef80dfc8
parent 351698d165
10 changed files with 120 additions and 212 deletions
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@ -167,6 +167,22 @@ load_instance_id(nir_builder *b)
   return nir_channel(b, nir_load_global_invocation_id(b, 32), 1);
 }

+nir_def *
+agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
+                          nir_def *vertex)
+{
+   assert(intr->intrinsic == nir_intrinsic_load_per_vertex_input);
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+   nir_def *addr = libagx_vertex_output_address(
+      b, nir_load_vs_output_buffer_agx(b), nir_load_vs_outputs_agx(b), vertex,
+      nir_iadd_imm(b, intr->src[1].ssa, sem.location));
+
+   addr = nir_iadd_imm(b, addr, 4 * nir_intrinsic_component(intr));
+   return nir_load_global_constant(b, addr, 4, intr->def.num_components,
+                                   intr->def.bit_size);
+}
+
 static bool
 lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
 {
@ -174,9 +190,6 @@ lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
      return false;

   b->cursor = nir_instr_remove(&intr->instr);
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
-
-   nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);

   /* Calculate the vertex ID we're pulling, based on the topology class */
   nir_def *vert_in_prim = intr->src[0].ssa;
@ -192,16 +205,7 @@ lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
                        load_geometry_param(b, input_vertices)),
               vertex);

-   /* Calculate the address of the input given the unrolled vertex ID */
-   nir_def *addr = libagx_vertex_output_address(
-      b, nir_load_geometry_param_buffer_agx(b), unrolled, location,
-      load_geometry_param(b, vs_outputs));
-
-   assert(intr->def.bit_size == 32);
-   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
-
-   nir_def *val = nir_load_global_constant(b, addr, 4, intr->def.num_components,
-                                           intr->def.bit_size);
+   nir_def *val = agx_load_per_vertex_input(b, intr, unrolled);
   nir_def_rewrite_uses(&intr->def, val);
   return true;
 }
@ -1312,9 +1316,13 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);

+   /* We inline the outputs_written because it's known at compile-time, even
+    * with shader objects. This lets us constant fold a bit of address math.
+    */
+   nir_def *mask = nir_imm_int64(b, b->shader->info.outputs_written);
+
   nir_def *addr = libagx_vertex_output_address(
-      b, nir_load_geometry_param_buffer_agx(b), calc_unrolled_id(b), location,
-      nir_imm_int64(b, b->shader->info.outputs_written));
+      b, nir_load_vs_output_buffer_agx(b), mask, calc_unrolled_id(b), location);

   assert(nir_src_bit_size(intr->src[0]) == 32);
   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
@ -1377,6 +1385,8 @@ agx_nir_gs_setup_indirect(nir_builder *b, const void *data)

   libagx_gs_setup_indirect(b, nir_load_geometry_param_buffer_agx(b),
                            nir_load_input_assembly_buffer_agx(b),
+                            nir_load_vs_output_buffer_ptr_agx(b),
+                            nir_load_vs_outputs_agx(b),
                            nir_imm_int(b, key->prim),
                            nir_channel(b, nir_load_local_invocation_id(b), 0));
 }
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -7,15 +7,11 @@

 #include <stdbool.h>
 #include <stdint.h>
+#include "nir.h"
 #include "shader_enums.h"

-struct nir_shader;
 enum mesa_prim;

-struct nir_instr;
-struct nir_builder;
-struct nir_variable;
-
 struct agx_lower_output_to_var_state {
   struct nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS];
 };
@ -23,6 +19,10 @@ struct agx_lower_output_to_var_state {
 bool agx_lower_output_to_var(struct nir_builder *b, struct nir_instr *instr,
                             void *data);

+struct nir_def *agx_load_per_vertex_input(struct nir_builder *b,
+                                          nir_intrinsic_instr *intr,
+                                          struct nir_def *vertex);
+
 struct nir_def *agx_vertex_id_for_topology_class(struct nir_builder *b,
                                                 struct nir_def *vert,
                                                 enum mesa_prim clas);
@ -54,8 +54,7 @@ struct agx_unroll_restart_key {

 void agx_nir_unroll_restart(struct nir_builder *b, const void *key);

-bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *vs,
-                       const struct nir_shader *libagx, uint8_t index_size_B);
+bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *libagx);

 bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);

--- a/src/asahi/lib/agx_nir_lower_tess.c
+++ b/src/asahi/lib/agx_nir_lower_tess.c
@ -16,11 +16,6 @@
 #include "nir_intrinsics_indices.h"
 #include "shader_enums.h"

-struct tcs_state {
-   struct agx_lower_output_to_var_state vs_vars;
-   uint64_t vs_outputs_written;
-};
-
 static nir_def *
 tcs_patch_id(nir_builder *b)
 {
@ -99,23 +94,18 @@ lower_tes_load(nir_builder *b, nir_intrinsic_instr *intr)
 }

 static nir_def *
-tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr,
-               struct tcs_state *state)
+tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr)
 {
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   nir_def *base = nir_imul(
+      b, tcs_unrolled_id(b),
+      libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_agx(b)));
+   nir_def *vertex = nir_iadd(b, base, intr->src[0].ssa);

-   nir_def *off = libagx_tcs_in_offset(
-      b, intr->src[0].ssa, nir_iadd_imm(b, intr->src[1].ssa, sem.location),
-      nir_imm_int64(b, state->vs_outputs_written));
-
-   off = nir_iadd_imm(b, off, 4 * nir_intrinsic_component(intr));
-
-   return nir_load_shared(b, intr->def.num_components, 32, off);
+   return agx_load_per_vertex_input(b, intr, vertex);
 }

 static nir_def *
-lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr,
-               struct tcs_state *state)
+lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr)
 {
   switch (intr->intrinsic) {
   case nir_intrinsic_barrier:
@ -132,7 +122,7 @@ lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr,
      return nir_channel(b, nir_load_local_invocation_id(b), 0);

   case nir_intrinsic_load_per_vertex_input:
-      return tcs_load_input(b, intr, state);
+      return tcs_load_input(b, intr);

   case nir_intrinsic_load_patch_vertices_in:
      return libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_agx(b));
@ -179,7 +169,7 @@ lower_tcs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
   b->cursor = nir_before_instr(&intr->instr);

-   nir_def *repl = lower_tcs_impl(b, intr, data);
+   nir_def *repl = lower_tcs_impl(b, intr);
   if (!repl)
      return false;

@ -208,102 +198,13 @@ link_libagx(nir_shader *nir, const nir_shader *libagx)
            nir_address_format_62bit_generic);
 }

-/*
- * Predicate the TCS so the merged shader works when input patch size > output
- * patch size.
- */
-static bool
-agx_nir_predicate_tcs(nir_shader *tcs)
-{
-   nir_function_impl *entry = nir_shader_get_entrypoint(tcs);
-   nir_cf_list list;
-   nir_cf_extract(&list, nir_before_impl(entry), nir_after_impl(entry));
-
-   nir_builder b = nir_builder_at(nir_after_block(nir_start_block(entry)));
-   nir_def *input_vtx_id = nir_load_invocation_id(&b);
-   unsigned verts = tcs->info.tess.tcs_vertices_out;
-
-   nir_push_if(&b, nir_ult_imm(&b, input_vtx_id, verts));
-   {
-      nir_cf_reinsert(&list, b.cursor);
-   }
-   nir_pop_if(&b, NULL);
-
-   nir_metadata_preserve(entry, nir_metadata_none);
-   return false;
-}
-
 bool
-agx_nir_lower_tcs(nir_shader *tcs, const nir_shader *vs,
-                  const struct nir_shader *libagx, uint8_t index_size_B)
+agx_nir_lower_tcs(nir_shader *tcs, const struct nir_shader *libagx)
 {
-   agx_nir_predicate_tcs(tcs);
+   nir_shader_intrinsics_pass(
+      tcs, lower_tcs, nir_metadata_block_index | nir_metadata_dominance, NULL);

-   nir_function_impl *tcs_entry = nir_shader_get_entrypoint(tcs);
-
-   /* Link the vertex shader with the TCS. This assumes that all functions have
-    * been inlined in the vertex shader.
-    */
-   nir_function_impl *vs_entry = nir_shader_get_entrypoint(vs);
-   nir_function *vs_function = nir_function_create(tcs, "vertex");
-   vs_function->impl = nir_function_impl_clone(tcs, vs_entry);
-   vs_function->impl->function = vs_function;
-
-   /* Vertex shader outputs are staged to temporaries */
-   struct tcs_state state = {
-      .vs_outputs_written = vs->info.outputs_written & tcs->info.inputs_read,
-   };
-
-   u_foreach_bit64(slot, vs->info.outputs_written) {
-      const char *slot_name =
-         gl_varying_slot_name_for_stage(slot, MESA_SHADER_VERTEX);
-
-      state.vs_vars.outputs[slot] = nir_variable_create(
-         tcs, nir_var_shader_temp, glsl_uvec4_type(), slot_name);
-   }
-
-   nir_function_instructions_pass(
-      vs_function->impl, agx_lower_output_to_var,
-      nir_metadata_block_index | nir_metadata_dominance, &state.vs_vars);
-
-   /* Invoke the VS first for each vertex in the input patch */
-   nir_builder b_ = nir_builder_at(nir_before_impl(tcs_entry));
-   nir_builder *b = &b_;
-
-   nir_def *input_vtx_id = nir_load_invocation_id(b);
-   nir_push_if(b, nir_ult(b, input_vtx_id, nir_load_patch_vertices_in(b)));
-   {
-      nir_inline_function_impl(b, vs_function->impl, NULL, NULL);
-
-      /* To handle cross-invocation VS output reads, dump everything in
-       * shared local memory.
-       *
-       * TODO: Optimize to registers.
-       */
-      u_foreach_bit64(slot, state.vs_outputs_written) {
-         nir_def *off =
-            libagx_tcs_in_offset(b, input_vtx_id, nir_imm_int(b, slot),
-                                 nir_imm_int64(b, state.vs_outputs_written));
-
-         nir_store_shared(b, nir_load_var(b, state.vs_vars.outputs[slot]), off,
-                          .write_mask = nir_component_mask(4));
-      }
-   }
-   nir_pop_if(b, NULL);
-
-   /* Clean up after inlining VS into TCS */
-   exec_node_remove(&vs_function->node);
-   nir_lower_global_vars_to_local(tcs);
-
-   /* Lower I/A. TODO: Indirect multidraws */
-   agx_nir_lower_index_buffer(tcs, index_size_B, true);
-
-   /* Lower TCS outputs */
-   nir_shader_intrinsics_pass(tcs, lower_tcs,
-                              nir_metadata_block_index | nir_metadata_dominance,
-                              &state);
   link_libagx(tcs, libagx);
-   nir_metadata_preserve(b->impl, nir_metadata_none);
   return true;
 }

--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@ -425,8 +425,9 @@ libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,

 void
 libagx_gs_setup_indirect(global struct agx_geometry_params *p,
-                         global struct agx_ia_state *ia, enum mesa_prim mode,
-                         uint local_id)
+                         global struct agx_ia_state *ia,
+                         global uintptr_t *vertex_buffer, uint64_t vs_outputs,
+                         enum mesa_prim mode, uint local_id)
 {
   global uint *in_draw = (global uint *)ia->draws;

@ -463,13 +464,13 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p,
   global struct agx_geometry_state *state = p->state;

   uint vertex_buffer_size =
-      libagx_tcs_in_size(vertex_count * instance_count, p->vs_outputs);
+      libagx_tcs_in_size(vertex_count * instance_count, vs_outputs);

   p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
   state->heap_bottom +=
      align(p->input_primitives * p->count_buffer_stride, 16);

-   p->vertex_buffer = (global uint *)(state->heap + state->heap_bottom);
+   *vertex_buffer = (uintptr_t)(state->heap + state->heap_bottom);
   state->heap_bottom += align(vertex_buffer_size, 4);
 }

@ -518,9 +519,8 @@ libagx_is_provoking_last(global struct agx_ia_state *ia)
 }

 uintptr_t
-libagx_vertex_output_address(constant struct agx_geometry_params *p, uint vtx,
-                             gl_varying_slot location, uint64_t vs_outputs)
+libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
+                             gl_varying_slot location)
 {
-   return (uintptr_t)p->vertex_buffer +
-          libagx_tcs_in_offs(vtx, location, vs_outputs);
+   return buffer + libagx_tcs_in_offs(vtx, location, mask);
 }
--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@ -85,9 +85,6 @@ struct agx_geometry_params {
   /* Address of associated indirect draw buffer */
   GLOBAL(uint) indirect_desc;

-   /* Address of vertex shader output buffer */
-   GLOBAL(uchar) vertex_buffer;
-
   /* Address of count buffer. For an indirect draw, this will be written by the
    * indirect setup kernel.
    */
@ -113,9 +110,6 @@ struct agx_geometry_params {
    */
   GLOBAL(uchar) xfb_base[MAX_SO_BUFFERS];

-   /* Bitfield of VS outputs. TODO: Optimize linked shaders. */
-   uint64_t vs_outputs;
-
   /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
   uint64_t flat_outputs;

@ -157,7 +151,7 @@ struct agx_geometry_params {
    */
   uint32_t input_topology;
 } PACKED;
-AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 83 * 4);
+AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 79 * 4);

 struct agx_tess_params {
   /* Persistent (cross-draw) geometry state */
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1947,6 +1947,17 @@ barrier("fence_pbe_to_tex_pixel_agx")
 # Unknown fence used in the helper program on exit.
 barrier("fence_helper_exit_agx")

+# Pointer to the buffer passing outputs VS->TCS, VS->GS, or TES->GS linkage.
+system_value("vs_output_buffer_agx", 1, bit_sizes=[64])
+
+# Indirect for the above, used for indirect draws.
+system_value("vs_output_buffer_ptr_agx", 1, bit_sizes=[64])
+
+# Mask of VS->TCS, VS->GS, or TES->GS outputs. This is modelled as a sysval
+# directly so it can be dynamic with shader objects or constant folded with
+# pipelines (including GPL)
+system_value("vs_outputs_agx", 1, bit_sizes=[64])
+
 # Address of state for AGX input assembly lowering for geometry/tessellation
 system_value("input_assembly_buffer_agx", 1, bit_sizes=[64])

--- a/src/gallium/drivers/asahi/agx_nir_lower_bindings.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_bindings.c
@ -23,18 +23,6 @@
 *    2. Images (read/write interleaved)
 */

-/*
- * We only support VS/TCS merging, so we lower TCS samplers to bindless and let
- * VS have exclusive binding table access.
- *
- * This could be optimized but it should be good enough for now.
- */
-static bool
-agx_stage_needs_bindless(enum pipe_shader_type stage)
-{
-   return stage == MESA_SHADER_TESS_CTRL;
-}
-
 static bool
 lower_sampler(nir_builder *b, nir_tex_instr *tex)
 {
@ -55,8 +43,7 @@ lower(nir_builder *b, nir_instr *instr, void *data)
 {
   bool *uses_bindless_samplers = data;
   bool progress = false;
-   bool force_bindless = agx_nir_needs_texture_crawl(instr) ||
-                         agx_stage_needs_bindless(b->shader->info.stage);
+   bool force_bindless = agx_nir_needs_texture_crawl(instr);
   b->cursor = nir_before_instr(instr);

   if (instr->type == nir_instr_type_intrinsic) {
@ -132,8 +119,7 @@ lower(nir_builder *b, nir_instr *instr, void *data)
   } else if (instr->type == nir_instr_type_tex) {
      nir_tex_instr *tex = nir_instr_as_tex(instr);

-      if ((agx_stage_needs_bindless(b->shader->info.stage) ||
-           BITSET_COUNT(b->shader->info.samplers_used) > 16) &&
+      if ((BITSET_COUNT(b->shader->info.samplers_used) > 16) &&
          lower_sampler(b, tex)) {

         progress = true;
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@ -6,6 +6,7 @@
 #include "compiler/nir/nir_builder.h"
 #include "util/bitset.h"
 #include "util/u_dynarray.h"
+#include "agx_nir_lower_gs.h"
 #include "agx_state.h"
 #include "nir.h"
 #include "nir_builder_opcodes.h"
@ -173,6 +174,13 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
      return load_sysval_root(b, 1, 64, &u->input_assembly);
   case nir_intrinsic_load_geometry_param_buffer_agx:
      return load_sysval_root(b, 1, 64, &u->geometry_params);
+   case nir_intrinsic_load_vs_output_buffer_agx:
+      return nir_load_global_constant(
+         b, load_sysval_root(b, 1, 64, &u->vertex_output_buffer_ptr), 8, 1, 64);
+   case nir_intrinsic_load_vs_output_buffer_ptr_agx:
+      return load_sysval_root(b, 1, 64, &u->vertex_output_buffer_ptr);
+   case nir_intrinsic_load_vs_outputs_agx:
+      return load_sysval_root(b, 1, 64, &u->vertex_outputs);
   case nir_intrinsic_load_tess_param_buffer_agx:
      return load_sysval_root(b, 1, 64, &u->tess_params);
   case nir_intrinsic_load_fixed_point_size_agx:
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -609,13 +609,6 @@ agx_bind_sampler_states(struct pipe_context *pctx, enum pipe_shader_type shader,
   }
 }

-/* See agx_stage_needs_bindless_sampler for explanation */
-static enum pipe_shader_type
-merged_stage(struct agx_context *ctx, enum pipe_shader_type stage)
-{
-   return stage == MESA_SHADER_TESS_CTRL ? MESA_SHADER_VERTEX : stage;
-}
-
 static enum agx_texture_dimension
 agx_translate_tex_dim(enum pipe_texture_target dim, unsigned samples)
 {
@ -1906,21 +1899,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
                  key->next.gs.index_size_B, &outputs);
      }
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-      struct asahi_tcs_shader_key *key = &key_->tcs;
-
-      /* TODO: Deduplicate this logic from the GS case! */
-      struct blob_reader vs_reader;
-      blob_reader_init(&vs_reader, linked_so->serialized_nir.data,
-                       linked_so->serialized_nir.size);
-      nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);
-
-      /* Apply the VS key to the VS before linking it in */
-      NIR_PASS_V(vs, lower_vbo, key->attribs);
-      NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
-      NIR_PASS_V(vs, agx_nir_lower_sysvals, PIPE_SHADER_VERTEX, false);
-
-      NIR_PASS_V(nir, agx_nir_lower_tcs, vs, dev->libagx, key->index_size_B);
-      ralloc_free(vs);
+      NIR_PASS_V(nir, agx_nir_lower_tcs, dev->libagx);
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      struct asahi_gs_shader_key *key = &key_->gs;

@ -2482,10 +2461,11 @@ agx_update_vs(struct agx_context *ctx, unsigned index_size_B)
      rast_prim(ctx->batch->reduced_prim, ctx->rast->base.fill_front);

   struct asahi_vs_shader_key key = {
-      .next_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess
-                       ? ASAHI_VS_TCS
-                    : ctx->stage[PIPE_SHADER_GEOMETRY].shader ? ASAHI_VS_GS
-                                                              : ASAHI_VS_FS,
+      .next_stage =
+         ((ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess) ||
+          ctx->stage[PIPE_SHADER_GEOMETRY].shader)
+            ? ASAHI_VS_GS
+            : ASAHI_VS_FS,
   };

   if (key.next_stage == ASAHI_VS_FS) {
@ -2840,9 +2820,6 @@ translate_sampler_state_count(struct agx_context *ctx,
                              struct agx_compiled_shader *cs,
                              enum pipe_shader_type stage)
 {
-   /* Get samplers from merged stage but get txf status from cs */
-   stage = merged_stage(ctx, stage);
-
   /* Clamp to binding table maximum, anything larger will be bindless */
   return agx_translate_sampler_state_count(
      MIN2(sampler_count(ctx, cs, stage), 16),
@ -3054,13 +3031,10 @@ agx_sampler_heap_add(struct agx_device *dev, struct agx_sampler_heap *heap,

 static void
 agx_upload_samplers(struct agx_batch *batch, struct agx_compiled_shader *cs,
-                    enum pipe_shader_type orig_stage)
+                    enum pipe_shader_type stage)
 {
   struct agx_context *ctx = batch->ctx;

-   /* Get samplers from merged stage but get txf status from cs */
-   enum pipe_shader_type stage = merged_stage(ctx, orig_stage);
-
   unsigned nr_samplers = sampler_count(ctx, cs, stage);
   bool custom_borders = ctx->stage[stage].custom_borders;

@ -3104,8 +3078,8 @@ agx_upload_samplers(struct agx_batch *batch, struct agx_compiled_shader *cs,
      out_sampler += sampler_length;
   }

-   batch->sampler_count[orig_stage] = nr_samplers;
-   batch->samplers[orig_stage] = T.gpu;
+   batch->sampler_count[stage] = nr_samplers;
+   batch->samplers[stage] = T.gpu;
 }

 static void
@ -3131,8 +3105,7 @@ agx_update_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
   if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
      agx_set_sampler_uniforms(batch, stage);

-   if ((ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER) ||
-       (ctx->stage[merged_stage(ctx, stage)].dirty & AGX_STAGE_DIRTY_SAMPLER))
+   if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
      agx_upload_samplers(batch, cs, stage);

   struct agx_stage_uniforms *unif = &batch->stage_uniforms[stage];
@ -3151,14 +3124,13 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
      agx_alloc_usc_control(&batch->pipeline_pool, cs->push_range_count + 2);

   enum pipe_shader_type stage = cs->stage;
-   enum pipe_shader_type merged = merged_stage(ctx, stage);

-   if (batch->texture_count[merged]) {
+   if (batch->texture_count[stage]) {
      agx_usc_pack(&b, TEXTURE, cfg) {
         cfg.start = 0;
         cfg.count =
-            MIN2(batch->texture_count[merged], AGX_NUM_TEXTURE_STATE_REGS);
-         cfg.buffer = batch->stage_uniforms[merged].texture_base;
+            MIN2(batch->texture_count[stage], AGX_NUM_TEXTURE_STATE_REGS);
+         cfg.buffer = batch->stage_uniforms[stage].texture_base;
      }
   }

@ -4192,10 +4164,12 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
    * & count buffers. GPU calculates and allocates for indirect draws.
    */
   unsigned count_buffer_stride = batch->ctx->gs->gs_count_words * 4;
-   params.vs_outputs = batch->ctx->vs->info.outputs;
+   batch->uniforms.vertex_outputs = batch->ctx->vs->info.outputs;

   if (indirect) {
      params.count_buffer_stride = count_buffer_stride;
+      batch->uniforms.vertex_output_buffer_ptr =
+         agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
   } else {
      params.gs_grid[0] =
         u_decomposed_prims_for_vertices(info->mode, draw->count);
@ -4206,7 +4180,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
      params.input_vertices = draw->count;

      unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
-                                            params.vs_outputs);
+                                            batch->uniforms.vertex_outputs);
      unsigned size = params.input_primitives * count_buffer_stride;

      if (size) {
@ -4215,8 +4189,9 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
      }

      if (vb_size) {
-         params.vertex_buffer =
-            agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+         uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+         batch->uniforms.vertex_output_buffer_ptr =
+            agx_pool_upload(&batch->pool, &addr, 8);
      }
   }

@ -4743,14 +4718,28 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
   agx_update_descriptors(batch, ctx->vs);
   agx_update_descriptors(batch, ctx->tcs);

+   batch->uniforms.vertex_outputs = ctx->vs->info.outputs;
+
+   unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
+                                         batch->uniforms.vertex_outputs);
+   uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+   batch->uniforms.vertex_output_buffer_ptr =
+      agx_pool_upload(&batch->pool, &addr, 8);
+
+   struct pipe_grid_info vs_grid = {
+      .block = {1, 1, 1},
+      .grid = {draws->count, info->instance_count, 1},
+   };
+
+   agx_launch(batch, &vs_grid, ctx->vs, PIPE_SHADER_VERTEX);
+
   struct pipe_grid_info tcs_grid = {
-      .block = {MAX2(patch_vertices, tcs->tess.output_patch_size), 1, 1},
+      .block = {tcs->tess.output_patch_size, 1, 1},
      .grid = {in_patches, info->instance_count, 1},
-      /* XXX */
-      .variable_shared_mem = 32768,
   };

   agx_launch(batch, &tcs_grid, ctx->tcs, PIPE_SHADER_TESS_CTRL);
+   batch->uniforms.vertex_output_buffer_ptr = 0;

   agx_flush_all(ctx, "HACK");
   agx_sync_all(ctx, "HACK");
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -112,6 +112,16 @@ struct PACKED agx_draw_uniforms {
   /* Addresses for the results of pipeline statistics queries */
   uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS];

+   /* Pointer to base address of the VS->TCS, VS->GS, or TES->GS buffer.
+    * Indirected so it can be written to in an indirect setup kernel. G13
+    * appears to prefetch uniforms across dispatches, but does not pre-run
+    * preambles, so this indirection saves us from splitting the batch.
+    */
+   uint64_t vertex_output_buffer_ptr;
+
+   /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
+   uint64_t vertex_outputs;
+
   /* Address of input assembly buffer if geom/tess is used, else 0 */
   uint64_t input_assembly;