asahi: Implement skeleton for tessellation

This implements a rough skeleton of what's needed for tessellation. It contains the relevant lowerings to merge the VS and TCS, running them as a compute kernel, and to lower the TES to a new VS (possibly merged in with a subsequent GS). This is sufficient for both standalone tessellation and tess + geom/xfb together. It does not yet contain a GPU accellerated tessellator, simply falling back to the CPU for that for now. Nevertheless the data structures are engineered with that end goal in mind, in particular to be able to tessellate all patches in parallel without needing any prefix sums etc (using simple watermark allocation for the heap). Work on fleshing out the skeleton continues in parallel. For now, this does pass the tests and lets the harder stuff get regression tested more easily. And merging early will ease rebase. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2025-12-29 08:00:12 +01:00 · 2023-10-28 15:39:14 -04:00 · 2023-10-28 15:39:14 -04:00 · 9753cd44f7
commit 9753cd44f7
parent 2d37d1b704
18 changed files with 1285 additions and 137 deletions
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@ -2959,7 +2959,8 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx,
         out->inputs_flat_shaded = masks.flat;
         out->inputs_linear_shaded = masks.linear;
      }
-   } else if (nir->info.stage == MESA_SHADER_VERTEX) {
+   } else if (nir->info.stage == MESA_SHADER_VERTEX ||
+              nir->info.stage == MESA_SHADER_TESS_EVAL) {
      out->has_edgeflags = nir->info.outputs_written & VARYING_BIT_EDGE;
      out->cull_distance_size = nir->info.cull_distance_array_size;

--- a/src/asahi/compiler/agx_nir_lower_cull_distance.c
+++ b/src/asahi/compiler/agx_nir_lower_cull_distance.c
@ -9,6 +9,7 @@
 #include "agx_compile.h"
 #include "agx_nir.h"
 #include "glsl_types.h"
+#include "shader_enums.h"

 /*
 * Lower cull distance to discard. From the spec:
@ -61,7 +62,9 @@ lower_write(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
 bool
 agx_nir_lower_cull_distance_vs(nir_shader *s)
 {
-   assert(s->info.stage == MESA_SHADER_VERTEX);
+   assert(s->info.stage == MESA_SHADER_VERTEX ||
+          s->info.stage == MESA_SHADER_TESS_EVAL);
+
   assert(s->info.outputs_written & VARYING_BIT_CULL_DIST0);

   nir_shader_intrinsics_pass(
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@ -131,15 +131,10 @@ add_counter(nir_builder *b, nir_def *counter, nir_def *increment)
 }

 /* Helpers for lowering I/O to variables */
-struct lower_output_to_var_state {
-   nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS];
-   bool arrayed;
-};
-
-static bool
-lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
+bool
+agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
 {
-   struct lower_output_to_var_state *state = data;
+   struct agx_lower_output_to_var_state *state = data;
   if (instr->type != nir_instr_type_intrinsic)
      return false;

@ -201,7 +196,7 @@ load_instance_id(nir_builder *b)
 static bool
 lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
-   struct lower_output_to_var_state *vs_state = data;
+   struct agx_lower_output_to_var_state *vs_state = data;
   if (intr->intrinsic != nir_intrinsic_load_per_vertex_input)
      return false;

@ -249,7 +244,7 @@ lower_id_in_prim(nir_builder *b, nir_instr *instr, void *data)
 static void
 agx_nir_link_vs_gs(nir_shader *vs, nir_shader *gs)
 {
-   struct lower_output_to_var_state state = {.arrayed = true};
+   struct agx_lower_output_to_var_state state = {.arrayed = true};

   /* Vertex shader outputs will be placed in arrays. Create those arrays. */
   u_foreach_bit64(slot, vs->info.outputs_written) {
@ -278,7 +273,7 @@ agx_nir_link_vs_gs(nir_shader *vs, nir_shader *gs)

   /* The vertex shader needs to be expressed in terms of that index */
   nir_function_instructions_pass(
-      vs_function->impl, lower_output_to_var,
+      vs_function->impl, agx_lower_output_to_var,
      nir_metadata_block_index | nir_metadata_dominance, &state);

   nir_function_instructions_pass(
@ -1144,7 +1139,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx,
      *gs_count = NULL;

   /* Geometry shader outputs are staged to temporaries */
-   struct lower_output_to_var_state state = {.arrayed = false};
+   struct agx_lower_output_to_var_state state = {.arrayed = false};

   u_foreach_bit64(slot, gs->info.outputs_written) {
      const char *slot_name =
@ -1165,7 +1160,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx,
      gs_state.stride_B += size_B;
   }

-   NIR_PASS(_, gs, nir_shader_instructions_pass, lower_output_to_var,
+   NIR_PASS(_, gs, nir_shader_instructions_pass, agx_lower_output_to_var,
            nir_metadata_block_index | nir_metadata_dominance, &state);

   /* Set flatshade_first. For now this is always a constant, but in the future
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -7,11 +7,25 @@
 #define __AGX_NIR_LOWER_GS_H

 #include <stdbool.h>
+#include <stdint.h>
+#include "shader_enums.h"

 struct nir_shader;
 struct agx_ia_key;
 enum mesa_prim;

+struct nir_instr;
+struct nir_builder;
+struct nir_variable;
+
+struct agx_lower_output_to_var_state {
+   struct nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS];
+   bool arrayed;
+};
+
+bool agx_lower_output_to_var(struct nir_builder *b, struct nir_instr *instr,
+                             void *data);
+
 bool agx_nir_lower_ia(struct nir_shader *s, struct agx_ia_key *ia);

 bool agx_nir_lower_multidraw(struct nir_shader *s, struct agx_ia_key *key);
@ -33,4 +47,13 @@ struct nir_shader *agx_nir_unroll_restart(const struct nir_shader *libagx,
                                          enum mesa_prim prim,
                                          unsigned index_size_B);

+bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *vs,
+                       const struct nir_shader *libagx, uint8_t index_size_B);
+
+bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
+
+uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);
+
+unsigned agx_tcs_output_stride(const struct nir_shader *nir);
+
 #endif
--- a/src/asahi/lib/agx_nir_lower_ia.c
+++ b/src/asahi/lib/agx_nir_lower_ia.c
@ -12,6 +12,7 @@
 #include "nir.h"
 #include "nir_builder_opcodes.h"
 #include "nir_intrinsics.h"
+#include "shader_enums.h"

 /*
 * This file implements input assembly in software for geometry/tessellation
@ -27,14 +28,60 @@
 * This multidraw implementation kicks off the prefix sum and lowered draw.
 */

+/*
+ * Sync with geometry.cl, this is preferred to avoid NIR needing to chew through
+ * the massive switch statement (bad for compile time).
+ */
+static nir_def *
+vertex_id_for_topology(nir_builder *b, struct agx_ia_key *key)
+{
+   nir_def *prim = nir_load_primitive_id(b);
+   nir_def *vert = nir_load_vertex_id_in_primitive_agx(b);
+   nir_def *flatshade_first = nir_imm_bool(b, key->flatshade_first);
+
+   switch (key->mode) {
+   case MESA_PRIM_POINTS:
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_LINES_ADJACENCY:
+   case MESA_PRIM_TRIANGLES_ADJACENCY:
+      return nir_iadd(
+         b, nir_imul_imm(b, prim, mesa_vertices_per_prim(key->mode)), vert);
+
+   case MESA_PRIM_LINE_LOOP:
+      return libagx_vertex_id_for_line_loop(b, prim, vert,
+                                            nir_load_num_vertices(b));
+
+   case MESA_PRIM_LINE_STRIP:
+   case MESA_PRIM_LINE_STRIP_ADJACENCY:
+      return nir_iadd(b, prim, vert);
+
+   case MESA_PRIM_TRIANGLE_STRIP: {
+      return nir_iadd(
+         b, prim,
+         libagx_map_vertex_in_tri_strip(b, prim, vert, flatshade_first));
+   }
+
+   case MESA_PRIM_TRIANGLE_FAN:
+      return libagx_vertex_id_for_tri_fan(b, prim, vert, flatshade_first);
+
+   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return libagx_vertex_id_for_tri_strip_adj(
+         b, prim, vert, nir_load_num_vertices(b), flatshade_first);
+
+   case MESA_PRIM_PATCHES:
+      return nir_iadd(b, nir_imul(b, prim, nir_load_patch_vertices_in(b)),
+                      nir_load_invocation_id(b));
+
+   default:
+      unreachable("invalid mode");
+   }
+}
+
 static nir_def *
 load_vertex_id(nir_builder *b, struct agx_ia_key *key)
 {
-   /* Tessellate by primitive mode */
-   nir_def *id = libagx_vertex_id_for_topology(
-      b, nir_imm_int(b, key->mode), nir_imm_bool(b, key->flatshade_first),
-      nir_load_primitive_id(b), nir_load_vertex_id_in_primitive_agx(b),
-      nir_load_num_vertices(b));
+   nir_def *id = vertex_id_for_topology(b, key);

   /* If drawing with an index buffer, pull the vertex ID. Otherwise, the
    * vertex ID is just the index as-is.
--- a/src/asahi/lib/agx_nir_lower_tess.c
+++ b/src/asahi/lib/agx_nir_lower_tess.c
@ -0,0 +1,395 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "shaders/geometry.h"
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "agx_nir_lower_gs.h"
+#include "glsl_types.h"
+#include "libagx_shaders.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "shader_enums.h"
+
+struct tcs_state {
+   struct agx_lower_output_to_var_state vs_vars;
+   uint64_t vs_outputs_written;
+};
+
+static nir_def *
+tcs_patch_id(nir_builder *b)
+{
+   return nir_channel(b, nir_load_workgroup_id(b), 0);
+}
+
+static nir_def *
+tcs_instance_id(nir_builder *b)
+{
+   return nir_channel(b, nir_load_workgroup_id(b), 1);
+}
+
+static nir_def *
+tcs_unrolled_id(nir_builder *b)
+{
+   nir_def *stride = nir_channel(b, nir_load_num_workgroups(b), 0);
+
+   return nir_iadd(b, nir_imul(b, tcs_instance_id(b), stride), tcs_patch_id(b));
+}
+
+uint64_t
+agx_tcs_per_vertex_outputs(const nir_shader *nir)
+{
+   return nir->info.outputs_written &
+          ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER |
+            VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1);
+}
+
+unsigned
+agx_tcs_output_stride(const nir_shader *nir)
+{
+   return libagx_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
+                                nir->info.tess.tcs_vertices_out,
+                                agx_tcs_per_vertex_outputs(nir));
+}
+
+static nir_def *
+tcs_out_addr(nir_builder *b, nir_intrinsic_instr *intr, nir_def *vertex_id)
+{
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+   nir_def *offset = nir_get_io_offset_src(intr)->ssa;
+   nir_def *addr = libagx_tcs_out_address(
+      b, nir_load_tess_param_buffer_agx(b), tcs_unrolled_id(b), vertex_id,
+      nir_iadd_imm(b, offset, sem.location),
+      nir_imm_int(b, util_last_bit(b->shader->info.patch_outputs_written)),
+      nir_imm_int(b, b->shader->info.tess.tcs_vertices_out),
+      nir_imm_int64(b, agx_tcs_per_vertex_outputs(b->shader)));
+
+   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
+
+   return addr;
+}
+
+static nir_def *
+lower_tes_load(nir_builder *b, nir_intrinsic_instr *intr)
+{
+   gl_varying_slot location = nir_intrinsic_io_semantics(intr).location;
+   nir_src *offset_src = nir_get_io_offset_src(intr);
+
+   nir_def *vertex = nir_imm_int(b, 0);
+   nir_def *offset = offset_src ? offset_src->ssa : nir_imm_int(b, 0);
+
+   if (intr->intrinsic == nir_intrinsic_load_per_vertex_input)
+      vertex = intr->src[0].ssa;
+
+   nir_def *addr = libagx_tes_in_address(b, nir_load_tess_param_buffer_agx(b),
+                                         nir_load_vertex_id(b), vertex,
+                                         nir_iadd_imm(b, offset, location));
+
+   if (nir_intrinsic_has_component(intr))
+      addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
+
+   return nir_load_global_constant(b, addr, 4, intr->def.num_components,
+                                   intr->def.bit_size);
+}
+
+static nir_def *
+tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr,
+               struct tcs_state *state)
+{
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+   nir_def *off = libagx_tcs_in_offset(
+      b, intr->src[0].ssa, nir_iadd_imm(b, intr->src[1].ssa, sem.location),
+      nir_imm_int64(b, state->vs_outputs_written));
+
+   off = nir_iadd_imm(b, off, 4 * nir_intrinsic_component(intr));
+
+   return nir_load_shared(b, intr->def.num_components, 32, off);
+}
+
+static nir_def *
+lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr,
+               struct tcs_state *state)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_barrier:
+      /* A patch fits in a subgroup, so the barrier is unnecessary. */
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   case nir_intrinsic_load_primitive_id:
+      return tcs_patch_id(b);
+
+   case nir_intrinsic_load_instance_id:
+      return tcs_instance_id(b);
+
+   case nir_intrinsic_load_invocation_id:
+      return nir_channel(b, nir_load_local_invocation_id(b), 0);
+
+   case nir_intrinsic_load_per_vertex_input:
+      return tcs_load_input(b, intr, state);
+
+   case nir_intrinsic_load_patch_vertices_in:
+      return libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_agx(b));
+
+   case nir_intrinsic_load_tess_level_outer_default:
+      return libagx_tess_level_outer_default(b,
+                                             nir_load_tess_param_buffer_agx(b));
+
+   case nir_intrinsic_load_tess_level_inner_default:
+      return libagx_tess_level_inner_default(b,
+                                             nir_load_tess_param_buffer_agx(b));
+
+   case nir_intrinsic_load_output: {
+      nir_def *addr = tcs_out_addr(b, intr, nir_undef(b, 1, 32));
+      return nir_load_global(b, addr, 4, intr->def.num_components,
+                             intr->def.bit_size);
+   }
+
+   case nir_intrinsic_load_per_vertex_output: {
+      nir_def *addr = tcs_out_addr(b, intr, intr->src[0].ssa);
+      return nir_load_global(b, addr, 4, intr->def.num_components,
+                             intr->def.bit_size);
+   }
+
+   case nir_intrinsic_store_output: {
+      nir_store_global(b, tcs_out_addr(b, intr, nir_undef(b, 1, 32)), 4,
+                       intr->src[0].ssa, nir_intrinsic_write_mask(intr));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   case nir_intrinsic_store_per_vertex_output: {
+      nir_store_global(b, tcs_out_addr(b, intr, intr->src[1].ssa), 4,
+                       intr->src[0].ssa, nir_intrinsic_write_mask(intr));
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   default:
+      return NULL;
+   }
+}
+
+static bool
+lower_tcs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+
+   nir_def *repl = lower_tcs_impl(b, intr, data);
+   if (!repl)
+      return false;
+
+   if (repl != NIR_LOWER_INSTR_PROGRESS_REPLACE)
+      nir_def_rewrite_uses(&intr->def, repl);
+
+   nir_instr_remove(&intr->instr);
+   return true;
+}
+
+static void
+link_libagx(nir_shader *nir, const nir_shader *libagx)
+{
+   nir_link_shader_functions(nir, libagx);
+   NIR_PASS(_, nir, nir_inline_functions);
+   nir_remove_non_entrypoints(nir);
+   NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, 64);
+   NIR_PASS(_, nir, nir_opt_dce);
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
+            glsl_get_cl_type_size_align);
+   NIR_PASS(_, nir, nir_opt_deref);
+   NIR_PASS(_, nir, nir_lower_vars_to_ssa);
+   NIR_PASS(_, nir, nir_lower_explicit_io,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global,
+            nir_address_format_62bit_generic);
+}
+
+/*
+ * Predicate the TCS so the merged shader works when input patch size > output
+ * patch size.
+ */
+static bool
+agx_nir_predicate_tcs(nir_shader *tcs)
+{
+   nir_function_impl *entry = nir_shader_get_entrypoint(tcs);
+   nir_cf_list list;
+   nir_cf_extract(&list, nir_before_impl(entry), nir_after_impl(entry));
+
+   nir_builder b = nir_builder_at(nir_after_block(nir_start_block(entry)));
+   nir_def *input_vtx_id = nir_load_invocation_id(&b);
+   unsigned verts = tcs->info.tess.tcs_vertices_out;
+
+   nir_push_if(&b, nir_ult_imm(&b, input_vtx_id, verts));
+   {
+      nir_cf_reinsert(&list, b.cursor);
+   }
+   nir_pop_if(&b, NULL);
+
+   nir_metadata_preserve(entry, nir_metadata_none);
+   return false;
+}
+
+bool
+agx_nir_lower_tcs(nir_shader *tcs, const nir_shader *vs,
+                  const struct nir_shader *libagx, uint8_t index_size_B)
+{
+   agx_nir_predicate_tcs(tcs);
+
+   nir_function_impl *tcs_entry = nir_shader_get_entrypoint(tcs);
+
+   /* Link the vertex shader with the TCS. This assumes that all functions have
+    * been inlined in the vertex shader.
+    */
+   nir_function_impl *vs_entry = nir_shader_get_entrypoint(vs);
+   nir_function *vs_function = nir_function_create(tcs, "vertex");
+   vs_function->impl = nir_function_impl_clone(tcs, vs_entry);
+   vs_function->impl->function = vs_function;
+
+   /* Vertex shader outputs are staged to temporaries */
+   struct tcs_state state = {
+      .vs_vars.arrayed = false,
+      .vs_outputs_written = vs->info.outputs_written & tcs->info.inputs_read,
+   };
+
+   u_foreach_bit64(slot, vs->info.outputs_written) {
+      const char *slot_name =
+         gl_varying_slot_name_for_stage(slot, MESA_SHADER_VERTEX);
+
+      state.vs_vars.outputs[slot] = nir_variable_create(
+         tcs, nir_var_shader_temp, glsl_uvec4_type(), slot_name);
+   }
+
+   nir_function_instructions_pass(
+      vs_function->impl, agx_lower_output_to_var,
+      nir_metadata_block_index | nir_metadata_dominance, &state.vs_vars);
+
+   /* Invoke the VS first for each vertex in the input patch */
+   nir_builder b_ = nir_builder_at(nir_before_impl(tcs_entry));
+   nir_builder *b = &b_;
+
+   nir_def *input_vtx_id = nir_load_invocation_id(b);
+   nir_push_if(b, nir_ult(b, input_vtx_id, nir_load_patch_vertices_in(b)));
+   {
+      nir_inline_function_impl(b, vs_function->impl, NULL, NULL);
+
+      /* To handle cross-invocation VS output reads, dump everything in
+       * shared local memory.
+       *
+       * TODO: Optimize to registers.
+       */
+      u_foreach_bit64(slot, state.vs_outputs_written) {
+         nir_def *off =
+            libagx_tcs_in_offset(b, input_vtx_id, nir_imm_int(b, slot),
+                                 nir_imm_int64(b, state.vs_outputs_written));
+
+         nir_store_shared(b, nir_load_var(b, state.vs_vars.outputs[slot]), off,
+                          .write_mask = nir_component_mask(4));
+      }
+   }
+   nir_pop_if(b, NULL);
+
+   /* Clean up after inlining VS into TCS */
+   exec_node_remove(&vs_function->node);
+   nir_lower_global_vars_to_local(tcs);
+
+   /* Lower I/A. TODO: Indirect multidraws */
+   agx_nir_lower_ia(tcs, &(struct agx_ia_key){
+                            .index_size = index_size_B,
+                            .mode = MESA_PRIM_PATCHES,
+                         });
+
+   /* Lower TCS outputs */
+   nir_shader_intrinsics_pass(tcs, lower_tcs,
+                              nir_metadata_block_index | nir_metadata_dominance,
+                              &state);
+   link_libagx(tcs, libagx);
+   nir_metadata_preserve(b->impl, nir_metadata_none);
+   return true;
+}
+
+static nir_def *
+lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_tess_coord_xy:
+      return libagx_load_tess_coord(b, nir_load_tess_param_buffer_agx(b),
+                                    nir_load_vertex_id(b));
+
+   case nir_intrinsic_load_primitive_id:
+      return libagx_tes_patch_id(b, nir_load_tess_param_buffer_agx(b),
+                                 nir_load_vertex_id(b));
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input:
+   case nir_intrinsic_load_tess_level_inner:
+   case nir_intrinsic_load_tess_level_outer:
+      return lower_tes_load(b, intr);
+
+   case nir_intrinsic_load_patch_vertices_in:
+      return libagx_tes_patch_vertices_in(b, nir_load_tess_param_buffer_agx(b));
+
+   default:
+      return NULL;
+   }
+}
+
+static bool
+lower_tes(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_def *repl = lower_tes_impl(b, intr, data);
+
+   if (repl) {
+      nir_def_rewrite_uses(&intr->def, repl);
+      nir_instr_remove(&intr->instr);
+      return true;
+   } else {
+      return false;
+   }
+}
+
+static int
+glsl_type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+bool
+agx_nir_lower_tes(nir_shader *tes, const nir_shader *libagx)
+{
+   nir_lower_tess_coord_z(
+      tes, tes->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
+
+   nir_shader_intrinsics_pass(
+      tes, lower_tes, nir_metadata_block_index | nir_metadata_dominance, NULL);
+
+   /* Points mode renders as points, make sure we write point size for the HW */
+   if (tes->info.tess.point_mode &&
+       !(tes->info.outputs_written & VARYING_BIT_PSIZ)) {
+
+      nir_function_impl *impl = nir_shader_get_entrypoint(tes);
+      nir_builder b = nir_builder_at(nir_after_impl(impl));
+
+      nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0),
+                       .io_semantics.location = VARYING_SLOT_PSIZ,
+                       .write_mask = nir_component_mask(1), .range = 1);
+
+      tes->info.outputs_written |= VARYING_BIT_PSIZ;
+   }
+
+   /* We lower to a HW VS, so update the shader info so the compiler does the
+    * right thing.
+    */
+   tes->info.stage = MESA_SHADER_VERTEX;
+   memset(&tes->info.vs, 0, sizeof(tes->info.vs));
+   tes->info.vs.tes_agx = true;
+
+   link_libagx(tes, libagx);
+   nir_lower_idiv(tes, &(nir_lower_idiv_options){.allow_fp16 = true});
+   nir_metadata_preserve(nir_shader_get_entrypoint(tes), nir_metadata_none);
+   return true;
+}
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@ -17,6 +17,7 @@ libasahi_lib_files = files(
  'agx_nir_lower_ia.c',
  'agx_nir_lower_msaa.c',
  'agx_nir_lower_sample_intrinsics.c',
+  'agx_nir_lower_tess.c',
  'agx_nir_lower_tilebuffer.c',
  'agx_nir_lower_vbo.c',
  'agx_nir_predicate_layer_id.c',
@ -32,6 +33,8 @@ libagx_shader_files = files(
  'shaders/libagx.h',
  'shaders/geometry.cl',
  'shaders/geometry.h',
+  'shaders/tessellation.cl',
+  'shaders/tessellator.cl',
  'shaders/texture.cl',
 )

--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@ -40,7 +40,80 @@ libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index,
   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
 }

-/* TODO: Primitive restart */
+uint
+libagx_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
+{
+   /* (0, 1), (1, 2), (2, 0) */
+   if (prim == (num_prims - 1) && vert == 1)
+      return 0;
+   else
+      return prim + vert;
+}
+
+uint
+libagx_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
+{
+   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
+    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
+    * Piglit clipflat expects us to switch between these orders depending on
+    * provoking vertex, to avoid trivializing the fan.
+    *
+    * Rotate accordingly.
+    */
+   if (flatshade_first)
+      vert = (vert + 1) % 3;
+
+   /* The simpler form assuming last is provoking. */
+   return (vert == 0) ? 0 : prim + vert;
+}
+
+uint
+libagx_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
+                                   bool flatshade_first)
+{
+   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
+    *
+    * There are different cases for first/middle/last/only primitives and for
+    * odd/even primitives.  Determine which case we're in.
+    */
+   bool last = prim == (num_prims - 1);
+   bool first = prim == 0;
+   bool even = (prim & 1) == 0;
+   bool even_or_first = even || first;
+
+   /* When the last vertex is provoking, we rotate the primitives
+    * accordingly. This seems required for OpenGL.
+    */
+   if (!flatshade_first && !even_or_first) {
+      vert = (vert + 4u) % 6u;
+   }
+
+   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
+    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
+    *
+    * Here we assume the first vertex is provoking, the Vulkan default.
+    */
+   uint offsets[6] = {
+      0,
+      first ? 1 : (even ? -2 : 3),
+      even_or_first ? 2 : 4,
+      last ? 5 : 6,
+      even_or_first ? 4 : 2,
+      even_or_first ? 3 : -2,
+   };
+
+   /* Ensure NIR can see thru the local array */
+   uint offset = 0;
+   for (uint i = 1; i < 6; ++i) {
+      if (i == vert)
+         offset = offsets[i];
+   }
+
+   /* Finally add to the base of the primitive */
+   return (prim * 2) + offset;
+}
+
+/* Sync with agx_nir_lower_ia.c, this is for the restart unrolling */
 uint
 libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
                              uint prim, uint vert, uint num_prims)
@ -50,24 +123,17 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
   case MESA_PRIM_LINES:
   case MESA_PRIM_TRIANGLES:
   case MESA_PRIM_LINES_ADJACENCY:
-   case MESA_PRIM_TRIANGLES_ADJACENCY: {
+   case MESA_PRIM_TRIANGLES_ADJACENCY:
      /* Regular primitive: every N vertices defines a primitive */
      return (prim * mesa_vertices_per_prim(mode)) + vert;
-   }

-   case MESA_PRIM_LINE_LOOP: {
-      /* (0, 1), (1, 2), (2, 0) */
-      if (prim == (num_prims - 1) && vert == 1)
-         return 0;
-      else
-         return prim + vert;
-   }
+   case MESA_PRIM_LINE_LOOP:
+      return libagx_vertex_id_for_line_loop(prim, vert, num_prims);

   case MESA_PRIM_LINE_STRIP:
-   case MESA_PRIM_LINE_STRIP_ADJACENCY: {
+   case MESA_PRIM_LINE_STRIP_ADJACENCY:
      /* (i, i + 1) or (i, ..., i + 3) */
      return prim + vert;
-   }

   case MESA_PRIM_TRIANGLE_STRIP: {
      /* Order depends on the provoking vert.
@ -80,66 +146,14 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
      return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
   }

-   case MESA_PRIM_TRIANGLE_FAN: {
-      /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
-       * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
-       * Piglit clipflat expects us to switch between these orders depending on
-       * provoking vertex, to avoid trivializing the fan.
-       *
-       * Rotate accordingly.
-       */
-      if (flatshade_first)
-         vert = (vert + 1) % 3;
+   case MESA_PRIM_TRIANGLE_FAN:
+      return libagx_vertex_id_for_tri_fan(prim, vert, flatshade_first);

-      /* The simpler form assuming last is provoking. */
-      return (vert == 0) ? 0 : prim + vert;
-   }
-
-   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY: {
-      /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
-       *
-       * There are different cases for first/middle/last/only primitives and for
-       * odd/even primitives.  Determine which case we're in.
-       */
-      bool last = prim == (num_prims - 1);
-      bool first = prim == 0;
-      bool even = (prim & 1) == 0;
-      bool even_or_first = even || first;
-
-      /* When the last vertex is provoking, we rotate the primitives
-       * accordingly. This seems required for OpenGL.
-       */
-      if (!flatshade_first && !even_or_first) {
-         vert = (vert + 4u) % 6u;
-      }
-
-      /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
-       * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
-       *
-       * Here we assume the first vertex is provoking, the Vulkan default.
-       */
-      uint offsets[6] = {
-         0,
-         first ? 1 : (even ? -2 : 3),
-         even_or_first ? 2 : 4,
-         last ? 5 : 6,
-         even_or_first ? 4 : 2,
-         even_or_first ? 3 : -2,
-      };
-
-      /* Ensure NIR can see thru the local array */
-      uint offset = 0;
-      for (uint i = 1; i < 6; ++i) {
-         if (i == vert)
-            offset = offsets[i];
-      }
-
-      /* Finally add to the base of the primitive */
-      return (prim * 2) + offset;
-   }
+   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return libagx_vertex_id_for_tri_strip_adj(prim, vert, num_prims,
+                                                flatshade_first);

   default:
-      /* Invalid */
      return 0;
   }
 }
--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@ -8,13 +8,16 @@
 #include "libagx.h"

 #ifndef __OPENCL_VERSION__
+#include "util/bitscan.h"
 #include "util/macros.h"
-#define GLOBAL(type_) uint64_t
-#define CONST(type_)  uint64_t
+#define GLOBAL(type_)      uint64_t
+#define CONST(type_)       uint64_t
+#define libagx_popcount(x) util_bitcount64(x)
 #else
 #define PACKED
-#define GLOBAL(type_) global type_ *
-#define CONST(type_)  constant type_ *
+#define GLOBAL(type_)      global type_ *
+#define CONST(type_)       constant type_ *
+#define libagx_popcount(x) popcount(x)
 #endif

 #ifndef LIBAGX_GEOMETRY_H
@ -156,4 +159,116 @@ struct agx_geometry_params {
   uint32_t count_buffer_stride;
 } PACKED;

+struct agx_tess_params {
+   /* Persistent (cross-draw) geometry state */
+   GLOBAL(struct agx_geometry_state) state;
+
+   /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */
+   GLOBAL(uint) patch_coord_offs;
+
+   /* Patch coordinate buffer, indexed as:
+    *
+    *    patch_coord_offs[patch_ID] + vertex_in_patch
+    *
+    * Currently float2s, but we might be able to compact later?
+    */
+   GLOBAL(float2) patch_coord_buffer;
+
+   /* Tessellation control shader output buffer, indexed by patch ID. */
+   GLOBAL(uchar) tcs_buffer;
+
+   /* Bitfield of TCS per-vertex outputs */
+   uint64_t tcs_per_vertex_outputs;
+
+   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
+    * Unused in Vulkan and OpenGL ES.
+    */
+   float tess_level_outer_default[4];
+   float tess_level_inner_default[4];
+
+   /* Number of vertices in the input patch */
+   uint input_patch_size;
+
+   /* Number of vertices in the TCS output patch */
+   uint output_patch_size;
+
+   /* Number of patch constants written by TCS */
+   uint tcs_patch_constants;
+
+   /* Number of input patches per instance of the VS/TCS */
+   uint patches_per_instance;
+} PACKED;
+
+/* TCS shared memory layout:
+ *
+ *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
+ *
+ * TODO: compact.
+ */
+static inline ushort
+libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
+                   uint64_t crosslane_vs_out_mask)
+{
+   uint base = vtx * libagx_popcount(crosslane_vs_out_mask);
+   uint offs = libagx_popcount(crosslane_vs_out_mask &
+                               (((uint64_t)(1) << location) - 1));
+
+   return (base + offs) * 16;
+}
+
+static inline uint
+libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
+{
+   return libagx_tcs_in_offs(vertices_in_patch - 1, VARYING_SLOT_VAR31,
+                             crosslane_vs_out_mask);
+}
+
+/*
+ * TCS out buffer layout, per-patch:
+ *
+ *    float tess_level_outer[4];
+ *    float tess_level_inner[2];
+ *    vec4 patch_out[MAX_PATCH_OUTPUTS];
+ *    vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
+ *
+ * Vertex out are compacted based on the mask of written out. Patch
+ * out are used as-is.
+ *
+ * Bounding boxes are ignored.
+ */
+static inline uint
+libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
+                    uint out_patch_size, uint64_t vtx_out_mask)
+{
+   uint off = 0;
+   if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+      return off;
+
+   off += 4 * sizeof(float);
+   if (location == VARYING_SLOT_TESS_LEVEL_INNER)
+      return off;
+
+   off += 2 * sizeof(float);
+   if (location >= VARYING_SLOT_PATCH0)
+      return off + (16 * (location - VARYING_SLOT_PATCH0));
+
+   /* Anything else is a per-vtx output */
+   off += 16 * nr_patch_out;
+   off += 16 * vtx_id * libagx_popcount(vtx_out_mask);
+
+   uint idx = libagx_popcount(vtx_out_mask & (((uint64_t)(1) << location) - 1));
+   return off + (16 * idx);
+}
+
+static inline uint
+libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
+                      uint64_t vtx_out_mask)
+{
+   return libagx_tcs_out_offs(out_patch_size, VARYING_SLOT_VAR0, nr_patch_out,
+                              out_patch_size, vtx_out_mask);
+}
+
+/* In a tess eval shader, stride for hw vertex ID */
+#define LIBAGX_TES_PATCH_ID_STRIDE 8192
+
 #endif
--- a/src/asahi/lib/shaders/tessellation.cl
+++ b/src/asahi/lib/shaders/tessellation.cl
@ -0,0 +1,92 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "geometry.h"
+
+uint
+libagx_tcs_patch_vertices_in(constant struct agx_tess_params *p)
+{
+   return p->input_patch_size;
+}
+
+uint
+libagx_tes_patch_vertices_in(constant struct agx_tess_params *p)
+{
+   return p->output_patch_size;
+}
+
+ushort
+libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
+                     uint64_t crosslane_vs_out_mask)
+{
+   return libagx_tcs_in_offs(vtx, location, crosslane_vs_out_mask);
+}
+
+uintptr_t
+libagx_tcs_out_address(constant struct agx_tess_params *p, uint patch_id,
+                       uint vtx_id, gl_varying_slot location, uint nr_patch_out,
+                       uint out_patch_size, uint64_t vtx_out_mask)
+{
+   uint stride =
+      libagx_tcs_out_stride(nr_patch_out, out_patch_size, vtx_out_mask);
+
+   uint offs = libagx_tcs_out_offs(vtx_id, location, nr_patch_out,
+                                   out_patch_size, vtx_out_mask);
+
+   return (uintptr_t)(p->tcs_buffer) + (patch_id * stride) + offs;
+}
+
+static uint
+libagx_tes_unrolled_patch_id(uint raw_id)
+{
+   return raw_id / LIBAGX_TES_PATCH_ID_STRIDE;
+}
+
+uint
+libagx_tes_patch_id(constant struct agx_tess_params *p, uint raw_id)
+{
+   return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
+}
+
+static uint
+tes_vertex_id_in_patch(uint raw_id)
+{
+   return raw_id % LIBAGX_TES_PATCH_ID_STRIDE;
+}
+
+float2
+libagx_load_tess_coord(constant struct agx_tess_params *p, uint raw_id)
+{
+   uint patch = libagx_tes_unrolled_patch_id(raw_id);
+   uint vtx = tes_vertex_id_in_patch(raw_id);
+
+   return p->patch_coord_buffer[p->patch_coord_offs[patch] + vtx];
+}
+
+uintptr_t
+libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
+                      uint vtx_id, gl_varying_slot location)
+{
+   uint patch = libagx_tes_unrolled_patch_id(raw_id);
+
+   return libagx_tcs_out_address(p, patch, vtx_id, location,
+                                 p->tcs_patch_constants, p->output_patch_size,
+                                 p->tcs_per_vertex_outputs);
+}
+
+float4
+libagx_tess_level_outer_default(constant struct agx_tess_params *p)
+{
+   return (
+      float4)(p->tess_level_outer_default[0], p->tess_level_outer_default[1],
+              p->tess_level_outer_default[2], p->tess_level_outer_default[3]);
+}
+
+float2
+libagx_tess_level_inner_default(constant struct agx_tess_params *p)
+{
+   return (float2)(p->tess_level_inner_default[0],
+                   p->tess_level_inner_default[1]);
+}
--- a/src/asahi/lib/shaders/tessellator.cl
+++ b/src/asahi/lib/shaders/tessellator.cl
@ -0,0 +1,8 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright (c) Microsoft Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "geometry.h"
+
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1899,6 +1899,9 @@ system_value("input_assembly_buffer_agx", 1, bit_sizes=[64])
 # Address of the parameter buffer for AGX geometry shaders
 system_value("geometry_param_buffer_agx", 1, bit_sizes=[64])

+# Address of the parameter buffer for AGX tessellation shaders
+system_value("tess_param_buffer_agx", 1, bit_sizes=[64])
+
 # Loads the vertex index within the current decomposed primitive. For a
 # triangle, this will be in [0, 2], where 2 is the last vertex. This is defined
 # only when the vertex shader is reinvoked for the same vertex in each
--- a/src/gallium/drivers/asahi/agx_blit.c
+++ b/src/gallium/drivers/asahi/agx_blit.c
@ -323,6 +323,10 @@ agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
   util_blitter_save_vertex_elements(blitter, ctx->attributes);
   util_blitter_save_vertex_shader(blitter,
                                   ctx->stage[PIPE_SHADER_VERTEX].shader);
+   util_blitter_save_tessctrl_shader(blitter,
+                                     ctx->stage[PIPE_SHADER_TESS_CTRL].shader);
+   util_blitter_save_tesseval_shader(blitter,
+                                     ctx->stage[PIPE_SHADER_TESS_EVAL].shader);
   util_blitter_save_geometry_shader(blitter,
                                     ctx->stage[PIPE_SHADER_GEOMETRY].shader);
   util_blitter_save_rasterizer(blitter, ctx->rast);
--- a/src/gallium/drivers/asahi/agx_disk_cache.c
+++ b/src/gallium/drivers/asahi/agx_disk_cache.c
@ -37,6 +37,8 @@ agx_disk_cache_compute_key(struct disk_cache *cache,
      key_size = sizeof(shader_key->vs);
   else if (uncompiled->type == PIPE_SHADER_GEOMETRY)
      key_size = sizeof(shader_key->gs);
+   else if (uncompiled->type == PIPE_SHADER_TESS_CTRL)
+      key_size = sizeof(shader_key->tcs);
   else if (uncompiled->type == PIPE_SHADER_FRAGMENT)
      key_size = sizeof(shader_key->fs);
   else if (uncompiled->type == PIPE_SHADER_COMPUTE)
@ -68,8 +70,9 @@ agx_disk_cache_store(struct disk_cache *cache,
   if (!cache)
      return;

-   /* TODO: Support caching GS */
-   if (uncompiled->type == PIPE_SHADER_GEOMETRY)
+   /* TODO: Support caching GS/TCS */
+   if (uncompiled->type == PIPE_SHADER_GEOMETRY ||
+       uncompiled->type == PIPE_SHADER_TESS_CTRL)
      return;

   assert(binary->bo->ptr.cpu != NULL && "shaders must be CPU mapped");
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@ -11,6 +11,7 @@
 #include "nir_builder_opcodes.h"
 #include "nir_intrinsics.h"
 #include "nir_intrinsics_indices.h"
+#include "shader_enums.h"

 #define AGX_TEXTURE_DESC_STRIDE 24

@ -89,8 +90,12 @@ load_sysval_indirect(nir_builder *b, unsigned dim, unsigned bitsize,
 static unsigned
 stage_table(nir_builder *b)
 {
-   assert(b->shader->info.stage < PIPE_SHADER_TYPES);
-   return AGX_SYSVAL_STAGE(b->shader->info.stage);
+   gl_shader_stage stage = b->shader->info.stage;
+   if (stage == MESA_SHADER_VERTEX && b->shader->info.vs.tes_agx)
+      stage = MESA_SHADER_TESS_EVAL;
+
+   assert(stage < PIPE_SHADER_TYPES);
+   return AGX_SYSVAL_STAGE(stage);
 }

 static nir_def *
@ -161,6 +166,8 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
      return load_sysval_root(b, 1, 64, &u->input_assembly);
   case nir_intrinsic_load_geometry_param_buffer_agx:
      return load_sysval_root(b, 1, 64, &u->geometry_params);
+   case nir_intrinsic_load_tess_param_buffer_agx:
+      return load_sysval_root(b, 1, 64, &u->tess_params);
   case nir_intrinsic_load_fixed_point_size_agx:
      return load_sysval_root(b, 1, 32, &u->fixed_point_size);
   case nir_intrinsic_load_tex_sprite_mask_agx:
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@ -46,6 +46,7 @@
 #include "agx_public.h"
 #include "agx_state.h"
 #include "agx_tilebuffer.h"
+#include "shader_enums.h"

 /* Fake values, pending UAPI upstreaming */
 #ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED
@ -1533,6 +1534,7 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_FS_FINE_DERIVATIVE:
   case PIPE_CAP_CULL_DISTANCE_NOCOMBINE:
   case PIPE_CAP_NIR_COMPACT_ARRAYS:
+   case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
      return 1;

   case PIPE_CAP_CLIP_HALFZ:
@ -1699,7 +1701,10 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
      return 4;

+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 32;
   case PIPE_CAP_MAX_VARYINGS:
+      /* TODO: Probably should bump to 32? */
      return 16;

   case PIPE_CAP_FLATSHADE:
@ -1726,7 +1731,8 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
             BITFIELD_BIT(MESA_PRIM_LINES_ADJACENCY) |
             BITFIELD_BIT(MESA_PRIM_LINE_STRIP_ADJACENCY) |
             BITFIELD_BIT(MESA_PRIM_TRIANGLES_ADJACENCY) |
-             BITFIELD_BIT(MESA_PRIM_TRIANGLE_STRIP_ADJACENCY);
+             BITFIELD_BIT(MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) |
+             BITFIELD_BIT(MESA_PRIM_PATCHES);

   case PIPE_CAP_MAP_UNSYNCHRONIZED_THREAD_SAFE:
      return 1;
@ -1790,6 +1796,8 @@ agx_get_shader_param(struct pipe_screen *pscreen, enum pipe_shader_type shader,
   case PIPE_SHADER_COMPUTE:
   case PIPE_SHADER_GEOMETRY:
      break;
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
   default:
      return false;
   }
@ -1836,7 +1844,11 @@ agx_get_shader_param(struct pipe_screen *pscreen, enum pipe_shader_type shader,
      return 1;

   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      return shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL;
+
   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      return shader == PIPE_SHADER_TESS_CTRL;
+
   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
   case PIPE_SHADER_CAP_SUBROUTINES:
   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -31,6 +31,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
+#include "tessellator/p_tessellator.h"
+#include "util/bitscan.h"
 #include "util/bitset.h"
 #include "util/blend.h"
 #include "util/blob.h"
@ -47,6 +49,7 @@
 #include "util/u_resource.h"
 #include "util/u_transfer.h"
 #include "util/u_upload_mgr.h"
+#include "agx_bo.h"
 #include "agx_device.h"
 #include "agx_disk_cache.h"
 #include "agx_nir_lower_gs.h"
@ -185,6 +188,24 @@ agx_set_blend_color(struct pipe_context *pctx,
   ctx->dirty |= AGX_DIRTY_BLEND_COLOR;
 }

+static void
+agx_set_patch_vertices(struct pipe_context *pctx, unsigned char n)
+{
+   struct agx_context *ctx = agx_context(pctx);
+   ctx->patch_vertices = n;
+}
+
+static void
+agx_set_tess_state(struct pipe_context *pctx,
+                   const float default_outer_level[4],
+                   const float default_inner_level[2])
+{
+   struct agx_context *ctx = agx_context(pctx);
+
+   memcpy(ctx->default_outer_level, default_outer_level, 4 * sizeof(float));
+   memcpy(ctx->default_inner_level, default_inner_level, 2 * sizeof(float));
+}
+
 static void *
 agx_create_blend_state(struct pipe_context *ctx,
                       const struct pipe_blend_state *state)
@ -586,6 +607,7 @@ static enum pipe_shader_type
 merged_stage(struct agx_context *ctx, enum pipe_shader_type stage)
 {
   switch (stage) {
+   case MESA_SHADER_VERTEX:
   case MESA_SHADER_GEOMETRY:
      return ctx->stage[PIPE_SHADER_TESS_EVAL].shader ? MESA_SHADER_TESS_EVAL
                                                      : MESA_SHADER_VERTEX;
@ -1504,6 +1526,18 @@ asahi_fs_shader_key_equal(const void *a, const void *b)
   return memcmp(a, b, sizeof(struct asahi_fs_shader_key)) == 0;
 }

+static uint32_t
+asahi_tcs_shader_key_hash(const void *key)
+{
+   return _mesa_hash_data(key, sizeof(struct asahi_tcs_shader_key));
+}
+
+static bool
+asahi_tcs_shader_key_equal(const void *a, const void *b)
+{
+   return memcmp(a, b, sizeof(struct asahi_tcs_shader_key)) == 0;
+}
+
 /* No compute variants */
 static uint32_t
 asahi_cs_shader_key_hash(const void *key)
@ -1837,6 +1871,22 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
         NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
                  nir_metadata_block_index | nir_metadata_dominance, NULL);
      }
+   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+      struct asahi_tcs_shader_key *key = &key_->tcs;
+
+      /* TODO: Deduplicate this logic from the GS case! */
+      struct blob_reader vs_reader;
+      blob_reader_init(&vs_reader, linked_so->serialized_nir.data,
+                       linked_so->serialized_nir.size);
+      nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);
+
+      /* Apply the VS key to the VS before linking it in */
+      NIR_PASS_V(vs, agx_nir_lower_vbo, key->attribs);
+      NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+      NIR_PASS_V(vs, agx_nir_lower_sysvals, false);
+
+      NIR_PASS_V(nir, agx_nir_lower_tcs, vs, dev->libagx, key->index_size_B);
+      ralloc_free(vs);
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      struct asahi_gs_shader_key *key = &key_->gs;

@ -2040,6 +2090,8 @@ agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
      memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
   } else if (so->type == PIPE_SHADER_GEOMETRY) {
      memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
+   } else if (so->type == PIPE_SHADER_TESS_CTRL) {
+      memcpy(cloned_key, key, sizeof(struct asahi_tcs_shader_key));
   } else {
      assert(gl_shader_stage_is_compute(so->type));
      /* No key */
@ -2057,8 +2109,6 @@ agx_shader_initialize(struct agx_device *dev, struct agx_uncompiled_shader *so,
   if (nir->info.stage == MESA_SHADER_KERNEL)
      nir->info.stage = MESA_SHADER_COMPUTE;

-   so->type = pipe_shader_type_from_mesa(nir->info.stage);
-
   blob_init(&so->early_serialized_nir);
   nir_serialize(&so->early_serialized_nir, nir, true);

@ -2103,11 +2153,16 @@ agx_shader_initialize(struct agx_device *dev, struct agx_uncompiled_shader *so,
               nir_metadata_block_index | nir_metadata_dominance, NULL);
   }

+   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      NIR_PASS(_, nir, agx_nir_lower_tes, dev->libagx);
+   }
+
   blob_init(&so->serialized_nir);
   nir_serialize(&so->serialized_nir, nir, true);
   _mesa_sha1_compute(so->serialized_nir.data, so->serialized_nir.size,
                      so->nir_sha1);

+   so->type = pipe_shader_type_from_mesa(nir->info.stage);
   so->has_xfb_info = (nir->xfb_info != NULL);

   static_assert(
@ -2147,11 +2202,34 @@ agx_create_shader_state(struct pipe_context *pctx,
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      so->variants = _mesa_hash_table_create(NULL, asahi_gs_shader_key_hash,
                                             asahi_gs_shader_key_equal);
+
+   } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      /* No variants */
+      so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
+                                             asahi_cs_shader_key_equal);
+   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
+      so->variants = _mesa_hash_table_create(NULL, asahi_tcs_shader_key_hash,
+                                             asahi_tcs_shader_key_equal);
   } else {
      so->variants = _mesa_hash_table_create(so, asahi_fs_shader_key_hash,
                                             asahi_fs_shader_key_equal);
   }

+   if (nir->info.stage == MESA_SHADER_TESS_EVAL ||
+       nir->info.stage == MESA_SHADER_TESS_CTRL) {
+
+      so->tess.ccw = nir->info.tess.ccw;
+      so->tess.point_mode = nir->info.tess.point_mode;
+      so->tess.spacing = nir->info.tess.spacing;
+      so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
+      so->tess.primitive = nir->info.tess._primitive_mode;
+      so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
+      so->tess.nr_patch_outputs =
+         util_last_bit(nir->info.patch_outputs_written);
+      if (nir->info.stage == MESA_SHADER_TESS_CTRL)
+         so->tess.output_stride = agx_tcs_output_stride(nir);
+   }
+
   agx_shader_initialize(dev, so, nir, ctx->support_lod_bias);

   /* We're done with the NIR, throw it away */
@ -2178,7 +2256,9 @@ agx_create_shader_state(struct pipe_context *pctx,
      }

      case PIPE_SHADER_GEOMETRY:
-         /* TODO: Geometry shaders with shader-db */
+      case PIPE_SHADER_TESS_CTRL:
+      case PIPE_SHADER_TESS_EVAL:
+         /* TODO: Geometry/tessellation shaders with shader-db */
         return so;

      case PIPE_SHADER_FRAGMENT:
@ -2276,7 +2356,7 @@ agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
   }

   struct agx_uncompiled_shader *linked_so = NULL;
-   if (stage == PIPE_SHADER_GEOMETRY)
+   if (stage == PIPE_SHADER_TESS_CTRL || stage == PIPE_SHADER_GEOMETRY)
      linked_so = ctx->stage[PIPE_SHADER_VERTEX].shader;

   struct agx_screen *screen = agx_screen(ctx->base.screen);
@ -2351,6 +2431,30 @@ translate_ia_mode(enum mesa_prim prim)
   }
 }

+static bool
+agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
+{
+   assert(info->mode == MESA_PRIM_PATCHES);
+
+   /* We don't bother to dirty track yet, update! */
+   struct asahi_tcs_shader_key key = {
+      .index_size_B = info->index_size,
+   };
+
+   memcpy(key.attribs, ctx->attributes,
+          sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
+
+   static_assert(sizeof(key.input_nir_sha1) ==
+                    sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),
+                 "common size for shader sha-1");
+
+   memcpy(key.input_nir_sha1, ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1,
+          sizeof(key.input_nir_sha1));
+
+   return agx_update_shader(ctx, &ctx->tcs, PIPE_SHADER_TESS_CTRL,
+                            (union asahi_shader_key *)&key);
+}
+
 /*
 * Triangle strips and fans are rotated based on the provoking vertex, but other
 * primitive types are not and do not need to know the provoking vertex.
@ -2487,6 +2591,18 @@ agx_bind_gs_state(struct pipe_context *pctx, void *cso)
   agx_bind_shader_state(pctx, cso, PIPE_SHADER_GEOMETRY);
 }

+static void
+agx_bind_tcs_state(struct pipe_context *pctx, void *cso)
+{
+   agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_CTRL);
+}
+
+static void
+agx_bind_tes_state(struct pipe_context *pctx, void *cso)
+{
+   agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_EVAL);
+}
+
 static void
 agx_bind_cs_state(struct pipe_context *pctx, void *cso)
 {
@ -2850,7 +2966,7 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,

   if (stage == PIPE_SHADER_FRAGMENT) {
      agx_usc_tilebuffer(&b, &batch->tilebuffer_layout);
-   } else if (stage == PIPE_SHADER_COMPUTE) {
+   } else if (stage == PIPE_SHADER_COMPUTE || stage == PIPE_SHADER_TESS_CTRL) {
      unsigned size = cs->info.local_size + variable_shared_mem;

      agx_usc_pack(&b, SHARED, cfg) {
@ -4185,6 +4301,291 @@ util_draw_multi_upload_indirect(struct pipe_context *pctx,
   pctx->draw_vbo(pctx, info, 0, &indirect_, draws, 1);
 }

+static void
+agx_upload_draw_params(struct agx_batch *batch,
+                       const struct pipe_draw_indirect_info *indirect,
+                       const struct pipe_draw_start_count_bias *draws,
+                       const struct pipe_draw_info *info)
+{
+   if (indirect) {
+      struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
+      uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
+      agx_batch_reads(batch, indirect_rsrc);
+
+      /* To implement draw parameters, we use the last 2 words of the
+       * indirect draw descriptor. Offset by 3 words for indexed draw (5
+       * total) and 2 words for non-indexed (4 total).  See the layouts of
+       * indexed vs non-indexed draw descriptors.
+       *
+       * This gives us a consistent layout
+       *
+       *    uint32_t first_vertex;
+       *    uint32_t base_instance;
+       *
+       * and we can implement load_first_vertex & load_base_instance without
+       * checking for indexing.
+       */
+      uint32_t offset = info->index_size ? 3 : 2;
+      batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = address + offset * 4;
+   } else {
+      /* Upload just those two words. */
+      uint32_t params[2] = {
+         info->index_size ? draws->index_bias : draws->start,
+         info->start_instance,
+      };
+
+      batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] =
+         agx_pool_upload_aligned(&batch->pool, params, sizeof(params), 4);
+   }
+}
+
+static void
+agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
+                 unsigned drawid_offset,
+                 const struct pipe_draw_indirect_info *indirect,
+                 const struct pipe_draw_start_count_bias *draws,
+                 unsigned num_draws)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+   perf_debug(dev, "Tessellation");
+
+   struct agx_uncompiled_shader *tcs = ctx->stage[MESA_SHADER_TESS_CTRL].shader;
+   struct agx_uncompiled_shader *tes = ctx->stage[MESA_SHADER_TESS_EVAL].shader;
+
+   assert(tes != NULL && "required with patches");
+
+   unsigned patch_vertices = ctx->patch_vertices;
+
+   /* OpenGL allows omitting the tcs, fill in a passthrough program if needed.
+    * In principle, we could optimize this case, but I don't think it matters.
+    */
+   bool unbind_tcs_when_done = false;
+   if (!tcs) {
+      struct agx_uncompiled_shader *vs = ctx->stage[MESA_SHADER_VERTEX].shader;
+
+      assert(patch_vertices >= 1 &&
+             patch_vertices <= ARRAY_SIZE(vs->passthrough_tcs));
+
+      if (!vs->passthrough_tcs[patch_vertices - 1]) {
+         struct blob_reader reader;
+         blob_reader_init(&reader, vs->early_serialized_nir.data,
+                          vs->early_serialized_nir.size);
+         nir_shader *vs_nir = nir_deserialize(NULL, &agx_nir_options, &reader);
+         nir_shader *nir = nir_create_passthrough_tcs(&agx_nir_options, vs_nir,
+                                                      patch_vertices);
+         ralloc_free(vs_nir);
+
+         /* Lower the tess level sysvals and gather info, since mesa/st won't do
+          * either for us.
+          */
+         NIR_PASS(_, nir, nir_lower_system_values);
+
+         nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+         vs->passthrough_tcs[patch_vertices - 1] =
+            pipe_shader_from_nir(&ctx->base, nir);
+      }
+
+      tcs = vs->passthrough_tcs[patch_vertices - 1];
+      ctx->base.bind_tcs_state(&ctx->base, tcs);
+      unbind_tcs_when_done = true;
+   }
+
+   unsigned in_vertices = draws->count;
+   unsigned in_patches = in_vertices / patch_vertices;
+
+   if (in_patches == 0)
+      return;
+
+   struct agx_batch *batch = agx_get_compute_batch(ctx);
+   agx_batch_init_state(batch);
+
+   struct pipe_resource *heap =
+      pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL, PIPE_USAGE_DEFAULT,
+                         1024 * 1024 * 128);
+
+   uint64_t heap_gpu = agx_resource(heap)->bo->ptr.gpu;
+   uint8_t *heap_cpu = agx_resource(heap)->bo->ptr.cpu;
+
+   unsigned unrolled_patch_count = in_patches * info->instance_count;
+
+   uint32_t heap_water = 0;
+   uint32_t tcs_out_offs = heap_water;
+   heap_water += ALIGN(unrolled_patch_count * tcs->tess.output_stride, 4);
+
+   agx_batch_writes(batch, agx_resource(heap), 0);
+
+   uint64_t ib = 0;
+   size_t ib_extent = 0;
+
+   if (info->index_size)
+      ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
+
+   agx_upload_ia_params(batch, info, indirect, ib, ib_extent, 0);
+   agx_upload_draw_params(batch, indirect, draws, info);
+
+   /* Setup parameters */
+   struct agx_tess_params tess_params = {
+      .tcs_buffer = heap_gpu + tcs_out_offs,
+      .input_patch_size = patch_vertices,
+      .output_patch_size = tcs->tess.output_patch_size,
+      .tcs_patch_constants = tcs->tess.nr_patch_outputs,
+      .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
+      .patch_coord_buffer = heap_gpu,
+      .patches_per_instance = in_patches,
+   };
+
+   memcpy(&tess_params.tess_level_outer_default, ctx->default_outer_level,
+          sizeof(ctx->default_outer_level));
+   memcpy(&tess_params.tess_level_inner_default, ctx->default_inner_level,
+          sizeof(ctx->default_inner_level));
+
+   batch->uniforms.tess_params =
+      agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
+
+   /* Run VS+TCS as compute */
+   agx_upload_vbos(batch);
+   agx_update_vs(ctx);
+   agx_update_tcs(ctx, info);
+   /* XXX */
+   ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0;
+   ctx->stage[PIPE_SHADER_TESS_EVAL].dirty = ~0;
+   agx_update_descriptors(batch, ctx->vs, PIPE_SHADER_VERTEX);
+   agx_update_descriptors(batch, ctx->tcs, PIPE_SHADER_TESS_CTRL);
+
+   struct pipe_grid_info tcs_grid = {
+      .block = {MAX2(patch_vertices, tcs->tess.output_patch_size), 1, 1},
+      .grid = {in_patches, info->instance_count, 1},
+      /* XXX */
+      .variable_shared_mem = 32768,
+   };
+
+   agx_launch(batch, &tcs_grid, ctx->tcs, PIPE_SHADER_TESS_CTRL);
+
+   agx_flush_all(ctx, "HACK");
+   agx_sync_all(ctx, "HACK");
+
+   /* Setup batch */
+   batch = agx_get_batch(ctx);
+
+   enum tess_primitive_mode mode =
+      MAX2(tcs->tess.primitive, tes->tess.primitive);
+   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
+
+   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
+                                        ? PIPE_TESS_SPACING_EQUAL
+                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
+                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
+                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
+
+   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
+   enum mesa_prim in_prim = mode == TESS_PRIMITIVE_ISOLINES ? MESA_PRIM_LINES
+                            : mode == TESS_PRIMITIVE_QUADS
+                               ? MESA_PRIM_QUADS
+                               : MESA_PRIM_TRIANGLES;
+   enum mesa_prim out_prim = point_mode ? MESA_PRIM_POINTS
+                             : mode == TESS_PRIMITIVE_ISOLINES
+                                ? MESA_PRIM_LINES
+                                : MESA_PRIM_TRIANGLES;
+
+   struct pipe_tessellator *tess =
+      p_tess_init(in_prim, pspacing, tes->tess.ccw, point_mode);
+
+   struct pipe_tessellator_data data = {0};
+
+   /* Mem allocate */
+   uint32_t patch_coord_offs_offs = heap_water;
+   tess_params.patch_coord_offs = heap_gpu + heap_water;
+   heap_water += align(4 * unrolled_patch_count, 4);
+
+   uint32_t draws_off = heap_water;
+   uint32_t *patch_draws = (uint32_t *)(heap_cpu + heap_water);
+   heap_water += align(sizeof(uint32_t) * 5 * unrolled_patch_count, 4);
+
+   uint32_t *patch_offs = (uint32_t *)(heap_cpu + patch_coord_offs_offs);
+
+   for (unsigned patch = 0; patch < unrolled_patch_count; ++patch) {
+      float *addr =
+         (float *)(heap_cpu + tcs_out_offs + tcs->tess.output_stride * patch);
+
+      struct pipe_tessellation_factors factors = {
+         .outer_tf = {addr[0], addr[1], addr[2], addr[3]},
+         .inner_tf = {addr[4], addr[5]},
+      };
+      p_tessellate(tess, &factors, &data);
+
+      /* Mem allocate indices */
+      uint32_t index_off = heap_water;
+      uint16_t *indices = (uint16_t *)(heap_cpu + heap_water);
+      heap_water += align(sizeof(*indices) * data.num_indices, 4);
+
+      for (unsigned idx = 0; idx < data.num_indices; ++idx) {
+         indices[idx] = data.indices[idx];
+      }
+
+      /* Mem allocate patch coords */
+      heap_water = align(heap_water, 8);
+      patch_offs[patch] = heap_water / 8;
+      float *patch_coords = (float *)(heap_cpu + heap_water);
+      heap_water += align(8 * data.num_domain_points, 4);
+
+      for (unsigned p = 0; p < data.num_domain_points; ++p) {
+         patch_coords[2 * p + 0] = data.domain_points_u[p];
+         patch_coords[2 * p + 1] = data.domain_points_v[p];
+      }
+      assert(data.num_indices < 32768);
+      assert(data.num_domain_points < 8192);
+
+      /* Generate a draw for the patch */
+      uint32_t *desc = patch_draws + (patch * 5);
+
+      desc[0] = data.num_indices;                   /* count */
+      desc[1] = 1;                                  /* instance_count */
+      desc[2] = index_off / sizeof(*indices);       /* start */
+      desc[3] = patch * LIBAGX_TES_PATCH_ID_STRIDE; /* index_bias */
+      desc[4] = 0;                                  /* start_instance */
+   }
+   p_tess_destroy(tess);
+
+   /* Run TES as VS */
+   agx_batch_init_state(batch);
+   void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
+   ctx->base.bind_vs_state(&ctx->base,
+                           ctx->stage[PIPE_SHADER_TESS_EVAL].shader);
+   agx_update_vs(ctx);
+   agx_update_descriptors(batch, ctx->vs, PIPE_SHADER_TESS_EVAL);
+
+   struct pipe_draw_info draw_info = {
+      .mode = out_prim,
+      .index_size = 2,
+      .index.resource = heap,
+      .instance_count = 1,
+      .view_mask = info->view_mask,
+   };
+
+   /* Wrap the pool allocation in a fake resource for meta-Gallium use */
+   struct pipe_draw_indirect_info copy_indirect = {
+      .buffer = heap,
+      .offset = draws_off,
+      .stride = 5 * sizeof(uint32_t),
+      .draw_count = in_patches * info->instance_count,
+   };
+
+   batch->uniforms.tess_params =
+      agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
+
+   ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);
+
+   /* Restore vertex state */
+   ctx->base.bind_vs_state(&ctx->base, vs_cso);
+
+   pipe_resource_reference(&heap, NULL);
+
+   if (unbind_tcs_when_done) {
+      ctx->base.bind_tcs_state(&ctx->base, NULL);
+   }
+}
+
 static void
 agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
             unsigned drawid_offset,
@ -4205,7 +4606,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      assert(drawid_offset == 0);
      assert(num_draws == 1);

-      util_draw_multi_upload_indirect(pctx, info, indirect, draws);
+      util_draw_multi_unroll_indirect(pctx, info, indirect, draws);
      return;
   }

@ -4214,6 +4615,17 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      return;
   }

+   /* TODO: stop cheating */
+   if (info->mode == MESA_PRIM_PATCHES && indirect) {
+      perf_debug_ctx(ctx, "indirect tessellation");
+      util_draw_indirect(pctx, info, indirect);
+   }
+
+   if (info->mode == MESA_PRIM_PATCHES) {
+      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
+      return;
+   }
+
   if (agx_needs_passthrough_gs(ctx, info, indirect)) {
      agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
                               num_draws);
@ -4330,39 +4742,9 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   enum mesa_prim mode = info->mode;

   if (ctx->vs->info.uses_base_param || ctx->gs) {
+      agx_upload_draw_params(batch, indirect, draws, info);
+
      batch->uniforms.is_indexed_draw = (idx_size > 0);
-
-      if (indirect) {
-         struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
-         uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
-         agx_batch_reads(batch, indirect_rsrc);
-
-         /* To implement draw parameters, we use the last 2 words of the
-          * indirect draw descriptor. Offset by 3 words for indexed draw (5
-          * total) and 2 words for non-indexed (4 total).  See the layouts of
-          * indexed vs non-indexed draw descriptors.
-          *
-          * This gives us a consistent layout
-          *
-          *    uint32_t first_vertex;
-          *    uint32_t base_instance;
-          *
-          * and we can implement load_first_vertex & load_base_instance without
-          * checking for indexing.
-          */
-         uint32_t offset = idx_size ? 3 : 2;
-         batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = address + offset * 4;
-      } else {
-         /* Upload just those two words. */
-         uint32_t params[2] = {
-            idx_size ? draws->index_bias : draws->start,
-            info->start_instance,
-         };
-
-         batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] =
-            agx_pool_upload_aligned(&batch->pool, params, sizeof(params), 4);
-      }
-
      ctx->dirty |= AGX_DIRTY_VS;
   }

@ -4774,6 +5156,8 @@ agx_init_state_functions(struct pipe_context *ctx)
   ctx->create_vertex_elements_state = agx_create_vertex_elements;
   ctx->create_vs_state = agx_create_shader_state;
   ctx->create_gs_state = agx_create_shader_state;
+   ctx->create_tcs_state = agx_create_shader_state;
+   ctx->create_tes_state = agx_create_shader_state;
   ctx->create_compute_state = agx_create_compute_state;
   ctx->bind_blend_state = agx_bind_blend_state;
   ctx->bind_depth_stencil_alpha_state = agx_bind_zsa_state;
@ -4783,6 +5167,8 @@ agx_init_state_functions(struct pipe_context *ctx)
   ctx->bind_vertex_elements_state = agx_bind_vertex_elements_state;
   ctx->bind_vs_state = agx_bind_vs_state;
   ctx->bind_gs_state = agx_bind_gs_state;
+   ctx->bind_tcs_state = agx_bind_tcs_state;
+   ctx->bind_tes_state = agx_bind_tes_state;
   ctx->bind_compute_state = agx_bind_cs_state;
   ctx->delete_blend_state = agx_delete_state;
   ctx->delete_depth_stencil_alpha_state = agx_delete_state;
@ -4793,6 +5179,8 @@ agx_init_state_functions(struct pipe_context *ctx)
   ctx->delete_vertex_elements_state = agx_delete_state;
   ctx->delete_vs_state = agx_delete_shader_state;
   ctx->delete_gs_state = agx_delete_shader_state;
+   ctx->delete_tcs_state = agx_delete_shader_state;
+   ctx->delete_tes_state = agx_delete_shader_state;
   ctx->set_blend_color = agx_set_blend_color;
   ctx->set_clip_state = agx_set_clip_state;
   ctx->set_constant_buffer = agx_set_constant_buffer;
@ -4801,6 +5189,7 @@ agx_init_state_functions(struct pipe_context *ctx)
   ctx->set_sampler_views = agx_set_sampler_views;
   ctx->set_framebuffer_state = agx_set_framebuffer_state;
   ctx->set_polygon_stipple = agx_set_polygon_stipple;
+   ctx->set_patch_vertices = agx_set_patch_vertices;
   ctx->set_sample_mask = agx_set_sample_mask;
   ctx->set_scissor_states = agx_set_scissor_states;
   ctx->set_stencil_ref = agx_set_stencil_ref;
@ -4813,4 +5202,5 @@ agx_init_state_functions(struct pipe_context *ctx)
   ctx->set_global_binding = agx_set_global_binding;
   ctx->texture_barrier = agx_texture_barrier;
   ctx->get_compute_state_info = agx_get_compute_state_info;
+   ctx->set_tess_state = agx_set_tess_state;
 }
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -106,6 +106,9 @@ struct PACKED agx_draw_uniforms {
   /* Address of input assembly buffer if geom/tess is used, else 0 */
   uint64_t input_assembly;

+   /* Address of tessellation param buffer if tessellation is used, else 0 */
+   uint64_t tess_params;
+
   /* Address of geometry param buffer if geometry shaders are used, else 0 */
   uint64_t geometry_params;

@ -213,6 +216,7 @@ struct agx_uncompiled_shader {
   struct agx_uncompiled_shader_info info;
   struct hash_table *variants;
   struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2];
+   struct agx_uncompiled_shader *passthrough_tcs[32];

   uint32_t xfb_strides[4];
   bool has_xfb_info;
@ -222,6 +226,18 @@ struct agx_uncompiled_shader {

   /* Set on VS, passed to FS for linkage */
   unsigned base_varying;
+
+   /* Tessellation info */
+   struct {
+      uint64_t per_vertex_outputs;
+      uint32_t output_stride;
+      enum gl_tess_spacing spacing;
+      enum tess_primitive_mode primitive;
+      uint8_t output_patch_size;
+      uint8_t nr_patch_outputs;
+      bool ccw;
+      bool point_mode;
+   } tess;
 };

 enum agx_stage_dirty {
@ -407,6 +423,18 @@ struct asahi_fs_shader_key {
   enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS];
 };

+struct asahi_tcs_shader_key {
+   /* Input assembly key. Simplified because we know we're operating on patches.
+    */
+   uint8_t index_size_B;
+
+   /* Vertex shader key */
+   struct agx_attribute attribs[AGX_MAX_VBUFS];
+
+   /* Tessellation control shaders must be linked with a vertex shader. */
+   uint8_t input_nir_sha1[20];
+};
+
 struct asahi_gs_shader_key {
   /* Input assembly key */
   struct agx_ia_key ia;
@ -426,6 +454,7 @@ struct asahi_gs_shader_key {

 union asahi_shader_key {
   struct asahi_vs_shader_key vs;
+   struct asahi_tcs_shader_key tcs;
   struct asahi_gs_shader_key gs;
   struct asahi_fs_shader_key fs;
 };
@ -498,7 +527,7 @@ struct asahi_blitter {

 struct agx_context {
   struct pipe_context base;
-   struct agx_compiled_shader *vs, *fs, *gs;
+   struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
   uint32_t dirty;

   /* Heap for dynamic memory allocation for geometry/tessellation shaders */
@ -527,6 +556,10 @@ struct agx_context {
   struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
   uint32_t vb_mask;

+   unsigned patch_vertices;
+   float default_outer_level[4];
+   float default_inner_level[2];
+
   struct agx_stage stage[PIPE_SHADER_TYPES];
   struct agx_attribute *attributes;
   struct agx_rasterizer *rast;