From d26ae4f455a2ebfb4bf4fba62dd93a050603b8e2 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Fri, 5 Jul 2024 11:51:14 -0400
Subject: [PATCH] asahi,libagx: tessellate on device

Add OpenCL kernels implementing the tessellation algorithm on device. This is an
OpenCL C port of the D3D11 reference tessellator, originally written by
Microsoft in C++. There are significant differences compared to the CPU based
reference implementation:

* significant simplifications and clean up. The reference code did a lot of
  things in weird ways that would be inefficient on the GPU. I did a *lot* of
  work here to get good AGX assembly generated for the tessellation kernels ...
  the first attempts were quite bad! Notably, everything is carefully written to
  ensure that all private memory access is optimized out in NIR; the resulting
  kernels do not use scratch and do not spill on G13.

* prefix sum variants. To implement geom+tess efficiently, we need to first
  calculate the count of indices generated by the tessellator, then prefix sum
  that, then tessellate using the prefix sum results writing into 1 large index
  buffer for a single indirect draw. This isn't too bad, we already have most of
  the logic and the guts of the prefix sum kernel is shared with geometry
  shaders.

* VDM generation variant. To implement tess alone, it's fastest to generate a
  hardware Index List word for each patch, adding an appropriate 32-bit index
  bias to the dynamically allocated U16 index buffers. Then from the CPU, we
  have the illusion of a single draw to Stream Link with Return to. This
  requires packing hardware control words from the tessellator kernel.
  Fortunately, we have GenXML available so we just use agx_pack like we would in
  the driver.

Along the way, we pick up indirect tess support (this follows on naturally),
which gets rid of the other bit of tessellation-related cheating. Implementing
this requires reworking our internal agx_launch data structures, but that has
the nice side effect of speeding up GS invocations too (by fixing the workgroup
size).

Don't get me wrong. tessellator.cl is the single most unhinged file of my
career, featuring GenXML-based pack macros fed by dynamic memory allocation fed
by the inscrutable tessellation algorithm.

But it works *really* well.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30051>
---
 src/asahi/lib/agx_nir_lower_gs.c      |   41 +
 src/asahi/lib/agx_nir_lower_gs.h      |   22 +
 src/asahi/lib/meson.build             |    1 +
 src/asahi/lib/shaders/geometry.cl     |   53 +-
 src/asahi/lib/shaders/geometry.h      |   41 -
 src/asahi/lib/shaders/tessellation.cl |  114 +-
 src/asahi/lib/shaders/tessellator.cl  | 1748 ++++++++++++++++++++++++-
 src/asahi/lib/shaders/tessellator.h   |  124 ++
 src/gallium/drivers/asahi/agx_query.c |    4 +-
 src/gallium/drivers/asahi/agx_state.c |  711 +++++-----
 src/gallium/drivers/asahi/agx_state.h |   62 +-
 11 files changed, 2531 insertions(+), 390 deletions(-)
 create mode 100644 src/asahi/lib/shaders/tessellator.h

diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c
index 567aab61da4..e287afd1b4f 100644
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@@ -1534,6 +1534,13 @@ agx_nir_prefix_sum_gs(nir_builder *b, const void *data)
                      nir_channel(b, nir_load_workgroup_id(b), 0));
 }
 
+void
+agx_nir_prefix_sum_tess(nir_builder *b, const void *data)
+{
+   b->shader->info.workgroup_size[0] = 1024;
+   libagx_prefix_sum_tess(b, nir_load_preamble(b, 1, 64, .base = 0));
+}
+
 void
 agx_nir_gs_setup_indirect(nir_builder *b, const void *data)
 {
@@ -1564,3 +1571,37 @@ agx_nir_unroll_restart(nir_builder *b, const void *data)
    else
       unreachable("invalid index size");
 }
+
+void
+agx_nir_tessellate(nir_builder *b, const void *data)
+{
+   const struct agx_tessellator_key *key = data;
+   b->shader->info.workgroup_size[0] = 64;
+
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *patch = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+   nir_def *mode = nir_imm_int(b, key->mode);
+   nir_def *partitioning = nir_imm_int(b, key->partitioning);
+   nir_def *output_prim = nir_imm_int(b, key->output_primitive);
+
+   if (key->prim == TESS_PRIMITIVE_ISOLINES)
+      libagx_tess_isoline(b, params, mode, partitioning, output_prim, patch);
+   else if (key->prim == TESS_PRIMITIVE_TRIANGLES)
+      libagx_tess_tri(b, params, mode, partitioning, output_prim, patch);
+   else if (key->prim == TESS_PRIMITIVE_QUADS)
+      libagx_tess_quad(b, params, mode, partitioning, output_prim, patch);
+   else
+      unreachable("invalid tess primitive");
+}
+
+void
+agx_nir_tess_setup_indirect(nir_builder *b, const void *data)
+{
+   const struct agx_tess_setup_indirect_key *key = data;
+
+   nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
+   nir_def *with_counts = nir_imm_bool(b, key->with_counts);
+   nir_def *point_mode = nir_imm_bool(b, key->point_mode);
+
+   libagx_tess_setup_indirect(b, params, with_counts, point_mode);
+}
diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h
index f73084552f0..74fca3849ed 100644
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@@ -7,6 +7,7 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include "shaders/tessellator.h"
 #include "nir.h"
 #include "shader_enums.h"
 
@@ -39,6 +40,8 @@ bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx,
 
 void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data);
 
+void agx_nir_prefix_sum_tess(struct nir_builder *b, const void *data);
+
 struct agx_gs_setup_indirect_key {
    enum mesa_prim prim;
 };
@@ -52,6 +55,23 @@ struct agx_unroll_restart_key {
 
 void agx_nir_unroll_restart(struct nir_builder *b, const void *key);
 
+struct agx_tessellator_key {
+   enum tess_primitive_mode prim                      : 8;
+   enum libagx_tess_output_primitive output_primitive : 8;
+   enum libagx_tess_partitioning partitioning         : 8;
+   enum libagx_tess_mode mode                         : 8;
+};
+static_assert(sizeof(struct agx_tessellator_key) == 4, "padded");
+
+struct agx_tess_setup_indirect_key {
+   bool point_mode;
+   bool with_counts;
+   bool padding[2];
+};
+static_assert(sizeof(struct agx_tess_setup_indirect_key) == 4, "padded");
+
+void agx_nir_tessellate(struct nir_builder *b, const void *key);
+
 bool agx_nir_lower_tcs(struct nir_shader *tcs, const struct nir_shader *libagx);
 
 bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
@@ -59,3 +79,5 @@ bool agx_nir_lower_tes(struct nir_shader *tes, const struct nir_shader *libagx);
 uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);
 
 unsigned agx_tcs_output_stride(const struct nir_shader *nir);
+
+void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data);
diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build
index 0d78932adc4..ced44d9f679 100644
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@@ -42,6 +42,7 @@ libagx_shader_files = files(
   'shaders/query.h',
   'shaders/tessellation.cl',
   'shaders/tessellator.cl',
+  'shaders/tessellator.h',
   'shaders/texture.cl',
   'shaders/helper.cl',
 )
diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl
index ec4e0f970e0..88468ef9cb2 100644
--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "shaders/tessellator.h"
 #include "geometry.h"
 
 /* Compatible with util/u_math.h */
@@ -282,6 +283,17 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
    return (first_group * 32) + off;
 }
 
+/*
+ * Allocate memory from the heap (thread-safe). Returns the offset into the
+ * heap. The allocation will be word-aligned.
+ */
+static inline uint
+libagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B)
+{
+   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
+                           align(size_B, 8));
+}
+
 /*
  * When unrolling the index buffer for a draw, we translate the old indirect
  * draws to new indirect draws. This routine allocates the new index buffer and
@@ -300,12 +312,11 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
    uint max_verts = max_prims * mesa_vertices_per_prim(mode);
    uint alloc_size = max_verts * index_size_B;
 
-   /* Allocate memory from the heap for the unrolled index buffer. Use an atomic
-    * since multiple threads may be running to handle multidraw in parallel.
+   /* Allocate unrolled index buffer. Atomic since multiple threads may be
+    * running to handle multidraw in parallel.
     */
    global struct agx_geometry_state *heap = p->heap;
-   uint old_heap_bottom_B = atomic_fetch_add(
-      (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
+   uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size);
 
    /* Regardless of the input stride, we use tightly packed output draws */
    global uint *out = &p->out_draws[5 * draw];
@@ -612,6 +623,40 @@ libagx_prefix_sum(global uint *buffer, uint len, uint words, uint word)
    }
 }
 
+kernel void
+libagx_prefix_sum_tess(global struct libagx_tess_args *p)
+{
+   libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */);
+
+   /* After prefix summing, we know the total # of indices, so allocate the
+    * index buffer now. Elect a thread for the allocation.
+    */
+   barrier(CLK_LOCAL_MEM_FENCE);
+   if (get_local_id(0) != 0)
+      return;
+
+   /* The last element of an inclusive prefix sum is the total sum */
+   uint total = p->counts[p->nr_patches - 1];
+
+   /* Allocate 4-byte indices */
+   uint32_t elsize_B = sizeof(uint32_t);
+   uint32_t size_B = total * elsize_B;
+   uint alloc_B = p->heap->heap_bottom;
+   p->heap->heap_bottom += size_B;
+   p->heap->heap_bottom = align(p->heap->heap_bottom, 8);
+
+   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B);
+
+   /* ...and now we can generate the API indexed draw */
+   global uint32_t *desc = p->out_draws;
+
+   desc[0] = total;              /* count */
+   desc[1] = 1;                  /* instance_count */
+   desc[2] = alloc_B / elsize_B; /* start */
+   desc[3] = 0;                  /* index_bias */
+   desc[4] = 0;                  /* start_instance */
+}
+
 uintptr_t
 libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
                              gl_varying_slot location)
diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h
index 745d17d6b26..b4a2b44f7d4 100644
--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@@ -203,47 +203,6 @@ struct agx_geometry_params {
 } PACKED;
 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 78 * 4);
 
-struct agx_tess_params {
-   /* Persistent (cross-draw) geometry state */
-   GLOBAL(struct agx_geometry_state) state;
-
-   /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */
-   GLOBAL(uint) patch_coord_offs;
-
-   /* Patch coordinate buffer, indexed as:
-    *
-    *    patch_coord_offs[patch_ID] + vertex_in_patch
-    *
-    * Currently float2s, but we might be able to compact later?
-    */
-   GLOBAL(float2) patch_coord_buffer;
-
-   /* Tessellation control shader output buffer, indexed by patch ID. */
-   GLOBAL(uchar) tcs_buffer;
-
-   /* Bitfield of TCS per-vertex outputs */
-   uint64_t tcs_per_vertex_outputs;
-
-   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
-    * Unused in Vulkan and OpenGL ES.
-    */
-   float tess_level_outer_default[4];
-   float tess_level_inner_default[4];
-
-   /* Number of vertices in the input patch */
-   uint input_patch_size;
-
-   /* Number of vertices in the TCS output patch */
-   uint output_patch_size;
-
-   /* Number of patch constants written by TCS */
-   uint tcs_patch_constants;
-
-   /* Number of input patches per instance of the VS/TCS */
-   uint patches_per_instance;
-} PACKED;
-AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4);
-
 /* TCS shared memory layout:
  *
  *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
diff --git a/src/asahi/lib/shaders/tessellation.cl b/src/asahi/lib/shaders/tessellation.cl
index c4d549b9ff7..5dfea8345a1 100644
--- a/src/asahi/lib/shaders/tessellation.cl
+++ b/src/asahi/lib/shaders/tessellation.cl
@@ -4,15 +4,17 @@
  */
 
 #include "geometry.h"
+#include "tessellator.h"
+#include <agx_pack.h>
 
 uint
-libagx_tcs_patch_vertices_in(constant struct agx_tess_params *p)
+libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p)
 {
    return p->input_patch_size;
 }
 
 uint
-libagx_tes_patch_vertices_in(constant struct agx_tess_params *p)
+libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p)
 {
    return p->output_patch_size;
 }
@@ -25,7 +27,7 @@ libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
 }
 
 uintptr_t
-libagx_tcs_out_address(constant struct agx_tess_params *p, uint patch_id,
+libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id,
                        uint vtx_id, gl_varying_slot location, uint nr_patch_out,
                        uint out_patch_size, uint64_t vtx_out_mask)
 {
@@ -45,7 +47,7 @@ libagx_tes_unrolled_patch_id(uint raw_id)
 }
 
 uint
-libagx_tes_patch_id(constant struct agx_tess_params *p, uint raw_id)
+libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id)
 {
    return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
 }
@@ -57,16 +59,20 @@ tes_vertex_id_in_patch(uint raw_id)
 }
 
 float2
-libagx_load_tess_coord(constant struct agx_tess_params *p, uint raw_id)
+libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id)
 {
    uint patch = libagx_tes_unrolled_patch_id(raw_id);
    uint vtx = tes_vertex_id_in_patch(raw_id);
 
-   return p->patch_coord_buffer[p->patch_coord_offs[patch] + vtx];
+   global struct libagx_tess_point *t =
+      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
+
+   /* Written weirdly because NIR struggles with loads of structs */
+   return *((global float2 *)t);
 }
 
 uintptr_t
-libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
+libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id,
                       uint vtx_id, gl_varying_slot location)
 {
    uint patch = libagx_tes_unrolled_patch_id(raw_id);
@@ -77,7 +83,7 @@ libagx_tes_in_address(constant struct agx_tess_params *p, uint raw_id,
 }
 
 float4
-libagx_tess_level_outer_default(constant struct agx_tess_params *p)
+libagx_tess_level_outer_default(constant struct libagx_tess_args *p)
 {
    return (
       float4)(p->tess_level_outer_default[0], p->tess_level_outer_default[1],
@@ -85,8 +91,98 @@ libagx_tess_level_outer_default(constant struct agx_tess_params *p)
 }
 
 float2
-libagx_tess_level_inner_default(constant struct agx_tess_params *p)
+libagx_tess_level_inner_default(constant struct libagx_tess_args *p)
 {
    return (float2)(p->tess_level_inner_default[0],
                    p->tess_level_inner_default[1]);
 }
+
+void
+libagx_tess_setup_indirect(global struct libagx_tess_args *p, bool with_counts,
+                           bool point_mode)
+{
+   uint count = p->indirect[0], instance_count = p->indirect[1];
+   unsigned in_patches = count / p->input_patch_size;
+
+   /* TCS invocation counter increments once per-patch */
+   if (p->tcs_statistic) {
+      *(p->tcs_statistic) += in_patches;
+   }
+
+   size_t draw_stride =
+      ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
+
+   unsigned unrolled_patches = in_patches * instance_count;
+
+   uint32_t alloc = 0;
+   uint32_t tcs_out_offs = alloc;
+   alloc += unrolled_patches * p->tcs_stride_el * 4;
+
+   uint32_t patch_coord_offs = alloc;
+   alloc += unrolled_patches * 4;
+
+   uint32_t count_offs = alloc;
+   if (with_counts)
+      alloc += unrolled_patches * sizeof(uint32_t);
+
+   uint vb_offs = alloc;
+   uint vb_size = libagx_tcs_in_size(count * instance_count, p->vertex_outputs);
+   alloc += vb_size;
+
+   /* Allocate all patch calculations in one go */
+   global uchar *blob = p->heap->heap + p->heap->heap_bottom;
+   p->heap->heap_bottom += alloc;
+
+   p->tcs_buffer = (global float *)(blob + tcs_out_offs);
+   p->patches_per_instance = in_patches;
+   p->coord_allocs = (global uint *)(blob + patch_coord_offs);
+   p->nr_patches = unrolled_patches;
+
+   *(p->vertex_output_buffer_ptr) = (uintptr_t)(blob + vb_offs);
+
+   if (with_counts) {
+      p->counts = (global uint32_t *)(blob + count_offs);
+   } else {
+#if 0
+      /* Arrange so we return after all generated draws. agx_pack would be nicer
+       * here but designated initializers lead to scratch access...
+       */
+      global uint32_t *ret =
+         (global uint32_t *)(blob + draw_offs +
+                             (draw_stride * unrolled_patches));
+
+      *ret = (AGX_VDM_BLOCK_TYPE_BARRIER << 29) | /* with return */ (1u << 27);
+#endif
+      /* TODO */
+   }
+
+   /* VS grid size */
+   p->grids[0] = count;
+   p->grids[1] = instance_count;
+   p->grids[2] = 1;
+
+   /* VS workgroup size */
+   p->grids[3] = 64;
+   p->grids[4] = 1;
+   p->grids[5] = 1;
+
+   /* TCS grid size */
+   p->grids[6] = in_patches * p->output_patch_size;
+   p->grids[7] = instance_count;
+   p->grids[8] = 1;
+
+   /* TCS workgroup size */
+   p->grids[9] = p->output_patch_size;
+   p->grids[10] = 1;
+   p->grids[11] = 1;
+
+   /* Tess grid size */
+   p->grids[12] = unrolled_patches;
+   p->grids[13] = 1;
+   p->grids[14] = 1;
+
+   /* Tess workgroup size */
+   p->grids[15] = 64;
+   p->grids[16] = 1;
+   p->grids[17] = 1;
+}
diff --git a/src/asahi/lib/shaders/tessellator.cl b/src/asahi/lib/shaders/tessellator.cl
index 0a1fe63e66a..82079242f47 100644
--- a/src/asahi/lib/shaders/tessellator.cl
+++ b/src/asahi/lib/shaders/tessellator.cl
@@ -1,8 +1,1748 @@
 /*
- * Copyright 2023 Alyssa Rosenzweig
- * Copyright (c) Microsoft Corporation
- * SPDX-License-Identifier: MIT
- */
+    Copyright (c) Microsoft Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 
 #include "geometry.h"
+#include "tessellator.h"
+#include <agx_pack.h>
 
+#if 0
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include "util/macros.h"
+#define min(x, y) (x < y ? x : y)
+#define max(x, y) (x > y ? x : y)
+#define clz(x)    (x ? __builtin_clz(x) : (8 * sizeof(x)))
+#define clamp(x, y, z) (x < y ? y : x > z ? z : x)
+#define align(x, y)    ALIGN_POT(x, y)
+#else
+#define assert(x)
+#endif
+
+#define LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f
+#define LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f
+
+typedef unsigned int FXP; // fixed point number
+
+enum {
+   U = 0, // points on a tri patch
+   V = 1,
+};
+
+enum {
+   Ueq0 = 0, // edges on a tri patch
+   Veq0 = 1,
+   Weq0 = 2,
+};
+
+enum {
+   Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1
+   Veq1 = 3,
+};
+
+#define QUAD_AXES  2
+#define QUAD_EDGES 4
+#define TRI_EDGES  3
+
+// The interior can just use a simpler stitch.
+typedef enum DIAGONALS {
+   DIAGONALS_INSIDE_TO_OUTSIDE,
+   DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE,
+   DIAGONALS_MIRRORED
+} DIAGONALS;
+
+typedef struct TESS_FACTOR_CONTEXT {
+   FXP fxpInvNumSegmentsOnFloorTessFactor;
+   FXP fxpInvNumSegmentsOnCeilTessFactor;
+   FXP fxpHalfTessFactorFraction;
+   int numHalfTessFactorPoints;
+   int splitPointOnFloorHalfTessFactor;
+} TESS_FACTOR_CONTEXT;
+
+struct INDEX_PATCH_CONTEXT {
+   int insidePointIndexDeltaToRealValue;
+   int insidePointIndexBadValue;
+   int insidePointIndexReplacementValue;
+   int outsidePointIndexPatchBase;
+   int outsidePointIndexDeltaToRealValue;
+   int outsidePointIndexBadValue;
+   int outsidePointIndexReplacementValue;
+};
+
+struct INDEX_PATCH_CONTEXT2 {
+   int baseIndexToInvert;
+   int indexInversionEndPoint;
+   int cornerCaseBadValue;
+   int cornerCaseReplacementValue;
+};
+
+struct CHWTessellator {
+   enum libagx_tess_output_primitive outputPrimitive;
+   enum libagx_tess_mode mode;
+   uint index_bias;
+
+   // array where we will store u/v's for the points we generate
+   global struct libagx_tess_point *Point;
+
+   // array where we will store index topology
+   global void *Index;
+
+   // A second index patch we have to do handles the leftover strip of quads in
+   // the middle of an odd quad patch after finishing all the concentric rings.
+   // This also handles the leftover strip of points in the middle of an even
+   // quad patch, when stitching the row of triangles up the left side (V major
+   // quad) or bottom (U major quad) of the inner ring
+   bool bUsingPatchedIndices;
+   bool bUsingPatchedIndices2;
+   struct INDEX_PATCH_CONTEXT IndexPatchCtx;
+   struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2;
+};
+
+#define FXP_INTEGER_BITS  15
+#define FXP_FRACTION_BITS 16
+#define FXP_FRACTION_MASK 0x0000ffff
+#define FXP_INTEGER_MASK  0x7fff0000
+#define FXP_ONE           (1 << FXP_FRACTION_BITS)
+#define FXP_ONE_THIRD     0x00005555
+#define FXP_TWO_THIRDS    0x0000aaaa
+#define FXP_ONE_HALF      0x00008000
+
+static global float *
+tess_factors(constant struct libagx_tess_args *p, uint patch)
+{
+   return p->tcs_buffer + (patch * p->tcs_stride_el);
+}
+
+static inline uint
+libagx_heap_alloc(global struct agx_geometry_state *heap, uint size_B)
+{
+   // TODO: drop align to 4 I think
+   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
+                           align(size_B, 8));
+}
+
+/*
+ * Generate an indexed draw for a patch with the computed number of indices.
+ * This allocates heap memory for the index buffer, returning the allocated
+ * memory.
+ */
+static global void *
+libagx_draw(constant struct libagx_tess_args *p, enum libagx_tess_mode mode,
+            bool lines, uint patch, uint count)
+{
+   if (mode == LIBAGX_TESS_MODE_COUNT) {
+      p->counts[patch] = count;
+   }
+
+   if (mode == LIBAGX_TESS_MODE_VDM) {
+      uint32_t elsize_B = sizeof(uint16_t);
+      uint32_t alloc_B = libagx_heap_alloc(p->heap, elsize_B * count);
+      uint64_t ib = ((uintptr_t)p->heap->heap) + alloc_B;
+
+      global uint32_t *desc = p->out_draws + (patch * 6);
+      agx_pack(&desc[0], INDEX_LIST, cfg) {
+         cfg.index_buffer_hi = (ib >> 32);
+         cfg.primitive = lines ? AGX_PRIMITIVE_LINES : AGX_PRIMITIVE_TRIANGLES;
+         cfg.restart_enable = false;
+         cfg.index_size = AGX_INDEX_SIZE_U16;
+         cfg.index_buffer_size_present = true;
+         cfg.index_buffer_present = true;
+         cfg.index_count_present = true;
+         cfg.instance_count_present = true;
+         cfg.start_present = true;
+         cfg.unk_1_present = false;
+         cfg.indirect_buffer_present = false;
+         cfg.unk_2_present = false;
+         cfg.block_type = AGX_VDM_BLOCK_TYPE_INDEX_LIST;
+      }
+
+      agx_pack(&desc[1], INDEX_LIST_BUFFER_LO, cfg) {
+         cfg.buffer_lo = ib & 0xffffffff;
+      }
+
+      agx_pack(&desc[2], INDEX_LIST_COUNT, cfg) {
+         cfg.count = count;
+      }
+
+      agx_pack(&desc[3], INDEX_LIST_INSTANCES, cfg) {
+         cfg.count = 1;
+      }
+
+      agx_pack(&desc[4], INDEX_LIST_START, cfg) {
+         cfg.start = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+      }
+
+      agx_pack(&desc[5], INDEX_LIST_BUFFER_SIZE, cfg) {
+         cfg.size = align(count * 2, 4);
+      }
+
+      return (global void *)ib;
+   }
+
+   if (mode == LIBAGX_TESS_MODE_WITH_COUNTS) {
+      /* The index buffer is already allocated, get a pointer inside it.
+       * p->counts has had an inclusive prefix sum hence the subtraction.
+       */
+      uint offset_el = p->counts[sub_sat(patch, 1u)];
+      if (patch == 0)
+         offset_el = 0;
+
+      return &p->index_buffer[offset_el];
+   }
+
+   return NULL;
+}
+
+static void
+libagx_draw_points(private struct CHWTessellator *ctx,
+                   constant struct libagx_tess_args *p, uint patch, uint count)
+{
+   if (ctx->mode == LIBAGX_TESS_MODE_VDM) {
+      /* Generate a non-indexed draw for points mode tessellation. */
+      global uint32_t *desc = p->out_draws + (patch * 4);
+      agx_pack(&desc[0], INDEX_LIST, cfg) {
+         cfg.index_buffer_hi = 0;
+         cfg.primitive = AGX_PRIMITIVE_POINTS;
+         cfg.restart_enable = false;
+         cfg.index_size = 0;
+         cfg.index_buffer_size_present = false;
+         cfg.index_buffer_present = false;
+         cfg.index_count_present = true;
+         cfg.instance_count_present = true;
+         cfg.start_present = true;
+         cfg.unk_1_present = false;
+         cfg.indirect_buffer_present = false;
+         cfg.unk_2_present = false;
+         cfg.block_type = AGX_VDM_BLOCK_TYPE_INDEX_LIST;
+      }
+
+      agx_pack(&desc[1], INDEX_LIST_COUNT, cfg) {
+         cfg.count = count;
+      }
+
+      agx_pack(&desc[2], INDEX_LIST_INSTANCES, cfg) {
+         cfg.count = 1;
+      }
+
+      agx_pack(&desc[3], INDEX_LIST_START, cfg) {
+         cfg.start = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+      }
+   } else {
+      /* For points mode with a single draw, we need to generate a trivial index
+       * buffer to stuff in the patch ID in the right place.
+       */
+      global uint32_t *indices = libagx_draw(p, ctx->mode, false, patch, count);
+
+      if (ctx->mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      for (int i = 0; i < count; ++i) {
+         indices[i] = ctx->index_bias + i;
+      }
+   }
+}
+
+static void
+libagx_draw_empty(constant struct libagx_tess_args *p,
+                  enum libagx_tess_mode mode,
+                  enum libagx_tess_output_primitive output_primitive,
+                  uint patch)
+{
+   if (mode == LIBAGX_TESS_MODE_COUNT) {
+      p->counts[patch] = 0;
+   } else if (mode == LIBAGX_TESS_MODE_VDM) {
+      uint32_t words = (output_primitive == LIBAGX_TESS_OUTPUT_POINT) ? 4 : 6;
+      global uint32_t *desc = p->out_draws + (patch * words);
+      uint32_t nop_token = AGX_VDM_BLOCK_TYPE_BARRIER << 29;
+
+      for (uint32_t i = 0; i < words; ++i) {
+         desc[i] = nop_token;
+      }
+   }
+}
+
+/*
+ * Allocate heap memory for domain points for a patch. The allocation
+ * is recorded in the coord_allocs[] array, which is in elements.
+ */
+static global struct libagx_tess_point *
+libagx_heap_alloc_points(constant struct libagx_tess_args *p, uint patch,
+                         uint count)
+{
+   /* If we're recording statistics, increment now. The statistic is for
+    * tessellation evaluation shader invocations, which is equal to the number
+    * of domain points generated.
+    */
+   if (p->statistic) {
+      atomic_fetch_add((volatile atomic_uint *)(p->statistic), count);
+   }
+
+   uint32_t elsize_B = sizeof(struct libagx_tess_point);
+   uint32_t alloc_B = libagx_heap_alloc(p->heap, elsize_B * count);
+   uint32_t alloc_el = alloc_B / elsize_B;
+
+   p->coord_allocs[patch] = alloc_el;
+   return (global struct libagx_tess_point *)(((uintptr_t)p->heap->heap) +
+                                              alloc_B);
+}
+
+// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012
+// amar.patel@microsoft.com
+
+#define LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR  1
+#define LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR  63
+#define LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR 2
+#define LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR 64
+
+// 2^(-16), min positive fixed point fraction
+#define EPSILON 0.0000152587890625f
+#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON                                   \
+   (LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2)
+
+static float clamp_factor(float factor,
+                          enum libagx_tess_partitioning partitioning,
+                          float maxf)
+{
+   float lower = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN)
+                    ? LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR
+                    : LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR;
+
+   float upper = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD)
+                    ? LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR
+                    : LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR;
+
+   // If any TessFactor will end up > 1 after floatToFixed conversion later,
+   // then force the inside TessFactors to be > 1 so there is a picture frame.
+   if (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD &&
+       maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) {
+
+      lower = LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
+   }
+
+   factor = clamp(factor, lower, upper);
+
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      factor = ceil(factor);
+   }
+
+   return factor;
+}
+
+
+static FXP
+floatToFixed(const float input)
+{
+   return mad(input, FXP_ONE, 0.5f);
+}
+
+static float
+fixedToFloat(const FXP input)
+{
+   // Don't need to worry about special cases because the bounds are reasonable.
+   return ((float)input) / FXP_ONE;
+}
+
+static bool
+isOdd(const float input)
+{
+   return ((int)input) & 1;
+}
+
+static FXP
+fxpCeil(const FXP input)
+{
+   if (input & FXP_FRACTION_MASK) {
+      return (input & FXP_INTEGER_MASK) + FXP_ONE;
+   }
+   return input;
+}
+
+static FXP
+fxpFloor(const FXP input)
+{
+   return (input & FXP_INTEGER_MASK);
+}
+
+static int
+PatchIndexValue(private struct CHWTessellator *ctx, int index)
+{
+   if (ctx->bUsingPatchedIndices) {
+      // assumed remapped outide indices are > remapped inside vertices
+      if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) {
+         if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue)
+            return ctx->IndexPatchCtx.outsidePointIndexReplacementValue;
+         else
+            return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue;
+      } else {
+         if (index == ctx->IndexPatchCtx.insidePointIndexBadValue)
+            return ctx->IndexPatchCtx.insidePointIndexReplacementValue;
+         else
+            return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue;
+      }
+   } else if (ctx->bUsingPatchedIndices2) {
+      if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) {
+         if (index == ctx->IndexPatchCtx2.cornerCaseBadValue)
+            return ctx->IndexPatchCtx2.cornerCaseReplacementValue;
+         else
+            return ctx->IndexPatchCtx2.indexInversionEndPoint - index;
+      } else if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) {
+         return ctx->IndexPatchCtx2.cornerCaseReplacementValue;
+      }
+   }
+
+   return index;
+}
+
+static void
+DefinePoint(global struct libagx_tess_point *out, FXP fxpU, FXP fxpV)
+{
+   out->u = fixedToFloat(fxpU);
+   out->v = fixedToFloat(fxpV);
+}
+
+static void
+DefineIndex(private struct CHWTessellator *ctx, int index,
+            int indexStorageOffset)
+{
+   int patched = PatchIndexValue(ctx, index);
+
+   if (ctx->mode == LIBAGX_TESS_MODE_WITH_COUNTS) {
+      global uint32_t *indices = (global uint32_t *)ctx->Index;
+      indices[indexStorageOffset] = ctx->index_bias + patched;
+   } else {
+      global uint16_t *indices = (global uint16_t *)ctx->Index;
+      indices[indexStorageOffset] = patched;
+   }
+}
+
+static void
+DefineClockwiseTriangle(private struct CHWTessellator *ctx, int index0,
+                        int index1, int index2, int indexStorageBaseOffset)
+{
+   // inputs a clockwise triangle, stores a CW or CCW triangle per state state
+   bool cw = ctx->outputPrimitive == LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   DefineIndex(ctx, index0, indexStorageBaseOffset);
+   DefineIndex(ctx, cw ? index1 : index2, indexStorageBaseOffset + 1);
+   DefineIndex(ctx, cw ? index2 : index1, indexStorageBaseOffset + 2);
+}
+
+static uint32_t
+RemoveMSB(uint32_t val)
+{
+   uint32_t bit = val ? (1 << (31 - clz(val))) : 0;
+   return val & ~bit;
+}
+
+static int
+NumPointsForTessFactor(bool odd, FXP fxpTessFactor)
+{
+   // Add epsilon for rounding and add 1 for odd
+   FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1);
+   int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1);
+   return odd ? r : r + 1;
+}
+
+static void
+ComputeTessFactorCtx(bool odd, FXP fxpTessFactor,
+                     private TESS_FACTOR_CONTEXT *TessFactorCtx)
+{
+   // fxpHalfTessFactor == 1/2 if TessFactor is 1,
+   // but we're pretending we are even.
+   FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2;
+   if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) {
+      fxpHalfTessFactor += FXP_ONE_HALF;
+   }
+   FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor);
+   FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor);
+   TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor;
+   TessFactorCtx->numHalfTessFactorPoints =
+      (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always
+                                                    // fixed at the midpoint of the TessFactor
+   if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) {
+      TessFactorCtx->splitPointOnFloorHalfTessFactor =
+         /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1;
+   } else if (odd) {
+      if (fxpFloorHalfTessFactor == FXP_ONE) {
+         TessFactorCtx->splitPointOnFloorHalfTessFactor = 0;
+      } else {
+         TessFactorCtx->splitPointOnFloorHalfTessFactor =
+            (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1;
+      }
+   } else {
+      TessFactorCtx->splitPointOnFloorHalfTessFactor =
+         (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1;
+   }
+   int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS;
+   int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS;
+   if (odd) {
+      numFloorSegments -= 1;
+      numCeilSegments -= 1;
+   }
+   TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor =
+      floatToFixed(1.0f / (float)numFloorSegments);
+   TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor =
+      floatToFixed(1.0f / (float)numCeilSegments);
+}
+
+static FXP
+PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd,
+               int point)
+{
+   bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints;
+
+   if (bFlip) {
+      point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd;
+   }
+
+   // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly
+   if (point == TessFactorCtx->numHalfTessFactorPoints)
+      return FXP_ONE_HALF;
+
+   unsigned int indexOnCeilHalfTessFactor = point;
+   unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
+   if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) {
+      indexOnFloorHalfTessFactor -= 1;
+   }
+   // For the fixed point multiplies below, we know the results are <= 16 bits
+   // because the locations on the halfTessFactor are <= half the number of
+   // segments for the total TessFactor. So a number divided by a number that
+   // is at least twice as big will give a result no bigger than 0.5 (which in
+   // fixed point is 16 bits in our case)
+   FXP fxpLocationOnFloorHalfTessFactor =
+      indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor;
+   FXP fxpLocationOnCeilHalfTessFactor =
+      indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor;
+
+   // Since we know the numbers calculated above are <= fixed point 0.5, and the
+   // equation below is just lerping between two values <= fixed point 0.5
+   // (0x00008000), then we know that the final result before shifting by 16 bits
+   // is no larger than 0x80000000.  Once we shift that down by 16, we get the
+   // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5
+   // (0x00008000)
+   FXP fxpLocation =
+      fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) +
+      fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction);
+   fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16
+   if (bFlip) {
+      fxpLocation = FXP_ONE - fxpLocation;
+   }
+   return fxpLocation;
+}
+
+static void
+StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid,
+              DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints,
+              int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset)
+{
+   int insidePoint = insideEdgePointBaseOffset;
+   int outsidePoint = outsideEdgePointBaseOffset;
+   if (bTrapezoid) {
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+   int p;
+   switch (diagonals) {
+   case DIAGONALS_INSIDE_TO_OUTSIDE:
+      // Diagonals pointing from inside edge forward towards outside edge
+      for (p = 0; p < numInsideEdgePoints - 1; p++) {
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+      // Diagonals pointing from outside edge forward towards inside edge
+
+      // First half
+      for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) {
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+
+      // Middle
+      DefineClockwiseTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, baseIndexOffset);
+      baseIndexOffset += 3;
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
+                              baseIndexOffset);
+      baseIndexOffset += 3;
+      insidePoint++;
+      outsidePoint++;
+      p += 2;
+
+      // Second half
+      for (; p < numInsideEdgePoints; p++) {
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   case DIAGONALS_MIRRORED:
+      // First half, diagonals pointing from outside of outside edge to inside of
+      // inside edge
+      for (p = 0; p < numInsideEdgePoints / 2; p++) {
+         DefineClockwiseTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      // Second half, diagonals pointing from inside of inside edge to outside of
+      // outside edge
+      for (; p < numInsideEdgePoints - 1; p++) {
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                                 baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   }
+   if (bTrapezoid) {
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint, baseIndexOffset);
+      baseIndexOffset += 3;
+   }
+}
+
+// loop_start and loop_end give optimal loop bounds for
+// the stitching algorithm further below, for any given halfTssFactor. There
+// is probably a better way to encode this...
+//
+// Return the FIRST entry in finalPointPositionTable awhich is less than
+// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
+static int
+loop_start(int N)
+{
+   if (N < 2)
+      return 1;
+   else if (N == 2)
+      return 17;
+   else if (N < 5)
+      return 9;
+   else if (N < 9)
+      return 5;
+   else if (N < 17)
+      return 3;
+   else
+      return 2;
+}
+
+// Return the LAST entry in finalPointPositionTable[] which is less than
+// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
+static int
+loop_end(int N)
+{
+   if (N < 2)
+      return 0;
+   else if (N < 4)
+      return 17;
+   else if (N < 8)
+      return 25;
+   else if (N < 16)
+      return 29;
+   else if (N < 32)
+      return 31;
+   else
+      return 32;
+}
+
+// Tables to assist in the stitching of 2 rows of points having arbitrary
+// TessFactors. The stitching order is governed by Ruler Function vertex
+// split ordering (see external documentation).
+//
+// The contents of the finalPointPositionTable are where vertex i [0..33]
+// ends up on the half-edge at the max tessellation amount given
+// ruler-function split order. Recall the other half of an edge is mirrored,
+// so we only need to deal with one half. This table is used to decide when
+// to advance a point on the interior or exterior. It supports odd TessFactor
+// up to 65 and even TessFactor up to 64.
+
+/* TODO: Is this actually faster than a LUT? */
+static uint32_t
+finalPointPositionTable(uint32_t x)
+{
+   if (x == 0)
+      return 0;
+   if (x == 1)
+      return 0x20;
+
+   uint32_t shift;
+   if ((x & 1) == 0) {
+      shift = 1;
+   } else if ((x & 3) == 3) {
+      shift = 2;
+   } else if ((x & 7) == 5) {
+      shift = 3;
+   } else if (x != 17) {
+      shift = 4;
+   } else {
+      shift = 5;
+   }
+
+   // SWAR vectorized right-shift of (0x20, x)
+   // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`.
+   uint32_t items_to_shift = x | (0x20 << 16);
+   uint32_t shifted = items_to_shift >> shift;
+
+   uint32_t bias = min(0xfu, shifted >> 16);
+   return bias + (shifted & 0xffff);
+}
+
+static void
+StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset,
+                 int insideEdgePointBaseOffset,
+                 int insideNumHalfTessFactorPoints,
+                 bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset,
+                 int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd)
+{
+   if (insideEdgeTessFactorOdd) {
+      insideNumHalfTessFactorPoints -= 1;
+   }
+   if (outsideTessFactorOdd) {
+      outsideNumHalfTessFactorPoints -= 1;
+   }
+   // Walk first half
+   int outsidePoint = outsideEdgePointBaseOffset;
+   int insidePoint = insideEdgePointBaseOffset;
+
+   // iStart,iEnd are a small optimization so the loop below doesn't have to go
+   // from 0 up to 31
+   int iStart = min(loop_start(insideNumHalfTessFactorPoints),
+                    loop_start(outsideNumHalfTessFactorPoints));
+   int iEnd = loop_end(
+      max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints));
+
+   // since we don't start the loop at 0 below, we need a special case.
+   if (0 < outsideNumHalfTessFactorPoints) {
+      // Advance outside
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                              baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+
+   for (int i = iStart; i <= iEnd; i++) {
+      int bound = finalPointPositionTable(i);
+
+      if (bound < insideNumHalfTessFactorPoints) {
+         // Advance inside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+      if (bound < outsideNumHalfTessFactorPoints) {
+         // Advance outside
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1,
+                                 insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      }
+   }
+
+   if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) ||
+       insideEdgeTessFactorOdd) {
+      if (insideEdgeTessFactorOdd == outsideTessFactorOdd) {
+         // Quad in the middle
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineClockwiseTriangle(ctx, insidePoint + 1, outsidePoint,
+                                 outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      } else if (!insideEdgeTessFactorOdd) {
+         // Triangle pointing inside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 outsidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      } else {
+         // Triangle pointing outside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+   }
+
+   // Walk second half.
+   for (int i = iEnd; i >= iStart; i--) {
+      int bound = finalPointPositionTable(i);
+
+      if (bound < outsideNumHalfTessFactorPoints) {
+         // Advance outside
+         DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1,
+                                 insidePoint, baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      }
+      if (bound < insideNumHalfTessFactorPoints) {
+         // Advance inside
+         DefineClockwiseTriangle(ctx, insidePoint, outsidePoint,
+                                 insidePoint + 1, baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+   }
+   // Below case is not needed if we didn't optimize loop above and made it run
+   // from 31 down to 0.
+   if (0 < outsideNumHalfTessFactorPoints) {
+      DefineClockwiseTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                              baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+}
+
+void
+libagx_tess_isoline(constant struct libagx_tess_args *p,
+                    enum libagx_tess_mode mode,
+                    enum libagx_tess_partitioning partitioning,
+                    enum libagx_tess_output_primitive output_primitive,
+                    uint patch)
+{
+   bool lineDensityOdd;
+   bool lineDetailOdd;
+   TESS_FACTOR_CONTEXT lineDensityTessFactorCtx;
+   TESS_FACTOR_CONTEXT lineDetailTessFactorCtx;
+
+   global float *factors = tess_factors(p, patch);
+   float TessFactor_V_LineDensity = factors[0];
+   float TessFactor_U_LineDetail = factors[1];
+
+   // Is the patch culled? NaN will pass.
+   if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) {
+      libagx_draw_empty(p, mode, output_primitive, patch);
+      return;
+   }
+
+   // Clamp edge TessFactors
+   TessFactor_V_LineDensity =
+      clamp(TessFactor_V_LineDensity,
+            LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR,
+            LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR);
+   TessFactor_U_LineDetail =
+      clamp_factor(TessFactor_U_LineDetail, partitioning, 0);
+
+   // Process tessFactors
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      lineDetailOdd = isOdd(TessFactor_U_LineDetail);
+   } else {
+      lineDetailOdd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
+   }
+
+   FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail);
+
+   ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail,
+                        &lineDetailTessFactorCtx);
+   int numPointsPerLine =
+      NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail);
+
+   TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity);
+   lineDensityOdd = isOdd(TessFactor_V_LineDensity);
+   FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity);
+   ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity,
+                        &lineDensityTessFactorCtx);
+
+   // don't draw last line at V == 1.
+   int numLines =
+      NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1;
+
+   /* Points */
+   uint num_points = numPointsPerLine * numLines;
+   if (mode != LIBAGX_TESS_MODE_COUNT) {
+      global struct libagx_tess_point *points =
+         libagx_heap_alloc_points(p, patch, num_points);
+
+      for (int line = 0, pointOffset = 0; line < numLines; line++) {
+         FXP fxpV =
+            PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line);
+
+         for (int point = 0; point < numPointsPerLine; point++) {
+            FXP fxpU =
+               PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point);
+
+            DefinePoint(&points[pointOffset++], fxpU, fxpV);
+         }
+      }
+   }
+
+   struct CHWTessellator ctx;
+   ctx.mode = mode;
+   ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+
+   /* Connectivity */
+   if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) {
+      uint num_indices = numLines * (numPointsPerLine - 1) * 2;
+      ctx.Index = libagx_draw(p, mode, true, patch, num_indices);
+
+      if (mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines;
+           line++) {
+         pointOffset++;
+
+         for (int point = 1; point < numPointsPerLine; point++) {
+            DefineIndex(&ctx, pointOffset - 1, indexOffset++);
+            DefineIndex(&ctx, pointOffset, indexOffset++);
+            pointOffset++;
+         }
+      }
+   } else {
+      libagx_draw_points(&ctx, p, patch, num_points);
+   }
+}
+
+void
+libagx_tess_tri(constant struct libagx_tess_args *p, enum libagx_tess_mode mode,
+
+                enum libagx_tess_partitioning partitioning,
+                enum libagx_tess_output_primitive output_primitive, uint patch)
+{
+   global float *factors = tess_factors(p, patch);
+   float tessFactor_Ueq0 = factors[0];
+   float tessFactor_Veq0 = factors[1];
+   float tessFactor_Weq0 = factors[2];
+   float insideTessFactor_f = factors[4];
+
+   struct CHWTessellator ctx;
+   ctx.outputPrimitive = output_primitive;
+   ctx.Point = NULL;
+   ctx.Index = NULL;
+   ctx.mode = mode;
+   ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+   ctx.bUsingPatchedIndices = false;
+   ctx.bUsingPatchedIndices2 = false;
+
+   // Is the patch culled? NaN will pass.
+   if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) ||
+       !(tessFactor_Weq0 > 0)) {
+
+      libagx_draw_empty(p, mode, output_primitive, patch);
+
+      return;
+   }
+
+   FXP outsideTessFactor[TRI_EDGES];
+   FXP insideTessFactor;
+   bool outsideTessFactorOdd[TRI_EDGES];
+   bool insideTessFactorOdd;
+   TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES];
+   TESS_FACTOR_CONTEXT insideTessFactorCtx;
+   // Stuff below is just specific to the traversal order
+   // this code happens to use to generate points/lines
+   int numPointsForOutsideEdge[TRI_EDGES];
+   int numPointsForInsideTessFactor;
+   int insideEdgePointBaseOffset;
+
+   // Clamp TessFactors
+   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
+   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
+   tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0);
+
+   float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0);
+   insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf);
+   // Note the above clamps map NaN to the lower bound
+
+   // Process tessFactors
+   float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
+                                           tessFactor_Weq0};
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
+      }
+      insideTessFactorOdd =
+         isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f);
+   } else {
+      bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
+
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = odd;
+      }
+      insideTessFactorOdd = odd;
+   }
+
+   // Save fixed point TessFactors
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
+   }
+   insideTessFactor = floatToFixed(insideTessFactor_f);
+
+   if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) {
+      // Special case if all TessFactors are 1
+      if ((FXP_ONE == insideTessFactor) &&
+          (FXP_ONE == outsideTessFactor[Ueq0]) &&
+          (FXP_ONE == outsideTessFactor[Veq0]) &&
+          (FXP_ONE == outsideTessFactor[Weq0])) {
+
+         /* Just do minimum tess factor */
+         if (mode == LIBAGX_TESS_MODE_COUNT) {
+            p->counts[patch] = 3;
+            return;
+         }
+
+         global struct libagx_tess_point *points =
+            libagx_heap_alloc_points(p, patch, 3);
+
+         DefinePoint(&points[0], 0,
+                     FXP_ONE);          // V=1 (beginning of Ueq0 edge VW)
+         DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU)
+         DefinePoint(&points[2], FXP_ONE,
+                     0); // U=1 (beginning of Weq0 edge UV)
+
+         if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) {
+            ctx.Index = libagx_draw(p, mode, false, patch, 3);
+
+            DefineClockwiseTriangle(&ctx, 0, 1, 2,
+                                    /*indexStorageBaseOffset*/ 0);
+         } else {
+            libagx_draw_points(&ctx, p, patch, 3);
+         }
+
+         return;
+      }
+   }
+
+   // Compute per-TessFactor metadata
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
+                           &outsideTessFactorCtx[edge]);
+   }
+   ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor,
+                        &insideTessFactorCtx);
+
+   // Compute some initial data.
+   int NumPoints = 0;
+
+   // outside edge offsets and storage
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
+         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
+      NumPoints += numPointsForOutsideEdge[edge];
+   }
+   NumPoints -= 3;
+
+   // inside edge offsets
+   numPointsForInsideTessFactor =
+      NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor);
+   {
+      int pointCountMin = insideTessFactorOdd ? 4 : 3;
+      // max() allows degenerate transition regions when inside TessFactor == 1
+      numPointsForInsideTessFactor =
+         max(pointCountMin, numPointsForInsideTessFactor);
+   }
+
+   insideEdgePointBaseOffset = NumPoints;
+
+   // inside storage, including interior edges above
+   {
+      int interiorRings = (numPointsForInsideTessFactor >> 1) - 1;
+      int even = insideTessFactorOdd ? 0 : 1;
+      NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even;
+   }
+
+   /* GENERATE POINTS */
+   if (mode != LIBAGX_TESS_MODE_COUNT) {
+      ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints);
+
+      // Generate exterior ring edge points, clockwise starting from point V
+      // (VW, the U==0 edge)
+      int pointOffset = 0;
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         int odd = edge & 0x1;
+         int endPoint = numPointsForOutsideEdge[edge] - 1;
+         // don't include end, since next edge starts with it.
+         for (int p = 0; p < endPoint; p++, pointOffset++) {
+            // whether to reverse point order given we are defining V or U (W
+            // implicit): edge0, VW, has V decreasing, so reverse 1D points
+            // below edge1, WU, has U increasing, so don't reverse 1D points
+            // below edge2, UV, has U decreasing, so reverse 1D points below
+            int q = odd ? p : endPoint - p;
+
+            FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
+                                          outsideTessFactorOdd[edge], q);
+            if (edge == 0) {
+               DefinePoint(&ctx.Point[pointOffset], 0, fxpParam);
+            } else {
+               DefinePoint(&ctx.Point[pointOffset], fxpParam,
+                           (edge == 2) ? FXP_ONE - fxpParam : 0);
+            }
+         }
+      }
+
+      // Generate interior ring points, clockwise spiralling in
+      int numRings = (numPointsForInsideTessFactor >> 1);
+      for (int ring = 1; ring < numRings; ring++) {
+         int startPoint = ring;
+         int endPoint = numPointsForInsideTessFactor - 1 - startPoint;
+
+         for (int edge = 0; edge < TRI_EDGES; edge++) {
+            int odd = edge & 0x1;
+            int perpendicularAxisPoint = startPoint;
+            FXP fxpPerpParam =
+               PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd,
+                              perpendicularAxisPoint);
+
+            // Map location to the right size in
+            // barycentric space. We know this fixed
+            // point math won't over/underflow
+            fxpPerpParam *= FXP_TWO_THIRDS;
+            fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >>
+                           FXP_FRACTION_BITS; // get back to n.16
+
+            // don't include end: next edge starts with it.
+            for (int p = startPoint; p < endPoint; p++, pointOffset++) {
+               // whether to reverse point given we are defining V or U (W
+               // implicit): edge0, VW, has V decreasing, so reverse 1D points
+               // below edge1, WU, has U increasing, so don't reverse 1D points
+               // below edge2, UV, has U decreasing, so reverse 1D points below
+               int q = odd ? p : endPoint - (p - startPoint);
+
+               FXP fxpParam =
+                  PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q);
+               // edge0 VW, has perpendicular parameter U constant
+               // edge1 WU, has perpendicular parameter V constant
+               // edge2 UV, has perpendicular parameter W constant
+               // reciprocal is the rate of change of edge-parallel parameters
+               // as they are pushed into the triangle
+               const unsigned int deriv = 2;
+
+               // we know this fixed point math won't over/underflow
+               FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv;
+
+               DefinePoint(&ctx.Point[pointOffset],
+                           edge > 0 ? tmp : fxpPerpParam,
+                           edge == 0   ? tmp
+                           : edge == 1 ? fxpPerpParam
+                                       : FXP_ONE - tmp - fxpPerpParam);
+            }
+         }
+      }
+      if (!insideTessFactorOdd) {
+         // Last point is the point at the center.
+         DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD);
+      }
+   }
+
+   if (output_primitive == LIBAGX_TESS_OUTPUT_POINT) {
+      libagx_draw_points(&ctx, p, patch, NumPoints);
+      return;
+   }
+
+   {
+      // Generate primitives for all the concentric rings, one side at a time
+      // for each ring +1 is so even tess includes the center point, which we
+      // want to now
+      int numRings = ((numPointsForInsideTessFactor + 1) >> 1);
+
+      int NumIndices = 0;
+      {
+         assert(numRings >= 2 && "invariant");
+         int OuterPoints = numPointsForOutsideEdge[0] +
+                           numPointsForOutsideEdge[1] +
+                           numPointsForOutsideEdge[2];
+
+         int numRings18 = numRings * 18;
+         NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) +
+                      (3 * OuterPoints) - (numRings18 * (numRings - 1)) +
+                      (insideTessFactorOdd ? 3 : 0);
+      }
+
+      // Generate the draw and allocate the index buffer now that we know the size
+      ctx.Index = libagx_draw(p, mode, false, patch, NumIndices);
+
+      if (mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      int insideOffset = insideEdgePointBaseOffset;
+      int outsideEdgePointBaseOffset = 0;
+
+      NumIndices = 0;
+      for (int ring = 1; ring < numRings; ring++) {
+         int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring;
+         int edge0InsidePointBaseOffset = insideOffset;
+         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
+         for (int edge = 0; edge < TRI_EDGES; edge++) {
+            int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge]
+                                          : (numPointsForInsideEdge + 2);
+
+            int numTriangles = numPointsForInsideEdge + outsidePoints - 2;
+
+            int insideBaseOffset;
+            int outsideBaseOffset;
+            if (edge == 2) {
+               ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
+                  insideOffset;
+               ctx.IndexPatchCtx.insidePointIndexBadValue =
+                  numPointsForInsideEdge - 1;
+               ctx.IndexPatchCtx.insidePointIndexReplacementValue =
+                  edge0InsidePointBaseOffset;
+               ctx.IndexPatchCtx.outsidePointIndexPatchBase =
+                  ctx.IndexPatchCtx.insidePointIndexBadValue +
+                  1; // past inside patched index range
+               ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
+                  outsideEdgePointBaseOffset -
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+               ctx.IndexPatchCtx.outsidePointIndexBadValue =
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints -
+                  1;
+               ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
+                  edge0OutsidePointBaseOffset;
+               ctx.bUsingPatchedIndices = true;
+               insideBaseOffset = 0;
+               outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+            } else {
+               insideBaseOffset = insideOffset;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+            }
+            if (ring == 1) {
+               StitchTransition(
+                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
+                  insideTessFactorCtx.numHalfTessFactorPoints,
+                  insideTessFactorOdd, outsideBaseOffset,
+                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
+                  outsideTessFactorOdd[edge]);
+            } else {
+               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
+                             /*baseIndexOffset: */ NumIndices,
+                             numPointsForInsideEdge, insideBaseOffset,
+                             outsideBaseOffset);
+            }
+            if (2 == edge) {
+               ctx.bUsingPatchedIndices = false;
+            }
+            NumIndices += numTriangles * 3;
+            outsideEdgePointBaseOffset += outsidePoints - 1;
+            insideOffset += numPointsForInsideEdge - 1;
+         }
+      }
+      if (insideTessFactorOdd) {
+         // Triangulate center (a single triangle)
+         DefineClockwiseTriangle(&ctx, outsideEdgePointBaseOffset,
+                                 outsideEdgePointBaseOffset + 1,
+                                 outsideEdgePointBaseOffset + 2, NumIndices);
+         NumIndices += 3;
+      }
+   }
+}
+
+void
+libagx_tess_quad(constant struct libagx_tess_args *p,
+                 enum libagx_tess_mode mode,
+                 enum libagx_tess_partitioning partitioning,
+                 enum libagx_tess_output_primitive output_primitive, uint patch)
+{
+   global float *factors = tess_factors(p, patch);
+
+   float tessFactor_Ueq0 = factors[0];
+   float tessFactor_Veq0 = factors[1];
+   float tessFactor_Ueq1 = factors[2];
+   float tessFactor_Veq1 = factors[3];
+
+   float insideTessFactor_U = factors[4];
+   float insideTessFactor_V = factors[5];
+
+   // TODO: fix designated initializer optimization in NIR
+   struct CHWTessellator ctx;
+   ctx.outputPrimitive = output_primitive;
+   ctx.Point = NULL;
+   ctx.Index = NULL;
+   ctx.mode = mode;
+   ctx.index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE;
+   ctx.bUsingPatchedIndices = false;
+   ctx.bUsingPatchedIndices2 = false;
+
+   // Is the patch culled?
+   if (!(tessFactor_Ueq0 > 0) || // NaN will pass
+       !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) ||
+       !(tessFactor_Veq1 > 0)) {
+      libagx_draw_empty(p, mode, output_primitive, patch);
+      return;
+   }
+
+   FXP outsideTessFactor[QUAD_EDGES];
+   FXP insideTessFactor[QUAD_AXES];
+   bool outsideTessFactorOdd[QUAD_EDGES];
+   bool insideTessFactorOdd[QUAD_AXES];
+   TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES];
+   TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES];
+   // Stuff below is just specific to the traversal order
+   // this code happens to use to generate points/lines
+   int numPointsForOutsideEdge[QUAD_EDGES];
+   int numPointsForInsideTessFactor[QUAD_AXES];
+   int insideEdgePointBaseOffset;
+
+   // Clamp edge TessFactors
+   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
+   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
+   tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0);
+   tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0);
+
+   float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0),
+                        max(tessFactor_Ueq1, tessFactor_Veq1)),
+                    max(insideTessFactor_U, insideTessFactor_V));
+
+   insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf);
+   insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf);
+   // Note the above clamps map NaN to lowerBound
+
+   // Process tessFactors
+   float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
+                                            tessFactor_Ueq1, tessFactor_Veq1};
+   float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U,
+                                          insideTessFactor_V};
+   int edge, axis;
+   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
+      for (edge = 0; edge < QUAD_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
+      }
+      for (axis = 0; axis < QUAD_AXES; axis++) {
+         insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) &&
+                                     (1.0f != insideTessFactor_f[axis]);
+      }
+   } else {
+      bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
+
+      for (edge = 0; edge < QUAD_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = odd;
+      }
+      insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd;
+   }
+
+   // Save fixed point TessFactors
+   for (edge = 0; edge < QUAD_EDGES; edge++) {
+      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
+   }
+   for (axis = 0; axis < QUAD_AXES; axis++) {
+      insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]);
+   }
+
+   if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) {
+      // Special case if all TessFactors are 1
+      if ((FXP_ONE == insideTessFactor[U]) &&
+          (FXP_ONE == insideTessFactor[V]) &&
+          (FXP_ONE == outsideTessFactor[Ueq0]) &&
+          (FXP_ONE == outsideTessFactor[Veq0]) &&
+          (FXP_ONE == outsideTessFactor[Ueq1]) &&
+          (FXP_ONE == outsideTessFactor[Veq1])) {
+
+         /* Just do minimum tess factor */
+         if (output_primitive != LIBAGX_TESS_OUTPUT_POINT) {
+            ctx.Index = libagx_draw(p, mode, false, patch, 6);
+            if (mode == LIBAGX_TESS_MODE_COUNT)
+               return;
+
+            DefineClockwiseTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0);
+            DefineClockwiseTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3);
+         } else {
+            libagx_draw_points(&ctx, p, patch, 4);
+         }
+
+         global struct libagx_tess_point *points =
+            libagx_heap_alloc_points(p, patch, 4);
+
+         DefinePoint(&points[0], 0, 0);
+         DefinePoint(&points[1], FXP_ONE, 0);
+         DefinePoint(&points[2], FXP_ONE, FXP_ONE);
+         DefinePoint(&points[3], 0, FXP_ONE);
+         return;
+      }
+   }
+
+   // Compute TessFactor-specific metadata
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
+                           &outsideTessFactorCtx[edge]);
+   }
+
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis],
+                           &insideTessFactorCtx[axis]);
+   }
+
+   int NumPoints = 0;
+
+   // outside edge offsets and storage
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
+         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
+      NumPoints += numPointsForOutsideEdge[edge];
+   }
+   NumPoints -= 4;
+
+   // inside edge offsets
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(
+         insideTessFactorOdd[axis], insideTessFactor[axis]);
+      int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3;
+      // max() allows degenerate transition regions when inside TessFactor == 1
+      numPointsForInsideTessFactor[axis] =
+         max(pointCountMin, numPointsForInsideTessFactor[axis]);
+   }
+
+   insideEdgePointBaseOffset = NumPoints;
+
+   // inside storage, including interior edges above
+   int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) *
+                           (numPointsForInsideTessFactor[V] - 2);
+   NumPoints += numInteriorPoints;
+
+   if (mode != LIBAGX_TESS_MODE_COUNT) {
+      ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints);
+
+      // Generate exterior ring edge points, clockwise from top-left
+      int pointOffset = 0;
+      for (int edge = 0; edge < QUAD_EDGES; edge++) {
+         int odd = edge & 0x1;
+         // don't include end, since next edge starts with it.
+         int endPoint = numPointsForOutsideEdge[edge] - 1;
+         for (int p = 0; p < endPoint; p++, pointOffset++) {
+            FXP fxpParam;
+            int q =
+               ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order
+            fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
+                                      outsideTessFactorOdd[edge], q);
+            if (odd) {
+               DefinePoint(&ctx.Point[pointOffset], fxpParam,
+                           (edge == 3) ? FXP_ONE : 0);
+            } else {
+               DefinePoint(&ctx.Point[pointOffset], (edge == 2) ? FXP_ONE : 0,
+                           fxpParam);
+            }
+         }
+      }
+
+      // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left)
+      // spiralling toward center
+      int minNumPointsForTessFactor =
+         min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]);
+      // note for even tess we aren't counting center point here.
+      int numRings = (minNumPointsForTessFactor >> 1);
+
+      for (int ring = 1; ring < numRings; ring++) {
+         int startPoint = ring;
+         int endPoint[QUAD_AXES] = {
+            numPointsForInsideTessFactor[U] - 1 - startPoint,
+            numPointsForInsideTessFactor[V] - 1 - startPoint,
+         };
+
+         for (int edge = 0; edge < QUAD_EDGES; edge++) {
+            int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)};
+            int perpendicularAxisPoint =
+               (edge < 2) ? startPoint : endPoint[odd[0]];
+            FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]],
+                                              insideTessFactorOdd[odd[0]],
+                                              perpendicularAxisPoint);
+
+            for (int p = startPoint; p < endPoint[odd[1]]; p++,
+                     pointOffset++) // don't include end: next edge starts with
+                                    // it.
+            {
+               int q = ((edge == 1) || (edge == 2))
+                          ? p
+                          : endPoint[odd[1]] - (p - startPoint);
+               FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd[1]],
+                                             insideTessFactorOdd[odd[1]], q);
+               if (odd[1]) {
+                  DefinePoint(&ctx.Point[pointOffset], fxpPerpParam, fxpParam);
+               } else {
+                  DefinePoint(&ctx.Point[pointOffset], fxpParam, fxpPerpParam);
+               }
+            }
+         }
+      }
+      // For even tessellation, the inner "ring" is degenerate - a row of points
+      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
+          !insideTessFactorOdd[V]) {
+         int startPoint = numRings;
+         int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint;
+         for (int p = startPoint; p <= endPoint; p++, pointOffset++) {
+            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U],
+                                          insideTessFactorOdd[U], p);
+            DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF);
+         }
+      } else if ((numPointsForInsideTessFactor[V] >=
+                  numPointsForInsideTessFactor[U]) &&
+                 !insideTessFactorOdd[U]) {
+         int startPoint = numRings;
+         int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint;
+         for (int p = endPoint; p >= startPoint; p--, pointOffset++) {
+            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V],
+                                          insideTessFactorOdd[V], p);
+            DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam);
+         }
+      }
+   }
+
+   if (output_primitive == LIBAGX_TESS_OUTPUT_POINT) {
+      libagx_draw_points(&ctx, p, patch, NumPoints);
+      return;
+   }
+
+   /* CONNECTIVITY */
+   {
+      // Generate primitives for all the concentric rings, one side at a time
+      // for each ring. +1 is so even tess includes the center point
+      int numPointRowsToCenter[QUAD_AXES] = {
+         (numPointsForInsideTessFactor[U] + 1) >> 1,
+         (numPointsForInsideTessFactor[V] + 1) >> 1,
+      };
+
+      int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]);
+
+      /* Calculate # of indices so we can allocate */
+      {
+         /* numPointsForInsideTessFactor >= 3 so numRings >= 2 */
+         assert(numRings >= 2);
+
+         /* Handle main case */
+         int OuterPoints =
+            numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] +
+            numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3];
+
+         int InnerPoints =
+            numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V];
+
+         int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) -
+                          (InnerPoints * 18) - (24 * numRings * (numRings - 1));
+
+         /* Determine major/minor axes */
+         bool U_major =
+            (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]);
+         unsigned M = U_major ? U : V;
+         unsigned m = U_major ? V : U;
+
+         /* Handle degenerate ring */
+         if (insideTessFactorOdd[m]) {
+            assert(numPointsForInsideTessFactor[M] >=
+                   numPointsForInsideTessFactor[m]);
+
+            NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) -
+                                (numPointsForInsideTessFactor[m] >> 1));
+            NumIndices += (insideTessFactorOdd[M] ? 6 : 12);
+         }
+
+         // Generate the draw and allocate the index buffer with the size
+         ctx.Index = libagx_draw(p, mode, false, patch, NumIndices);
+      }
+
+      if (mode == LIBAGX_TESS_MODE_COUNT)
+         return;
+
+      int degeneratePointRing[QUAD_AXES] = {
+         // Even partitioning causes degenerate row of points,
+         // which results in exceptions to the point ordering conventions
+         // when travelling around the rings counterclockwise.
+         !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1,
+         !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1,
+      };
+
+      int numPointsForOutsideEdge_[QUAD_EDGES] = {
+         numPointsForOutsideEdge[Ueq0],
+         numPointsForOutsideEdge[Veq0],
+         numPointsForOutsideEdge[Ueq1],
+         numPointsForOutsideEdge[Veq1],
+      };
+
+      int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset;
+      int outsideEdgePointBaseOffset = 0;
+
+      int NumIndices = 0;
+
+      for (int ring = 1; ring < numRings; ring++) {
+         int numPointsForInsideEdge[QUAD_AXES] = {
+            numPointsForInsideTessFactor[U] - 2 * ring,
+            numPointsForInsideTessFactor[V] - 2 * ring};
+
+         int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_;
+         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
+
+         for (int edge = 0; edge < QUAD_EDGES; edge++) {
+            int odd = (edge + 1) & 0x1;
+
+            int numTriangles =
+               numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2;
+            int insideBaseOffset;
+            int outsideBaseOffset;
+
+            // We need to patch the indexing so Stitch() can think it sees 2
+            // sequentially increasing rows of points, even though we have
+            // wrapped around to the end of the inner and outer ring's points,
+            // so the last point is really the first point for the ring. We make
+            // it so that when Stitch() calls AddIndex(), that function will do
+            // any necessary index adjustment.
+            if (edge == 3) {
+               if (ring == degeneratePointRing[odd]) {
+                  ctx.IndexPatchCtx2.baseIndexToInvert =
+                     insideEdgePointBaseOffset_ + 1;
+                  ctx.IndexPatchCtx2.cornerCaseBadValue =
+                     outsideEdgePointBaseOffset +
+                     numPointsForOutsideEdge_[edge] - 1;
+                  ctx.IndexPatchCtx2.cornerCaseReplacementValue =
+                     edge0OutsidePointBaseOffset;
+                  ctx.IndexPatchCtx2.indexInversionEndPoint =
+                     (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1;
+                  insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
+                  outsideBaseOffset = outsideEdgePointBaseOffset;
+                  ctx.bUsingPatchedIndices2 = true;
+               } else {
+                  ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
+                     insideEdgePointBaseOffset_;
+                  ctx.IndexPatchCtx.insidePointIndexBadValue =
+                     numPointsForInsideEdge[odd] - 1;
+                  ctx.IndexPatchCtx.insidePointIndexReplacementValue =
+                     edge0InsidePointBaseOffset;
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase =
+                     ctx.IndexPatchCtx.insidePointIndexBadValue +
+                     1; // past inside patched index range
+                  ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
+                     outsideEdgePointBaseOffset -
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+                  ctx.IndexPatchCtx.outsidePointIndexBadValue =
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase +
+                     numPointsForOutsideEdge_[edge] - 1;
+                  ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
+                     edge0OutsidePointBaseOffset;
+
+                  insideBaseOffset = 0;
+                  outsideBaseOffset =
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+                  ctx.bUsingPatchedIndices = true;
+               }
+            } else if ((edge == 2) && (ring == degeneratePointRing[odd])) {
+               ctx.IndexPatchCtx2.baseIndexToInvert =
+                  insideEdgePointBaseOffset_;
+               ctx.IndexPatchCtx2.cornerCaseBadValue = -1;         // unused
+               ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused
+               ctx.IndexPatchCtx2.indexInversionEndPoint =
+                  ctx.IndexPatchCtx2.baseIndexToInvert << 1;
+               insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+               ctx.bUsingPatchedIndices2 = true;
+            } else {
+               insideBaseOffset = insideEdgePointBaseOffset_;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+            }
+            if (ring == 1) {
+               StitchTransition(
+                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
+                  insideTessFactorCtx[odd].numHalfTessFactorPoints,
+                  insideTessFactorOdd[odd], outsideBaseOffset,
+                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
+                  outsideTessFactorOdd[edge]);
+            } else {
+               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
+                             /*baseIndexOffset: */ NumIndices,
+                             numPointsForInsideEdge[odd], insideBaseOffset,
+                             outsideBaseOffset);
+            }
+            ctx.bUsingPatchedIndices = false;
+            ctx.bUsingPatchedIndices2 = false;
+            NumIndices += numTriangles * 3;
+            outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1;
+            if ((edge == 2) && (ring == degeneratePointRing[odd])) {
+               insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1;
+            } else {
+               insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1;
+            }
+            numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd];
+         }
+      }
+
+      // Triangulate center - a row of quads if odd
+      // This triangulation may be producing diagonals that are asymmetric about
+      // the center of the patch in this region.
+      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
+          insideTessFactorOdd[V]) {
+         ctx.bUsingPatchedIndices2 = true;
+         int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) -
+                               (numPointsForInsideTessFactor[V] >> 1))
+                              << 1) +
+                             (insideTessFactorOdd[U] ? 1 : 2);
+         ctx.IndexPatchCtx2.baseIndexToInvert =
+            outsideEdgePointBaseOffset + stripNumQuads + 2;
+         ctx.IndexPatchCtx2.cornerCaseBadValue =
+            ctx.IndexPatchCtx2.baseIndexToInvert;
+         ctx.IndexPatchCtx2.cornerCaseReplacementValue =
+            outsideEdgePointBaseOffset;
+         ctx.IndexPatchCtx2.indexInversionEndPoint =
+            ctx.IndexPatchCtx2.baseIndexToInvert +
+            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
+         StitchRegular(
+            &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE,
+            /*baseIndexOffset: */ NumIndices,
+            /*numInsideEdgePoints:*/ stripNumQuads + 1,
+            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
+            outsideEdgePointBaseOffset + 1);
+         ctx.bUsingPatchedIndices2 = false;
+         NumIndices += stripNumQuads * 6;
+      } else if ((numPointsForInsideTessFactor[V] >=
+                  numPointsForInsideTessFactor[U]) &&
+                 insideTessFactorOdd[U]) {
+         ctx.bUsingPatchedIndices2 = true;
+         int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) -
+                               (numPointsForInsideTessFactor[U] >> 1))
+                              << 1) +
+                             (insideTessFactorOdd[V] ? 1 : 2);
+         ctx.IndexPatchCtx2.baseIndexToInvert =
+            outsideEdgePointBaseOffset + stripNumQuads + 1;
+         ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused
+         ctx.IndexPatchCtx2.indexInversionEndPoint =
+            ctx.IndexPatchCtx2.baseIndexToInvert +
+            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
+         DIAGONALS diag = insideTessFactorOdd[V]
+                             ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE
+                             : DIAGONALS_INSIDE_TO_OUTSIDE;
+         StitchRegular(
+            &ctx, /*bTrapezoid*/ false, diag,
+            /*baseIndexOffset: */ NumIndices,
+            /*numInsideEdgePoints:*/ stripNumQuads + 1,
+            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
+            outsideEdgePointBaseOffset);
+         ctx.bUsingPatchedIndices2 = false;
+         NumIndices += stripNumQuads * 6;
+      }
+   }
+}
diff --git a/src/asahi/lib/shaders/tessellator.h b/src/asahi/lib/shaders/tessellator.h
new file mode 100644
index 00000000000..ec674f43764
--- /dev/null
+++ b/src/asahi/lib/shaders/tessellator.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "libagx.h"
+
+enum libagx_tess_partitioning {
+   LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD,
+   LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN,
+   LIBAGX_TESS_PARTITIONING_INTEGER,
+};
+
+enum libagx_tess_output_primitive {
+   LIBAGX_TESS_OUTPUT_POINT,
+   LIBAGX_TESS_OUTPUT_TRIANGLE_CW,
+   LIBAGX_TESS_OUTPUT_TRIANGLE_CCW,
+};
+
+enum libagx_tess_mode {
+   /* Do not actually tessellate, just write the index counts */
+   LIBAGX_TESS_MODE_COUNT,
+
+   /* Tessellate using the count buffers to allocate indices */
+   LIBAGX_TESS_MODE_WITH_COUNTS,
+
+   /* Tessellate without count buffers by generating VDM index list words */
+   LIBAGX_TESS_MODE_VDM,
+};
+
+struct libagx_tess_point {
+   float u;
+   float v;
+};
+AGX_STATIC_ASSERT(sizeof(struct libagx_tess_point) == 8);
+
+struct libagx_tess_args {
+   /* Heap to allocate tessellator outputs in */
+   GLOBAL(struct agx_geometry_state) heap;
+
+   /* Patch coordinate buffer, indexed as:
+    *
+    *    coord_allocs[patch_ID] + vertex_in_patch
+    */
+   GLOBAL(struct libagx_tess_point) patch_coord_buffer;
+
+   /* Per-patch index within the heap for the tess coords, written by the
+    * tessellator based on the allocated memory.
+    */
+   GLOBAL(uint32_t) coord_allocs;
+
+   /* Space for output draws from the tessellator. Either API draw calls or
+    * VDM control words, depending on the mode. */
+   GLOBAL(uint32_t) out_draws;
+
+   /* Tessellation control shader output buffer. */
+   GLOBAL(float) tcs_buffer;
+
+   /* Count buffer. # of indices per patch written here, then prefix summed. */
+   GLOBAL(uint32_t) counts;
+
+   /* Allocated index buffer for all patches, if we're prefix summing counts */
+   GLOBAL(uint32_t) index_buffer;
+
+   /* Address of the tess eval invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by tessellator.
+    */
+   GLOBAL(uint32_t) statistic;
+
+   /* Address of the tess control invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by indirect tess
+    * setup kernel.
+    */
+   GLOBAL(uint32_t) tcs_statistic;
+
+   /* For indirect draws with tessellation, the grid sizes. VS then TCS then
+    * tess. Allocated by the CPU and written by the tessellation
+    * setup indirect kernel.
+    */
+   GLOBAL(uint32_t) grids;
+
+   /* For indirect draws, the indirect draw descriptor. */
+   GLOBAL(uint32_t) indirect;
+
+   /* For indirect draws, the allocation for the vertex buffer.
+    *
+    * TODO: We could move these fields to an indirect setup kernel, not sure if
+    * it's worth it though...
+    */
+   GLOBAL(uint64_t) vertex_output_buffer_ptr;
+
+   /* For indirect draws, the bitfield of VS outputs */
+   uint64_t vertex_outputs;
+
+   /* Bitfield of TCS per-vertex outputs */
+   uint64_t tcs_per_vertex_outputs;
+
+   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
+    * Unused in Vulkan and OpenGL ES.
+    */
+   float tess_level_outer_default[4];
+   float tess_level_inner_default[2];
+
+   /* Number of vertices in the input patch */
+   uint32_t input_patch_size;
+
+   /* Number of vertices in the TCS output patch */
+   uint32_t output_patch_size;
+
+   /* Number of patch constants written by TCS */
+   uint32_t tcs_patch_constants;
+
+   /* Number of input patches per instance of the VS/TCS */
+   uint32_t patches_per_instance;
+
+   /* Stride between tessellation facotrs in the TCS output buffer. */
+   uint32_t tcs_stride_el;
+
+   /* Number of patches being tessellated */
+   uint32_t nr_patches;
+} PACKED;
+AGX_STATIC_ASSERT(sizeof(struct libagx_tess_args) == 40 * 4);
diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c
index 58835b77d8e..65fc3731e17 100644
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -553,8 +553,8 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx,
    ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, false,
                                  &cb);
 
-   struct pipe_grid_info grid = {.block = {1, 1, 1}, .grid = {1, 1, 1}};
-   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE, 0);
 
    /* take_ownership=true so do not unreference */
    ctx->base.set_constant_buffer(&ctx->base, PIPE_SHADER_COMPUTE, 0, true,
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 9b655713e7f..5e26b2696d7 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -16,6 +16,7 @@
 #include "asahi/lib/agx_nir_passes.h"
 #include "asahi/lib/agx_ppp.h"
 #include "asahi/lib/agx_usc.h"
+#include "asahi/lib/shaders/tessellator.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_serialize.h"
 #include "compiler/shader_enums.h"
@@ -32,7 +33,6 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
-#include "tessellator/p_tessellator.h"
 #include "util/bitscan.h"
 #include "util/bitset.h"
 #include "util/blend.h"
@@ -3047,7 +3047,7 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
 
 static uint32_t
 agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
-                       void *data, size_t data_size)
+                       uint64_t data)
 {
    size_t usc_size = agx_usc_size(12);
 
@@ -3056,8 +3056,7 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
 
    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
 
-   uint64_t ptr = agx_pool_upload(&batch->pool, data, data_size);
-   agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &ptr, 8));
+   agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &data, 8));
    agx_usc_immediates(&b, batch, cs);
 
    assert(cs->b.info.scratch_size == 0 && "internal kernels don't spill");
@@ -3093,16 +3092,26 @@ agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
    return t.gpu;
 }
 
-void
-agx_launch_with_data(struct agx_batch *batch, const struct pipe_grid_info *info,
-                     meta_shader_builder_t builder, void *key, size_t key_size,
-                     void *data, size_t data_size)
+static void
+agx_launch_with_uploaded_data(struct agx_batch *batch,
+                              const struct agx_grid *grid,
+                              meta_shader_builder_t builder, void *key,
+                              size_t key_size, uint64_t data)
 {
    struct agx_compiled_shader *cs = agx_build_meta_shader_internal(
       batch->ctx, builder, key, key_size, false, false, 0, true);
 
-   uint32_t usc = agx_build_internal_usc(batch, cs, data, data_size);
-   agx_launch_internal(batch, info, cs, PIPE_SHADER_COMPUTE, usc);
+   uint32_t usc = agx_build_internal_usc(batch, cs, data);
+   agx_launch_internal(batch, grid, cs, PIPE_SHADER_COMPUTE, usc);
+}
+
+void
+agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
+                     meta_shader_builder_t builder, void *key, size_t key_size,
+                     void *data, size_t data_size)
+{
+   uint64_t upload = agx_pool_upload_aligned(&batch->pool, data, data_size, 4);
+   agx_launch_with_uploaded_data(batch, grid, builder, key, key_size, upload);
 }
 
 struct asahi_bg_eot
@@ -4053,6 +4062,17 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
                                           8, &batch->geom_params_bo);
 }
 
+static uint64_t
+agx_indirect_buffer_ptr(struct agx_batch *batch,
+                        const struct pipe_draw_indirect_info *indirect)
+{
+   assert(indirect->buffer && "drawauto already handled");
+
+   struct agx_resource *rsrc = agx_resource(indirect->buffer);
+   agx_batch_reads(batch, rsrc);
+   return rsrc->bo->ptr.gpu + indirect->offset;
+}
+
 static void
 agx_launch_gs_prerast(struct agx_batch *batch,
                       const struct pipe_draw_info *info,
@@ -4084,23 +4104,14 @@ agx_launch_gs_prerast(struct agx_batch *batch,
 
    assert(!info->primitive_restart && "should have been lowered");
 
-   struct pipe_grid_info grid_vs = {.block = {1, 1, 1}};
-   struct pipe_grid_info grid_gs = {.block = {1, 1, 1}};
-   struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
+   struct agx_grid grid_vs, grid_gs;
 
    /* Setup grids */
    if (indirect) {
-      assert(indirect->buffer && "drawauto already handled");
-
       struct agx_gs_setup_indirect_key key = {
          .prim = info->mode,
       };
 
-      const struct pipe_grid_info grid_setup = {
-         .block = {1, 1, 1},
-         .grid = {1, 1, 1},
-      };
-
       uint64_t ib = 0;
       size_t ib_extent = 0;
 
@@ -4109,13 +4120,10 @@ agx_launch_gs_prerast(struct agx_batch *batch,
                                    &ib_extent);
       }
 
-      struct agx_resource *rsrc = agx_resource(indirect->buffer);
-      agx_batch_reads(batch, rsrc);
-
       struct agx_gs_setup_indirect_params gsi = {
          .index_buffer = ib,
          .index_buffer_range_el = ib_extent / info->index_size,
-         .draw = rsrc->bo->ptr.gpu + indirect->offset,
+         .draw = agx_indirect_buffer_ptr(batch, indirect),
          .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
          .ia = batch->uniforms.input_assembly,
          .geom = batch->uniforms.geometry_params,
@@ -4123,62 +4131,50 @@ agx_launch_gs_prerast(struct agx_batch *batch,
          .index_size_B = info->index_size,
       };
 
+      const struct agx_grid grid_setup = agx_grid_direct(1, 1, 1, 1, 1, 1);
       agx_launch_with_data(batch, &grid_setup, agx_nir_gs_setup_indirect, &key,
                            sizeof(key), &gsi, sizeof(gsi));
 
-      /* Wrap the pool allocation in a fake resource for meta-Gallium use */
-      assert(batch->geom_params_bo != NULL);
-      grid_vs.indirect = &grid_indirect_rsrc.base;
-      grid_gs.indirect = &grid_indirect_rsrc.base;
+      uint64_t gp = batch->uniforms.geometry_params;
 
-      unsigned param_offs =
-         (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu);
+      grid_vs = agx_grid_indirect(
+         gp + offsetof(struct agx_geometry_params, vs_grid), 1, 1, 1);
 
-      grid_vs.indirect_offset =
-         param_offs + offsetof(struct agx_geometry_params, vs_grid);
-
-      grid_gs.indirect_offset =
-         param_offs + offsetof(struct agx_geometry_params, gs_grid);
+      grid_gs = agx_grid_indirect(
+         gp + offsetof(struct agx_geometry_params, gs_grid), 1, 1, 1);
    } else {
-      grid_vs.grid[0] = draws->count;
-      grid_vs.grid[1] = info->instance_count;
-      grid_vs.grid[2] = 1;
+      grid_vs =
+         agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
 
-      grid_gs.grid[0] =
-         u_decomposed_prims_for_vertices(info->mode, draws->count);
-      grid_gs.grid[1] = info->instance_count;
-      grid_gs.grid[2] = 1;
+      grid_gs = agx_grid_direct(
+         u_decomposed_prims_for_vertices(info->mode, draws->count),
+         info->instance_count, 1, 64, 1, 1);
    }
 
    /* Launch the vertex shader first */
-   agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage);
+   agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0);
 
    /* If there is a count shader, launch it and prefix sum the results. */
    if (gs->gs_count) {
       perf_debug(dev, "Geometry shader count");
-      agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY);
+      agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY, 0);
 
       unsigned words = gs->gs_count_words;
-      agx_launch(batch,
-                 &(const struct pipe_grid_info){
-                    .block = {1024, 1, 1},
-                    .grid = {gs->gs_count_words, 1, 1},
-                 },
+      struct agx_grid grid =
+         agx_grid_direct(1024 * gs->gs_count_words, 1, 1, 1024, 1, 1);
+
+      agx_launch(batch, &grid,
                  agx_build_meta_shader(ctx, agx_nir_prefix_sum_gs, &words,
                                        sizeof(words)),
-                 NULL, PIPE_SHADER_COMPUTE);
+                 NULL, PIPE_SHADER_COMPUTE, 0);
    }
 
    /* Pre-GS shader */
-   agx_launch(batch,
-              &(const struct pipe_grid_info){
-                 .block = {1, 1, 1},
-                 .grid = {1, 1, 1},
-              },
-              gs->pre_gs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+   agx_launch(batch, &grid, gs->pre_gs, NULL, PIPE_SHADER_COMPUTE, 0);
 
    /* Pre-rast geometry shader */
-   agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY);
+   agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY, 0);
 }
 
 static void
@@ -4249,10 +4245,8 @@ agx_draw_without_restart(struct agx_batch *batch,
    };
 
    /* Unroll the index buffer for each draw */
-   const struct pipe_grid_info grid_setup = {
-      .block = {1024, 1, 1},
-      .grid = {indirect->draw_count, 1, 1},
-   };
+   const struct agx_grid grid_setup =
+      agx_grid_direct(1024 * indirect->draw_count, 1, 1, 1024, 1, 1);
 
    agx_launch_with_data(batch, &grid_setup, agx_nir_unroll_restart, &key,
                         sizeof(key), &unroll, sizeof(unroll));
@@ -4310,8 +4304,9 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
       return true;
    }
 
-   /* TODO: this is sloppy, we should add a VDM kernel for this. */
-   if (indirect && ctx->active_queries && ctx->prims_generated[0]) {
+   /* TODO: this is really sloppy, we should add a VDM kernel for this. */
+   if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries &&
+       ctx->prims_generated[0]) {
       perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
       return true;
    }
@@ -4333,8 +4328,11 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
    /* Transform feedback is layered on geometry shaders, so if transform
     * feedback is used, we need a GS.
     */
-   if (ctx->stage[PIPE_SHADER_VERTEX].shader->has_xfb_info &&
-       ctx->streamout.num_targets) {
+   struct agx_uncompiled_shader *last_vtx =
+      ctx->stage[PIPE_SHADER_TESS_EVAL].shader
+         ?: ctx->stage[PIPE_SHADER_VERTEX].shader;
+
+   if (last_vtx->has_xfb_info && ctx->streamout.num_targets) {
       *xfb_only = true;
       return true;
    }
@@ -4343,6 +4341,20 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
    return false;
 }
 
+static enum mesa_prim
+agx_tess_output_prim(struct agx_uncompiled_shader *tcs,
+                     struct agx_uncompiled_shader *tes)
+{
+   if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) {
+      return MESA_PRIM_POINTS;
+   } else if (TESS_PRIMITIVE_ISOLINES ==
+              MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) {
+      return MESA_PRIM_LINES;
+   } else {
+      return MESA_PRIM_TRIANGLES;
+   }
+}
+
 static struct agx_uncompiled_shader *
 agx_get_passthrough_gs(struct agx_context *ctx,
                        struct agx_uncompiled_shader *prev_cso,
@@ -4350,11 +4362,16 @@ agx_get_passthrough_gs(struct agx_context *ctx,
 {
    bool edgeflags = has_edgeflags(ctx, mode);
 
+   if (mode == MESA_PRIM_PATCHES) {
+      mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader,
+                                  ctx->stage[MESA_SHADER_TESS_EVAL].shader);
+   }
+
    /* Only handle the polygon mode when edge flags are in use, because
     * nir_passthrough_gs doesn't handle transform feedback + polygon mode
-    * properly. Technically this can break edge flags + transform feedback but
-    * that's firmly in "doctor, it hurts when I do this" territory, and I'm not
-    * sure that's even possible to hit. TODO: Reevaluate.
+    * properly. Technically this can break edge flags + transform feedback
+    * but that's firmly in "doctor, it hurts when I do this" territory, and
+    * I'm not sure that's even possible to hit. TODO: Reevaluate.
     */
    unsigned poly_mode =
       edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
@@ -4525,35 +4542,39 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
       unbind_tcs_when_done = true;
    }
 
-   unsigned in_vertices = draws->count;
-   unsigned in_patches = in_vertices / patch_vertices;
+   enum tess_primitive_mode mode =
+      MAX2(tcs->tess.primitive, tes->tess.primitive);
+   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
 
-   if (in_patches == 0)
-      return;
+   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
+                                        ? PIPE_TESS_SPACING_EQUAL
+                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
+                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
+                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
 
-   /* TCS invocation counter increments once per-patch */
-   agx_query_increment_cpu(
-      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
-      in_patches);
+   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
+   enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
 
-   struct agx_batch *batch = agx_get_compute_batch(ctx);
+   enum libagx_tess_partitioning partitioning =
+      (enum libagx_tess_partitioning)pspacing;
+
+   enum libagx_tess_output_primitive prim =
+      point_mode       ? LIBAGX_TESS_OUTPUT_POINT
+      : !tes->tess.ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
+                       : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   struct agx_bo *draw_bo = NULL;
+   bool with_counts =
+      indirect || ctx->stage[MESA_SHADER_GEOMETRY].shader != NULL;
+   size_t draw_stride =
+      ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
+
+   struct agx_batch *batch = agx_get_batch(ctx);
    agx_batch_init_state(batch);
 
-   struct pipe_resource *heap =
-      pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL, PIPE_USAGE_DEFAULT,
-                         1024 * 1024 * 128);
-
-   uint64_t heap_gpu = agx_resource(heap)->bo->ptr.gpu;
-   uint8_t *heap_cpu = agx_resource(heap)->bo->ptr.cpu;
-
-   unsigned unrolled_patch_count = in_patches * info->instance_count;
-
-   uint32_t heap_water = 0;
-   uint32_t tcs_out_offs = heap_water;
-   heap_water += ALIGN(unrolled_patch_count * tcs->tess.output_stride, 4);
-
-   agx_batch_writes(batch, agx_resource(heap), 0);
-   batch->incoherent_writes = true;
+   if (!batch->cdm.bo) {
+      batch->cdm = agx_encoder_allocate(batch, dev);
+   }
 
    uint64_t ib = 0;
    size_t ib_extent = 0;
@@ -4573,25 +4594,29 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
    agx_upload_draw_params(batch, indirect, draws, info);
 
    /* Setup parameters */
-   struct agx_tess_params tess_params = {
-      .tcs_buffer = heap_gpu + tcs_out_offs,
+   uint64_t geom_state = agx_batch_geometry_state(batch);
+   assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
+
+   struct libagx_tess_args args = {
+      .heap = geom_state,
+      .tcs_stride_el = tcs->tess.output_stride / 4,
+      .statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]),
       .input_patch_size = patch_vertices,
       .output_patch_size = tcs->tess.output_patch_size,
       .tcs_patch_constants = tcs->tess.nr_patch_outputs,
       .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
-      .patch_coord_buffer = heap_gpu,
-      .patches_per_instance = in_patches,
+      .patch_coord_buffer = agx_resource(ctx->heap)->bo->ptr.gpu,
    };
 
-   memcpy(&tess_params.tess_level_outer_default, ctx->default_outer_level,
+   memcpy(&args.tess_level_outer_default, ctx->default_outer_level,
           sizeof(ctx->default_outer_level));
-   memcpy(&tess_params.tess_level_inner_default, ctx->default_inner_level,
+   memcpy(&args.tess_level_inner_default, ctx->default_inner_level,
           sizeof(ctx->default_inner_level));
 
-   batch->uniforms.tess_params =
-      agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
+   struct agx_grid vs_grid, tcs_grid, tess_grid;
+   unsigned tess_wg_size = 64;
 
-   /* Run VS+TCS as compute */
    agx_upload_vbos(batch);
    agx_update_vs(ctx, info->index_size);
    agx_update_tcs(ctx, info);
@@ -4605,153 +4630,184 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
 
    batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs;
 
-   unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
-                                         batch->uniforms.vertex_outputs);
-   uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
-   batch->uniforms.vertex_output_buffer_ptr =
-      agx_pool_upload(&batch->pool, &addr, 8);
+   if (indirect == NULL) {
+      unsigned in_patches = draws->count / patch_vertices;
+      if (in_patches == 0)
+         return;
 
-   struct pipe_grid_info vs_grid = {
-      .block = {1, 1, 1},
-      .grid = {draws->count, info->instance_count, 1},
-   };
+      /* TCS invocation counter increments once per-patch */
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
+         in_patches);
 
-   agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX);
+      unsigned unrolled_patches = in_patches * info->instance_count;
 
-   struct pipe_grid_info tcs_grid = {
-      .block = {tcs->tess.output_patch_size, 1, 1},
-      .grid = {in_patches, info->instance_count, 1},
-   };
+      uint32_t alloc = 0;
+      uint32_t tcs_out_offs = alloc;
+      alloc += unrolled_patches * tcs->tess.output_stride;
 
-   agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL);
+      uint32_t patch_coord_offs = alloc;
+      alloc += unrolled_patches * 4;
+
+      uint32_t count_offs = alloc;
+      if (with_counts)
+         alloc += unrolled_patches * sizeof(uint32_t);
+
+      uint32_t draw_offs = alloc;
+
+      if (with_counts) {
+         alloc += draw_stride;
+      } else {
+         /* Padding added because VDM overreads */
+         alloc +=
+            (draw_stride * unrolled_patches) + (AGX_VDM_BARRIER_LENGTH + 0x800);
+      }
+
+      struct agx_ptr blob =
+         agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo);
+
+      args.tcs_buffer = blob.gpu + tcs_out_offs;
+      args.patches_per_instance = in_patches;
+      args.coord_allocs = blob.gpu + patch_coord_offs;
+      args.nr_patches = unrolled_patches;
+      args.out_draws = blob.gpu + draw_offs;
+
+      if (with_counts) {
+         args.counts = blob.gpu + count_offs;
+      } else {
+         /* Arrange so we return after all generated draws */
+         uint8_t *ret =
+            (uint8_t *)blob.cpu + draw_offs + (draw_stride * unrolled_patches);
+
+         agx_pack(ret, VDM_BARRIER, cfg) {
+            cfg.returns = true;
+         }
+      }
+
+      unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
+                                            batch->uniforms.vertex_outputs);
+      uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+      batch->uniforms.vertex_output_buffer_ptr =
+         agx_pool_upload(&batch->pool, &addr, 8);
+
+      vs_grid =
+         agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
+
+      tcs_grid = agx_grid_direct(in_patches * tcs->tess.output_patch_size,
+                                 info->instance_count, 1,
+                                 tcs->tess.output_patch_size, 1, 1);
+
+      tess_grid = agx_grid_direct(unrolled_patches, 1, 1, tess_wg_size, 1, 1);
+   } else if (indirect) {
+      args.tcs_statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
+
+      args.indirect = agx_indirect_buffer_ptr(batch, indirect);
+
+      /* Allocate 3x indirect global+local grids for VS/TCS/tess */
+      uint32_t grid_stride = sizeof(uint32_t) * 6;
+      args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
+
+      vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
+      tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
+      tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
+
+      args.vertex_outputs = ctx->vs->b.info.outputs;
+      args.vertex_output_buffer_ptr =
+         agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
+
+      batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
+
+      if (with_counts) {
+         args.out_draws = agx_pool_alloc_aligned_with_bo(
+                             &batch->pool, draw_stride, 4, &draw_bo)
+                             .gpu;
+      } else {
+         unreachable("need an extra indirection...");
+      }
+   }
+
+   uint64_t state =
+      agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4);
+
+   if (indirect) {
+      const struct agx_grid indirect_grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
+      struct agx_tess_setup_indirect_key indirect_key = {
+         .point_mode = point_mode,
+         .with_counts = with_counts,
+      };
+
+      agx_launch_with_uploaded_data(batch, &indirect_grid,
+                                    agx_nir_tess_setup_indirect, &indirect_key,
+                                    sizeof(indirect_key), state);
+   }
+
+   batch->uniforms.tess_params = state;
+
+   agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX, 0);
+   agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0);
    batch->uniforms.vertex_output_buffer_ptr = 0;
 
-   agx_flush_all(ctx, "HACK");
-   agx_sync_all(ctx, "HACK");
+   struct agx_tessellator_key key = {
+      .prim = mode,
+      .output_primitive = prim,
+      .partitioning = partitioning,
+   };
 
-   /* Setup batch */
-   batch = agx_get_batch(ctx);
+   if (with_counts) {
+      /* Generate counts */
+      key.mode = LIBAGX_TESS_MODE_COUNT;
+      agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
+                                    sizeof(key), state);
 
-   enum tess_primitive_mode mode =
-      MAX2(tcs->tess.primitive, tes->tess.primitive);
-   enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
+      /* Prefix sum counts, allocating index buffer space. */
+      const struct agx_grid prefix_sum_grid =
+         agx_grid_direct(1024, 1, 1, 1024, 1, 1);
 
-   enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
-                                        ? PIPE_TESS_SPACING_EQUAL
-                                     : spacing == TESS_SPACING_FRACTIONAL_ODD
-                                        ? PIPE_TESS_SPACING_FRACTIONAL_ODD
-                                        : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
+      agx_launch_with_uploaded_data(batch, &prefix_sum_grid,
+                                    agx_nir_prefix_sum_tess, NULL, 0, state);
 
-   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
-   enum mesa_prim in_prim = mode == TESS_PRIMITIVE_ISOLINES ? MESA_PRIM_LINES
-                            : mode == TESS_PRIMITIVE_QUADS
-                               ? MESA_PRIM_QUADS
-                               : MESA_PRIM_TRIANGLES;
-   enum mesa_prim out_prim = point_mode ? MESA_PRIM_POINTS
-                             : mode == TESS_PRIMITIVE_ISOLINES
-                                ? MESA_PRIM_LINES
-                                : MESA_PRIM_TRIANGLES;
-
-   struct pipe_tessellator *tess =
-      p_tess_init(in_prim, pspacing, tes->tess.ccw, point_mode);
-
-   struct pipe_tessellator_data data = {0};
-
-   /* Mem allocate */
-   uint32_t patch_coord_offs_offs = heap_water;
-   tess_params.patch_coord_offs = heap_gpu + heap_water;
-   heap_water += align(4 * unrolled_patch_count, 4);
-
-   uint32_t draws_off = heap_water;
-   uint32_t *patch_draws = (uint32_t *)(heap_cpu + heap_water);
-   heap_water += align(sizeof(uint32_t) * 5 * unrolled_patch_count, 4);
-
-   uint32_t *patch_offs = (uint32_t *)(heap_cpu + patch_coord_offs_offs);
-
-   for (unsigned patch = 0; patch < unrolled_patch_count; ++patch) {
-      float *addr =
-         (float *)(heap_cpu + tcs_out_offs + tcs->tess.output_stride * patch);
-
-      struct pipe_tessellation_factors factors = {
-         .outer_tf = {addr[0], addr[1], addr[2], addr[3]},
-         .inner_tf = {addr[4], addr[5]},
-      };
-      p_tessellate(tess, &factors, &data);
-
-      /* Mem allocate indices */
-      uint32_t index_off = heap_water;
-      uint16_t *indices = (uint16_t *)(heap_cpu + heap_water);
-      heap_water += align(sizeof(*indices) * data.num_indices, 4);
-
-      for (unsigned idx = 0; idx < data.num_indices; ++idx) {
-         indices[idx] = data.indices[idx];
-      }
-
-      /* Mem allocate patch coords */
-      heap_water = align(heap_water, 8);
-      patch_offs[patch] = heap_water / 8;
-      float *patch_coords = (float *)(heap_cpu + heap_water);
-      heap_water += align(8 * data.num_domain_points, 4);
-
-      for (unsigned p = 0; p < data.num_domain_points; ++p) {
-         patch_coords[2 * p + 0] = data.domain_points_u[p];
-         patch_coords[2 * p + 1] = data.domain_points_v[p];
-      }
-      assert(data.num_indices < 32768);
-      assert(data.num_domain_points < 8192);
-
-      /* Generate a draw for the patch */
-      uint32_t *desc = patch_draws + (patch * 5);
-
-      desc[0] = data.num_indices;                   /* count */
-      desc[1] = 1;                                  /* instance_count */
-      desc[2] = index_off / sizeof(*indices);       /* start */
-      desc[3] = patch * LIBAGX_TES_PATCH_ID_STRIDE; /* index_bias */
-      desc[4] = 0;                                  /* start_instance */
-
-      /* TES invocation counter increments once per tessellated vertex */
-      agx_query_increment_cpu(
-         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
-         data.num_domain_points);
+      key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
+   } else {
+      key.mode = LIBAGX_TESS_MODE_VDM;
    }
-   p_tess_destroy(tess);
+
+   /* Now we can tessellate */
+   agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
+                                 sizeof(key), state);
 
    /* Run TES as VS */
    void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
    void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
    ctx->base.bind_vs_state(&ctx->base, tes_cso);
    ctx->in_tess = true;
+   ctx->in_generated_vdm = !with_counts;
 
    struct pipe_draw_info draw_info = {
       .mode = out_prim,
-      .index_size = 2,
-      .index.resource = heap,
+      .index_size = with_counts ? 4 : (point_mode ? 0 : 2),
+      .index.resource = (!with_counts && point_mode) ? NULL : ctx->heap,
       .instance_count = 1,
       .view_mask = info->view_mask,
    };
 
    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
-   struct pipe_draw_indirect_info copy_indirect = {
-      .buffer = heap,
-      .offset = draws_off,
-      .stride = 5 * sizeof(uint32_t),
-      .draw_count = in_patches * info->instance_count,
-   };
+   struct agx_resource indirect_rsrc = {.bo = draw_bo};
 
-   /* Tess param upload is deferred to draw_vbo since the batch may change
-    * within draw_vbo for various reasons, so we can't upload it to the batch
-    * upfront.
-    */
-   memcpy(&ctx->tess_params, &tess_params, sizeof(tess_params));
+   struct pipe_draw_indirect_info copy_indirect = {
+      .buffer = &indirect_rsrc.base,
+      .offset = args.out_draws - draw_bo->ptr.gpu,
+      .stride = draw_stride,
+      .draw_count = 1,
+   };
 
    ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);
 
    /* Restore vertex state */
    ctx->base.bind_vs_state(&ctx->base, vs_cso);
+   ctx->in_generated_vdm = false;
    ctx->in_tess = false;
 
-   pipe_resource_reference(&heap, NULL);
-
    if (unbind_tcs_when_done) {
       ctx->base.bind_tcs_state(&ctx->base, NULL);
    }
@@ -4860,13 +4916,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
-   /* TODO: stop cheating */
-   if (info->mode == MESA_PRIM_PATCHES && indirect) {
-      perf_debug_ctx(ctx, "indirect tessellation");
-      util_draw_indirect(pctx, info, drawid_offset, indirect);
-      return;
-   }
-
    /* TODO: stop cheating */
    if (ctx->active_queries && !ctx->active_draw_without_restart &&
        (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
@@ -4878,11 +4927,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
-   if (info->mode == MESA_PRIM_PATCHES) {
-      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
-      return;
-   }
-
    bool xfb_passthrough = false;
    if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
       agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
@@ -4890,6 +4934,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       return;
    }
 
+   if (info->mode == MESA_PRIM_PATCHES) {
+      agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
+      return;
+   }
+
    agx_legalize_feedback_loops(ctx);
 
    /* Only the rasterization stream counts */
@@ -5017,11 +5066,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
        IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
        IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) {
 
-      if (ctx->in_tess) {
-         batch->uniforms.tess_params = agx_pool_upload(
-            &batch->pool, &ctx->tess_params, sizeof(ctx->tess_params));
-      }
-
       if (IS_DIRTY(VERTEX)) {
          agx_upload_vbos(batch);
       }
@@ -5143,63 +5187,75 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
 
    uint8_t *out = agx_encode_state(batch, batch->vdm.current);
 
-   if (info->index_size) {
-      agx_push(out, VDM_STATE, cfg)
-         cfg.restart_index_present = true;
-
-      agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
-         cfg.value = info->restart_index;
-   }
-
-   agx_push(out, INDEX_LIST, cfg) {
-      cfg.primitive = agx_primitive_for_pipe(info->mode);
-
-      if (indirect != NULL) {
-         cfg.indirect_buffer_present = true;
-      } else {
-         cfg.instance_count_present = true;
-         cfg.index_count_present = true;
-         cfg.start_present = true;
-      }
-
-      if (info->index_size) {
-         cfg.restart_enable = info->primitive_restart;
-         cfg.index_buffer_hi = (ib >> 32);
-         cfg.index_size = agx_translate_index_size(info->index_size);
-         cfg.index_buffer_present = true;
-         cfg.index_buffer_size_present = true;
-      }
-   }
-
-   if (info->index_size) {
-      agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
-         cfg.buffer_lo = ib & BITFIELD_MASK(32);
-      }
-   }
-
-   if (indirect) {
+   if (ctx->in_generated_vdm) {
       struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
       uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
 
-      agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
-         cfg.address_hi = address >> 32;
-         cfg.address_lo = address & BITFIELD_MASK(32);
+      agx_push(out, VDM_STREAM_LINK, cfg) {
+         cfg.target_lo = address & BITFIELD_MASK(32);
+         cfg.target_hi = address >> 32;
+         cfg.with_return = true;
       }
    } else {
-      agx_push(out, INDEX_LIST_COUNT, cfg)
-         cfg.count = draws->count;
 
-      agx_push(out, INDEX_LIST_INSTANCES, cfg)
-         cfg.count = info->instance_count;
+      if (info->index_size && info->primitive_restart) {
+         agx_push(out, VDM_STATE, cfg)
+            cfg.restart_index_present = true;
 
-      agx_push(out, INDEX_LIST_START, cfg) {
-         cfg.start = info->index_size ? draws->index_bias : draws->start;
+         agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
+            cfg.value = info->restart_index;
       }
-   }
 
-   if (info->index_size) {
-      agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
-         cfg.size = ib_extent;
+      agx_push(out, INDEX_LIST, cfg) {
+         cfg.primitive = agx_primitive_for_pipe(info->mode);
+
+         if (indirect != NULL) {
+            cfg.indirect_buffer_present = true;
+         } else {
+            cfg.instance_count_present = true;
+            cfg.index_count_present = true;
+            cfg.start_present = true;
+         }
+
+         if (info->index_size) {
+            cfg.restart_enable = info->primitive_restart;
+            cfg.index_buffer_hi = (ib >> 32);
+            cfg.index_size = agx_translate_index_size(info->index_size);
+            cfg.index_buffer_present = true;
+            cfg.index_buffer_size_present = true;
+         }
+      }
+
+      if (info->index_size) {
+         agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
+            cfg.buffer_lo = ib & BITFIELD_MASK(32);
+         }
+      }
+
+      if (indirect) {
+         struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
+         uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
+
+         agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
+            cfg.address_hi = address >> 32;
+            cfg.address_lo = address & BITFIELD_MASK(32);
+         }
+      } else {
+         agx_push(out, INDEX_LIST_COUNT, cfg)
+            cfg.count = draws->count;
+
+         agx_push(out, INDEX_LIST_INSTANCES, cfg)
+            cfg.count = info->instance_count;
+
+         agx_push(out, INDEX_LIST_START, cfg) {
+            cfg.start = info->index_size ? draws->index_bias : draws->start;
+         }
+      }
+
+      if (info->index_size) {
+         agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
+            cfg.size = ib_extent;
+         }
       }
    }
 
@@ -5242,7 +5298,7 @@ agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
 }
 
 void
-agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
+agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
                     struct agx_compiled_shader *cs, enum pipe_shader_type stage,
                     uint32_t usc)
 {
@@ -5253,11 +5309,7 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
    uint8_t *out = batch->cdm.current;
 
    agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
-      if (info->indirect)
-         cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
-      else
-         cfg.mode = AGX_CDM_MODE_DIRECT;
-
+      cfg.mode = grid->mode;
       cfg.uniform_register_count = cs->b.info.push_count;
       cfg.preshader_register_count = cs->b.info.nr_preamble_gprs;
       cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, cs);
@@ -5275,32 +5327,25 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
          ;
    }
 
-   if (info->indirect) {
-      struct agx_resource *indirect = agx_resource(info->indirect);
-      uint64_t addr = indirect->bo->ptr.gpu + info->indirect_offset;
-
-      agx_push(out, CDM_INDIRECT, cfg) {
-         cfg.address_hi = addr >> 32;
-         cfg.address_lo = addr & BITFIELD64_MASK(32);
+   if (grid->mode == AGX_CDM_MODE_DIRECT) {
+      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
+         cfg.x = grid->global[0];
+         cfg.y = grid->global[1];
+         cfg.z = grid->global[2];
       }
    } else {
-      uint32_t size[3];
-      for (unsigned d = 0; d < 3; ++d) {
-         size[d] = ((info->grid[d] - 1) * info->block[d]) +
-                   (info->last_block[d] ?: info->block[d]);
-      }
-
-      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
-         cfg.x = size[0];
-         cfg.y = size[1];
-         cfg.z = size[2];
+      agx_push(out, CDM_INDIRECT, cfg) {
+         cfg.address_hi = grid->indirect >> 32;
+         cfg.address_lo = grid->indirect & BITFIELD64_MASK(32);
       }
    }
 
-   agx_push(out, CDM_LOCAL_SIZE, cfg) {
-      cfg.x = info->block[0];
-      cfg.y = info->block[1];
-      cfg.z = info->block[2];
+   if (grid->mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
+      agx_push(out, CDM_LOCAL_SIZE, cfg) {
+         cfg.x = grid->local[0];
+         cfg.y = grid->local[1];
+         cfg.z = grid->local[2];
+      }
    }
 
    agx_push(out, CDM_BARRIER, cfg) {
@@ -5352,9 +5397,9 @@ agx_launch_internal(struct agx_batch *batch, const struct pipe_grid_info *info,
 }
 
 void
-agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
+agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
            struct agx_compiled_shader *cs, struct agx_linked_shader *linked,
-           enum pipe_shader_type stage)
+           enum pipe_shader_type stage, unsigned variable_shared_mem)
 {
    struct agx_context *ctx = batch->ctx;
 
@@ -5362,18 +5407,17 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
     * available in GPU memory. This is either the indirect buffer, or just a
     * buffer we upload ourselves if not indirect.
     */
-   if (info->indirect) {
-      struct agx_resource *indirect = agx_resource(info->indirect);
-      agx_batch_reads(batch, indirect);
+   if (grid->mode == AGX_CDM_MODE_DIRECT) {
+      uint32_t groups[3] = {
+         grid->global[0] / grid->local[0],
+         grid->global[1] / grid->local[1],
+         grid->global[2] / grid->local[2],
+      };
 
       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
-         indirect->bo->ptr.gpu + info->indirect_offset;
+         agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4);
    } else {
-      static_assert(sizeof(info->grid) == 12,
-                    "matches indirect dispatch buffer");
-
-      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = agx_pool_upload_aligned(
-         &batch->pool, info->grid, sizeof(info->grid), 4);
+      batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid->indirect;
    }
 
    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
@@ -5403,11 +5447,10 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
    }
 #endif
 
-   uint32_t usc =
-      agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
-                         info->variable_shared_mem, subgroups_per_core);
+   uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
+                                     variable_shared_mem, subgroups_per_core);
 
-   agx_launch_internal(batch, info, cs, stage, usc);
+   agx_launch_internal(batch, grid, cs, stage, usc);
 }
 
 static void
@@ -5454,7 +5497,29 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    struct agx_compiled_shader *cs =
       _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
 
-   agx_launch(batch, info, cs, NULL, PIPE_SHADER_COMPUTE);
+   struct agx_grid grid = {
+      .local[0] = info->block[0],
+      .local[1] = info->block[1],
+      .local[2] = info->block[2],
+   };
+
+   if (info->indirect) {
+      struct agx_resource *indirect = agx_resource(info->indirect);
+      agx_batch_reads(batch, indirect);
+
+      grid.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
+      grid.indirect = indirect->bo->ptr.gpu + info->indirect_offset;
+   } else {
+      grid.mode = AGX_CDM_MODE_DIRECT;
+
+      for (unsigned d = 0; d < 3; ++d) {
+         grid.global[d] = ((info->grid[d] - 1) * info->block[d]) +
+                          (info->last_block[d] ?: info->block[d]);
+      }
+   }
+
+   agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE,
+              info->variable_shared_mem);
 
    /* TODO: Dirty tracking? */
    agx_dirty_all(ctx);
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 3d139a3c71b..949e343cecc 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -682,8 +682,8 @@ struct agx_context {
    struct util_debug_callback debug;
    bool is_noop;
 
-   struct agx_tess_params tess_params;
    bool in_tess;
+   bool in_generated_vdm;
 
    struct blitter_context *blitter;
    struct asahi_blitter compute_blitter;
@@ -781,19 +781,67 @@ struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx,
                                                   meta_shader_builder_t builder,
                                                   void *data, size_t data_size);
 
-void agx_launch_with_data(struct agx_batch *batch,
-                          const struct pipe_grid_info *info,
+struct agx_grid {
+   /* Tag for the union */
+   enum agx_cdm_mode mode;
+
+   /* If mode != INDIRECT_LOCAL, the local size */
+   uint32_t local[3];
+
+   union {
+      /* If mode == DIRECT, the global size. This is *not* multiplied by the
+       * local size, differing from the API definition but matching AGX.
+       */
+      uint32_t global[3];
+
+      /* Address of the indirect buffer if mode != DIRECT */
+      uint64_t indirect;
+   };
+};
+
+static inline const struct agx_grid
+agx_grid_direct(uint32_t global_x, uint32_t global_y, uint32_t global_z,
+                uint32_t local_x, uint32_t local_y, uint32_t local_z)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_DIRECT,
+      .global = {global_x, global_y, global_z},
+      .local = {local_x, local_y, local_z},
+   };
+}
+
+static inline const struct agx_grid
+agx_grid_indirect(uint64_t indirect, uint32_t local_x, uint32_t local_y,
+                  uint32_t local_z)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_INDIRECT_GLOBAL,
+      .local = {local_x, local_y, local_z},
+      .indirect = indirect,
+   };
+}
+
+static inline const struct agx_grid
+agx_grid_indirect_local(uint64_t indirect)
+{
+   return (struct agx_grid){
+      .mode = AGX_CDM_MODE_INDIRECT_LOCAL,
+      .indirect = indirect,
+   };
+}
+
+void agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
                           meta_shader_builder_t builder, void *key,
                           size_t key_size, void *data, size_t data_size);
 
-void agx_launch_internal(struct agx_batch *batch,
-                         const struct pipe_grid_info *info,
+void agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
                          struct agx_compiled_shader *cs,
                          enum pipe_shader_type stage, uint32_t usc);
 
-void agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
+void agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
                 struct agx_compiled_shader *cs,
-                struct agx_linked_shader *linked, enum pipe_shader_type stage);
+                struct agx_linked_shader *linked, enum pipe_shader_type stage,
+                unsigned variable_shared_mem);
 
 void agx_init_query_functions(struct pipe_context *ctx);