diff --git a/meson.build b/meson.build
index c7ed21a0961..ec978d02cdd 100644
--- a/meson.build
+++ b/meson.build
@@ -845,6 +845,10 @@ endif
 with_llvm = with_llvm \
   .enable_if(with_clc, error_message : 'CLC requires LLVM')
 
+with_poly = [
+  with_gallium_asahi, with_asahi_vk, with_tools.contains('asahi'),
+].contains(true)
+
 dep_clc = null_dep
 if with_clc
   dep_clc = dependency('libclc')
diff --git a/src/.clang-format b/src/.clang-format
index beb26c8d3b2..099120f2cca 100644
--- a/src/.clang-format
+++ b/src/.clang-format
@@ -237,7 +237,9 @@ ForEachMacros:
   - agx_foreach_reg_dest
   - agx_foreach_successor
   - foreach_next_use
-  - libagx_foreach_xfb
+
+# poly
+  - poly_foreach_xfb
 
 # radv
   - PHASE
diff --git a/src/asahi/lib/agx_helpers.h b/src/asahi/lib/agx_helpers.h
index f9aa1220aeb..bf7dfb9a6da 100644
--- a/src/asahi/lib/agx_helpers.h
+++ b/src/asahi/lib/agx_helpers.h
@@ -316,16 +316,6 @@ agx_fill_decompress_args(struct ail_layout *layout, unsigned layer,
       agx_fill_decompress_args(layout, layer, level, ptr, images),             \
       util_logbase2(layout->sample_count_sa))
 
-#define libagx_tessellate(context, grid, barrier, prim, mode, state)           \
-   if (prim == TESS_PRIMITIVE_QUADS) {                                         \
-      libagx_tess_quad(context, grid, barrier, state, mode);                   \
-   } else if (prim == TESS_PRIMITIVE_TRIANGLES) {                              \
-      libagx_tess_tri(context, grid, barrier, state, mode);                    \
-   } else {                                                                    \
-      assert(prim == TESS_PRIMITIVE_ISOLINES);                                 \
-      libagx_tess_isoline(context, grid, barrier, state, mode);                \
-   }
-
 struct agx_border_packed;
 
 void agx_pack_border(struct agx_border_packed *out, const uint32_t in[4],
diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h
deleted file mode 100644
index e29705a9491..00000000000
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 2023 Alyssa Rosenzweig
- * SPDX-License-Identifier: MIT
- */
-
-#pragma once
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "libagx/geometry.h"
-#include "nir.h"
-#include "shader_enums.h"
-
-struct nir_def *agx_load_per_vertex_input(struct nir_builder *b,
-                                          nir_intrinsic_instr *intr,
-                                          struct nir_def *vertex);
-
-nir_def *agx_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
-                                unsigned index_size_B);
-
-bool agx_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
-
-bool agx_nir_lower_vs_before_gs(struct nir_shader *vs);
-
-struct agx_gs_info {
-   /* Output primitive mode for geometry shaders */
-   enum mesa_prim mode;
-
-   /* Number of words per primitive in the count buffer */
-   unsigned count_words;
-
-   /* Per-input primitive stride of the output index buffer */
-   unsigned max_indices;
-
-   /* Whether the GS includes transform feedback at a compile-time level */
-   bool xfb;
-
-   /* Whether a prefix sum is required on the count outputs. Implies xfb */
-   bool prefix_sum;
-
-   /* Whether the GS writes to a stream other than stream #0 */
-   bool multistream;
-
-   /* Shape of the rasterization draw, named by the instance ID */
-   enum agx_gs_shape shape;
-
-   /* Static topology used if shape = AGX_GS_SHAPE_STATIC_INDEXED */
-   uint8_t topology[64];
-};
-
-bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
-                      struct nir_shader **gs_copy, struct nir_shader **pre_gs,
-                      struct agx_gs_info *info);
-
-bool agx_nir_lower_tcs(struct nir_shader *tcs);
-
-bool agx_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs);
-
-uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);
-
-unsigned agx_tcs_output_stride(const struct nir_shader *nir);
diff --git a/src/asahi/lib/agx_nir_prolog_epilog.c b/src/asahi/lib/agx_nir_prolog_epilog.c
index fe2d63e505d..254555f5f4c 100644
--- a/src/asahi/lib/agx_nir_prolog_epilog.c
+++ b/src/asahi/lib/agx_nir_prolog_epilog.c
@@ -5,11 +5,12 @@
  */
 
 #include "gallium/include/pipe/p_defines.h"
+#include "poly/cl/libpoly.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/format/u_formats.h"
 #include "agx_abi.h"
 #include "agx_linker.h"
 #include "agx_nir.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_pack.h"
 #include "agx_tilebuffer.h"
@@ -149,11 +150,11 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    nir_def *id = nir_load_vertex_id(b);
 
    if (key->adjacency == MESA_PRIM_LINES_ADJACENCY) {
-      id = libagx_map_to_line_adj(b, id);
+      id = poly_map_to_line_adj(b, id);
    } else if (key->adjacency == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
-      id = libagx_map_to_tri_strip_adj(b, id);
+      id = poly_map_to_tri_strip_adj(b, id);
    } else if (key->adjacency == MESA_PRIM_LINE_STRIP_ADJACENCY) {
-      id = libagx_map_to_line_strip_adj(b, id);
+      id = poly_map_to_line_strip_adj(b, id);
    } else if (key->adjacency == MESA_PRIM_TRIANGLES_ADJACENCY) {
       /* Sequence (0, 2, 4), (6, 8, 10), ... */
       id = nir_imul_imm(b, id, 2);
@@ -161,7 +162,7 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
       UNREACHABLE("unknown");
    }
 
-   id = agx_nir_load_vertex_id(b, id, key->sw_index_size_B);
+   id = poly_nir_load_vertex_id(b, id, key->sw_index_size_B);
 
    nir_def_replace(&intr->def, id);
    return true;
@@ -215,7 +216,7 @@ agx_nir_vs_prolog(nir_builder *b, const void *key_)
    }
 
    if (!key->hw) {
-      agx_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
+      poly_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
    } else if (key->adjacency) {
       nir_shader_intrinsics_pass(b->shader, lower_adjacency,
                                  nir_metadata_control_flow, (void *)key);
diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build
index c9e46f41418..e50a70eff51 100644
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@@ -11,11 +11,8 @@ libasahi_lib_files = files(
   'agx_linker.c',
   'agx_bg_eot.c',
   'agx_tilebuffer.c',
-  'agx_nir_lower_gs.c',
-  'agx_nir_lower_ia.c',
   'agx_nir_lower_msaa.c',
   'agx_nir_lower_sample_intrinsics.c',
-  'agx_nir_lower_tess.c',
   'agx_nir_lower_tilebuffer.c',
   'agx_nir_lower_uvs.c',
   'agx_nir_lower_vbo.c',
@@ -66,8 +63,8 @@ libasahi_lib = static_library(
   include_directories : [inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
   c_args : [no_override_init_args, '-Wno-c2x-extensions'],
   gnu_symbol_visibility : 'hidden',
-  link_with: [libasahi_decode, libvdrm],
-  dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx],
+  link_with: [libasahi_decode, libvdrm, libpoly_nir],
+  dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx, idep_libpoly],
   build_by_default : false,
 )
 
diff --git a/src/asahi/libagx/draws.cl b/src/asahi/libagx/draws.cl
index 88e88fadb2b..9c39d856691 100644
--- a/src/asahi/libagx/draws.cl
+++ b/src/asahi/libagx/draws.cl
@@ -4,8 +4,8 @@
  */
 #include "asahi/lib/agx_abi.h"
 #include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
 #include "agx_pack.h"
-#include "geometry.h"
 #include "libagx_dgc.h"
 
 /*
@@ -36,7 +36,7 @@ libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in,
 KERNEL(1)
 libagx_draw_without_adj(global VkDrawIndirectCommand *out,
                         global VkDrawIndirectCommand *in,
-                        global struct agx_ia_state *ia, uint64_t index_buffer,
+                        global struct poly_ia_state *ia, uint64_t index_buffer,
                         uint64_t index_buffer_range_el, int index_size_B,
                         enum mesa_prim prim)
 {
@@ -49,11 +49,11 @@ libagx_draw_without_adj(global VkDrawIndirectCommand *out,
    if (index_size_B) {
       uint offs = in->firstVertex;
 
-      ia->index_buffer = libagx_index_buffer(
-         index_buffer, index_buffer_range_el, offs, index_size_B);
+      ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
+                                           offs, index_size_B);
 
       ia->index_buffer_range_el =
-         libagx_index_buffer_range_el(index_buffer_range_el, offs);
+         poly_index_buffer_range_el(index_buffer_range_el, offs);
    }
 }
 
@@ -122,8 +122,7 @@ libagx_memset_small(global uchar *dst, uchar b, int len, uint tid)
  * TODO: Handle multiple draws in parallel.
  */
 KERNEL(32)
-libagx_draw_robust_index(global uint32_t *vdm,
-                         global struct agx_heap *heap,
+libagx_draw_robust_index(global uint32_t *vdm, global struct poly_heap *heap,
                          constant VkDrawIndexedIndirectCommand *cmd,
                          uint64_t in_buf_ptr, uint32_t in_buf_range_B,
                          ushort restart, enum agx_primitive topology,
@@ -163,7 +162,7 @@ libagx_draw_robust_index(global uint32_t *vdm,
       /* Allocate memory for the shadow index buffer */
       global uchar *padded;
       if (first) {
-         padded = agx_heap_alloc_nonatomic(heap, out_size_B);
+         padded = poly_heap_alloc_nonatomic(heap, out_size_B);
       }
       padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0);
 
@@ -172,7 +171,7 @@ libagx_draw_robust_index(global uint32_t *vdm,
       draw.start = 0;
 
       /* Clone the index buffer. The destination is aligned as a post-condition
-       * of agx_heap_alloc_nonatomic.
+       * of poly_heap_alloc_nonatomic.
        */
       libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid,
                                32);
diff --git a/src/asahi/libagx/geometry.cl b/src/asahi/libagx/geometry.cl
index bc72b487f5c..037b9dc061e 100644
--- a/src/asahi/libagx/geometry.cl
+++ b/src/asahi/libagx/geometry.cl
@@ -4,15 +4,11 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "asahi/lib/agx_abi.h"
 #include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
 #include "util/macros.h"
 #include "util/u_math.h"
-#include "geometry.h"
-#include "query.h"
-#include "tessellator.h"
-
-uint64_t nir_ro_to_rw_poly(uint64_t address);
 
 /* Swap the two non-provoking vertices in odd triangles. This generates a vertex
  * ID list with a consistent winding order.
@@ -32,54 +28,6 @@ map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
    return (provoking || even) ? vert : ((3 - pv) - vert);
 }
 
-static inline uint
-xfb_prim(uint id, uint n, uint copy)
-{
-   return sub_sat(id, n - 1u) + copy;
-}
-
-/*
- * Determine whether an output vertex has an n'th copy in the transform feedback
- * buffer. This is written weirdly to let constant folding remove unnecessary
- * stores when length is known statically.
- */
-bool
-libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
-{
-   uint prim = xfb_prim(id, n, copy);
-
-   int num_prims = length - (n - 1);
-   return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
-}
-
-uint
-libagx_xfb_vertex_offset(uint n, uint invocation_base_prim,
-                         uint strip_base_prim, uint id_in_strip, uint copy,
-                         bool flatshade_first)
-{
-   uint prim = xfb_prim(id_in_strip, n, copy);
-   uint vert_0 = min(id_in_strip, n - 1);
-   uint vert = vert_0 - copy;
-
-   if (n == 3) {
-      vert = map_vertex_in_tri_strip(prim, vert, flatshade_first);
-   }
-
-   /* Tally up in the whole buffer */
-   uint base_prim = invocation_base_prim + strip_base_prim;
-   uint base_vertex = base_prim * n;
-   return base_vertex + (prim * n) + vert;
-}
-
-uint64_t
-libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index,
-                          uint buffer, uint stride, uint output_offset)
-{
-   uint xfb_offset = (index * stride) + output_offset;
-
-   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
-}
-
 static uint
 vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
 {
@@ -90,20 +38,6 @@ vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
       return prim + vert;
 }
 
-uint
-libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
-                                uint num_prims)
-{
-   /* Line list, line strip, or line loop */
-   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
-      return 0;
-
-   if (mode == MESA_PRIM_LINES)
-      prim *= 2;
-
-   return prim + vert;
-}
-
 static uint
 vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
 {
@@ -122,44 +56,6 @@ vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
    return (vert == 0) ? 0 : prim + vert;
 }
 
-uint
-libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
-                               bool flatshade_first)
-{
-   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
-      vert = vert + 1;
-      vert = (vert == 3) ? 0 : vert;
-   }
-
-   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
-      return 0;
-
-   if (mode == MESA_PRIM_TRIANGLES)
-      prim *= 3;
-
-   /* Triangle list, triangle strip, or triangle fan */
-   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
-      unsigned pv = flatshade_first ? 0 : 2;
-
-      bool even = (prim & 1) == 0;
-      bool provoking = vert == pv;
-
-      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
-   }
-
-   return prim + vert;
-}
-
-uint
-libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
-{
-   /* Line list adj or line strip adj */
-   if (mode == MESA_PRIM_LINES_ADJACENCY)
-      prim *= 4;
-
-   return prim + vert;
-}
-
 static uint
 vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
                             bool flatshade_first)
@@ -206,18 +102,6 @@ vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
    return (prim * 2) + offset;
 }
 
-uint
-libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
-                                   uint nr, bool flatshade_first)
-{
-   /* Tri adj list or tri adj strip */
-   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
-      return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first);
-   } else {
-      return (6 * prim) + vert;
-   }
-}
-
 static uint
 vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
                        uint vert, uint num_prims)
@@ -262,127 +146,6 @@ vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
    }
 }
 
-uint
-libagx_map_to_line_adj(uint id)
-{
-   /* Sequence (1, 2), (5, 6), (9, 10), ... */
-   return ((id & ~1) * 2) + (id & 1) + 1;
-}
-
-uint
-libagx_map_to_line_strip_adj(uint id)
-{
-   /* Sequence (1, 2), (2, 3), (4, 5), .. */
-   uint prim = id / 2;
-   uint vert = id & 1;
-   return prim + vert + 1;
-}
-
-uint
-libagx_map_to_tri_strip_adj(uint id)
-{
-   /* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8)
-    *
-    * Although tri strips with adjacency have 6 cases in general, after
-    * disregarding the vertices only available in a geometry shader, there are
-    * only even/odd cases. In other words, it's just a triangle strip subject to
-    * extra padding.
-    *
-    * Dividing through by two, the sequence is:
-    *
-    *   (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4)
-    */
-   uint prim = id / 3;
-   uint vtx = id % 3;
-
-   /* Flip the winding order of odd triangles */
-   if ((prim % 2) == 1) {
-      if (vtx == 1)
-         vtx = 2;
-      else if (vtx == 2)
-         vtx = 1;
-   }
-
-   return 2 * (prim + vtx);
-}
-
-static void
-store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
-{
-   global uint32_t *out_32 = (global uint32_t *)index_buffer;
-   global uint16_t *out_16 = (global uint16_t *)index_buffer;
-   global uint8_t *out_8 = (global uint8_t *)index_buffer;
-
-   if (index_size_B == 4)
-      out_32[id] = value;
-   else if (index_size_B == 2)
-      out_16[id] = value;
-   else
-      out_8[id] = value;
-}
-
-static uint
-load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
-           uint index_size)
-{
-   bool oob = id >= index_buffer_range_el;
-
-   /* If the load would be out-of-bounds, load the first element which is
-    * assumed valid. If the application index buffer is empty with robustness2,
-    * index_buffer will point to a zero sink where only the first is valid.
-    */
-   if (oob) {
-      id = 0;
-   }
-
-   uint el;
-   if (index_size == 1) {
-      el = ((constant uint8_t *)index_buffer)[id];
-   } else if (index_size == 2) {
-      el = ((constant uint16_t *)index_buffer)[id];
-   } else {
-      el = ((constant uint32_t *)index_buffer)[id];
-   }
-
-   /* D3D robustness semantics. TODO: Optimize? */
-   if (oob) {
-      el = 0;
-   }
-
-   return el;
-}
-
-uint
-libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
-                         uint index_size)
-{
-   return load_index(p->index_buffer, p->index_buffer_range_el, id, index_size);
-}
-
-static void
-increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c,
-                   uint count)
-{
-   global uint32_t *ptr[] = {a, b, c};
-
-   for (uint i = 0; i < 3; ++i) {
-      if (ptr[i]) {
-         *(ptr[i]) += count;
-      }
-   }
-}
-
-static unsigned
-decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
-                                        unsigned verts_per_patch)
-{
-   if (prim >= MESA_PRIM_PATCHES) {
-      return vertices / verts_per_patch;
-   } else {
-      return u_decomposed_prims_for_vertices(prim, vertices);
-   }
-}
-
 KERNEL(1)
 libagx_increment_ia(global uint32_t *ia_vertices,
                     global uint32_t *ia_primitives,
@@ -390,13 +153,8 @@ libagx_increment_ia(global uint32_t *ia_vertices,
                     global uint32_t *c_invs, constant uint32_t *draw,
                     enum mesa_prim prim, unsigned verts_per_patch)
 {
-   increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]);
-
-   uint prims =
-      decomposed_prims_for_vertices_with_tess(prim, draw[0], verts_per_patch) *
-      draw[1];
-
-   increment_counters(ia_primitives, c_prims, c_invs, prims);
+   poly_increment_ia(ia_vertices, ia_primitives, vs_invocations, c_prims,
+                     c_invs, draw, prim, verts_per_patch);
 }
 
 KERNEL(1024)
@@ -418,8 +176,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
 
    /* Count non-restart indices */
    for (uint i = tid; i < count; i += 1024) {
-      uint index = load_index(index_buffer, index_buffer_range_el, start + i,
-                              index_size_B);
+      uint index = poly_load_index(index_buffer, index_buffer_range_el,
+                                   start + i, index_size_B);
 
       if (index != restart_index)
          partial++;
@@ -433,7 +191,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
 
    /* Elect a single thread from the workgroup to increment the counters */
    if (tid == 0) {
-      increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]);
+      poly_increment_counters(ia_vertices, vs_invocations, NULL,
+                              scratch * draw[1]);
    }
 
    /* TODO: We should vectorize this */
@@ -441,22 +200,22 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
       uint accum = 0;
       int last_restart = -1;
       for (uint i = 0; i < count; ++i) {
-         uint index = load_index(index_buffer, index_buffer_range_el, start + i,
-                                 index_size_B);
+         uint index = poly_load_index(index_buffer, index_buffer_range_el,
+                                      start + i, index_size_B);
 
          if (index == restart_index) {
-            accum += decomposed_prims_for_vertices_with_tess(
+            accum += poly_decomposed_prims_for_vertices_with_tess(
                prim, i - last_restart - 1, verts_per_patch);
             last_restart = i;
          }
       }
 
       {
-         accum += decomposed_prims_for_vertices_with_tess(
+         accum += poly_decomposed_prims_for_vertices_with_tess(
             prim, count - last_restart - 1, verts_per_patch);
       }
 
-      increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
+      poly_increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
    }
 }
 
@@ -483,7 +242,7 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
  * sets up most of the new draw descriptor.
  */
 static global void *
-setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
+setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw,
                       global uint *out, enum mesa_prim mode, uint index_size_B)
 {
    /* Determine an upper bound on the memory required for the index buffer.
@@ -499,7 +258,7 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
     * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
     * currently wired up in any driver.
     */
-   uint old_heap_bottom_B = agx_heap_alloc_nonatomic_offs(heap, alloc_size);
+   uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
 
    /* Setup most of the descriptor. Count will be determined after unroll. */
    out[1] = in_draw[1];                       /* instance count */
@@ -512,14 +271,14 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
 }
 
 KERNEL(1024)
-libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
+libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
                       constant uint *in_draw, global uint32_t *out_draw,
                       uint32_t max_draws, uint32_t restart_index,
                       uint32_t index_buffer_size_el, uint32_t index_size_log2,
                       uint32_t flatshade_first, uint mode__11)
 {
    uint32_t index_size_B = 1 << index_size_log2;
-   enum mesa_prim mode = libagx_uncompact_prim(mode__11);
+   enum mesa_prim mode = poly_uncompact_prim(mode__11);
    uint tid = cl_local_id.x;
    uint count = in_draw[0];
 
@@ -531,7 +290,7 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
 
    barrier(CLK_LOCAL_MEM_FENCE);
 
-   uintptr_t in_ptr = (uintptr_t)(libagx_index_buffer(
+   uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
       index_buffer, index_buffer_size_el, in_draw[2], index_size_B));
 
    local uint scratch[32];
@@ -545,8 +304,8 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
       for (;;) {
          uint idx = next_restart + tid;
          bool restart =
-            idx >= count || load_index(in_ptr, index_buffer_size_el, idx,
-                                       index_size_B) == restart_index;
+            idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx,
+                                            index_size_B) == restart_index;
 
          uint next_offs = first_true_thread_in_workgroup(restart, scratch);
 
@@ -566,10 +325,10 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
             uint offset = needle + id;
 
             uint x = ((out_prims_base + i) * per_prim) + vtx;
-            uint y =
-               load_index(in_ptr, index_buffer_size_el, offset, index_size_B);
+            uint y = poly_load_index(in_ptr, index_buffer_size_el, offset,
+                                     index_size_B);
 
-            store_index(out_ptr, index_size_B, x, y);
+            poly_store_index(out_ptr, index_size_B, x, y);
          }
       }
 
@@ -581,216 +340,39 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
       out_draw[0] = out_prims * per_prim;
 }
 
-static uint
-setup_xfb_buffer(global struct agx_geometry_params *p, uint i, uint stride,
-                 uint max_output_end, uint vertices_per_prim)
-{
-   uint xfb_offset = *(p->xfb_offs_ptrs[i]);
-   p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset;
-
-   /* Let output_end = output_offset + output_size.
-    *
-    * Primitive P will write up to (but not including) offset:
-    *
-    *    xfb_offset + ((P - 1) * (verts_per_prim * stride))
-    *               + ((verts_per_prim - 1) * stride)
-    *               + output_end
-    *
-    * To fit all outputs for P, that value must be less than the XFB
-    * buffer size for the output with maximal output_end, as everything
-    * else is constant here across outputs within a buffer/primitive:
-    *
-    *    floor(P) <= (stride + size - xfb_offset - output_end)
-    *                 // (stride * verts_per_prim)
-    */
-   int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset;
-   uint numer = max(numer_s, 0);
-   return numer / (stride * vertices_per_prim);
-}
-
-void
-libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
-                   uint32_t prim_index_offset, uint32_t vertex_offset,
-                   uint32_t verts_in_prim, uint3 info)
-{
-   _libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset,
-                       vertex_offset, verts_in_prim, info.x, info.y, info.z);
-}
-
-void
-libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset,
-                    uint nr_indices, uint alloc)
-{
-   for (uint i = nr_indices; i < alloc; ++i) {
-      index_buffer[inv_index_offset + i] = -1;
-   }
-}
-
 KERNEL(1)
 libagx_gs_setup_indirect(
    uint64_t index_buffer, constant uint *draw,
    global uintptr_t *vertex_buffer /* output */,
-   global struct agx_ia_state *ia /* output */,
-   global struct agx_geometry_params *p /* output */,
-   global struct agx_heap *heap,
+   global struct poly_ia_state *ia /* output */,
+   global struct poly_geometry_params *p /* output */,
+   global struct poly_heap *heap,
    uint64_t vs_outputs /* Vertex (TES) output mask */,
    uint32_t index_size_B /* 0 if no index bffer */,
    uint32_t index_buffer_range_el,
    uint32_t prim /* Input primitive type, enum mesa_prim */,
-   int is_prefix_summing, uint max_indices, enum agx_gs_shape shape)
+   int is_prefix_summing, uint max_indices, enum poly_gs_shape shape)
 {
-   /* Determine the (primitives, instances) grid size. */
-   uint vertex_count = draw[0];
-   uint instance_count = draw[1];
-
-   ia->verts_per_instance = vertex_count;
-
-   /* Calculate number of primitives input into the GS */
-   uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count);
-   p->input_primitives = prim_per_instance * instance_count;
-
-   /* Invoke VS as (vertices, instances); GS as (primitives, instances) */
-   p->vs_grid[0] = vertex_count;
-   p->vs_grid[1] = instance_count;
-
-   p->gs_grid[0] = prim_per_instance;
-   p->gs_grid[1] = instance_count;
-
-   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
-
-   /* If indexing is enabled, the third word is the offset into the index buffer
-    * in elements. Apply that offset now that we have it. For a hardware
-    * indirect draw, the hardware would do this for us, but for software input
-    * assembly we need to do it ourselves.
-    */
-   if (index_size_B) {
-      ia->index_buffer = libagx_index_buffer(
-         index_buffer, index_buffer_range_el, draw[2], index_size_B);
-
-      ia->index_buffer_range_el =
-         libagx_index_buffer_range_el(index_buffer_range_el, draw[2]);
-   }
-
-   /* We need to allocate VS and GS count buffers, do so now */
-   uint vertex_buffer_size =
-      libagx_tcs_in_size(vertex_count * instance_count, vs_outputs);
-
-   if (is_prefix_summing) {
-      p->count_buffer = agx_heap_alloc_nonatomic(
-         heap, p->input_primitives * p->count_buffer_stride);
-   }
-
-   p->input_buffer =
-      (uintptr_t)agx_heap_alloc_nonatomic(heap, vertex_buffer_size);
-   *vertex_buffer = p->input_buffer;
-
-   p->input_mask = vs_outputs;
-
-   /* Allocate the index buffer and write the draw consuming it */
-   global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
-
-   *cmd = (VkDrawIndexedIndirectCommand){
-      .indexCount = agx_gs_rast_vertices(shape, max_indices, prim_per_instance,
-                                         instance_count),
-      .instanceCount = agx_gs_rast_instances(shape, max_indices,
-                                             prim_per_instance, instance_count),
-   };
-
-   if (shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
-      cmd->firstIndex =
-         agx_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4;
-
-      p->output_index_buffer =
-         (global uint *)(heap->base + (cmd->firstIndex * 4));
-   }
-}
-
-/*
- * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
- * manually with subgroup ops and local memory since Mesa doesn't do those
- * lowerings yet.
- */
-static uint2
-libagx_work_group_scan_inclusive_add(uint x, local uint *scratch)
-{
-   uint sg_id = get_sub_group_id();
-
-   /* Partial prefix sum of the subgroup */
-   uint sg = sub_group_scan_inclusive_add(x);
-
-   /* Reduction (sum) for the subgroup */
-   uint sg_sum = sub_group_broadcast(sg, 31);
-
-   /* Write out all the subgroups sums */
-   barrier(CLK_LOCAL_MEM_FENCE);
-   scratch[sg_id] = sg_sum;
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
-    * threads in subgroup T.
-    */
-   uint other_sum = scratch[get_sub_group_local_id()];
-
-   /* Exclusive sum the subgroup sums to get the total before the current group,
-    * which can be added to the total for the current group.
-    */
-   uint other_sums = sub_group_scan_exclusive_add(other_sum);
-   uint base = sub_group_broadcast(other_sums, sg_id);
-   uint prefix = base + sg;
-
-   /* Reduce the workgroup using the prefix sum we already did */
-   uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
-
-   return (uint2)(prefix, reduction);
-}
-
-static void
-_libagx_prefix_sum(local uint *scratch, global uint *buffer, uint len,
-                   uint words, uint word)
-{
-   uint tid = cl_local_id.x;
-
-   /* Main loop: complete workgroups processing 1024 values at once */
-   uint i, count = 0;
-   uint len_remainder = len % 1024;
-   uint len_rounded_down = len - len_remainder;
-
-   for (i = tid; i < len_rounded_down; i += 1024) {
-      global uint *ptr = &buffer[(i * words) + word];
-      uint value = *ptr;
-      uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch);
-
-      *ptr = count + sums[0];
-      count += sums[1];
-   }
-
-   /* The last iteration is special since we won't have a full subgroup unless
-    * the length is divisible by the subgroup size, and we don't advance count.
-    */
-   global uint *ptr = &buffer[(i * words) + word];
-   uint value = (tid < len_remainder) ? *ptr : 0;
-   uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0];
-
-   if (tid < len_remainder) {
-      *ptr = count + scan;
-   }
+   poly_gs_setup_indirect(index_buffer, draw, vertex_buffer, ia, p, heap,
+                          vs_outputs, index_size_B, index_buffer_range_el, prim,
+                          is_prefix_summing, max_indices, shape);
 }
 
 KERNEL(1024)
-libagx_prefix_sum_geom(constant struct agx_geometry_params *p)
+libagx_prefix_sum_geom(constant struct poly_geometry_params *p)
 {
    local uint scratch[32];
-   _libagx_prefix_sum(scratch, p->count_buffer, p->input_primitives,
-                      p->count_buffer_stride / 4, cl_group_id.x);
+   poly_prefix_sum(scratch, p->count_buffer, p->input_primitives,
+                   p->count_buffer_stride / 4, cl_group_id.x, 1024);
 }
 
 KERNEL(1024)
-libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
+libagx_prefix_sum_tess(global struct poly_tess_args *p, global uint *c_prims,
                        global uint *c_invs, uint increment_stats__2)
 {
    local uint scratch[32];
-   _libagx_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */,
-                      0 /* word */);
+   poly_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */,
+                   0 /* word */, 1024);
 
    /* After prefix summing, we know the total # of indices, so allocate the
     * index buffer now. Elect a thread for the allocation.
@@ -805,7 +387,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
    /* Allocate 4-byte indices */
    uint32_t elsize_B = sizeof(uint32_t);
    uint32_t size_B = total * elsize_B;
-   uint alloc_B = agx_heap_alloc_nonatomic_offs(p->heap, size_B);
+   uint alloc_B = poly_heap_alloc_nonatomic_offs(p->heap, size_B);
    p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->base) + alloc_B);
 
    /* ...and now we can generate the API indexed draw */
@@ -818,7 +400,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
    desc[4] = 0;                  /* start_instance */
 
    /* If necessary, increment clipper statistics too. This is only used when
-    * there's no geometry shader following us. See agx_nir_lower_gs.c for more
+    * there's no geometry shader following us. See poly_nir_lower_gs.c for more
     * info on the emulation. We just need to calculate the # of primitives
     * tessellated.
     */
@@ -827,150 +409,6 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
                    : p->isolines  ? (total / 2)
                                   : (total / 3);
 
-      increment_counters(c_prims, c_invs, NULL, prims);
+      poly_increment_counters(c_prims, c_invs, NULL, prims);
    }
 }
-
-uintptr_t
-libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
-                             gl_varying_slot location)
-{
-   /* Written like this to let address arithmetic work */
-   return buffer + ((uintptr_t)libagx_tcs_in_offs_el(vtx, location, mask)) * 16;
-}
-
-uintptr_t
-libagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx,
-                              gl_varying_slot location)
-{
-   return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx,
-                                       location);
-}
-
-unsigned
-libagx_input_vertices(constant struct agx_ia_state *ia)
-{
-   return ia->verts_per_instance;
-}
-
-global uint *
-libagx_load_xfb_count_address(constant struct agx_geometry_params *p, int index,
-                              int count_words, uint unrolled_id)
-{
-   return &p->count_buffer[(unrolled_id * count_words) + index];
-}
-
-uint
-libagx_previous_xfb_primitives(global struct agx_geometry_params *p,
-                               int static_count, int count_index,
-                               int count_words, bool prefix_sum,
-                               uint unrolled_id)
-{
-   if (static_count >= 0) {
-      /* If the number of outputted vertices per invocation is known statically,
-       * we can calculate the base.
-       */
-      return unrolled_id * static_count;
-   } else {
-      /* Otherwise, load from the count buffer buffer. Note that the sums are
-       * inclusive, so index 0 is nonzero. This requires a little fixup here. We
-       * use a saturating unsigned subtraction so we don't read out-of-bounds.
-       *
-       * If we didn't prefix sum, there's only one element.
-       */
-      uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0;
-      uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index];
-
-      return unrolled_id == 0 ? 0 : count;
-   }
-}
-
-/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */
-#define libagx_foreach_xfb(word, index)                                        \
-   for (uint i = 0; i < 4; ++i)                                                \
-      if (word & BITFIELD_BIT(i))
-
-void
-libagx_pre_gs(global struct agx_geometry_params *p, uint streams,
-              uint buffers_written, uint4 buffer_to_stream, int4 count_index,
-              uint4 stride, uint4 output_end, int4 static_count,
-              uint invocations, uint vertices_per_prim,
-              global uint *gs_invocations, global uint *gs_primitives,
-              global uint *c_primitives, global uint *c_invocations)
-{
-   unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) +
-                          !!(count_index[2] >= 0) + !!(count_index[3] >= 0);
-   bool prefix_sum = count_words && buffers_written;
-   uint unrolled_in_prims = p->input_primitives;
-
-   /* Determine the number of primitives generated in each stream */
-   uint4 in_prims = 0;
-   libagx_foreach_xfb(streams, i) {
-      in_prims[i] = libagx_previous_xfb_primitives(
-         p, static_count[i], count_index[i], count_words, prefix_sum,
-         unrolled_in_prims);
-
-      *(p->prims_generated_counter[i]) += in_prims[i];
-   }
-
-   uint4 prims = in_prims;
-   uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3];
-
-   if (buffers_written) {
-      libagx_foreach_xfb(buffers_written, i) {
-         uint max_prims =
-            setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim);
-
-         unsigned stream = buffer_to_stream[i];
-         prims[stream] = min(prims[stream], max_prims);
-      }
-
-      int4 overflow = prims < in_prims;
-
-      libagx_foreach_xfb(streams, i) {
-         p->xfb_verts[i] = prims[i] * vertices_per_prim;
-
-         *(p->xfb_overflow[i]) += (bool)overflow[i];
-         *(p->xfb_prims_generated_counter[i]) += prims[i];
-      }
-
-      *(p->xfb_any_overflow) += any(overflow);
-
-      /* Update XFB counters */
-      libagx_foreach_xfb(buffers_written, i) {
-         uint32_t prim_stride_B = stride[i] * vertices_per_prim;
-         unsigned stream = buffer_to_stream[i];
-
-         global uint *ptr = p->xfb_offs_ptrs[i];
-
-         ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr);
-         *ptr += prims[stream] * prim_stride_B;
-      }
-   }
-
-   /* The geometry shader is invoked once per primitive (after unrolling
-    * primitive restart). From the spec:
-    *
-    *    In case of instanced geometry shaders (see section 11.3.4.2) the
-    *    geometry shader invocations count is incremented for each separate
-    *    instanced invocation.
-    */
-   *gs_invocations += unrolled_in_prims * invocations;
-   *gs_primitives += emitted_prims;
-
-   /* Clipper queries are not well-defined, so we can emulate them in lots of
-    * silly ways. We need the hardware counters to implement them properly. For
-    * now, just consider all primitives emitted as passing through the clipper.
-    * This satisfies spec text:
-    *
-    *    The number of primitives that reach the primitive clipping stage.
-    *
-    * and
-    *
-    *    If at least one vertex of the primitive lies inside the clipping
-    *    volume, the counter is incremented by one or more. Otherwise, the
-    *    counter is incremented by zero or more.
-    */
-   *c_primitives += emitted_prims;
-   *c_invocations += emitted_prims;
-}
diff --git a/src/asahi/libagx/geometry.h b/src/asahi/libagx/geometry.h
deleted file mode 100644
index 870f6489ca4..00000000000
--- a/src/asahi/libagx/geometry.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright 2023 Alyssa Rosenzweig
- * Copyright 2023 Valve Corporation
- * SPDX-License-Identifier: MIT
- */
-
-#include "asahi/lib/agx_abi.h"
-#include "compiler/libcl/libcl.h"
-#include "compiler/shader_enums.h"
-
-#include "util/bitscan.h"
-#include "util/u_math.h"
-
-#pragma once
-
-#define MAX_SO_BUFFERS     4
-#define MAX_VERTEX_STREAMS 4
-
-enum agx_gs_shape {
-   /* Indexed, where indices are encoded as:
-    *
-    *    round_to_pot(max_indices) * round_to_pot(input_primitives) *
-    *                              * instance_count
-    *
-    * invoked for max_indices * input_primitives * instance_count indices.
-    *
-    * This is used with any dynamic topology. No hardware instancing used.
-    */
-   AGX_GS_SHAPE_DYNAMIC_INDEXED,
-
-   /* Indexed with a static index buffer. Indices ranges up to max_indices.
-    * Hardware instance count = input_primitives * software instance count.
-    */
-   AGX_GS_SHAPE_STATIC_INDEXED,
-
-   /* Non-indexed. Dispatched as:
-    *
-    *    (max_indices, input_primitives * instance count).
-    */
-   AGX_GS_SHAPE_STATIC_PER_PRIM,
-
-   /* Non-indexed. Dispatched as:
-    *
-    *    (max_indices * input_primitives, instance count).
-    */
-   AGX_GS_SHAPE_STATIC_PER_INSTANCE,
-};
-
-static inline unsigned
-agx_gs_rast_vertices(enum agx_gs_shape shape, unsigned max_indices,
-                     unsigned input_primitives, unsigned instance_count)
-{
-   switch (shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED:
-      return max_indices * input_primitives * instance_count;
-
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-   case AGX_GS_SHAPE_STATIC_PER_PRIM:
-      return max_indices;
-
-   case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
-      return max_indices * input_primitives;
-   }
-
-   UNREACHABLE("invalid shape");
-}
-
-static inline unsigned
-agx_gs_rast_instances(enum agx_gs_shape shape, unsigned max_indices,
-                      unsigned input_primitives, unsigned instance_count)
-{
-   switch (shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED:
-      return 1;
-
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-   case AGX_GS_SHAPE_STATIC_PER_PRIM:
-      return input_primitives * instance_count;
-
-   case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
-      return instance_count;
-   }
-
-   UNREACHABLE("invalid shape");
-}
-
-static inline bool
-agx_gs_indexed(enum agx_gs_shape shape)
-{
-   return shape == AGX_GS_SHAPE_DYNAMIC_INDEXED ||
-          shape == AGX_GS_SHAPE_STATIC_INDEXED;
-}
-
-static inline unsigned
-agx_gs_index_size(enum agx_gs_shape shape)
-{
-   switch (shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED:
-      return 4;
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-      return 1;
-   default:
-      return 0;
-   }
-}
-
-/* Heap to allocate from. */
-struct agx_heap {
-   DEVICE(uchar) base;
-   uint32_t bottom, size;
-} PACKED;
-static_assert(sizeof(struct agx_heap) == 4 * 4);
-
-#ifdef __OPENCL_VERSION__
-static inline uint
-_agx_heap_alloc_offs(global struct agx_heap *heap, uint size_B, bool atomic)
-{
-   size_B = align(size_B, 16);
-
-   uint offs;
-   if (atomic) {
-      offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
-   } else {
-      offs = heap->bottom;
-      heap->bottom = offs + size_B;
-   }
-
-   /* Use printf+abort because assert is stripped from release builds. */
-   if (heap->bottom >= heap->size) {
-      printf(
-         "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
-         size_B, offs, heap->size);
-
-      abort();
-   }
-
-   return offs;
-}
-
-static inline uint
-agx_heap_alloc_nonatomic_offs(global struct agx_heap *heap, uint size_B)
-{
-   return _agx_heap_alloc_offs(heap, size_B, false);
-}
-
-static inline uint
-agx_heap_alloc_atomic_offs(global struct agx_heap *heap, uint size_B)
-{
-   return _agx_heap_alloc_offs(heap, size_B, true);
-}
-
-static inline global void *
-agx_heap_alloc_nonatomic(global struct agx_heap *heap, uint size_B)
-{
-   return heap->base + agx_heap_alloc_nonatomic_offs(heap, size_B);
-}
-
-uint64_t nir_load_ro_sink_address_poly(void);
-
-static inline uint64_t
-libagx_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
-                    uint elsize_B)
-{
-   if (offset_el < size_el)
-      return index_buffer + (offset_el * elsize_B);
-   else
-      return nir_load_ro_sink_address_poly();
-}
-#endif
-
-struct agx_ia_state {
-   /* Index buffer if present. */
-   uint64_t index_buffer;
-
-   /* Size of the bound index buffer for bounds checking */
-   uint32_t index_buffer_range_el;
-
-   /* Number of vertices per instance. Written by CPU for direct draw, indirect
-    * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
-    */
-   uint32_t verts_per_instance;
-} PACKED;
-static_assert(sizeof(struct agx_ia_state) == 4 * 4);
-
-static inline uint
-libagx_index_buffer_range_el(uint size_el, uint offset_el)
-{
-   return offset_el < size_el ? (size_el - offset_el) : 0;
-}
-
-struct agx_geometry_params {
-   /* Address of associated indirect draw buffer */
-   DEVICE(uint) indirect_desc;
-
-   /* Address of count buffer. For an indirect draw, this will be written by the
-    * indirect setup kernel.
-    */
-   DEVICE(uint) count_buffer;
-
-   /* Address of the primitives generated counters */
-   DEVICE(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
-   DEVICE(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
-   DEVICE(uint) xfb_overflow[MAX_VERTEX_STREAMS];
-   DEVICE(uint) xfb_any_overflow;
-
-   /* Pointers to transform feedback buffer offsets in bytes */
-   DEVICE(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
-
-   /* Output index buffer, allocated by pre-GS. */
-   DEVICE(uint) output_index_buffer;
-
-   /* Address of transform feedback buffer in general, supplied by the CPU. */
-   DEVICE(uchar) xfb_base_original[MAX_SO_BUFFERS];
-
-   /* Address of transform feedback for the current primitive. Written by pre-GS
-    * program.
-    */
-   DEVICE(uchar) xfb_base[MAX_SO_BUFFERS];
-
-   /* Address and present mask for the input to the geometry shader. These will
-    * reflect the vertex shader for VS->GS or instead the tessellation
-    * evaluation shader for TES->GS.
-    */
-   uint64_t input_buffer;
-   uint64_t input_mask;
-
-   /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
-   uint64_t flat_outputs;
-
-   uint32_t xfb_size[MAX_SO_BUFFERS];
-
-   /* Number of vertices emitted by transform feedback per stream. Written by
-    * the pre-GS program.
-    */
-   uint32_t xfb_verts[MAX_VERTEX_STREAMS];
-
-   /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
-    * out by the GS indirect setup kernel or the CPU for a direct draw. This is
-    * the "indirect local" format: first 3 is in threads, second 3 is in grid
-    * blocks. This lets us use nontrivial workgroups with indirect draws without
-    * needing any predication.
-    */
-   uint32_t vs_grid[6];
-   uint32_t gs_grid[6];
-
-   /* Number of input primitives across all instances, calculated by the CPU for
-    * a direct draw or the GS indirect setup kernel for an indirect draw.
-    */
-   uint32_t input_primitives;
-
-   /* Number of input primitives per instance, rounded up to a power-of-two and
-    * with the base-2 log taken. This is used to partition the output vertex IDs
-    * efficiently.
-    */
-   uint32_t primitives_log2;
-
-   /* Number of bytes output by the GS count shader per input primitive (may be
-    * 0), written by CPU and consumed by indirect draw setup shader for
-    * allocating counts.
-    */
-   uint32_t count_buffer_stride;
-
-   /* Dynamic input topology. Must be compatible with the geometry shader's
-    * layout() declared input class.
-    */
-   uint32_t input_topology;
-} PACKED;
-static_assert(sizeof(struct agx_geometry_params) == 86 * 4);
-
-/* TCS shared memory layout:
- *
- *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
- *
- * TODO: compact.
- */
-static inline uint
-libagx_tcs_in_offs_el(uint vtx, gl_varying_slot location,
-                      uint64_t crosslane_vs_out_mask)
-{
-   uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
-   uint offs = util_bitcount64(crosslane_vs_out_mask &
-                               (((uint64_t)(1) << location) - 1));
-
-   return base + offs;
-}
-
-static inline uint
-libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
-                   uint64_t crosslane_vs_out_mask)
-{
-   return libagx_tcs_in_offs_el(vtx, location, crosslane_vs_out_mask) * 16;
-}
-
-static inline uint
-libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
-{
-   return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
-}
-
-/*
- * TCS out buffer layout, per-patch:
- *
- *    float tess_level_outer[4];
- *    float tess_level_inner[2];
- *    vec4 patch_out[MAX_PATCH_OUTPUTS];
- *    vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
- *
- * Vertex out are compacted based on the mask of written out. Patch
- * out are used as-is.
- *
- * Bounding boxes are ignored.
- */
-static inline uint
-libagx_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
-                       uint64_t vtx_out_mask)
-{
-   uint off = 0;
-   if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
-      return off;
-
-   off += 4;
-   if (location == VARYING_SLOT_TESS_LEVEL_INNER)
-      return off;
-
-   off += 2;
-   if (location >= VARYING_SLOT_PATCH0)
-      return off + (4 * (location - VARYING_SLOT_PATCH0));
-
-   /* Anything else is a per-vtx output */
-   off += 4 * nr_patch_out;
-   off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
-
-   uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
-   return off + (4 * idx);
-}
-
-static inline uint
-libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
-                    uint64_t vtx_out_mask)
-{
-   return libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask) *
-          4;
-}
-
-static inline uint
-libagx_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
-                         uint64_t vtx_out_mask)
-{
-   return libagx_tcs_out_offs_el(out_patch_size, 0, nr_patch_out, vtx_out_mask);
-}
-
-static inline uint
-libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
-                      uint64_t vtx_out_mask)
-{
-   return libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
-          4;
-}
-
-/* In a tess eval shader, stride for hw vertex ID */
-#define LIBAGX_TES_PATCH_ID_STRIDE 8192
-
-static uint
-libagx_compact_prim(enum mesa_prim prim)
-{
-   static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
-   static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
-
-#ifndef __OPENCL_VERSION__
-   assert(prim != MESA_PRIM_QUADS);
-   assert(prim != MESA_PRIM_QUAD_STRIP);
-   assert(prim != MESA_PRIM_POLYGON);
-   assert(prim != MESA_PRIM_PATCHES);
-#endif
-
-   return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
-}
-
-static enum mesa_prim
-libagx_uncompact_prim(uint packed)
-{
-   return (packed >= MESA_PRIM_QUADS) ? (packed + 3) : packed;
-}
-
-/*
- * Write a strip into a 32-bit index buffer. This is the sequence:
- *
- *    (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
- *
- * For points, we write index buffers without restart just for remapping.
- */
-static inline void
-_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
-                    uint32_t vertex_offset, uint32_t verts_in_prim,
-                    uint32_t stream, uint32_t stream_multiplier, uint32_t n)
-{
-   bool restart = n > 1;
-   if (verts_in_prim < n)
-      return;
-
-   GLOBAL uint32_t *out = &index_buffer[index_offset];
-
-   /* Write out indices for the strip */
-   for (uint32_t i = 0; i < verts_in_prim; ++i) {
-      out[i] = (vertex_offset + i) * stream_multiplier + stream;
-   }
-
-   if (restart)
-      out[verts_in_prim] = -1;
-}
diff --git a/src/asahi/libagx/meson.build b/src/asahi/libagx/meson.build
index b772415dbbc..70d249d58b5 100644
--- a/src/asahi/libagx/meson.build
+++ b/src/asahi/libagx/meson.build
@@ -21,6 +21,7 @@ libagx_spv = custom_target(
     libagx_shader_files, '--',
     '-I' + join_paths(meson.project_source_root(), 'include'),
     '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
+    '-I' + join_paths(meson.project_source_root(), 'src/poly/cl'),
     '-I' + join_paths(meson.current_source_dir(), '.'),
     '-I' + join_paths(meson.current_source_dir(), '../../'),
     '-I' + join_paths(meson.current_source_dir(), 'shaders'),
diff --git a/src/asahi/libagx/tessellation.cl b/src/asahi/libagx/tessellation.cl
index 244158f3d38..a84eed823a3 100644
--- a/src/asahi/libagx/tessellation.cl
+++ b/src/asahi/libagx/tessellation.cl
@@ -3,148 +3,14 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "geometry.h"
-#include "tessellator.h"
-#include <agx_pack.h>
-
-uint
-libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p)
-{
-   return p->input_patch_size;
-}
-
-uint
-libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p)
-{
-   return p->output_patch_size;
-}
-
-uint
-libagx_tcs_unrolled_id(constant struct libagx_tess_args *p, uint3 wg_id)
-{
-   return (wg_id.y * p->patches_per_instance) + wg_id.x;
-}
-
-uint64_t
-libagx_tes_buffer(constant struct libagx_tess_args *p)
-{
-   return p->tes_buffer;
-}
-
-/*
- * Helper to lower indexing for a tess eval shader ran as a compute shader. This
- * handles the tess+geom case. This is simpler than the general input assembly
- * lowering, as we know:
- *
- * 1. the index buffer is U32
- * 2. the index is in bounds
- *
- * Therefore we do a simple load. No bounds checking needed.
- */
-uint32_t
-libagx_load_tes_index(constant struct libagx_tess_args *p, uint32_t index)
-{
-   /* Swap second and third vertices of each triangle to flip winding order
-    * dynamically if needed.
-    */
-   if (p->ccw) {
-      uint id = index % 3;
-
-      if (id == 1)
-         index++;
-      else if (id == 2)
-         index--;
-   }
-
-   return p->index_buffer[index];
-}
-
-ushort
-libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
-                     uint64_t crosslane_vs_out_mask)
-{
-   return libagx_tcs_in_offs(vtx, location, crosslane_vs_out_mask);
-}
-
-uintptr_t
-libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id,
-                       uint vtx_id, gl_varying_slot location, uint nr_patch_out,
-                       uint out_patch_size, uint64_t vtx_out_mask)
-{
-   uint stride_el =
-      libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask);
-
-   uint offs_el =
-      libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask);
-
-   offs_el += patch_id * stride_el;
-
-   /* Written to match the AGX addressing mode */
-   return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2);
-}
-
-static uint
-libagx_tes_unrolled_patch_id(uint raw_id)
-{
-   return raw_id / LIBAGX_TES_PATCH_ID_STRIDE;
-}
-
-uint
-libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id)
-{
-   return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
-}
-
-static uint
-tes_vertex_id_in_patch(uint raw_id)
-{
-   return raw_id % LIBAGX_TES_PATCH_ID_STRIDE;
-}
-
-float2
-libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id)
-{
-   uint patch = libagx_tes_unrolled_patch_id(raw_id);
-   uint vtx = tes_vertex_id_in_patch(raw_id);
-
-   global struct libagx_tess_point *t =
-      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
-
-   /* Written weirdly because NIR struggles with loads of structs */
-   uint2 fixed = *((global uint2 *)t);
-
-   /* Convert fixed point to float */
-   return convert_float2(fixed) / (1u << 16);
-}
-
-uintptr_t
-libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id,
-                      uint vtx_id, gl_varying_slot location)
-{
-   uint patch = libagx_tes_unrolled_patch_id(raw_id);
-
-   return libagx_tcs_out_address(p, patch, vtx_id, location,
-                                 p->tcs_patch_constants, p->output_patch_size,
-                                 p->tcs_per_vertex_outputs);
-}
-
-float4
-libagx_tess_level_outer_default(constant struct libagx_tess_args *p)
-{
-   return vload4(0, p->tess_level_outer_default);
-}
-
-float2
-libagx_tess_level_inner_default(constant struct libagx_tess_args *p)
-{
-   return vload2(0, p->tess_level_inner_default);
-}
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
 
 KERNEL(1)
 libagx_tess_setup_indirect(
-   global struct libagx_tess_args *p,
+   global struct poly_tess_args *p,
    global uint32_t *grids /* output: VS then TCS then tess */,
-   global struct agx_ia_state *ia /* output */, global uint32_t *indirect,
+   global struct poly_ia_state *ia /* output */, global uint32_t *indirect,
    global uint64_t *vertex_output_buffer_ptr, uint64_t in_index_buffer,
    uint32_t in_index_buffer_range_el, uint32_t in_index_size_B,
    uint64_t vertex_outputs /* bitfield */,
@@ -174,11 +40,11 @@ libagx_tess_setup_indirect(
    alloc += unrolled_patches * sizeof(uint32_t);
 
    uint vb_offs = alloc;
-   uint vb_size = libagx_tcs_in_size(count * instance_count, vertex_outputs);
+   uint vb_size = poly_tcs_in_size(count * instance_count, vertex_outputs);
    alloc += vb_size;
 
    /* Allocate all patch calculations in one go */
-   global uchar *blob = agx_heap_alloc_nonatomic(p->heap, alloc);
+   global uchar *blob = poly_heap_alloc_nonatomic(p->heap, alloc);
 
    p->tcs_buffer = (global float *)(blob + tcs_out_offs);
    p->patches_per_instance = in_patches;
@@ -201,11 +67,11 @@ libagx_tess_setup_indirect(
     */
    if (in_index_size_B) {
       ia->index_buffer =
-         libagx_index_buffer(in_index_buffer, in_index_buffer_range_el,
-                             indirect[2], in_index_size_B);
+         poly_index_buffer(in_index_buffer, in_index_buffer_range_el,
+                           indirect[2], in_index_size_B);
 
       ia->index_buffer_range_el =
-         libagx_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
+         poly_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
    }
 
    /* VS grid size */
diff --git a/src/asahi/libagx/tessellator.cl b/src/asahi/libagx/tessellator.cl
index 957230e422d..6dcbdd8320e 100644
--- a/src/asahi/libagx/tessellator.cl
+++ b/src/asahi/libagx/tessellator.cl
@@ -19,1594 +19,26 @@
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
 */
+#include "poly/cl/tessellator.h"
 
-#include "util/u_math.h"
-#include "geometry.h"
-#include "tessellator.h"
-
-#define LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f
-#define LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f
-
-typedef unsigned int FXP; // fixed point number
-
-enum {
-   U = 0, // points on a tri patch
-   V = 1,
-};
-
-enum {
-   Ueq0 = 0, // edges on a tri patch
-   Veq0 = 1,
-   Weq0 = 2,
-};
-
-enum {
-   Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1
-   Veq1 = 3,
-};
-
-#define QUAD_AXES  2
-#define QUAD_EDGES 4
-#define TRI_EDGES  3
-
-// The interior can just use a simpler stitch.
-typedef enum DIAGONALS {
-   DIAGONALS_INSIDE_TO_OUTSIDE,
-   DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE,
-   DIAGONALS_MIRRORED
-} DIAGONALS;
-
-typedef struct TESS_FACTOR_CONTEXT {
-   FXP fxpInvNumSegmentsOnFloorTessFactor;
-   FXP fxpInvNumSegmentsOnCeilTessFactor;
-   FXP fxpHalfTessFactorFraction;
-   int numHalfTessFactorPoints;
-   int splitPointOnFloorHalfTessFactor;
-} TESS_FACTOR_CONTEXT;
-
-struct INDEX_PATCH_CONTEXT {
-   int insidePointIndexDeltaToRealValue;
-   int insidePointIndexBadValue;
-   int insidePointIndexReplacementValue;
-   int outsidePointIndexPatchBase;
-   int outsidePointIndexDeltaToRealValue;
-   int outsidePointIndexBadValue;
-   int outsidePointIndexReplacementValue;
-};
-
-struct INDEX_PATCH_CONTEXT2 {
-   int baseIndexToInvert;
-   int indexInversionEndPoint;
-   int cornerCaseBadValue;
-   int cornerCaseReplacementValue;
-};
-
-struct CHWTessellator {
-   enum libagx_tess_mode mode;
-   uint index_bias;
-
-   // array where we will store u/v's for the points we generate
-   global struct libagx_tess_point *Point;
-
-   // array where we will store index topology
-   global void *Index;
-
-   // A second index patch we have to do handles the leftover strip of quads in
-   // the middle of an odd quad patch after finishing all the concentric rings.
-   // This also handles the leftover strip of points in the middle of an even
-   // quad patch, when stitching the row of triangles up the left side (V major
-   // quad) or bottom (U major quad) of the inner ring
-   bool bUsingPatchedIndices;
-   bool bUsingPatchedIndices2;
-   struct INDEX_PATCH_CONTEXT IndexPatchCtx;
-   struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2;
-};
-
-#define FXP_INTEGER_BITS  15
-#define FXP_FRACTION_BITS 16
-#define FXP_FRACTION_MASK 0x0000ffff
-#define FXP_INTEGER_MASK  0x7fff0000
-#define FXP_ONE           (1 << FXP_FRACTION_BITS)
-#define FXP_ONE_THIRD     0x00005555
-#define FXP_TWO_THIRDS    0x0000aaaa
-#define FXP_ONE_HALF      0x00008000
-
-static global float *
-tess_factors(constant struct libagx_tess_args *p, uint patch)
+KERNEL(64)
+libagx_tess_isoline(constant struct poly_tess_args *p,
+                    enum poly_tess_mode mode__2)
 {
-   return p->tcs_buffer + (patch * p->tcs_stride_el);
-}
-
-/*
- * Generate an indexed draw for a patch with the computed number of indices.
- * This allocates heap memory for the index buffer, returning the allocated
- * memory.
- */
-static global void *
-libagx_draw(constant struct libagx_tess_args *p, enum libagx_tess_mode mode,
-            bool lines, uint patch, uint count)
-{
-   if (mode == LIBAGX_TESS_MODE_COUNT) {
-      p->counts[patch] = count;
-   }
-
-   if (mode == LIBAGX_TESS_MODE_WITH_COUNTS) {
-      /* The index buffer is already allocated, get a pointer inside it.
-       * p->counts has had an inclusive prefix sum hence the subtraction.
-       */
-      uint offset_el = p->counts[sub_sat(patch, 1u)];
-      if (patch == 0)
-         offset_el = 0;
-
-      return &p->index_buffer[offset_el];
-   }
-
-   return NULL;
-}
-
-static void
-libagx_draw_points(private struct CHWTessellator *ctx,
-                   constant struct libagx_tess_args *p, uint patch, uint count)
-{
-   /* For points mode with a single draw, we need to generate a trivial index
-    * buffer to stuff in the patch ID in the right place.
-    */
-   global uint32_t *indices = libagx_draw(p, ctx->mode, false, patch, count);
-
-   if (ctx->mode == LIBAGX_TESS_MODE_COUNT)
-      return;
-
-   for (int i = 0; i < count; ++i) {
-      indices[i] = ctx->index_bias + i;
-   }
-}
-
-static void
-libagx_draw_empty(constant struct libagx_tess_args *p,
-                  enum libagx_tess_mode mode,
-                  uint patch)
-{
-   if (mode == LIBAGX_TESS_MODE_COUNT) {
-      p->counts[patch] = 0;
-   }
-}
-
-/*
- * Allocate heap memory for domain points for a patch. The allocation
- * is recorded in the coord_allocs[] array, which is in elements.
- */
-static global struct libagx_tess_point *
-libagx_heap_alloc_points(constant struct libagx_tess_args *p, uint patch,
-                         uint count)
-{
-   /* If we're recording statistics, increment now. The statistic is for
-    * tessellation evaluation shader invocations, which is equal to the number
-    * of domain points generated.
-    */
-   if (p->statistic) {
-      atomic_fetch_add((volatile atomic_uint *)(p->statistic), count);
-   }
-
-   uint32_t elsize_B = sizeof(struct libagx_tess_point);
-   uint32_t alloc_B = agx_heap_alloc_atomic_offs(p->heap, elsize_B * count);
-   uint32_t alloc_el = alloc_B / elsize_B;
-
-   p->coord_allocs[patch] = alloc_el;
-   return (global struct libagx_tess_point *)(((uintptr_t)p->heap->base) +
-                                              alloc_B);
-}
-
-// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012
-// amar.patel@microsoft.com
-
-#define LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR  1
-#define LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR  63
-#define LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR 2
-#define LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR 64
-
-// 2^(-16), min positive fixed point fraction
-#define EPSILON 0.0000152587890625f
-#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON                                   \
-   (LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2)
-
-static float clamp_factor(float factor,
-                          enum libagx_tess_partitioning partitioning,
-                          float maxf)
-{
-   float lower = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN)
-                    ? LIBAGX_TESS_MIN_EVEN_TESSELLATION_FACTOR
-                    : LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR;
-
-   float upper = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD)
-                    ? LIBAGX_TESS_MAX_ODD_TESSELLATION_FACTOR
-                    : LIBAGX_TESS_MAX_EVEN_TESSELLATION_FACTOR;
-
-   // If any TessFactor will end up > 1 after floatToFixed conversion later,
-   // then force the inside TessFactors to be > 1 so there is a picture frame.
-   if (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD &&
-       maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) {
-
-      lower = LIBAGX_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
-   }
-
-   factor = clamp(factor, lower, upper);
-
-   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
-      factor = ceil(factor);
-   }
-
-   return factor;
-}
-
-
-static FXP
-floatToFixed(const float input)
-{
-   return mad(input, FXP_ONE, 0.5f);
-}
-
-static bool
-isOdd(const float input)
-{
-   return ((int)input) & 1;
-}
-
-static FXP
-fxpCeil(const FXP input)
-{
-   if (input & FXP_FRACTION_MASK) {
-      return (input & FXP_INTEGER_MASK) + FXP_ONE;
-   }
-   return input;
-}
-
-static FXP
-fxpFloor(const FXP input)
-{
-   return (input & FXP_INTEGER_MASK);
-}
-
-static int
-PatchIndexValue(private struct CHWTessellator *ctx, int index)
-{
-   if (ctx->bUsingPatchedIndices) {
-      // assumed remapped outide indices are > remapped inside vertices
-      if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) {
-         if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue)
-            return ctx->IndexPatchCtx.outsidePointIndexReplacementValue;
-         else
-            return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue;
-      } else {
-         if (index == ctx->IndexPatchCtx.insidePointIndexBadValue)
-            return ctx->IndexPatchCtx.insidePointIndexReplacementValue;
-         else
-            return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue;
-      }
-   } else if (ctx->bUsingPatchedIndices2) {
-      if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) {
-         return ctx->IndexPatchCtx2.cornerCaseReplacementValue;
-      } else if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) {
-         return ctx->IndexPatchCtx2.indexInversionEndPoint - index;
-      }
-   }
-
-   return index;
-}
-
-static void
-DefinePoint(global struct libagx_tess_point *out, FXP fxpU, FXP fxpV)
-{
-   out->u = fxpU;
-   out->v = fxpV;
-}
-
-static void
-DefineIndex(private struct CHWTessellator *ctx, int index,
-            int indexStorageOffset)
-{
-   global uint32_t *indices = (global uint32_t *)ctx->Index;
-   indices[indexStorageOffset] = ctx->index_bias + PatchIndexValue(ctx, index);
-}
-
-static void
-DefineTriangle(private struct CHWTessellator *ctx, int index0, int index1,
-               int index2, int indexStorageBaseOffset)
-{
-   index0 = PatchIndexValue(ctx, index0);
-   index1 = PatchIndexValue(ctx, index1);
-   index2 = PatchIndexValue(ctx, index2);
-
-   vstore3(ctx->index_bias + (uint3)(index0, index1, index2), 0,
-           (global uint *)ctx->Index + indexStorageBaseOffset);
-}
-
-static uint32_t
-RemoveMSB(uint32_t val)
-{
-   uint32_t bit = val ? (1 << (31 - clz(val))) : 0;
-   return val & ~bit;
-}
-
-static int
-NumPointsForTessFactor(bool odd, FXP fxpTessFactor)
-{
-   // Add epsilon for rounding and add 1 for odd
-   FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1);
-   int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1);
-   return odd ? r : r + 1;
-}
-
-static void
-ComputeTessFactorCtx(bool odd, FXP fxpTessFactor,
-                     private TESS_FACTOR_CONTEXT *TessFactorCtx)
-{
-   // fxpHalfTessFactor == 1/2 if TessFactor is 1,
-   // but we're pretending we are even.
-   FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2;
-   if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) {
-      fxpHalfTessFactor += FXP_ONE_HALF;
-   }
-   FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor);
-   FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor);
-   TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor;
-   TessFactorCtx->numHalfTessFactorPoints =
-      (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always
-                                                    // fixed at the midpoint of the TessFactor
-   if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) {
-      TessFactorCtx->splitPointOnFloorHalfTessFactor =
-         /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1;
-   } else if (odd) {
-      if (fxpFloorHalfTessFactor == FXP_ONE) {
-         TessFactorCtx->splitPointOnFloorHalfTessFactor = 0;
-      } else {
-         TessFactorCtx->splitPointOnFloorHalfTessFactor =
-            (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1;
-      }
-   } else {
-      TessFactorCtx->splitPointOnFloorHalfTessFactor =
-         (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1;
-   }
-   int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS;
-   int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS;
-   if (odd) {
-      numFloorSegments -= 1;
-      numCeilSegments -= 1;
-   }
-   TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor =
-      floatToFixed(1.0f / (float)numFloorSegments);
-   TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor =
-      floatToFixed(1.0f / (float)numCeilSegments);
-}
-
-static FXP
-PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd,
-               int point)
-{
-   bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints;
-
-   if (bFlip) {
-      point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd;
-   }
-
-   // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly
-   if (point == TessFactorCtx->numHalfTessFactorPoints)
-      return FXP_ONE_HALF;
-
-   unsigned int indexOnCeilHalfTessFactor = point;
-   unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
-   if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) {
-      indexOnFloorHalfTessFactor -= 1;
-   }
-   // For the fixed point multiplies below, we know the results are <= 16 bits
-   // because the locations on the halfTessFactor are <= half the number of
-   // segments for the total TessFactor. So a number divided by a number that
-   // is at least twice as big will give a result no bigger than 0.5 (which in
-   // fixed point is 16 bits in our case)
-   FXP fxpLocationOnFloorHalfTessFactor =
-      indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor;
-   FXP fxpLocationOnCeilHalfTessFactor =
-      indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor;
-
-   // Since we know the numbers calculated above are <= fixed point 0.5, and the
-   // equation below is just lerping between two values <= fixed point 0.5
-   // (0x00008000), then we know that the final result before shifting by 16 bits
-   // is no larger than 0x80000000.  Once we shift that down by 16, we get the
-   // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5
-   // (0x00008000)
-   FXP fxpLocation =
-      fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) +
-      fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction);
-   fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16
-   if (bFlip) {
-      fxpLocation = FXP_ONE - fxpLocation;
-   }
-   return fxpLocation;
-}
-
-static void
-StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid,
-              DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints,
-              int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset)
-{
-   int insidePoint = insideEdgePointBaseOffset;
-   int outsidePoint = outsideEdgePointBaseOffset;
-   if (bTrapezoid) {
-      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                     baseIndexOffset);
-      baseIndexOffset += 3;
-      outsidePoint++;
-   }
-   int p;
-   switch (diagonals) {
-   case DIAGONALS_INSIDE_TO_OUTSIDE:
-      // Diagonals pointing from inside edge forward towards outside edge
-      for (p = 0; p < numInsideEdgePoints - 1; p++) {
-         DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-
-         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-         outsidePoint++;
-      }
-      break;
-   case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
-      // Diagonals pointing from outside edge forward towards inside edge
-
-      // First half
-      for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) {
-         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-         outsidePoint++;
-      }
-
-      // Middle
-      DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint,
-                     baseIndexOffset);
-      baseIndexOffset += 3;
-      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
-                     baseIndexOffset);
-      baseIndexOffset += 3;
-      insidePoint++;
-      outsidePoint++;
-      p += 2;
-
-      // Second half
-      for (; p < numInsideEdgePoints; p++) {
-         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-         outsidePoint++;
-      }
-      break;
-   case DIAGONALS_MIRRORED:
-      // First half, diagonals pointing from outside of outside edge to inside of
-      // inside edge
-      for (p = 0; p < numInsideEdgePoints / 2; p++) {
-         DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-         outsidePoint++;
-      }
-      // Second half, diagonals pointing from inside of inside edge to outside of
-      // outside edge
-      for (; p < numInsideEdgePoints - 1; p++) {
-         DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-         outsidePoint++;
-      }
-      break;
-   }
-   if (bTrapezoid) {
-      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                     baseIndexOffset);
-      baseIndexOffset += 3;
-   }
-}
-
-// loop_start and loop_end give optimal loop bounds for
-// the stitching algorithm further below, for any given halfTssFactor. There
-// is probably a better way to encode this...
-//
-// Return the FIRST entry in finalPointPositionTable awhich is less than
-// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
-static int
-loop_start(int N)
-{
-   if (N < 2)
-      return 1;
-   else if (N == 2)
-      return 17;
-   else if (N < 5)
-      return 9;
-   else if (N < 9)
-      return 5;
-   else if (N < 17)
-      return 3;
-   else
-      return 2;
-}
-
-// Return the LAST entry in finalPointPositionTable[] which is less than
-// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
-static int
-loop_end(int N)
-{
-   if (N < 2)
-      return 0;
-   else if (N < 4)
-      return 17;
-   else if (N < 8)
-      return 25;
-   else if (N < 16)
-      return 29;
-   else if (N < 32)
-      return 31;
-   else
-      return 32;
-}
-
-// Tables to assist in the stitching of 2 rows of points having arbitrary
-// TessFactors. The stitching order is governed by Ruler Function vertex
-// split ordering (see external documentation).
-//
-// The contents of the finalPointPositionTable are where vertex i [0..33]
-// ends up on the half-edge at the max tessellation amount given
-// ruler-function split order. Recall the other half of an edge is mirrored,
-// so we only need to deal with one half. This table is used to decide when
-// to advance a point on the interior or exterior. It supports odd TessFactor
-// up to 65 and even TessFactor up to 64.
-
-/* TODO: Is this actually faster than a LUT? */
-static uint32_t
-finalPointPositionTable(uint32_t x)
-{
-   if (x == 0)
-      return 0;
-   if (x == 1)
-      return 0x20;
-
-   uint32_t shift;
-   if ((x & 1) == 0) {
-      shift = 1;
-   } else if ((x & 3) == 3) {
-      shift = 2;
-   } else if ((x & 7) == 5) {
-      shift = 3;
-   } else if (x != 17) {
-      shift = 4;
-   } else {
-      shift = 5;
-   }
-
-   // SWAR vectorized right-shift of (0x20, x)
-   // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`.
-   uint32_t items_to_shift = x | (0x20 << 16);
-   uint32_t shifted = items_to_shift >> shift;
-
-   uint32_t bias = min(0xfu, shifted >> 16);
-   return bias + (shifted & 0xffff);
-}
-
-static void
-StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset,
-                 int insideEdgePointBaseOffset,
-                 int insideNumHalfTessFactorPoints,
-                 bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset,
-                 int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd)
-{
-   if (insideEdgeTessFactorOdd) {
-      insideNumHalfTessFactorPoints -= 1;
-   }
-   if (outsideTessFactorOdd) {
-      outsideNumHalfTessFactorPoints -= 1;
-   }
-   // Walk first half
-   int outsidePoint = outsideEdgePointBaseOffset;
-   int insidePoint = insideEdgePointBaseOffset;
-
-   // iStart,iEnd are a small optimization so the loop below doesn't have to go
-   // from 0 up to 31
-   int iStart = min(loop_start(insideNumHalfTessFactorPoints),
-                    loop_start(outsideNumHalfTessFactorPoints));
-   int iEnd = loop_end(
-      max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints));
-
-   // since we don't start the loop at 0 below, we need a special case.
-   if (0 < outsideNumHalfTessFactorPoints) {
-      // Advance outside
-      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                     baseIndexOffset);
-      baseIndexOffset += 3;
-      outsidePoint++;
-   }
-
-   for (int i = iStart; i <= iEnd; i++) {
-      int bound = finalPointPositionTable(i);
-
-      if (bound < insideNumHalfTessFactorPoints) {
-         // Advance inside
-         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-      }
-      if (bound < outsideNumHalfTessFactorPoints) {
-         // Advance outside
-         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         outsidePoint++;
-      }
-   }
-
-   if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) ||
-       insideEdgeTessFactorOdd) {
-      if (insideEdgeTessFactorOdd == outsideTessFactorOdd) {
-         // Quad in the middle
-         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         DefineTriangle(ctx, insidePoint + 1, outsidePoint, outsidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-         outsidePoint++;
-      } else if (!insideEdgeTessFactorOdd) {
-         // Triangle pointing inside
-         DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         outsidePoint++;
-      } else {
-         // Triangle pointing outside
-         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-      }
-   }
-
-   // Walk second half.
-   for (int i = iEnd; i >= iStart; i--) {
-      int bound = finalPointPositionTable(i);
-
-      if (bound < outsideNumHalfTessFactorPoints) {
-         // Advance outside
-         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         outsidePoint++;
-      }
-      if (bound < insideNumHalfTessFactorPoints) {
-         // Advance inside
-         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
-                        baseIndexOffset);
-         baseIndexOffset += 3;
-         insidePoint++;
-      }
-   }
-   // Below case is not needed if we didn't optimize loop above and made it run
-   // from 31 down to 0.
-   if (0 < outsideNumHalfTessFactorPoints) {
-      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
-                     baseIndexOffset);
-      baseIndexOffset += 3;
-      outsidePoint++;
-   }
+   uint patch = cl_global_id.x;
+   poly_tess_isoline_process(p, patch, mode__2);
 }
 
 KERNEL(64)
-libagx_tess_isoline(constant struct libagx_tess_args *p,
-                    enum libagx_tess_mode mode__2)
+libagx_tess_tri(constant struct poly_tess_args *p, enum poly_tess_mode mode__2)
 {
-   enum libagx_tess_mode mode = mode__2;
    uint patch = cl_global_id.x;
-   enum libagx_tess_partitioning partitioning = p->partitioning;
-
-   bool lineDensityOdd;
-   bool lineDetailOdd;
-   TESS_FACTOR_CONTEXT lineDensityTessFactorCtx;
-   TESS_FACTOR_CONTEXT lineDetailTessFactorCtx;
-
-   global float *factors = tess_factors(p, patch);
-   float TessFactor_V_LineDensity = factors[0];
-   float TessFactor_U_LineDetail = factors[1];
-
-   // Is the patch culled? NaN will pass.
-   if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) {
-      libagx_draw_empty(p, mode, patch);
-      return;
-   }
-
-   // Clamp edge TessFactors
-   TessFactor_V_LineDensity =
-      clamp(TessFactor_V_LineDensity,
-            LIBAGX_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR,
-            LIBAGX_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR);
-   TessFactor_U_LineDetail =
-      clamp_factor(TessFactor_U_LineDetail, partitioning, 0);
-
-   // Process tessFactors
-   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
-      lineDetailOdd = isOdd(TessFactor_U_LineDetail);
-   } else {
-      lineDetailOdd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
-   }
-
-   FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail);
-
-   ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail,
-                        &lineDetailTessFactorCtx);
-   int numPointsPerLine =
-      NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail);
-
-   TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity);
-   lineDensityOdd = isOdd(TessFactor_V_LineDensity);
-   FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity);
-   ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity,
-                        &lineDensityTessFactorCtx);
-
-   // don't draw last line at V == 1.
-   int numLines =
-      NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1;
-
-   /* Points */
-   uint num_points = numPointsPerLine * numLines;
-   if (mode != LIBAGX_TESS_MODE_COUNT) {
-      global struct libagx_tess_point *points =
-         libagx_heap_alloc_points(p, patch, num_points);
-
-      for (int line = 0, pointOffset = 0; line < numLines; line++) {
-         FXP fxpV =
-            PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line);
-
-         for (int point = 0; point < numPointsPerLine; point++) {
-            FXP fxpU =
-               PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point);
-
-            DefinePoint(&points[pointOffset++], fxpU, fxpV);
-         }
-      }
-   }
-
-   struct CHWTessellator ctx = {
-      .mode = mode,
-      .index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE,
-   };
-
-   /* Connectivity */
-   if (!p->points_mode) {
-      uint num_indices = numLines * (numPointsPerLine - 1) * 2;
-      ctx.Index = libagx_draw(p, mode, true, patch, num_indices);
-
-      if (mode == LIBAGX_TESS_MODE_COUNT)
-         return;
-
-      for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines;
-           line++) {
-         pointOffset++;
-
-         for (int point = 1; point < numPointsPerLine; point++) {
-            DefineIndex(&ctx, pointOffset - 1, indexOffset++);
-            DefineIndex(&ctx, pointOffset, indexOffset++);
-            pointOffset++;
-         }
-      }
-   } else {
-      libagx_draw_points(&ctx, p, patch, num_points);
-   }
+   poly_tess_tri_process(p, patch, mode__2);
 }
 
 KERNEL(64)
-libagx_tess_tri(constant struct libagx_tess_args *p,
-                enum libagx_tess_mode mode__2)
+libagx_tess_quad(constant struct poly_tess_args *p, enum poly_tess_mode mode__2)
 {
-   enum libagx_tess_mode mode = mode__2;
    uint patch = cl_global_id.x;
-   enum libagx_tess_partitioning partitioning = p->partitioning;
-
-   global float *factors = tess_factors(p, patch);
-   float tessFactor_Ueq0 = factors[0];
-   float tessFactor_Veq0 = factors[1];
-   float tessFactor_Weq0 = factors[2];
-   float insideTessFactor_f = factors[4];
-
-   struct CHWTessellator ctx = {
-      .mode = mode,
-      .index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE,
-   };
-
-   // Is the patch culled? NaN will pass.
-   if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) ||
-       !(tessFactor_Weq0 > 0)) {
-
-      libagx_draw_empty(p, mode, patch);
-
-      return;
-   }
-
-   FXP outsideTessFactor[TRI_EDGES];
-   FXP insideTessFactor;
-   bool outsideTessFactorOdd[TRI_EDGES];
-   bool insideTessFactorOdd;
-   TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES];
-   TESS_FACTOR_CONTEXT insideTessFactorCtx;
-   // Stuff below is just specific to the traversal order
-   // this code happens to use to generate points/lines
-   int numPointsForOutsideEdge[TRI_EDGES];
-   int numPointsForInsideTessFactor;
-   int insideEdgePointBaseOffset;
-
-   // Clamp TessFactors
-   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
-   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
-   tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0);
-
-   float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0);
-   insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf);
-   // Note the above clamps map NaN to the lower bound
-
-   // Process tessFactors
-   float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
-                                           tessFactor_Weq0};
-   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
-      for (int edge = 0; edge < TRI_EDGES; edge++) {
-         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
-      }
-      insideTessFactorOdd =
-         isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f);
-   } else {
-      bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
-
-      for (int edge = 0; edge < TRI_EDGES; edge++) {
-         outsideTessFactorOdd[edge] = odd;
-      }
-      insideTessFactorOdd = odd;
-   }
-
-   // Save fixed point TessFactors
-   for (int edge = 0; edge < TRI_EDGES; edge++) {
-      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
-   }
-   insideTessFactor = floatToFixed(insideTessFactor_f);
-
-   if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) {
-      // Special case if all TessFactors are 1
-      if ((FXP_ONE == insideTessFactor) &&
-          (FXP_ONE == outsideTessFactor[Ueq0]) &&
-          (FXP_ONE == outsideTessFactor[Veq0]) &&
-          (FXP_ONE == outsideTessFactor[Weq0])) {
-
-         /* Just do minimum tess factor */
-         if (mode == LIBAGX_TESS_MODE_COUNT) {
-            p->counts[patch] = 3;
-            return;
-         }
-
-         global struct libagx_tess_point *points =
-            libagx_heap_alloc_points(p, patch, 3);
-
-         DefinePoint(&points[0], 0,
-                     FXP_ONE);          // V=1 (beginning of Ueq0 edge VW)
-         DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU)
-         DefinePoint(&points[2], FXP_ONE,
-                     0); // U=1 (beginning of Weq0 edge UV)
-
-         if (!p->points_mode) {
-            ctx.Index = libagx_draw(p, mode, false, patch, 3);
-
-            DefineTriangle(&ctx, 0, 1, 2,
-                           /*indexStorageBaseOffset*/ 0);
-         } else {
-            libagx_draw_points(&ctx, p, patch, 3);
-         }
-
-         return;
-      }
-   }
-
-   // Compute per-TessFactor metadata
-   for (int edge = 0; edge < TRI_EDGES; edge++) {
-      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
-                           &outsideTessFactorCtx[edge]);
-   }
-   ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor,
-                        &insideTessFactorCtx);
-
-   // Compute some initial data.
-   int NumPoints = 0;
-
-   // outside edge offsets and storage
-   for (int edge = 0; edge < TRI_EDGES; edge++) {
-      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
-         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
-      NumPoints += numPointsForOutsideEdge[edge];
-   }
-   NumPoints -= 3;
-
-   // inside edge offsets
-   numPointsForInsideTessFactor =
-      NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor);
-   {
-      int pointCountMin = insideTessFactorOdd ? 4 : 3;
-      // max() allows degenerate transition regions when inside TessFactor == 1
-      numPointsForInsideTessFactor =
-         max(pointCountMin, numPointsForInsideTessFactor);
-   }
-
-   insideEdgePointBaseOffset = NumPoints;
-
-   // inside storage, including interior edges above
-   {
-      int interiorRings = (numPointsForInsideTessFactor >> 1) - 1;
-      int even = insideTessFactorOdd ? 0 : 1;
-      NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even;
-   }
-
-   /* GENERATE POINTS */
-   if (mode != LIBAGX_TESS_MODE_COUNT) {
-      ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints);
-
-      // Generate exterior ring edge points, clockwise starting from point V
-      // (VW, the U==0 edge)
-      int pointOffset = 0;
-      for (int edge = 0; edge < TRI_EDGES; edge++) {
-         int odd = edge & 0x1;
-         int endPoint = numPointsForOutsideEdge[edge] - 1;
-         // don't include end, since next edge starts with it.
-         for (int p = 0; p < endPoint; p++, pointOffset++) {
-            // whether to reverse point order given we are defining V or U (W
-            // implicit): edge0, VW, has V decreasing, so reverse 1D points
-            // below edge1, WU, has U increasing, so don't reverse 1D points
-            // below edge2, UV, has U decreasing, so reverse 1D points below
-            int q = odd ? p : endPoint - p;
-
-            FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
-                                          outsideTessFactorOdd[edge], q);
-            DefinePoint(&ctx.Point[pointOffset], (edge == 0) ? 0 : fxpParam,
-                        (edge == 0)   ? fxpParam
-                        : (edge == 2) ? FXP_ONE - fxpParam
-                                      : 0);
-         }
-      }
-
-      // Generate interior ring points, clockwise spiralling in
-      int numRings = (numPointsForInsideTessFactor >> 1);
-      for (int ring = 1; ring < numRings; ring++) {
-         int startPoint = ring;
-         int endPoint = numPointsForInsideTessFactor - 1 - startPoint;
-
-         int perpendicularAxisPoint = startPoint;
-         FXP fxpPerpParam = PlacePointIn1D(
-            &insideTessFactorCtx, insideTessFactorOdd, perpendicularAxisPoint);
-
-         // Map location to the right size in
-         // barycentric space. We know this fixed
-         // point math won't over/underflow
-         fxpPerpParam *= FXP_TWO_THIRDS;
-         fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >>
-                        FXP_FRACTION_BITS; // get back to n.16
-
-         for (int edge = 0; edge < TRI_EDGES; edge++) {
-            int odd = edge & 0x1;
-
-            // don't include end: next edge starts with it.
-            for (int p = startPoint; p < endPoint; p++, pointOffset++) {
-               // whether to reverse point given we are defining V or U (W
-               // implicit): edge0, VW, has V decreasing, so reverse 1D points
-               // below edge1, WU, has U increasing, so don't reverse 1D points
-               // below edge2, UV, has U decreasing, so reverse 1D points below
-               int q = odd ? p : endPoint - (p - startPoint);
-
-               FXP fxpParam =
-                  PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q);
-               // edge0 VW, has perpendicular parameter U constant
-               // edge1 WU, has perpendicular parameter V constant
-               // edge2 UV, has perpendicular parameter W constant
-               // reciprocal is the rate of change of edge-parallel parameters
-               // as they are pushed into the triangle
-               const unsigned int deriv = 2;
-
-               // we know this fixed point math won't over/underflow
-               FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv;
-
-               DefinePoint(&ctx.Point[pointOffset],
-                           edge > 0 ? tmp : fxpPerpParam,
-                           edge == 0   ? tmp
-                           : edge == 1 ? fxpPerpParam
-                                       : FXP_ONE - tmp - fxpPerpParam);
-            }
-         }
-      }
-      if (!insideTessFactorOdd) {
-         // Last point is the point at the center.
-         DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD);
-      }
-   }
-
-   if (p->points_mode) {
-      libagx_draw_points(&ctx, p, patch, NumPoints);
-      return;
-   }
-
-   {
-      // Generate primitives for all the concentric rings, one side at a time
-      // for each ring +1 is so even tess includes the center point, which we
-      // want to now
-      int numRings = ((numPointsForInsideTessFactor + 1) >> 1);
-
-      int NumIndices = 0;
-      {
-         int OuterPoints = numPointsForOutsideEdge[0] +
-                           numPointsForOutsideEdge[1] +
-                           numPointsForOutsideEdge[2];
-
-         int numRings18 = numRings * 18;
-         NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) +
-                      (3 * OuterPoints) - (numRings18 * (numRings - 1)) +
-                      (insideTessFactorOdd ? 3 : 0);
-      }
-
-      // Generate the draw and allocate the index buffer now that we know the size
-      ctx.Index = libagx_draw(p, mode, false, patch, NumIndices);
-
-      if (mode == LIBAGX_TESS_MODE_COUNT)
-         return;
-
-      int insideOffset = insideEdgePointBaseOffset;
-      int outsideEdgePointBaseOffset = 0;
-
-      NumIndices = 0;
-      for (int ring = 1; ring < numRings; ring++) {
-         int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring;
-         int edge0InsidePointBaseOffset = insideOffset;
-         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
-         for (int edge = 0; edge < TRI_EDGES; edge++) {
-            int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge]
-                                          : (numPointsForInsideEdge + 2);
-
-            int numTriangles = numPointsForInsideEdge + outsidePoints - 2;
-
-            int insideBaseOffset;
-            int outsideBaseOffset;
-            if (edge == 2) {
-               ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
-                  insideOffset;
-               ctx.IndexPatchCtx.insidePointIndexBadValue =
-                  numPointsForInsideEdge - 1;
-               ctx.IndexPatchCtx.insidePointIndexReplacementValue =
-                  edge0InsidePointBaseOffset;
-               ctx.IndexPatchCtx.outsidePointIndexPatchBase =
-                  ctx.IndexPatchCtx.insidePointIndexBadValue +
-                  1; // past inside patched index range
-               ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
-                  outsideEdgePointBaseOffset -
-                  ctx.IndexPatchCtx.outsidePointIndexPatchBase;
-               ctx.IndexPatchCtx.outsidePointIndexBadValue =
-                  ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints -
-                  1;
-               ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
-                  edge0OutsidePointBaseOffset;
-               ctx.bUsingPatchedIndices = true;
-               insideBaseOffset = 0;
-               outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase;
-            } else {
-               insideBaseOffset = insideOffset;
-               outsideBaseOffset = outsideEdgePointBaseOffset;
-            }
-            if (ring == 1) {
-               StitchTransition(
-                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
-                  insideTessFactorCtx.numHalfTessFactorPoints,
-                  insideTessFactorOdd, outsideBaseOffset,
-                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
-                  outsideTessFactorOdd[edge]);
-            } else {
-               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
-                             /*baseIndexOffset: */ NumIndices,
-                             numPointsForInsideEdge, insideBaseOffset,
-                             outsideBaseOffset);
-            }
-            if (2 == edge) {
-               ctx.bUsingPatchedIndices = false;
-            }
-            NumIndices += numTriangles * 3;
-            outsideEdgePointBaseOffset += outsidePoints - 1;
-            insideOffset += numPointsForInsideEdge - 1;
-         }
-      }
-      if (insideTessFactorOdd) {
-         // Triangulate center (a single triangle)
-         DefineTriangle(&ctx, outsideEdgePointBaseOffset,
-                        outsideEdgePointBaseOffset + 1,
-                        outsideEdgePointBaseOffset + 2, NumIndices);
-         NumIndices += 3;
-      }
-   }
-}
-
-KERNEL(64)
-libagx_tess_quad(constant struct libagx_tess_args *p,
-                 enum libagx_tess_mode mode__2)
-{
-   enum libagx_tess_mode mode = mode__2;
-   uint patch = cl_global_id.x;
-   enum libagx_tess_partitioning partitioning = p->partitioning;
-   global float *factors = tess_factors(p, patch);
-
-   float tessFactor_Ueq0 = factors[0];
-   float tessFactor_Veq0 = factors[1];
-   float tessFactor_Ueq1 = factors[2];
-   float tessFactor_Veq1 = factors[3];
-
-   float insideTessFactor_U = factors[4];
-   float insideTessFactor_V = factors[5];
-
-   struct CHWTessellator ctx = {
-      .mode = mode,
-      .index_bias = patch * LIBAGX_TES_PATCH_ID_STRIDE,
-   };
-
-   // Is the patch culled?
-   if (!(tessFactor_Ueq0 > 0) || // NaN will pass
-       !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) ||
-       !(tessFactor_Veq1 > 0)) {
-      libagx_draw_empty(p, mode, patch);
-      return;
-   }
-
-   FXP outsideTessFactor[QUAD_EDGES];
-   FXP insideTessFactor[QUAD_AXES];
-   bool outsideTessFactorOdd[QUAD_EDGES];
-   bool insideTessFactorOdd[QUAD_AXES];
-   TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES];
-   TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES];
-   // Stuff below is just specific to the traversal order
-   // this code happens to use to generate points/lines
-   int numPointsForOutsideEdge[QUAD_EDGES];
-   int numPointsForInsideTessFactor[QUAD_AXES];
-   int insideEdgePointBaseOffset;
-
-   // Clamp edge TessFactors
-   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
-   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
-   tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0);
-   tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0);
-
-   float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0),
-                        max(tessFactor_Ueq1, tessFactor_Veq1)),
-                    max(insideTessFactor_U, insideTessFactor_V));
-
-   insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf);
-   insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf);
-   // Note the above clamps map NaN to lowerBound
-
-   // Process tessFactors
-   float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
-                                            tessFactor_Ueq1, tessFactor_Veq1};
-   float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U,
-                                          insideTessFactor_V};
-   if (partitioning == LIBAGX_TESS_PARTITIONING_INTEGER) {
-      for (int edge = 0; edge < QUAD_EDGES; edge++) {
-         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
-      }
-      for (int axis = 0; axis < QUAD_AXES; axis++) {
-         insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) &&
-                                     (1.0f != insideTessFactor_f[axis]);
-      }
-   } else {
-      bool odd = (partitioning == LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD);
-
-      for (int edge = 0; edge < QUAD_EDGES; edge++) {
-         outsideTessFactorOdd[edge] = odd;
-      }
-      insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd;
-   }
-
-   // Save fixed point TessFactors
-   for (int edge = 0; edge < QUAD_EDGES; edge++) {
-      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
-   }
-   for (int axis = 0; axis < QUAD_AXES; axis++) {
-      insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]);
-   }
-
-   if (partitioning != LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN) {
-      // Special case if all TessFactors are 1
-      if ((FXP_ONE == insideTessFactor[U]) &&
-          (FXP_ONE == insideTessFactor[V]) &&
-          (FXP_ONE == outsideTessFactor[Ueq0]) &&
-          (FXP_ONE == outsideTessFactor[Veq0]) &&
-          (FXP_ONE == outsideTessFactor[Ueq1]) &&
-          (FXP_ONE == outsideTessFactor[Veq1])) {
-
-         /* Just do minimum tess factor */
-         if (!p->points_mode) {
-            ctx.Index = libagx_draw(p, mode, false, patch, 6);
-            if (mode == LIBAGX_TESS_MODE_COUNT)
-               return;
-
-            DefineTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0);
-            DefineTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3);
-         } else {
-            libagx_draw_points(&ctx, p, patch, 4);
-            if (mode == LIBAGX_TESS_MODE_COUNT)
-               return;
-         }
-
-         global struct libagx_tess_point *points =
-            libagx_heap_alloc_points(p, patch, 4);
-
-         DefinePoint(&points[0], 0, 0);
-         DefinePoint(&points[1], FXP_ONE, 0);
-         DefinePoint(&points[2], FXP_ONE, FXP_ONE);
-         DefinePoint(&points[3], 0, FXP_ONE);
-         return;
-      }
-   }
-
-   // Compute TessFactor-specific metadata
-   for (int edge = 0; edge < QUAD_EDGES; edge++) {
-      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
-                           &outsideTessFactorCtx[edge]);
-   }
-
-   for (int axis = 0; axis < QUAD_AXES; axis++) {
-      ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis],
-                           &insideTessFactorCtx[axis]);
-   }
-
-   int NumPoints = 0;
-
-   // outside edge offsets and storage
-   for (int edge = 0; edge < QUAD_EDGES; edge++) {
-      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
-         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
-      NumPoints += numPointsForOutsideEdge[edge];
-   }
-   NumPoints -= 4;
-
-   // inside edge offsets
-   for (int axis = 0; axis < QUAD_AXES; axis++) {
-      numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(
-         insideTessFactorOdd[axis], insideTessFactor[axis]);
-      int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3;
-      // max() allows degenerate transition regions when inside TessFactor == 1
-      numPointsForInsideTessFactor[axis] =
-         max(pointCountMin, numPointsForInsideTessFactor[axis]);
-   }
-
-   insideEdgePointBaseOffset = NumPoints;
-
-   // inside storage, including interior edges above
-   int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) *
-                           (numPointsForInsideTessFactor[V] - 2);
-   NumPoints += numInteriorPoints;
-
-   if (mode != LIBAGX_TESS_MODE_COUNT) {
-      ctx.Point = libagx_heap_alloc_points(p, patch, NumPoints);
-
-      // Generate exterior ring edge points, clockwise from top-left
-      int pointOffset = 0;
-      for (int edge = 0; edge < QUAD_EDGES; edge++) {
-         int odd = edge & 0x1;
-         // don't include end, since next edge starts with it.
-         int endPoint = numPointsForOutsideEdge[edge] - 1;
-         for (int p = 0; p < endPoint; p++, pointOffset++) {
-            int q =
-               ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order
-            FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
-                                          outsideTessFactorOdd[edge], q);
-
-            FXP u = odd ? fxpParam : ((edge == 2) ? FXP_ONE : 0);
-            FXP v = odd ? ((edge == 3) ? FXP_ONE : 0) : fxpParam;
-            DefinePoint(&ctx.Point[pointOffset], u, v);
-         }
-      }
-
-      // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left)
-      // spiralling toward center
-      int minNumPointsForTessFactor =
-         min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]);
-      // note for even tess we aren't counting center point here.
-      int numRings = (minNumPointsForTessFactor >> 1);
-
-      for (int ring = 1; ring < numRings; ring++) {
-         int startPoint = ring;
-         int endPoint[QUAD_AXES] = {
-            numPointsForInsideTessFactor[U] - 1 - startPoint,
-            numPointsForInsideTessFactor[V] - 1 - startPoint,
-         };
-
-         for (int edge = 0; edge < QUAD_EDGES; edge++) {
-            int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)};
-            int perpendicularAxisPoint =
-               (edge < 2) ? startPoint : endPoint[odd[0]];
-            FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]],
-                                              insideTessFactorOdd[odd[0]],
-                                              perpendicularAxisPoint);
-
-            for (int p = startPoint; p < endPoint[odd[1]]; p++,
-                     pointOffset++) // don't include end: next edge starts with
-                                    // it.
-            {
-               bool odd_ = odd[1];
-               int q = ((edge == 1) || (edge == 2))
-                          ? p
-                          : endPoint[odd_] - (p - startPoint);
-               FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd_],
-                                             insideTessFactorOdd[odd_], q);
-               DefinePoint(&ctx.Point[pointOffset],
-                           odd_ ? fxpPerpParam : fxpParam,
-                           odd_ ? fxpParam : fxpPerpParam);
-            }
-         }
-      }
-      // For even tessellation, the inner "ring" is degenerate - a row of points
-      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
-          !insideTessFactorOdd[V]) {
-         int startPoint = numRings;
-         int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint;
-         for (int p = startPoint; p <= endPoint; p++, pointOffset++) {
-            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U],
-                                          insideTessFactorOdd[U], p);
-            DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF);
-         }
-      } else if ((numPointsForInsideTessFactor[V] >=
-                  numPointsForInsideTessFactor[U]) &&
-                 !insideTessFactorOdd[U]) {
-         int startPoint = numRings;
-         int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint;
-         for (int p = endPoint; p >= startPoint; p--, pointOffset++) {
-            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V],
-                                          insideTessFactorOdd[V], p);
-            DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam);
-         }
-      }
-   }
-
-   if (p->points_mode) {
-      libagx_draw_points(&ctx, p, patch, NumPoints);
-      return;
-   }
-
-   /* CONNECTIVITY */
-   {
-      // Generate primitives for all the concentric rings, one side at a time
-      // for each ring. +1 is so even tess includes the center point
-      int numPointRowsToCenter[QUAD_AXES] = {
-         (numPointsForInsideTessFactor[U] + 1) >> 1,
-         (numPointsForInsideTessFactor[V] + 1) >> 1,
-      };
-
-      int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]);
-
-      /* Calculate # of indices so we can allocate */
-      {
-         /* Handle main case */
-         int OuterPoints =
-            numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] +
-            numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3];
-
-         int InnerPoints =
-            numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V];
-
-         int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) -
-                          (InnerPoints * 18) - (24 * numRings * (numRings - 1));
-
-         /* Determine major/minor axes */
-         bool U_major =
-            (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]);
-         unsigned M = U_major ? U : V;
-         unsigned m = U_major ? V : U;
-
-         /* Handle degenerate ring */
-         if (insideTessFactorOdd[m]) {
-            NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) -
-                                (numPointsForInsideTessFactor[m] >> 1));
-            NumIndices += (insideTessFactorOdd[M] ? 6 : 12);
-         }
-
-         // Generate the draw and allocate the index buffer with the size
-         ctx.Index = libagx_draw(p, mode, false, patch, NumIndices);
-      }
-
-      if (mode == LIBAGX_TESS_MODE_COUNT)
-         return;
-
-      int degeneratePointRing[QUAD_AXES] = {
-         // Even partitioning causes degenerate row of points,
-         // which results in exceptions to the point ordering conventions
-         // when travelling around the rings counterclockwise.
-         !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1,
-         !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1,
-      };
-
-      int numPointsForOutsideEdge_[QUAD_EDGES] = {
-         numPointsForOutsideEdge[Ueq0],
-         numPointsForOutsideEdge[Veq0],
-         numPointsForOutsideEdge[Ueq1],
-         numPointsForOutsideEdge[Veq1],
-      };
-
-      int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset;
-      int outsideEdgePointBaseOffset = 0;
-
-      int NumIndices = 0;
-
-      for (int ring = 1; ring < numRings; ring++) {
-         int numPointsForInsideEdge[QUAD_AXES] = {
-            numPointsForInsideTessFactor[U] - 2 * ring,
-            numPointsForInsideTessFactor[V] - 2 * ring};
-
-         int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_;
-         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
-
-         for (int edge = 0; edge < QUAD_EDGES; edge++) {
-            int odd = (edge + 1) & 0x1;
-
-            int numTriangles =
-               numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2;
-            int insideBaseOffset;
-            int outsideBaseOffset;
-
-            // We need to patch the indexing so Stitch() can think it sees 2
-            // sequentially increasing rows of points, even though we have
-            // wrapped around to the end of the inner and outer ring's points,
-            // so the last point is really the first point for the ring. We make
-            // it so that when Stitch() calls AddIndex(), that function will do
-            // any necessary index adjustment.
-            if (edge == 3) {
-               if (ring == degeneratePointRing[odd]) {
-                  ctx.IndexPatchCtx2.baseIndexToInvert =
-                     insideEdgePointBaseOffset_ + 1;
-                  ctx.IndexPatchCtx2.cornerCaseBadValue =
-                     outsideEdgePointBaseOffset +
-                     numPointsForOutsideEdge_[edge] - 1;
-                  ctx.IndexPatchCtx2.cornerCaseReplacementValue =
-                     edge0OutsidePointBaseOffset;
-                  ctx.IndexPatchCtx2.indexInversionEndPoint =
-                     (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1;
-                  insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
-                  outsideBaseOffset = outsideEdgePointBaseOffset;
-                  ctx.bUsingPatchedIndices2 = true;
-               } else {
-                  ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
-                     insideEdgePointBaseOffset_;
-                  ctx.IndexPatchCtx.insidePointIndexBadValue =
-                     numPointsForInsideEdge[odd] - 1;
-                  ctx.IndexPatchCtx.insidePointIndexReplacementValue =
-                     edge0InsidePointBaseOffset;
-                  ctx.IndexPatchCtx.outsidePointIndexPatchBase =
-                     ctx.IndexPatchCtx.insidePointIndexBadValue +
-                     1; // past inside patched index range
-                  ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
-                     outsideEdgePointBaseOffset -
-                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
-                  ctx.IndexPatchCtx.outsidePointIndexBadValue =
-                     ctx.IndexPatchCtx.outsidePointIndexPatchBase +
-                     numPointsForOutsideEdge_[edge] - 1;
-                  ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
-                     edge0OutsidePointBaseOffset;
-
-                  insideBaseOffset = 0;
-                  outsideBaseOffset =
-                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
-                  ctx.bUsingPatchedIndices = true;
-               }
-            } else if ((edge == 2) && (ring == degeneratePointRing[odd])) {
-               ctx.IndexPatchCtx2.baseIndexToInvert =
-                  insideEdgePointBaseOffset_;
-               ctx.IndexPatchCtx2.cornerCaseBadValue = -1;         // unused
-               ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused
-               ctx.IndexPatchCtx2.indexInversionEndPoint =
-                  ctx.IndexPatchCtx2.baseIndexToInvert << 1;
-               insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
-               outsideBaseOffset = outsideEdgePointBaseOffset;
-               ctx.bUsingPatchedIndices2 = true;
-            } else {
-               insideBaseOffset = insideEdgePointBaseOffset_;
-               outsideBaseOffset = outsideEdgePointBaseOffset;
-            }
-            if (ring == 1) {
-               StitchTransition(
-                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
-                  insideTessFactorCtx[odd].numHalfTessFactorPoints,
-                  insideTessFactorOdd[odd], outsideBaseOffset,
-                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
-                  outsideTessFactorOdd[edge]);
-            } else {
-               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
-                             /*baseIndexOffset: */ NumIndices,
-                             numPointsForInsideEdge[odd], insideBaseOffset,
-                             outsideBaseOffset);
-            }
-            ctx.bUsingPatchedIndices = false;
-            ctx.bUsingPatchedIndices2 = false;
-            NumIndices += numTriangles * 3;
-            outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1;
-            if ((edge == 2) && (ring == degeneratePointRing[odd])) {
-               insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1;
-            } else {
-               insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1;
-            }
-            numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd];
-         }
-      }
-
-      // Triangulate center - a row of quads if odd
-      // This triangulation may be producing diagonals that are asymmetric about
-      // the center of the patch in this region.
-      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
-          insideTessFactorOdd[V]) {
-         ctx.bUsingPatchedIndices2 = true;
-         int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) -
-                               (numPointsForInsideTessFactor[V] >> 1))
-                              << 1) +
-                             (insideTessFactorOdd[U] ? 1 : 2);
-         ctx.IndexPatchCtx2.baseIndexToInvert =
-            outsideEdgePointBaseOffset + stripNumQuads + 2;
-         ctx.IndexPatchCtx2.cornerCaseBadValue =
-            ctx.IndexPatchCtx2.baseIndexToInvert;
-         ctx.IndexPatchCtx2.cornerCaseReplacementValue =
-            outsideEdgePointBaseOffset;
-         ctx.IndexPatchCtx2.indexInversionEndPoint =
-            ctx.IndexPatchCtx2.baseIndexToInvert +
-            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
-         StitchRegular(
-            &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE,
-            /*baseIndexOffset: */ NumIndices,
-            /*numInsideEdgePoints:*/ stripNumQuads + 1,
-            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
-            outsideEdgePointBaseOffset + 1);
-         ctx.bUsingPatchedIndices2 = false;
-         NumIndices += stripNumQuads * 6;
-      } else if ((numPointsForInsideTessFactor[V] >=
-                  numPointsForInsideTessFactor[U]) &&
-                 insideTessFactorOdd[U]) {
-         ctx.bUsingPatchedIndices2 = true;
-         int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) -
-                               (numPointsForInsideTessFactor[U] >> 1))
-                              << 1) +
-                             (insideTessFactorOdd[V] ? 1 : 2);
-         ctx.IndexPatchCtx2.baseIndexToInvert =
-            outsideEdgePointBaseOffset + stripNumQuads + 1;
-         ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused
-         ctx.IndexPatchCtx2.indexInversionEndPoint =
-            ctx.IndexPatchCtx2.baseIndexToInvert +
-            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
-         DIAGONALS diag = insideTessFactorOdd[V]
-                             ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE
-                             : DIAGONALS_INSIDE_TO_OUTSIDE;
-         StitchRegular(
-            &ctx, /*bTrapezoid*/ false, diag,
-            /*baseIndexOffset: */ NumIndices,
-            /*numInsideEdgePoints:*/ stripNumQuads + 1,
-            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
-            outsideEdgePointBaseOffset);
-         ctx.bUsingPatchedIndices2 = false;
-         NumIndices += stripNumQuads * 6;
-      }
-   }
+   poly_tess_quad_process(p, patch, mode__2);
 }
diff --git a/src/asahi/libagx/tessellator.h b/src/asahi/libagx/tessellator.h
index 5841d5578f1..4cf8ab01938 100644
--- a/src/asahi/libagx/tessellator.h
+++ b/src/asahi/libagx/tessellator.h
@@ -5,104 +5,14 @@
 
 #pragma once
 
-#include "compiler/libcl/libcl.h"
+#include "poly/tessellator.h"
 
-enum libagx_tess_partitioning {
-   LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD,
-   LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN,
-   LIBAGX_TESS_PARTITIONING_INTEGER,
-};
-
-enum libagx_tess_mode {
-   /* Do not actually tessellate, just write the index counts */
-   LIBAGX_TESS_MODE_COUNT,
-
-   /* Tessellate using the count buffers to allocate indices */
-   LIBAGX_TESS_MODE_WITH_COUNTS,
-};
-
-struct libagx_tess_point {
-   uint32_t u;
-   uint32_t v;
-};
-static_assert(sizeof(struct libagx_tess_point) == 8);
-
-struct libagx_tess_args {
-   /* Heap to allocate tessellator outputs in */
-   DEVICE(struct agx_heap) heap;
-
-   /* Patch coordinate buffer, indexed as:
-    *
-    *    coord_allocs[patch_ID] + vertex_in_patch
-    */
-   DEVICE(struct libagx_tess_point) patch_coord_buffer;
-
-   /* Per-patch index within the heap for the tess coords, written by the
-    * tessellator based on the allocated memory.
-    */
-   DEVICE(uint32_t) coord_allocs;
-
-   /* Space for output draws from the tessellator. API draw calls. */
-   DEVICE(uint32_t) out_draws;
-
-   /* Tessellation control shader output buffer. */
-   DEVICE(float) tcs_buffer;
-
-   /* Count buffer. # of indices per patch written here, then prefix summed. */
-   DEVICE(uint32_t) counts;
-
-   /* Allocated index buffer for all patches, if we're prefix summing counts */
-   DEVICE(uint32_t) index_buffer;
-
-   /* Address of the tess eval invocation counter for implementing pipeline
-    * statistics, if active. Zero if inactive. Incremented by tessellator.
-    */
-   DEVICE(uint32_t) statistic;
-
-   /* When geom+tess used together, the buffer containing TES outputs (executed
-    * as a hardware compute shader).
-    */
-   uint64_t tes_buffer;
-
-   /* Bitfield of TCS per-vertex outputs */
-   uint64_t tcs_per_vertex_outputs;
-
-   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
-    * Unused in Vulkan and OpenGL ES.
-    */
-   float tess_level_outer_default[4];
-   float tess_level_inner_default[2];
-
-   /* Number of vertices in the input patch */
-   uint32_t input_patch_size;
-
-   /* Number of vertices in the TCS output patch */
-   uint32_t output_patch_size;
-
-   /* Number of patch constants written by TCS */
-   uint32_t tcs_patch_constants;
-
-   /* Number of input patches per instance of the VS/TCS */
-   uint32_t patches_per_instance;
-
-   /* Stride between tessellation facotrs in the TCS output buffer. */
-   uint32_t tcs_stride_el;
-
-   /* Number of patches being tessellated */
-   uint32_t nr_patches;
-
-   /* Partitioning and points mode. These affect per-patch setup code but not
-    * the hot tessellation loop so we make them dynamic to reduce tessellator
-    * variants.
-    */
-   enum libagx_tess_partitioning partitioning;
-   uint32_t points_mode;
-   uint32_t isolines;
-
-   /* When fed into a geometry shader, triangles should be counter-clockwise.
-    * The tessellator always produces clockwise triangles, but we can swap
-    * dynamically in the TES.
-    */
-   uint32_t ccw;
-} PACKED;
-static_assert(sizeof(struct libagx_tess_args) == 36 * 4);
+#define libagx_tessellate(context, grid, barrier, prim, mode, state)           \
+   if (prim == TESS_PRIMITIVE_QUADS) {                                         \
+      libagx_tess_quad(context, grid, barrier, state, mode);                   \
+   } else if (prim == TESS_PRIMITIVE_TRIANGLES) {                              \
+      libagx_tess_tri(context, grid, barrier, state, mode);                    \
+   } else {                                                                    \
+      assert(prim == TESS_PRIMITIVE_ISOLINES);                                 \
+      libagx_tess_isoline(context, grid, barrier, state, mode);                \
+   }
diff --git a/src/asahi/vulkan/hk_cmd_dispatch.c b/src/asahi/vulkan/hk_cmd_dispatch.c
index 9ff1006134e..b8eab93c4d6 100644
--- a/src/asahi/vulkan/hk_cmd_dispatch.c
+++ b/src/asahi/vulkan/hk_cmd_dispatch.c
@@ -5,10 +5,10 @@
  * SPDX-License-Identifier: MIT
  */
 #include "libagx/query.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "vulkan/vulkan_core.h"
 #include "agx_helpers.h"
 #include "agx_linker.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_pack.h"
 #include "agx_scratch.h"
 #include "agx_tilebuffer.h"
diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c
index 2c07b0fa168..a31098eb6fa 100644
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: MIT
  */
 #include <assert.h>
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "agx_abi.h"
 #include "agx_bg_eot.h"
 #include "agx_bo.h"
@@ -13,7 +14,6 @@
 #include "agx_device.h"
 #include "agx_helpers.h"
 #include "agx_linker.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_ppp.h"
 #include "agx_tilebuffer.h"
@@ -31,10 +31,10 @@
 
 #include "asahi/genxml/agx_pack.h"
 #include "asahi/libagx/compression.h"
-#include "asahi/libagx/geometry.h"
 #include "asahi/libagx/libagx.h"
 #include "asahi/libagx/query.h"
 #include "asahi/libagx/tessellator.h"
+#include "poly/geometry.h"
 #include "util/blend.h"
 #include "util/format/format_utils.h"
 #include "util/format/u_formats.h"
@@ -1007,9 +1007,9 @@ hk_heap(struct hk_cmd_buffer *cmd)
        * the CPU as rodata, even though the GPU uses it for scratch internally.
        */
       off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
-      struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
+      struct poly_heap *map = agx_bo_map(dev->rodata.bo) + off;
 
-      *map = (struct agx_heap){
+      *map = (struct poly_heap){
          .base = dev->heap->va->addr,
          .size = size,
       };
@@ -1021,7 +1021,7 @@ hk_heap(struct hk_cmd_buffer *cmd)
       uint64_t addr = dev->rodata.heap;
 
       /* Zeroing the allocated index frees everything */
-      hk_queue_write(cmd, addr + offsetof(struct agx_heap, bottom), 0,
+      hk_queue_write(cmd, addr + offsetof(struct poly_heap, bottom), 0,
                      true /* after gfx */);
 
       cmd->uses_heap = true;
@@ -1045,7 +1045,7 @@ hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
 {
    assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");
 
-   struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
+   struct poly_ia_state ia = {.verts_per_instance = draw.b.count[0]};
 
    if (draw.indexed) {
       unsigned index_size_B = agx_index_size_to_B(draw.index_size);
@@ -1115,7 +1115,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
       mode = u_decomposed_prim(mode);
    }
 
-   struct agx_geometry_params params = {
+   struct poly_geometry_params params = {
       .flat_outputs = fs->info.fs.interp.flat,
       .input_topology = mode,
 
@@ -1174,7 +1174,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
    params.vs_grid[4] = params.gs_grid[4] = 1;
    params.vs_grid[5] = params.gs_grid[5] = 1;
 
-   struct agx_gs_info *gsi = &count->info.gs;
+   struct poly_gs_info *gsi = &count->info.gs;
 
    if (indirect) {
       /* TODO: size */
@@ -1183,7 +1183,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
       params.indirect_desc = cmd->geom_indirect;
       params.vs_grid[2] = params.gs_grid[2] = 1;
 
-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
          /* Need to allocate heap if we haven't yet */
          hk_heap(cmd);
 
@@ -1191,7 +1191,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
          cmd->geom_index_count = dev->heap->size;
       } else {
          cmd->geom_index_count =
-            agx_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0);
+            poly_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0);
       }
    } else {
       uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
@@ -1207,13 +1207,13 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
          params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
       }
 
-      cmd->geom_index_count = agx_gs_rast_vertices(
+      cmd->geom_index_count = poly_gs_rast_vertices(
          gsi->shape, gsi->max_indices, params.gs_grid[0], instances);
 
-      cmd->geom_instance_count = agx_gs_rast_instances(
+      cmd->geom_instance_count = poly_gs_rast_instances(
          gsi->shape, gsi->max_indices, params.gs_grid[0], instances);
 
-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
          params.output_index_buffer =
             hk_pool_alloc(cmd, cmd->geom_index_count * 4, 4).gpu;
 
@@ -1221,7 +1221,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
       }
    }
 
-   if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
+   if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) {
       cmd->geom_index_buffer =
          hk_pool_upload(cmd, count->info.gs.topology, gsi->max_indices * 4, 4);
    }
@@ -1231,7 +1231,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
 }
 
 static void
-hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
+hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct poly_tess_args *out,
                       struct agx_draw draw)
 {
    struct hk_device *dev = hk_cmd_buffer_device(cmd);
@@ -1239,14 +1239,14 @@ hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
    struct hk_graphics_state *gfx = &cmd->state.gfx;
    struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
 
-   enum libagx_tess_partitioning partitioning =
+   enum poly_tess_partitioning partitioning =
       gfx->tess.info.spacing == TESS_SPACING_EQUAL
-         ? LIBAGX_TESS_PARTITIONING_INTEGER
+         ? POLY_TESS_PARTITIONING_INTEGER
       : gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
-         ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
-         : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
+         ? POLY_TESS_PARTITIONING_FRACTIONAL_ODD
+         : POLY_TESS_PARTITIONING_FRACTIONAL_EVEN;
 
-   struct libagx_tess_args args = {
+   struct poly_tess_args args = {
       .heap = hk_heap(cmd),
       .tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
       .statistic = hk_pipeline_stat_addr(
@@ -1428,7 +1428,7 @@ hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct agx_draw draw,
 
    libagx_unroll_restart_struct(cmd, agx_1d(1024 * draw_count),
                                 AGX_BARRIER_ALL | AGX_PREGFX, ia,
-                                libagx_compact_prim(prim));
+                                poly_compact_prim(prim));
 
    return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
                                     dev->heap->size, draw.index_size,
@@ -1485,7 +1485,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
 
       if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
          gsi.vertex_buffer = desc->root.draw.tess_params +
-                             offsetof(struct libagx_tess_args, tes_buffer);
+                             offsetof(struct poly_tess_args, tes_buffer);
       } else {
          gsi.vertex_buffer = desc->root.root_desc_addr +
                              offsetof(struct hk_root_descriptor_table,
@@ -1501,10 +1501,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
                                       AGX_BARRIER_ALL | AGX_PREGFX, gsi);
 
       grid_vs = agx_grid_indirect_local(
-         geometry_params + offsetof(struct agx_geometry_params, vs_grid));
+         geometry_params + offsetof(struct poly_geometry_params, vs_grid));
 
       grid_gs = agx_grid_indirect_local(
-         geometry_params + offsetof(struct agx_geometry_params, gs_grid));
+         geometry_params + offsetof(struct poly_geometry_params, gs_grid));
    } else {
       grid_vs = grid_gs = draw.b;
       grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
@@ -1554,9 +1554,9 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
    /* Pre-rast geometry shader */
    hk_dispatch_with_local_size(cmd, cs, main, grid_gs, wg);
 
-   if (agx_gs_indexed(count->info.gs.shape)) {
+   if (poly_gs_indexed(count->info.gs.shape)) {
       enum agx_index_size index_size =
-         agx_translate_index_size(agx_gs_index_size(count->info.gs.shape));
+         agx_translate_index_size(poly_gs_index_size(count->info.gs.shape));
 
       if (agx_is_indirect(draw.b)) {
          return agx_draw_indexed_indirect(
@@ -1661,13 +1661,13 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
 
    /* First generate counts, then prefix sum them, and then tessellate. */
    libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
-                     LIBAGX_TESS_MODE_COUNT, state);
+                     POLY_TESS_MODE_COUNT, state);
 
    libagx_prefix_sum_tess(cmd, agx_1d(1024), AGX_BARRIER_ALL | AGX_PREGFX,
                           state, c_prims, c_inv, c_prims || c_inv);
 
    libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
-                     LIBAGX_TESS_MODE_WITH_COUNTS, state);
+                     POLY_TESS_MODE_WITH_COUNTS, state);
 
    return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
                                     dev->heap->size, AGX_INDEX_SIZE_U32, false);
@@ -2219,8 +2219,9 @@ hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
    uint32_t index = cmd->state.gfx.index.restart;
 
    if (gs) {
-      enum agx_gs_shape shape = gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape;
-      index = BITFIELD_MASK(8 * agx_gs_index_size(shape));
+      enum poly_gs_shape shape =
+         gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape;
+      index = BITFIELD_MASK(8 * poly_gs_index_size(shape));
    }
 
    /* VDM State updates are relatively expensive, so only emit them when the
@@ -3061,7 +3062,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       bool indirect = agx_is_indirect(draw.b) || draw.restart;
 
       desc->root.draw.input_assembly =
-         indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
+         indirect ? hk_pool_alloc(cmd, sizeof(struct poly_ia_state), 4).gpu
                   : hk_upload_ia_params(cmd, draw);
       desc->root_dirty = true;
    }
@@ -3078,7 +3079,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       if (!indirect) {
          uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
          unsigned vb_size =
-            libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
+            poly_tcs_in_size(verts * instances, vs->b.info.outputs);
 
          /* Allocate if there are any outputs, or use the null sink to trap
           * reads if there aren't. Those reads are undefined but should not
@@ -3094,7 +3095,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
 
    struct agx_ptr tess_args = {0};
    if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
-      tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4);
+      tess_args = hk_pool_alloc(cmd, sizeof(struct poly_tess_args), 4);
       gfx->descriptors.root.draw.tess_params = tess_args.gpu;
       gfx->descriptors.root_dirty = true;
    }
diff --git a/src/asahi/vulkan/hk_device.c b/src/asahi/vulkan/hk_device.c
index 66a22934bb2..117ae784839 100644
--- a/src/asahi/vulkan/hk_device.c
+++ b/src/asahi/vulkan/hk_device.c
@@ -19,8 +19,8 @@
 #include "asahi/genxml/agx_pack.h"
 #include "asahi/lib/agx_bo.h"
 #include "asahi/lib/agx_device.h"
-#include "asahi/libagx/geometry.h"
 #include "compiler/nir/nir_builder.h"
+#include "poly/geometry.h"
 #include "util/hash_table.h"
 #include "util/ralloc.h"
 #include "util/simple_mtx.h"
@@ -86,7 +86,7 @@ hk_upload_rodata(struct hk_device *dev)
     */
    offs = align(offs, sizeof(uint64_t));
    dev->rodata.heap = dev->rodata.bo->va->addr + offs;
-   offs += sizeof(struct agx_heap);
+   offs += sizeof(struct poly_heap);
 
    return VK_SUCCESS;
 }
diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c
index 0cce56c0ec4..229887c6b46 100644
--- a/src/asahi/vulkan/hk_shader.c
+++ b/src/asahi/vulkan/hk_shader.c
@@ -8,10 +8,10 @@
  */
 #include "hk_shader.h"
 
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "agx_debug.h"
 #include "agx_device.h"
 #include "agx_helpers.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "glsl_types.h"
 #include "hk_instance.h"
@@ -1114,13 +1114,13 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
          shader->info.tess.tcs_output_patch_size =
             nir->info.tess.tcs_vertices_out;
          shader->info.tess.tcs_per_vertex_outputs =
-            agx_tcs_per_vertex_outputs(nir);
+            poly_tcs_per_vertex_outputs(nir);
          shader->info.tess.tcs_nr_patch_outputs =
             util_last_bit(nir->info.patch_outputs_written);
-         shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir);
+         shader->info.tess.tcs_output_stride = poly_tcs_output_stride(nir);
       } else {
          /* This destroys info so it needs to happen after the gather */
-         NIR_PASS(_, nir, agx_nir_lower_tes, hw);
+         NIR_PASS(_, nir, poly_nir_lower_tes, hw);
       }
    }
 
@@ -1137,7 +1137,7 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
       if (hw) {
          hk_lower_hw_vs(nir, shader, kill_psiz);
       } else {
-         NIR_PASS(_, nir, agx_nir_lower_vs_before_gs);
+         NIR_PASS(_, nir, poly_nir_lower_vs_before_gs);
          nir->info.stage = MESA_SHADER_COMPUTE;
          memset(&nir->info.cs, 0, sizeof(nir->info.cs));
          nir->xfb_info = NULL;
@@ -1335,7 +1335,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
       hk_populate_vs_key(&key_tmp.vs, state);
       key = &key_tmp;
    } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
-      NIR_PASS(_, nir, agx_nir_lower_tcs);
+      NIR_PASS(_, nir, poly_nir_lower_tcs);
    }
 
    /* Compile all variants up front */
@@ -1345,7 +1345,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
 
       nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
 
-      NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs,
+      NIR_PASS(_, nir, poly_nir_lower_gs, &count, &rast, &pre_gs,
                &count_variant->info.gs);
 
       agx_preprocess_nir(count);
diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h
index 836c7fbffaa..36712f30f33 100644
--- a/src/asahi/vulkan/hk_shader.h
+++ b/src/asahi/vulkan/hk_shader.h
@@ -8,9 +8,9 @@
 #pragma once
 
 #include "asahi/compiler/agx_compile.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/macros.h"
 #include "agx_linker.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_pack.h"
 #include "agx_usc.h"
@@ -94,7 +94,7 @@ struct hk_shader_info {
          struct hk_tess_info info;
       } tess;
 
-      struct agx_gs_info gs;
+      struct poly_gs_info gs;
 
       /* Used to initialize the union for other stages */
       uint8_t _pad[32];
diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index 5061ecde35d..c1ebac6c431 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -5,10 +5,10 @@
 
 #include "compiler/nir/nir_builder.h"
 #include "pipe/p_defines.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitset.h"
 #include "util/u_dynarray.h"
 #include "agx_abi.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_state.h"
 #include "nir.h"
 #include "nir_builder_opcodes.h"
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 91489b117ac..438bf63d8fa 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -34,6 +34,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
+#include "poly/geometry.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitscan.h"
 #include "util/bitset.h"
 #include "util/blend.h"
@@ -57,10 +59,8 @@
 #include "agx_disk_cache.h"
 #include "agx_linker.h"
 #include "agx_nir.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_tilebuffer.h"
-#include "geometry.h"
 #include "libagx.h"
 #include "libagx_dgc.h"
 #include "libagx_shaders.h"
@@ -1544,7 +1544,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
    nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);
 
    /* Auxiliary programs */
-   struct agx_gs_info gs_info = {0};
+   struct poly_gs_info gs_info = {0};
    uint64_t outputs = 0;
    struct agx_fs_epilog_link_info epilog_key = {false};
    nir_shader *gs_count = NULL;
@@ -1564,7 +1564,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
       struct asahi_vs_shader_key *key = &key_->vs;
 
       if (nir->info.vs.tes_poly) {
-         NIR_PASS(_, nir, agx_nir_lower_tes, key->hw);
+         NIR_PASS(_, nir, poly_nir_lower_tes, key->hw);
       } else {
          NIR_PASS(_, nir, agx_nir_gather_vs_inputs, attrib_components_read);
          NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog);
@@ -1580,7 +1580,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
          NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
          NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
       } else {
-         NIR_PASS(_, nir, agx_nir_lower_vs_before_gs);
+         NIR_PASS(_, nir, poly_nir_lower_vs_before_gs);
 
          /* Turn into a compute shader now that we're free of vertexisms */
          nir->info.stage = MESA_SHADER_COMPUTE;
@@ -1589,9 +1589,9 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
          outputs = nir->info.outputs_written;
       }
    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-      NIR_PASS(_, nir, agx_nir_lower_tcs);
+      NIR_PASS(_, nir, poly_nir_lower_tcs);
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
+      NIR_PASS(_, nir, poly_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
                &gs_info);
 
       agx_preprocess_nir(gs_count);
@@ -1932,11 +1932,11 @@ agx_create_shader_state(struct pipe_context *pctx,
       so->tess.spacing = nir->info.tess.spacing;
       so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
       so->tess.primitive = nir->info.tess._primitive_mode;
-      so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
+      so->tess.per_vertex_outputs = poly_tcs_per_vertex_outputs(nir);
       so->tess.nr_patch_outputs =
          util_last_bit(nir->info.patch_outputs_written);
       if (nir->info.stage == MESA_SHADER_TESS_CTRL)
-         so->tess.output_stride = agx_tcs_output_stride(nir);
+         so->tess.output_stride = poly_tcs_output_stride(nir);
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
       so->gs_mode = nir->info.gs.output_primitive;
    }
@@ -3903,7 +3903,7 @@ agx_batch_heap(struct agx_batch *batch)
                                         PIPE_USAGE_DEFAULT, size);
       }
 
-      struct agx_heap heap = {
+      struct poly_heap heap = {
          .base = agx_resource(ctx->heap)->bo->va->addr,
          .size = size,
       };
@@ -3924,7 +3924,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
                           const struct pipe_draw_start_count_bias *draw,
                           const struct pipe_draw_indirect_info *indirect)
 {
-   struct agx_ia_state ia = {
+   struct poly_ia_state ia = {
       .index_buffer = input_index_buffer,
       .index_buffer_range_el = index_buffer_size_B / info->index_size,
       .verts_per_instance = draw ? draw->count : 0,
@@ -3933,7 +3933,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
    batch->uniforms.input_assembly =
       agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
 
-   struct agx_geometry_params params = {
+   struct poly_geometry_params params = {
       .indirect_desc = batch->geom_indirect,
       .flat_outputs =
          batch->ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
@@ -4017,8 +4017,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
 
       params.input_primitives = params.gs_grid[0] * info->instance_count;
 
-      unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
-                                            batch->uniforms.vertex_outputs);
+      unsigned vb_size = poly_tcs_in_size(draw->count * info->instance_count,
+                                          batch->uniforms.vertex_outputs);
       unsigned size = params.input_primitives * params.count_buffer_stride;
 
       if (size && prefix_sum) {
@@ -4034,8 +4034,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
          params.input_buffer = addr;
       }
 
-      struct agx_gs_info *gsi = &batch->ctx->gs->gs;
-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      struct poly_gs_info *gsi = &batch->ctx->gs->gs;
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
          unsigned idx_size = params.input_primitives * gsi->max_indices;
 
          params.output_index_buffer =
@@ -4125,10 +4125,10 @@ agx_launch_gs_prerast(struct agx_batch *batch,
       libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
 
       grid_vs = agx_grid_indirect_local(
-         gp + offsetof(struct agx_geometry_params, vs_grid));
+         gp + offsetof(struct poly_geometry_params, vs_grid));
 
       grid_gs = agx_grid_indirect_local(
-         gp + offsetof(struct agx_geometry_params, gs_grid));
+         gp + offsetof(struct poly_geometry_params, gs_grid));
    } else {
       grid_vs = agx_3d(draws->count, info->instance_count, 1);
 
@@ -4246,7 +4246,7 @@ agx_draw_without_restart(struct agx_batch *batch,
    /* Unroll the index buffer for each draw */
    libagx_unroll_restart_struct(batch, agx_1d(1024 * indirect->draw_count),
                                 AGX_BARRIER_ALL, unroll,
-                                libagx_compact_prim(info->mode));
+                                poly_compact_prim(info->mode));
 
    /* Now draw the results without restart */
    struct pipe_draw_info new_info = {
@@ -4538,8 +4538,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
    bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
    enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
 
-   enum libagx_tess_partitioning partitioning =
-      (enum libagx_tess_partitioning)pspacing;
+   enum poly_tess_partitioning partitioning =
+      (enum poly_tess_partitioning)pspacing;
 
    struct agx_bo *draw_bo = NULL;
    size_t draw_stride = 5 * sizeof(uint32_t);
@@ -4557,7 +4557,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
    if (info->index_size)
       ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
 
-   struct agx_ia_state ia = {
+   struct poly_ia_state ia = {
       .index_buffer = ib,
       .index_buffer_range_el = ib_extent,
       .verts_per_instance = draws ? draws->count : 0,
@@ -4572,7 +4572,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
    uint64_t heap = agx_batch_heap(batch);
    assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
 
-   struct libagx_tess_args args = {
+   struct poly_tess_args args = {
       .heap = heap,
       .tcs_stride_el = tcs->tess.output_stride / 4,
       .statistic = agx_get_query_address(
@@ -4644,8 +4644,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
       args.out_draws = blob.gpu + draw_offs;
       args.counts = blob.gpu + count_offs;
 
-      unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
-                                            batch->uniforms.vertex_outputs);
+      unsigned vb_size = poly_tcs_in_size(draws->count * info->instance_count,
+                                          batch->uniforms.vertex_outputs);
       uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
       batch->uniforms.vertex_output_buffer_ptr =
          agx_pool_upload(&batch->pool, &addr, 8);
@@ -4716,11 +4716,11 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
 
    /* Generate counts, then prefix sum them, then finally tessellate. */
    libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
-                     LIBAGX_TESS_MODE_COUNT, state);
+                     POLY_TESS_MODE_COUNT, state);
    libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state, c_prims,
                           c_invs, c_prims || c_invs);
    libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
-                     LIBAGX_TESS_MODE_WITH_COUNTS, state);
+                     POLY_TESS_MODE_WITH_COUNTS, state);
 
    /* Face culling state needs to be specialized for tess */
    ctx->dirty |= AGX_DIRTY_RS;
@@ -5141,12 +5141,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       agx_launch_gs_prerast(batch, info, draws, indirect);
 
       /* Setup to rasterize the GS results */
-      struct agx_gs_info *gsi = &ctx->gs->gs;
+      struct poly_gs_info *gsi = &ctx->gs->gs;
       info_gs = (struct pipe_draw_info){
          .mode = gsi->mode,
-         .index_size = agx_gs_index_size(gsi->shape),
-         .primitive_restart = agx_gs_indexed(gsi->shape),
-         .restart_index = agx_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0,
+         .index_size = poly_gs_index_size(gsi->shape),
+         .primitive_restart = poly_gs_indexed(gsi->shape),
+         .restart_index = poly_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0,
          .index.resource = &index_rsrc.base,
          .instance_count = 1,
       };
@@ -5167,11 +5167,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
             u_decomposed_prims_for_vertices(info->mode, draws->count);
 
          draw_gs = (struct pipe_draw_start_count_bias){
-            .count = agx_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
-                                          info->instance_count),
+            .count = poly_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
+                                           info->instance_count),
          };
 
-         info_gs.instance_count = agx_gs_rast_instances(
+         info_gs.instance_count = poly_gs_rast_instances(
             gsi->shape, gsi->max_indices, prims, info->instance_count);
 
          draws = &draw_gs;
@@ -5184,10 +5184,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
       batch->reduced_prim = u_reduced_prim(info->mode);
       ctx->dirty |= AGX_DIRTY_PRIM;
 
-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
          ib = batch->geom_index;
          ib_extent = index_rsrc.bo->size - (batch->geom_index - ib);
-      } else if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
+      } else if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) {
          ib = agx_pool_upload(&batch->pool, gsi->topology, gsi->max_indices);
          ib_extent = gsi->max_indices;
       }
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 31c88ee989e..32b21bf4ffc 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -18,13 +18,14 @@
 #include "asahi/lib/agx_tilebuffer.h"
 #include "asahi/lib/agx_uvs.h"
 #include "asahi/lib/pool.h"
-#include "asahi/libagx/geometry.h"
 #include "compiler/shader_enums.h"
 #include "gallium/auxiliary/util/u_blitter.h"
 #include "gallium/include/pipe/p_context.h"
 #include "gallium/include/pipe/p_screen.h"
 #include "gallium/include/pipe/p_state.h"
 #include "pipe/p_defines.h"
+#include "poly/geometry.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitset.h"
 #include "util/disk_cache.h"
 #include "util/hash_table.h"
@@ -32,7 +33,6 @@
 #include "util/u_range.h"
 #include "agx_bg_eot.h"
 #include "agx_helpers.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_texture.h"
 
 #ifdef __GLIBC__
@@ -248,7 +248,7 @@ struct agx_compiled_shader {
    struct agx_compiled_shader *gs_count, *pre_gs;
    struct agx_compiled_shader *gs_copy;
 
-   struct agx_gs_info gs;
+   struct poly_gs_info gs;
 
    /* Logical shader stage used for descriptor access. This may differ from the
     * physical shader stage of the compiled shader, for example when executing a
diff --git a/src/meson.build b/src/meson.build
index fdfb2dc246f..d08e26466da 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -53,6 +53,9 @@ if with_gallium_or_lvp or with_gbm or with_platform_wayland
   subdir('loader')
 endif
 subdir('compiler')
+if with_poly
+  subdir('poly')
+endif
 if with_tools.contains('drm-shim')
   subdir('drm-shim')
 endif
diff --git a/src/poly/.clang-format b/src/poly/.clang-format
new file mode 100644
index 00000000000..6fc36ba4cca
--- /dev/null
+++ b/src/poly/.clang-format
@@ -0,0 +1,8 @@
+
+BasedOnStyle: InheritParentConfig
+DisableFormat: false
+
+AlignConsecutiveBitFields: Consecutive
+ColumnLimit: 80
+BreakStringLiterals: false
+SpaceBeforeParens: ControlStatementsExceptControlMacros
diff --git a/src/poly/cl/geometry.cl b/src/poly/cl/geometry.cl
new file mode 100644
index 00000000000..b1ae4ba1620
--- /dev/null
+++ b/src/poly/cl/geometry.cl
@@ -0,0 +1,501 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * Copyright 2025 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+
+uint64_t nir_ro_to_rw_poly(uint64_t address);
+
+/* Swap the two non-provoking vertices in odd triangles. This generates a vertex
+ * ID list with a consistent winding order.
+ *
+ * Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2]
+ * is its own inverse. It is hence used both vertex fetch and transform
+ * feedback.
+ */
+static uint
+map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
+{
+   unsigned pv = flatshade_first ? 0 : 2;
+
+   bool even = (prim & 1) == 0;
+   bool provoking = vert == pv;
+
+   return (provoking || even) ? vert : ((3 - pv) - vert);
+}
+
+static inline uint
+xfb_prim(uint id, uint n, uint copy)
+{
+   return sub_sat(id, n - 1u) + copy;
+}
+
+/*
+ * Determine whether an output vertex has an n'th copy in the transform feedback
+ * buffer. This is written weirdly to let constant folding remove unnecessary
+ * stores when length is known statically.
+ */
+bool
+poly_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
+{
+   uint prim = xfb_prim(id, n, copy);
+
+   int num_prims = length - (n - 1);
+   return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
+}
+
+uint
+poly_xfb_vertex_offset(uint n, uint invocation_base_prim, uint strip_base_prim,
+                       uint id_in_strip, uint copy, bool flatshade_first)
+{
+   uint prim = xfb_prim(id_in_strip, n, copy);
+   uint vert_0 = min(id_in_strip, n - 1);
+   uint vert = vert_0 - copy;
+
+   if (n == 3) {
+      vert = map_vertex_in_tri_strip(prim, vert, flatshade_first);
+   }
+
+   /* Tally up in the whole buffer */
+   uint base_prim = invocation_base_prim + strip_base_prim;
+   uint base_vertex = base_prim * n;
+   return base_vertex + (prim * n) + vert;
+}
+
+uint64_t
+poly_xfb_vertex_address(constant struct poly_geometry_params *p, uint index,
+                        uint buffer, uint stride, uint output_offset)
+{
+   uint xfb_offset = (index * stride) + output_offset;
+
+   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
+}
+
+static uint
+vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
+{
+   /* (0, 1), (1, 2), (2, 0) */
+   if (prim == (num_prims - 1) && vert == 1)
+      return 0;
+   else
+      return prim + vert;
+}
+
+uint
+poly_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
+                              uint num_prims)
+{
+   /* Line list, line strip, or line loop */
+   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
+      return 0;
+
+   if (mode == MESA_PRIM_LINES)
+      prim *= 2;
+
+   return prim + vert;
+}
+
+static uint
+vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
+{
+   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
+    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
+    * Piglit clipflat expects us to switch between these orders depending on
+    * provoking vertex, to avoid trivializing the fan.
+    *
+    * Rotate accordingly.
+    */
+   if (flatshade_first) {
+      vert = (vert == 2) ? 0 : (vert + 1);
+   }
+
+   /* The simpler form assuming last is provoking. */
+   return (vert == 0) ? 0 : prim + vert;
+}
+
+uint
+poly_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
+                             bool flatshade_first)
+{
+   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
+      vert = vert + 1;
+      vert = (vert == 3) ? 0 : vert;
+   }
+
+   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
+      return 0;
+
+   if (mode == MESA_PRIM_TRIANGLES)
+      prim *= 3;
+
+   /* Triangle list, triangle strip, or triangle fan */
+   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
+      unsigned pv = flatshade_first ? 0 : 2;
+
+      bool even = (prim & 1) == 0;
+      bool provoking = vert == pv;
+
+      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
+   }
+
+   return prim + vert;
+}
+
+uint
+poly_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
+{
+   /* Line list adj or line strip adj */
+   if (mode == MESA_PRIM_LINES_ADJACENCY)
+      prim *= 4;
+
+   return prim + vert;
+}
+
+static uint
+vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
+                            bool flatshade_first)
+{
+   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
+    *
+    * There are different cases for first/middle/last/only primitives and for
+    * odd/even primitives.  Determine which case we're in.
+    */
+   bool last = prim == (num_prims - 1);
+   bool first = prim == 0;
+   bool even = (prim & 1) == 0;
+   bool even_or_first = even || first;
+
+   /* When the last vertex is provoking, we rotate the primitives
+    * accordingly. This seems required for OpenGL.
+    */
+   if (!flatshade_first && !even_or_first) {
+      vert = (vert + 4u) % 6u;
+   }
+
+   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
+    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
+    *
+    * Here we assume the first vertex is provoking, the Vulkan default.
+    */
+   uint offsets[6] = {
+      0,
+      first ? 1 : (even ? -2 : 3),
+      even_or_first ? 2 : 4,
+      last ? 5 : 6,
+      even_or_first ? 4 : 2,
+      even_or_first ? 3 : -2,
+   };
+
+   /* Ensure NIR can see thru the local array */
+   uint offset = 0;
+   for (uint i = 1; i < 6; ++i) {
+      if (i == vert)
+         offset = offsets[i];
+   }
+
+   /* Finally add to the base of the primitive */
+   return (prim * 2) + offset;
+}
+
+uint
+poly_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
+                                 uint nr, bool flatshade_first)
+{
+   /* Tri adj list or tri adj strip */
+   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
+      return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first);
+   } else {
+      return (6 * prim) + vert;
+   }
+}
+
+static uint
+vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
+                       uint vert, uint num_prims)
+{
+   switch (mode) {
+   case MESA_PRIM_POINTS:
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_LINES_ADJACENCY:
+   case MESA_PRIM_TRIANGLES_ADJACENCY:
+      /* Regular primitive: every N vertices defines a primitive */
+      return (prim * mesa_vertices_per_prim(mode)) + vert;
+
+   case MESA_PRIM_LINE_LOOP:
+      return vertex_id_for_line_loop(prim, vert, num_prims);
+
+   case MESA_PRIM_LINE_STRIP:
+   case MESA_PRIM_LINE_STRIP_ADJACENCY:
+      /* (i, i + 1) or (i, ..., i + 3) */
+      return prim + vert;
+
+   case MESA_PRIM_TRIANGLE_STRIP: {
+      /* Order depends on the provoking vert.
+       *
+       * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
+       * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
+       *
+       * Pull the (maybe swapped) vert from the corresponding primitive
+       */
+      return prim + map_vertex_in_tri_strip(prim, vert, flatshade_first);
+   }
+
+   case MESA_PRIM_TRIANGLE_FAN:
+      return vertex_id_for_tri_fan(prim, vert, flatshade_first);
+
+   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return vertex_id_for_tri_strip_adj(prim, vert, num_prims,
+                                         flatshade_first);
+
+   default:
+      return 0;
+   }
+}
+
+uint
+poly_map_to_line_adj(uint id)
+{
+   /* Sequence (1, 2), (5, 6), (9, 10), ... */
+   return ((id & ~1) * 2) + (id & 1) + 1;
+}
+
+uint
+poly_map_to_line_strip_adj(uint id)
+{
+   /* Sequence (1, 2), (2, 3), (4, 5), .. */
+   uint prim = id / 2;
+   uint vert = id & 1;
+   return prim + vert + 1;
+}
+
+uint
+poly_map_to_tri_strip_adj(uint id)
+{
+   /* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8)
+    *
+    * Although tri strips with adjacency have 6 cases in general, after
+    * disregarding the vertices only available in a geometry shader, there are
+    * only even/odd cases. In other words, it's just a triangle strip subject to
+    * extra padding.
+    *
+    * Dividing through by two, the sequence is:
+    *
+    *   (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4)
+    */
+   uint prim = id / 3;
+   uint vtx = id % 3;
+
+   /* Flip the winding order of odd triangles */
+   if ((prim % 2) == 1) {
+      if (vtx == 1)
+         vtx = 2;
+      else if (vtx == 2)
+         vtx = 1;
+   }
+
+   return 2 * (prim + vtx);
+}
+
+uint
+poly_load_index_buffer(constant struct poly_ia_state *p, uint id,
+                       uint index_size)
+{
+   return poly_load_index(p->index_buffer, p->index_buffer_range_el, id,
+                          index_size);
+}
+
+static uint
+setup_xfb_buffer(global struct poly_geometry_params *p, uint i, uint stride,
+                 uint max_output_end, uint vertices_per_prim)
+{
+   uint xfb_offset = *(p->xfb_offs_ptrs[i]);
+   p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset;
+
+   /* Let output_end = output_offset + output_size.
+    *
+    * Primitive P will write up to (but not including) offset:
+    *
+    *    xfb_offset + ((P - 1) * (verts_per_prim * stride))
+    *               + ((verts_per_prim - 1) * stride)
+    *               + output_end
+    *
+    * To fit all outputs for P, that value must be less than the XFB
+    * buffer size for the output with maximal output_end, as everything
+    * else is constant here across outputs within a buffer/primitive:
+    *
+    *    floor(P) <= (stride + size - xfb_offset - output_end)
+    *                 // (stride * verts_per_prim)
+    */
+   int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset;
+   uint numer = max(numer_s, 0);
+   return numer / (stride * vertices_per_prim);
+}
+
+void
+poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
+                 uint32_t prim_index_offset, uint32_t vertex_offset,
+                 uint32_t verts_in_prim, uint3 info)
+{
+   _poly_write_strip(index_buffer, inv_index_offset + prim_index_offset,
+                     vertex_offset, verts_in_prim, info.x, info.y, info.z);
+}
+
+void
+poly_pad_index_gs(global int *index_buffer, uint inv_index_offset,
+                  uint nr_indices, uint alloc)
+{
+   for (uint i = nr_indices; i < alloc; ++i) {
+      index_buffer[inv_index_offset + i] = -1;
+   }
+}
+
+uintptr_t
+poly_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
+                           gl_varying_slot location)
+{
+   /* Written like this to let address arithmetic work */
+   return buffer + ((uintptr_t)poly_tcs_in_offs_el(vtx, location, mask)) * 16;
+}
+
+uintptr_t
+poly_geometry_input_address(constant struct poly_geometry_params *p, uint vtx,
+                            gl_varying_slot location)
+{
+   return poly_vertex_output_address(p->input_buffer, p->input_mask, vtx,
+                                     location);
+}
+
+unsigned
+poly_input_vertices(constant struct poly_ia_state *ia)
+{
+   return ia->verts_per_instance;
+}
+
+global uint *
+poly_load_xfb_count_address(constant struct poly_geometry_params *p, int index,
+                            int count_words, uint unrolled_id)
+{
+   return &p->count_buffer[(unrolled_id * count_words) + index];
+}
+
+uint
+poly_previous_xfb_primitives(global struct poly_geometry_params *p,
+                             int static_count, int count_index, int count_words,
+                             bool prefix_sum, uint unrolled_id)
+{
+   if (static_count >= 0) {
+      /* If the number of outputted vertices per invocation is known statically,
+       * we can calculate the base.
+       */
+      return unrolled_id * static_count;
+   } else {
+      /* Otherwise, load from the count buffer buffer. Note that the sums are
+       * inclusive, so index 0 is nonzero. This requires a little fixup here. We
+       * use a saturating unsigned subtraction so we don't read out-of-bounds.
+       *
+       * If we didn't prefix sum, there's only one element.
+       */
+      uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0;
+      uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index];
+
+      return unrolled_id == 0 ? 0 : count;
+   }
+}
+
+/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */
+#define poly_foreach_xfb(word, index)                                          \
+   for (uint i = 0; i < 4; ++i)                                                \
+      if (word & BITFIELD_BIT(i))
+
+void
+poly_pre_gs(global struct poly_geometry_params *p, uint streams,
+            uint buffers_written, uint4 buffer_to_stream, int4 count_index,
+            uint4 stride, uint4 output_end, int4 static_count, uint invocations,
+            uint vertices_per_prim, global uint *gs_invocations,
+            global uint *gs_primitives, global uint *c_primitives,
+            global uint *c_invocations)
+{
+   unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) +
+                          !!(count_index[2] >= 0) + !!(count_index[3] >= 0);
+   bool prefix_sum = count_words && buffers_written;
+   uint unrolled_in_prims = p->input_primitives;
+
+   /* Determine the number of primitives generated in each stream */
+   uint4 in_prims = 0;
+   poly_foreach_xfb(streams, i) {
+      in_prims[i] = poly_previous_xfb_primitives(p, static_count[i],
+                                                 count_index[i], count_words,
+                                                 prefix_sum, unrolled_in_prims);
+
+      *(p->prims_generated_counter[i]) += in_prims[i];
+   }
+
+   uint4 prims = in_prims;
+   uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3];
+
+   if (buffers_written) {
+      poly_foreach_xfb(buffers_written, i) {
+         uint max_prims =
+            setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim);
+
+         unsigned stream = buffer_to_stream[i];
+         prims[stream] = min(prims[stream], max_prims);
+      }
+
+      int4 overflow = prims < in_prims;
+
+      poly_foreach_xfb(streams, i) {
+         p->xfb_verts[i] = prims[i] * vertices_per_prim;
+
+         *(p->xfb_overflow[i]) += (bool)overflow[i];
+         *(p->xfb_prims_generated_counter[i]) += prims[i];
+      }
+
+      *(p->xfb_any_overflow) += any(overflow);
+
+      /* Update XFB counters */
+      poly_foreach_xfb(buffers_written, i) {
+         uint32_t prim_stride_B = stride[i] * vertices_per_prim;
+         unsigned stream = buffer_to_stream[i];
+
+         global uint *ptr = p->xfb_offs_ptrs[i];
+
+         ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr);
+         *ptr += prims[stream] * prim_stride_B;
+      }
+   }
+
+   /* The geometry shader is invoked once per primitive (after unrolling
+    * primitive restart). From the spec:
+    *
+    *    In case of instanced geometry shaders (see section 11.3.4.2) the
+    *    geometry shader invocations count is incremented for each separate
+    *    instanced invocation.
+    */
+   *gs_invocations += unrolled_in_prims * invocations;
+   *gs_primitives += emitted_prims;
+
+   /* Clipper queries are not well-defined, so we can emulate them in lots of
+    * silly ways. We need the hardware counters to implement them properly. For
+    * now, just consider all primitives emitted as passing through the clipper.
+    * This satisfies spec text:
+    *
+    *    The number of primitives that reach the primitive clipping stage.
+    *
+    * and
+    *
+    *    If at least one vertex of the primitive lies inside the clipping
+    *    volume, the counter is incremented by one or more. Otherwise, the
+    *    counter is incremented by zero or more.
+    */
+   *c_primitives += emitted_prims;
+   *c_invocations += emitted_prims;
+}
diff --git a/src/poly/cl/meson.build b/src/poly/cl/meson.build
new file mode 100644
index 00000000000..286dcf1c90e
--- /dev/null
+++ b/src/poly/cl/meson.build
@@ -0,0 +1,35 @@
+# Copyright 2024 Valve Corporation
+# Copyright © 2025 Collabora Ltd.
+# SPDX-License-Identifier: MIT
+
+libpoly_shader_files = files(
+  'geometry.cl',
+  'tessellation.cl',
+)
+
+libpoly_shaders_spv = custom_target(
+  input : libpoly_shader_files,
+  output : 'libpoly.spv',
+  command : [
+    prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@',
+    libpoly_shader_files, '--',
+    '-I' + join_paths(meson.project_source_root(), 'include'),
+    '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
+    '-I' + join_paths(meson.current_source_dir(), '.'),
+    '-I' + join_paths(meson.current_source_dir(), '../../'),
+    cl_args,
+    ],
+  depends : [],
+  depfile : 'libpoly_shaders.h.d',
+)
+
+libpoly_shaders = custom_target(
+  input : libpoly_shaders_spv,
+  output : ['libpoly.cpp', 'libpoly.h'],
+  command : [prog_vtn_bindgen2, libpoly_shaders_spv, '@OUTPUT0@', '@OUTPUT1@'],
+)
+
+idep_libpoly = declare_dependency(
+  sources : [libpoly_shaders],
+  include_directories : include_directories('.'),
+)
diff --git a/src/poly/cl/tessellation.cl b/src/poly/cl/tessellation.cl
new file mode 100644
index 00000000000..ed37ca889ba
--- /dev/null
+++ b/src/poly/cl/tessellation.cl
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+
+uint
+poly_tcs_patch_vertices_in(constant struct poly_tess_args *p)
+{
+   return p->input_patch_size;
+}
+
+uint
+poly_tes_patch_vertices_in(constant struct poly_tess_args *p)
+{
+   return p->output_patch_size;
+}
+
+uint
+poly_tcs_unrolled_id(constant struct poly_tess_args *p, uint3 wg_id)
+{
+   return (wg_id.y * p->patches_per_instance) + wg_id.x;
+}
+
+uint64_t
+poly_tes_buffer(constant struct poly_tess_args *p)
+{
+   return p->tes_buffer;
+}
+
+/*
+ * Helper to lower indexing for a tess eval shader ran as a compute shader. This
+ * handles the tess+geom case. This is simpler than the general input assembly
+ * lowering, as we know:
+ *
+ * 1. the index buffer is U32
+ * 2. the index is in bounds
+ *
+ * Therefore we do a simple load. No bounds checking needed.
+ */
+uint32_t
+poly_load_tes_index(constant struct poly_tess_args *p, uint32_t index)
+{
+   /* Swap second and third vertices of each triangle to flip winding order
+    * dynamically if needed.
+    */
+   if (p->ccw) {
+      uint id = index % 3;
+
+      if (id == 1)
+         index++;
+      else if (id == 2)
+         index--;
+   }
+
+   return p->index_buffer[index];
+}
+
+uintptr_t
+poly_tcs_out_address(constant struct poly_tess_args *p, uint patch_id,
+                     uint vtx_id, gl_varying_slot location, uint nr_patch_out,
+                     uint out_patch_size, uint64_t vtx_out_mask)
+{
+   uint stride_el =
+      poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask);
+
+   uint offs_el =
+      poly_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask);
+
+   offs_el += patch_id * stride_el;
+
+   /* Written to match the AGX addressing mode */
+   return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2);
+}
+
+static uint
+tes_unrolled_patch_id(uint raw_id)
+{
+   return raw_id / POLY_TES_PATCH_ID_STRIDE;
+}
+
+uint
+poly_tes_patch_id(constant struct poly_tess_args *p, uint raw_id)
+{
+   return tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
+}
+
+static uint
+tes_vertex_id_in_patch(uint raw_id)
+{
+   return raw_id % POLY_TES_PATCH_ID_STRIDE;
+}
+
+float2
+poly_load_tess_coord(constant struct poly_tess_args *p, uint raw_id)
+{
+   uint patch = tes_unrolled_patch_id(raw_id);
+   uint vtx = tes_vertex_id_in_patch(raw_id);
+
+   global struct poly_tess_point *t =
+      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
+
+   /* Written weirdly because NIR struggles with loads of structs */
+   uint2 fixed = *((global uint2 *)t);
+
+   /* Convert fixed point to float */
+   return convert_float2(fixed) / (1u << 16);
+}
+
+uintptr_t
+poly_tes_in_address(constant struct poly_tess_args *p, uint raw_id, uint vtx_id,
+                    gl_varying_slot location)
+{
+   uint patch = tes_unrolled_patch_id(raw_id);
+
+   return poly_tcs_out_address(p, patch, vtx_id, location,
+                               p->tcs_patch_constants, p->output_patch_size,
+                               p->tcs_per_vertex_outputs);
+}
+
+float4
+poly_tess_level_outer_default(constant struct poly_tess_args *p)
+{
+   return vload4(0, p->tess_level_outer_default);
+}
+
+float2
+poly_tess_level_inner_default(constant struct poly_tess_args *p)
+{
+   return vload2(0, p->tess_level_inner_default);
+}
diff --git a/src/poly/cl/tessellator.h b/src/poly/cl/tessellator.h
new file mode 100644
index 00000000000..0dbe5b76d52
--- /dev/null
+++ b/src/poly/cl/tessellator.h
@@ -0,0 +1,1609 @@
+/*
+    Copyright (c) Microsoft Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef __OPENCL_VERSION__
+#error "Tessellator should only be imported by OpenCL C code"
+#endif
+
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+#include "util/u_math.h"
+
+#pragma once
+
+#define POLY_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR 1.0f
+#define POLY_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR 64.0f
+
+typedef unsigned int FXP; // fixed point number
+
+enum {
+   U = 0, // points on a tri patch
+   V = 1,
+};
+
+enum {
+   Ueq0 = 0, // edges on a tri patch
+   Veq0 = 1,
+   Weq0 = 2,
+};
+
+enum {
+   Ueq1 = 2, // edges on a quad patch: Ueq0, Veq0, Ueq1, Veq1
+   Veq1 = 3,
+};
+
+#define QUAD_AXES  2
+#define QUAD_EDGES 4
+#define TRI_EDGES  3
+
+// The interior can just use a simpler stitch.
+typedef enum DIAGONALS {
+   DIAGONALS_INSIDE_TO_OUTSIDE,
+   DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE,
+   DIAGONALS_MIRRORED
+} DIAGONALS;
+
+typedef struct TESS_FACTOR_CONTEXT {
+   FXP fxpInvNumSegmentsOnFloorTessFactor;
+   FXP fxpInvNumSegmentsOnCeilTessFactor;
+   FXP fxpHalfTessFactorFraction;
+   int numHalfTessFactorPoints;
+   int splitPointOnFloorHalfTessFactor;
+} TESS_FACTOR_CONTEXT;
+
+struct INDEX_PATCH_CONTEXT {
+   int insidePointIndexDeltaToRealValue;
+   int insidePointIndexBadValue;
+   int insidePointIndexReplacementValue;
+   int outsidePointIndexPatchBase;
+   int outsidePointIndexDeltaToRealValue;
+   int outsidePointIndexBadValue;
+   int outsidePointIndexReplacementValue;
+};
+
+struct INDEX_PATCH_CONTEXT2 {
+   int baseIndexToInvert;
+   int indexInversionEndPoint;
+   int cornerCaseBadValue;
+   int cornerCaseReplacementValue;
+};
+
+struct CHWTessellator {
+   enum poly_tess_mode mode;
+   uint index_bias;
+
+   // array where we will store u/v's for the points we generate
+   global struct poly_tess_point *Point;
+
+   // array where we will store index topology
+   global void *Index;
+
+   // A second index patch we have to do handles the leftover strip of quads in
+   // the middle of an odd quad patch after finishing all the concentric rings.
+   // This also handles the leftover strip of points in the middle of an even
+   // quad patch, when stitching the row of triangles up the left side (V major
+   // quad) or bottom (U major quad) of the inner ring
+   bool bUsingPatchedIndices;
+   bool bUsingPatchedIndices2;
+   struct INDEX_PATCH_CONTEXT IndexPatchCtx;
+   struct INDEX_PATCH_CONTEXT2 IndexPatchCtx2;
+};
+
+#define FXP_INTEGER_BITS  15
+#define FXP_FRACTION_BITS 16
+#define FXP_FRACTION_MASK 0x0000ffff
+#define FXP_INTEGER_MASK  0x7fff0000
+#define FXP_ONE           (1 << FXP_FRACTION_BITS)
+#define FXP_ONE_THIRD     0x00005555
+#define FXP_TWO_THIRDS    0x0000aaaa
+#define FXP_ONE_HALF      0x00008000
+
+static inline global float *
+tess_factors(constant struct poly_tess_args *p, uint patch)
+{
+   return p->tcs_buffer + (patch * p->tcs_stride_el);
+}
+
+/*
+ * Generate an indexed draw for a patch with the computed number of indices.
+ * This allocates heap memory for the index buffer, returning the allocated
+ * memory.
+ */
+static inline global void *
+poly_draw(constant struct poly_tess_args *p, enum poly_tess_mode mode,
+          bool lines, uint patch, uint count)
+{
+   if (mode == POLY_TESS_MODE_COUNT) {
+      p->counts[patch] = count;
+   }
+
+   if (mode == POLY_TESS_MODE_WITH_COUNTS) {
+      /* The index buffer is already allocated, get a pointer inside it.
+       * p->counts has had an inclusive prefix sum hence the subtraction.
+       */
+      uint offset_el = p->counts[sub_sat(patch, 1u)];
+      if (patch == 0)
+         offset_el = 0;
+
+      return &p->index_buffer[offset_el];
+   }
+
+   return NULL;
+}
+
+static inline void
+poly_draw_points(private struct CHWTessellator *ctx,
+                 constant struct poly_tess_args *p, uint patch, uint count)
+{
+   /* For points mode with a single draw, we need to generate a trivial index
+    * buffer to stuff in the patch ID in the right place.
+    */
+   global uint32_t *indices = poly_draw(p, ctx->mode, false, patch, count);
+
+   if (ctx->mode == POLY_TESS_MODE_COUNT)
+      return;
+
+   for (int i = 0; i < count; ++i) {
+      indices[i] = ctx->index_bias + i;
+   }
+}
+
+static inline void
+poly_draw_empty(constant struct poly_tess_args *p, enum poly_tess_mode mode,
+                uint patch)
+{
+   if (mode == POLY_TESS_MODE_COUNT) {
+      p->counts[patch] = 0;
+   }
+}
+
+/*
+ * Allocate heap memory for domain points for a patch. The allocation
+ * is recorded in the coord_allocs[] array, which is in elements.
+ */
+static inline global struct poly_tess_point *
+poly_heap_alloc_points(constant struct poly_tess_args *p, uint patch,
+                       uint count)
+{
+   /* If we're recording statistics, increment now. The statistic is for
+    * tessellation evaluation shader invocations, which is equal to the number
+    * of domain points generated.
+    */
+   if (p->statistic) {
+      atomic_fetch_add((volatile atomic_uint *)(p->statistic), count);
+   }
+
+   uint32_t elsize_B = sizeof(struct poly_tess_point);
+   uint32_t alloc_B = poly_heap_alloc_atomic_offs(p->heap, elsize_B * count);
+   uint32_t alloc_el = alloc_B / elsize_B;
+
+   p->coord_allocs[patch] = alloc_el;
+   return (global struct poly_tess_point *)(((uintptr_t)p->heap->base) +
+                                            alloc_B);
+}
+
+// Microsoft D3D11 Fixed Function Tessellator Reference - May 7, 2012
+// amar.patel@microsoft.com
+
+#define POLY_TESS_MIN_ODD_TESSELLATION_FACTOR  1
+#define POLY_TESS_MAX_ODD_TESSELLATION_FACTOR  63
+#define POLY_TESS_MIN_EVEN_TESSELLATION_FACTOR 2
+#define POLY_TESS_MAX_EVEN_TESSELLATION_FACTOR 64
+
+// 2^(-16), min positive fixed point fraction
+#define EPSILON 0.0000152587890625f
+#define MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON                                   \
+   (POLY_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON / 2)
+
+static inline float
+clamp_factor(float factor, enum poly_tess_partitioning partitioning, float maxf)
+{
+   float lower = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_EVEN)
+                    ? POLY_TESS_MIN_EVEN_TESSELLATION_FACTOR
+                    : POLY_TESS_MIN_ODD_TESSELLATION_FACTOR;
+
+   float upper = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD)
+                    ? POLY_TESS_MAX_ODD_TESSELLATION_FACTOR
+                    : POLY_TESS_MAX_EVEN_TESSELLATION_FACTOR;
+
+   // If any TessFactor will end up > 1 after floatToFixed conversion later,
+   // then force the inside TessFactors to be > 1 so there is a picture frame.
+   if (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD &&
+       maxf > MIN_ODD_TESSFACTOR_PLUS_HALF_EPSILON) {
+
+      lower = POLY_TESS_MIN_ODD_TESSELLATION_FACTOR + EPSILON;
+   }
+
+   factor = clamp(factor, lower, upper);
+
+   if (partitioning == POLY_TESS_PARTITIONING_INTEGER) {
+      factor = ceil(factor);
+   }
+
+   return factor;
+}
+
+static inline FXP
+floatToFixed(const float input)
+{
+   return mad(input, FXP_ONE, 0.5f);
+}
+
+static inline bool
+isOdd(const float input)
+{
+   return ((int)input) & 1;
+}
+
+static inline FXP
+fxpCeil(const FXP input)
+{
+   if (input & FXP_FRACTION_MASK) {
+      return (input & FXP_INTEGER_MASK) + FXP_ONE;
+   }
+   return input;
+}
+
+static inline FXP
+fxpFloor(const FXP input)
+{
+   return (input & FXP_INTEGER_MASK);
+}
+
+static inline int
+PatchIndexValue(private struct CHWTessellator *ctx, int index)
+{
+   if (ctx->bUsingPatchedIndices) {
+      // assumed remapped outide indices are > remapped inside vertices
+      if (index >= ctx->IndexPatchCtx.outsidePointIndexPatchBase) {
+         if (index == ctx->IndexPatchCtx.outsidePointIndexBadValue)
+            return ctx->IndexPatchCtx.outsidePointIndexReplacementValue;
+         else
+            return index + ctx->IndexPatchCtx.outsidePointIndexDeltaToRealValue;
+      } else {
+         if (index == ctx->IndexPatchCtx.insidePointIndexBadValue)
+            return ctx->IndexPatchCtx.insidePointIndexReplacementValue;
+         else
+            return index + ctx->IndexPatchCtx.insidePointIndexDeltaToRealValue;
+      }
+   } else if (ctx->bUsingPatchedIndices2) {
+      if (index == ctx->IndexPatchCtx2.cornerCaseBadValue) {
+         return ctx->IndexPatchCtx2.cornerCaseReplacementValue;
+      } else if (index >= ctx->IndexPatchCtx2.baseIndexToInvert) {
+         return ctx->IndexPatchCtx2.indexInversionEndPoint - index;
+      }
+   }
+
+   return index;
+}
+
+static inline void
+DefinePoint(global struct poly_tess_point *out, FXP fxpU, FXP fxpV)
+{
+   out->u = fxpU;
+   out->v = fxpV;
+}
+
+static inline void
+DefineIndex(private struct CHWTessellator *ctx, int index,
+            int indexStorageOffset)
+{
+   global uint32_t *indices = (global uint32_t *)ctx->Index;
+   indices[indexStorageOffset] = ctx->index_bias + PatchIndexValue(ctx, index);
+}
+
+static inline void
+DefineTriangle(private struct CHWTessellator *ctx, int index0, int index1,
+               int index2, int indexStorageBaseOffset)
+{
+   index0 = PatchIndexValue(ctx, index0);
+   index1 = PatchIndexValue(ctx, index1);
+   index2 = PatchIndexValue(ctx, index2);
+
+   vstore3(ctx->index_bias + (uint3)(index0, index1, index2), 0,
+           (global uint *)ctx->Index + indexStorageBaseOffset);
+}
+
+static inline uint32_t
+RemoveMSB(uint32_t val)
+{
+   uint32_t bit = val ? (1 << (31 - clz(val))) : 0;
+   return val & ~bit;
+}
+
+static inline int
+NumPointsForTessFactor(bool odd, FXP fxpTessFactor)
+{
+   // Add epsilon for rounding and add 1 for odd
+   FXP f = fxpTessFactor + (odd ? (FXP_ONE + 1) : 1);
+   int r = fxpCeil(f / 2) >> (FXP_FRACTION_BITS - 1);
+   return odd ? r : r + 1;
+}
+
+static inline void
+ComputeTessFactorCtx(bool odd, FXP fxpTessFactor,
+                     private TESS_FACTOR_CONTEXT *TessFactorCtx)
+{
+   // fxpHalfTessFactor == 1/2 if TessFactor is 1,
+   // but we're pretending we are even.
+   FXP fxpHalfTessFactor = (fxpTessFactor + 1 /*round*/) / 2;
+   if (odd || (fxpHalfTessFactor == FXP_ONE_HALF)) {
+      fxpHalfTessFactor += FXP_ONE_HALF;
+   }
+   FXP fxpFloorHalfTessFactor = fxpFloor(fxpHalfTessFactor);
+   FXP fxpCeilHalfTessFactor = fxpCeil(fxpHalfTessFactor);
+   TessFactorCtx->fxpHalfTessFactorFraction = fxpHalfTessFactor - fxpFloorHalfTessFactor;
+   TessFactorCtx->numHalfTessFactorPoints =
+      (fxpCeilHalfTessFactor >> FXP_FRACTION_BITS); // for EVEN, we don't include the point always
+                                                    // fixed at the midpoint of the TessFactor
+   if (fxpCeilHalfTessFactor == fxpFloorHalfTessFactor) {
+      TessFactorCtx->splitPointOnFloorHalfTessFactor =
+         /*pick value to cause this to be ignored*/ TessFactorCtx->numHalfTessFactorPoints + 1;
+   } else if (odd) {
+      if (fxpFloorHalfTessFactor == FXP_ONE) {
+         TessFactorCtx->splitPointOnFloorHalfTessFactor = 0;
+      } else {
+         TessFactorCtx->splitPointOnFloorHalfTessFactor =
+            (RemoveMSB((fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) - 1) << 1) + 1;
+      }
+   } else {
+      TessFactorCtx->splitPointOnFloorHalfTessFactor =
+         (RemoveMSB(fxpFloorHalfTessFactor >> FXP_FRACTION_BITS) << 1) + 1;
+   }
+   int numFloorSegments = (fxpFloorHalfTessFactor * 2) >> FXP_FRACTION_BITS;
+   int numCeilSegments = (fxpCeilHalfTessFactor * 2) >> FXP_FRACTION_BITS;
+   if (odd) {
+      numFloorSegments -= 1;
+      numCeilSegments -= 1;
+   }
+   TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor =
+      floatToFixed(1.0f / (float)numFloorSegments);
+   TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor =
+      floatToFixed(1.0f / (float)numCeilSegments);
+}
+
+static inline FXP
+PlacePointIn1D(private const TESS_FACTOR_CONTEXT *TessFactorCtx, bool odd,
+               int point)
+{
+   bool bFlip = point >= TessFactorCtx->numHalfTessFactorPoints;
+
+   if (bFlip) {
+      point = (TessFactorCtx->numHalfTessFactorPoints << 1) - point - odd;
+   }
+
+   // special casing middle since 16 bit fixed math below can't reproduce 0.5 exactly
+   if (point == TessFactorCtx->numHalfTessFactorPoints)
+      return FXP_ONE_HALF;
+
+   unsigned int indexOnCeilHalfTessFactor = point;
+   unsigned int indexOnFloorHalfTessFactor = indexOnCeilHalfTessFactor;
+   if (point > TessFactorCtx->splitPointOnFloorHalfTessFactor) {
+      indexOnFloorHalfTessFactor -= 1;
+   }
+   // For the fixed point multiplies below, we know the results are <= 16 bits
+   // because the locations on the halfTessFactor are <= half the number of
+   // segments for the total TessFactor. So a number divided by a number that
+   // is at least twice as big will give a result no bigger than 0.5 (which in
+   // fixed point is 16 bits in our case)
+   FXP fxpLocationOnFloorHalfTessFactor =
+      indexOnFloorHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnFloorTessFactor;
+   FXP fxpLocationOnCeilHalfTessFactor =
+      indexOnCeilHalfTessFactor * TessFactorCtx->fxpInvNumSegmentsOnCeilTessFactor;
+
+   // Since we know the numbers calculated above are <= fixed point 0.5, and the
+   // equation below is just lerping between two values <= fixed point 0.5
+   // (0x00008000), then we know that the final result before shifting by 16 bits
+   // is no larger than 0x80000000.  Once we shift that down by 16, we get the
+   // result of lerping 2 numbers <= 0.5, which is obviously at most 0.5
+   // (0x00008000)
+   FXP fxpLocation =
+      fxpLocationOnFloorHalfTessFactor * (FXP_ONE - TessFactorCtx->fxpHalfTessFactorFraction) +
+      fxpLocationOnCeilHalfTessFactor * (TessFactorCtx->fxpHalfTessFactorFraction);
+   fxpLocation = (fxpLocation + FXP_ONE_HALF /*round*/) >> FXP_FRACTION_BITS; // get back to n.16
+   if (bFlip) {
+      fxpLocation = FXP_ONE - fxpLocation;
+   }
+   return fxpLocation;
+}
+
+static inline void
+StitchRegular(private struct CHWTessellator *ctx, bool bTrapezoid,
+              DIAGONALS diagonals, int baseIndexOffset, int numInsideEdgePoints,
+              int insideEdgePointBaseOffset, int outsideEdgePointBaseOffset)
+{
+   int insidePoint = insideEdgePointBaseOffset;
+   int outsidePoint = outsideEdgePointBaseOffset;
+   if (bTrapezoid) {
+      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                     baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+   int p;
+   switch (diagonals) {
+   case DIAGONALS_INSIDE_TO_OUTSIDE:
+      // Diagonals pointing from inside edge forward towards outside edge
+      for (p = 0; p < numInsideEdgePoints - 1; p++) {
+         DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+
+         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   case DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE: // Assumes ODD tessellation
+      // Diagonals pointing from outside edge forward towards inside edge
+
+      // First half
+      for (p = 0; p < numInsideEdgePoints / 2 - 1; p++) {
+         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+
+      // Middle
+      DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint,
+                     baseIndexOffset);
+      baseIndexOffset += 3;
+      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
+                     baseIndexOffset);
+      baseIndexOffset += 3;
+      insidePoint++;
+      outsidePoint++;
+      p += 2;
+
+      // Second half
+      for (; p < numInsideEdgePoints; p++) {
+         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   case DIAGONALS_MIRRORED:
+      // First half, diagonals pointing from outside of outside edge to inside of
+      // inside edge
+      for (p = 0; p < numInsideEdgePoints / 2; p++) {
+         DefineTriangle(ctx, outsidePoint, insidePoint + 1, insidePoint,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      // Second half, diagonals pointing from inside of inside edge to outside of
+      // outside edge
+      for (; p < numInsideEdgePoints - 1; p++) {
+         DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineTriangle(ctx, insidePoint, outsidePoint + 1, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      }
+      break;
+   }
+   if (bTrapezoid) {
+      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                     baseIndexOffset);
+      baseIndexOffset += 3;
+   }
+}
+
+// loop_start and loop_end give optimal loop bounds for
+// the stitching algorithm further below, for any given halfTssFactor. There
+// is probably a better way to encode this...
+//
+// Return the FIRST entry in finalPointPositionTable awhich is less than
+// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
+static inline int
+loop_start(int N)
+{
+   if (N < 2)
+      return 1;
+   else if (N == 2)
+      return 17;
+   else if (N < 5)
+      return 9;
+   else if (N < 9)
+      return 5;
+   else if (N < 17)
+      return 3;
+   else
+      return 2;
+}
+
+// Return the LAST entry in finalPointPositionTable[] which is less than
+// halfTessFactor, except entry 0 and 1 which are set up to skip the loop.
+static int
+loop_end(int N)
+{
+   if (N < 2)
+      return 0;
+   else if (N < 4)
+      return 17;
+   else if (N < 8)
+      return 25;
+   else if (N < 16)
+      return 29;
+   else if (N < 32)
+      return 31;
+   else
+      return 32;
+}
+
+// Tables to assist in the stitching of 2 rows of points having arbitrary
+// TessFactors. The stitching order is governed by Ruler Function vertex
+// split ordering (see external documentation).
+//
+// The contents of the finalPointPositionTable are where vertex i [0..33]
+// ends up on the half-edge at the max tessellation amount given
+// ruler-function split order. Recall the other half of an edge is mirrored,
+// so we only need to deal with one half. This table is used to decide when
+// to advance a point on the interior or exterior. It supports odd TessFactor
+// up to 65 and even TessFactor up to 64.
+
+/* TODO: Is this actually faster than a LUT? */
+static inline uint32_t
+finalPointPositionTable(uint32_t x)
+{
+   if (x == 0)
+      return 0;
+   if (x == 1)
+      return 0x20;
+
+   uint32_t shift;
+   if ((x & 1) == 0) {
+      shift = 1;
+   } else if ((x & 3) == 3) {
+      shift = 2;
+   } else if ((x & 7) == 5) {
+      shift = 3;
+   } else if (x != 17) {
+      shift = 4;
+   } else {
+      shift = 5;
+   }
+
+   // SWAR vectorized right-shift of (0x20, x)
+   // We're calculating `min(0xf, 0x20 >> shift) + (x >> shift)`.
+   uint32_t items_to_shift = x | (0x20 << 16);
+   uint32_t shifted = items_to_shift >> shift;
+
+   uint32_t bias = min(0xfu, shifted >> 16);
+   return bias + (shifted & 0xffff);
+}
+
+static inline void
+StitchTransition(private struct CHWTessellator *ctx, int baseIndexOffset,
+                 int insideEdgePointBaseOffset,
+                 int insideNumHalfTessFactorPoints,
+                 bool insideEdgeTessFactorOdd, int outsideEdgePointBaseOffset,
+                 int outsideNumHalfTessFactorPoints, bool outsideTessFactorOdd)
+{
+   if (insideEdgeTessFactorOdd) {
+      insideNumHalfTessFactorPoints -= 1;
+   }
+   if (outsideTessFactorOdd) {
+      outsideNumHalfTessFactorPoints -= 1;
+   }
+   // Walk first half
+   int outsidePoint = outsideEdgePointBaseOffset;
+   int insidePoint = insideEdgePointBaseOffset;
+
+   // iStart,iEnd are a small optimization so the loop below doesn't have to go
+   // from 0 up to 31
+   int iStart = min(loop_start(insideNumHalfTessFactorPoints),
+                    loop_start(outsideNumHalfTessFactorPoints));
+   int iEnd = loop_end(
+      max(insideNumHalfTessFactorPoints, outsideNumHalfTessFactorPoints));
+
+   // since we don't start the loop at 0 below, we need a special case.
+   if (0 < outsideNumHalfTessFactorPoints) {
+      // Advance outside
+      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                     baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+
+   for (int i = iStart; i <= iEnd; i++) {
+      int bound = finalPointPositionTable(i);
+
+      if (bound < insideNumHalfTessFactorPoints) {
+         // Advance inside
+         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+      if (bound < outsideNumHalfTessFactorPoints) {
+         // Advance outside
+         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      }
+   }
+
+   if ((insideEdgeTessFactorOdd != outsideTessFactorOdd) ||
+       insideEdgeTessFactorOdd) {
+      if (insideEdgeTessFactorOdd == outsideTessFactorOdd) {
+         // Quad in the middle
+         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         DefineTriangle(ctx, insidePoint + 1, outsidePoint, outsidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+         outsidePoint++;
+      } else if (!insideEdgeTessFactorOdd) {
+         // Triangle pointing inside
+         DefineTriangle(ctx, insidePoint, outsidePoint, outsidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      } else {
+         // Triangle pointing outside
+         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+   }
+
+   // Walk second half.
+   for (int i = iEnd; i >= iStart; i--) {
+      int bound = finalPointPositionTable(i);
+
+      if (bound < outsideNumHalfTessFactorPoints) {
+         // Advance outside
+         DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         outsidePoint++;
+      }
+      if (bound < insideNumHalfTessFactorPoints) {
+         // Advance inside
+         DefineTriangle(ctx, insidePoint, outsidePoint, insidePoint + 1,
+                        baseIndexOffset);
+         baseIndexOffset += 3;
+         insidePoint++;
+      }
+   }
+   // Below case is not needed if we didn't optimize loop above and made it run
+   // from 31 down to 0.
+   if (0 < outsideNumHalfTessFactorPoints) {
+      DefineTriangle(ctx, outsidePoint, outsidePoint + 1, insidePoint,
+                     baseIndexOffset);
+      baseIndexOffset += 3;
+      outsidePoint++;
+   }
+}
+
+static inline void
+poly_tess_isoline_process(constant struct poly_tess_args *p, uint32_t patch,
+                          enum poly_tess_mode mode)
+{
+   enum poly_tess_partitioning partitioning = p->partitioning;
+
+   bool lineDensityOdd;
+   bool lineDetailOdd;
+   TESS_FACTOR_CONTEXT lineDensityTessFactorCtx;
+   TESS_FACTOR_CONTEXT lineDetailTessFactorCtx;
+
+   global float *factors = tess_factors(p, patch);
+   float TessFactor_V_LineDensity = factors[0];
+   float TessFactor_U_LineDetail = factors[1];
+
+   // Is the patch culled? NaN will pass.
+   if (!(TessFactor_V_LineDensity > 0) || !(TessFactor_U_LineDetail > 0)) {
+      poly_draw_empty(p, mode, patch);
+      return;
+   }
+
+   // Clamp edge TessFactors
+   TessFactor_V_LineDensity =
+      clamp(TessFactor_V_LineDensity,
+            POLY_TESS_MIN_ISOLINE_DENSITY_TESSELLATION_FACTOR,
+            POLY_TESS_MAX_ISOLINE_DENSITY_TESSELLATION_FACTOR);
+   TessFactor_U_LineDetail =
+      clamp_factor(TessFactor_U_LineDetail, partitioning, 0);
+
+   // Process tessFactors
+   if (partitioning == POLY_TESS_PARTITIONING_INTEGER) {
+      lineDetailOdd = isOdd(TessFactor_U_LineDetail);
+   } else {
+      lineDetailOdd = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD);
+   }
+
+   FXP fxpTessFactor_U_LineDetail = floatToFixed(TessFactor_U_LineDetail);
+
+   ComputeTessFactorCtx(lineDetailOdd, fxpTessFactor_U_LineDetail,
+                        &lineDetailTessFactorCtx);
+   int numPointsPerLine =
+      NumPointsForTessFactor(lineDetailOdd, fxpTessFactor_U_LineDetail);
+
+   TessFactor_V_LineDensity = ceil(TessFactor_V_LineDensity);
+   lineDensityOdd = isOdd(TessFactor_V_LineDensity);
+   FXP fxpTessFactor_V_LineDensity = floatToFixed(TessFactor_V_LineDensity);
+   ComputeTessFactorCtx(lineDensityOdd, fxpTessFactor_V_LineDensity,
+                        &lineDensityTessFactorCtx);
+
+   // don't draw last line at V == 1.
+   int numLines =
+      NumPointsForTessFactor(lineDensityOdd, fxpTessFactor_V_LineDensity) - 1;
+
+   /* Points */
+   uint num_points = numPointsPerLine * numLines;
+   if (mode != POLY_TESS_MODE_COUNT) {
+      global struct poly_tess_point *points =
+         poly_heap_alloc_points(p, patch, num_points);
+
+      for (int line = 0, pointOffset = 0; line < numLines; line++) {
+         FXP fxpV =
+            PlacePointIn1D(&lineDensityTessFactorCtx, lineDensityOdd, line);
+
+         for (int point = 0; point < numPointsPerLine; point++) {
+            FXP fxpU =
+               PlacePointIn1D(&lineDetailTessFactorCtx, lineDetailOdd, point);
+
+            DefinePoint(&points[pointOffset++], fxpU, fxpV);
+         }
+      }
+   }
+
+   struct CHWTessellator ctx = {
+      .mode = mode,
+      .index_bias = patch * POLY_TES_PATCH_ID_STRIDE,
+   };
+
+   /* Connectivity */
+   if (!p->points_mode) {
+      uint num_indices = numLines * (numPointsPerLine - 1) * 2;
+      ctx.Index = poly_draw(p, mode, true, patch, num_indices);
+
+      if (mode == POLY_TESS_MODE_COUNT)
+         return;
+
+      for (int line = 0, pointOffset = 0, indexOffset = 0; line < numLines;
+           line++) {
+         pointOffset++;
+
+         for (int point = 1; point < numPointsPerLine; point++) {
+            DefineIndex(&ctx, pointOffset - 1, indexOffset++);
+            DefineIndex(&ctx, pointOffset, indexOffset++);
+            pointOffset++;
+         }
+      }
+   } else {
+      poly_draw_points(&ctx, p, patch, num_points);
+   }
+}
+
+static inline void
+poly_tess_tri_process(constant struct poly_tess_args *p, uint32_t patch,
+                      enum poly_tess_mode mode)
+{
+   enum poly_tess_partitioning partitioning = p->partitioning;
+
+   global float *factors = tess_factors(p, patch);
+   float tessFactor_Ueq0 = factors[0];
+   float tessFactor_Veq0 = factors[1];
+   float tessFactor_Weq0 = factors[2];
+   float insideTessFactor_f = factors[4];
+
+   struct CHWTessellator ctx = {
+      .mode = mode,
+      .index_bias = patch * POLY_TES_PATCH_ID_STRIDE,
+   };
+
+   // Is the patch culled? NaN will pass.
+   if (!(tessFactor_Ueq0 > 0) || !(tessFactor_Veq0 > 0) ||
+       !(tessFactor_Weq0 > 0)) {
+
+      poly_draw_empty(p, mode, patch);
+
+      return;
+   }
+
+   FXP outsideTessFactor[TRI_EDGES];
+   FXP insideTessFactor;
+   bool outsideTessFactorOdd[TRI_EDGES];
+   bool insideTessFactorOdd;
+   TESS_FACTOR_CONTEXT outsideTessFactorCtx[TRI_EDGES];
+   TESS_FACTOR_CONTEXT insideTessFactorCtx;
+   // Stuff below is just specific to the traversal order
+   // this code happens to use to generate points/lines
+   int numPointsForOutsideEdge[TRI_EDGES];
+   int numPointsForInsideTessFactor;
+   int insideEdgePointBaseOffset;
+
+   // Clamp TessFactors
+   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
+   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
+   tessFactor_Weq0 = clamp_factor(tessFactor_Weq0, partitioning, 0);
+
+   float maxf = max(max(tessFactor_Ueq0, tessFactor_Veq0), tessFactor_Weq0);
+   insideTessFactor_f = clamp_factor(insideTessFactor_f, partitioning, maxf);
+   // Note the above clamps map NaN to the lower bound
+
+   // Process tessFactors
+   float outsideTessFactor_f[TRI_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
+                                           tessFactor_Weq0};
+   if (partitioning == POLY_TESS_PARTITIONING_INTEGER) {
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
+      }
+      insideTessFactorOdd =
+         isOdd(insideTessFactor_f) && (1.0f != insideTessFactor_f);
+   } else {
+      bool odd = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD);
+
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = odd;
+      }
+      insideTessFactorOdd = odd;
+   }
+
+   // Save fixed point TessFactors
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
+   }
+   insideTessFactor = floatToFixed(insideTessFactor_f);
+
+   if (partitioning != POLY_TESS_PARTITIONING_FRACTIONAL_EVEN) {
+      // Special case if all TessFactors are 1
+      if ((FXP_ONE == insideTessFactor) &&
+          (FXP_ONE == outsideTessFactor[Ueq0]) &&
+          (FXP_ONE == outsideTessFactor[Veq0]) &&
+          (FXP_ONE == outsideTessFactor[Weq0])) {
+
+         /* Just do minimum tess factor */
+         if (mode == POLY_TESS_MODE_COUNT) {
+            p->counts[patch] = 3;
+            return;
+         }
+
+         global struct poly_tess_point *points =
+            poly_heap_alloc_points(p, patch, 3);
+
+         DefinePoint(&points[0], 0,
+                     FXP_ONE);          // V=1 (beginning of Ueq0 edge VW)
+         DefinePoint(&points[1], 0, 0); // W=1 (beginning of Veq0 edge WU)
+         DefinePoint(&points[2], FXP_ONE,
+                     0); // U=1 (beginning of Weq0 edge UV)
+
+         if (!p->points_mode) {
+            ctx.Index = poly_draw(p, mode, false, patch, 3);
+
+            DefineTriangle(&ctx, 0, 1, 2,
+                           /*indexStorageBaseOffset*/ 0);
+         } else {
+            poly_draw_points(&ctx, p, patch, 3);
+         }
+
+         return;
+      }
+   }
+
+   // Compute per-TessFactor metadata
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
+                           &outsideTessFactorCtx[edge]);
+   }
+   ComputeTessFactorCtx(insideTessFactorOdd, insideTessFactor,
+                        &insideTessFactorCtx);
+
+   // Compute some initial data.
+   int NumPoints = 0;
+
+   // outside edge offsets and storage
+   for (int edge = 0; edge < TRI_EDGES; edge++) {
+      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
+         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
+      NumPoints += numPointsForOutsideEdge[edge];
+   }
+   NumPoints -= 3;
+
+   // inside edge offsets
+   numPointsForInsideTessFactor =
+      NumPointsForTessFactor(insideTessFactorOdd, insideTessFactor);
+   {
+      int pointCountMin = insideTessFactorOdd ? 4 : 3;
+      // max() allows degenerate transition regions when inside TessFactor == 1
+      numPointsForInsideTessFactor =
+         max(pointCountMin, numPointsForInsideTessFactor);
+   }
+
+   insideEdgePointBaseOffset = NumPoints;
+
+   // inside storage, including interior edges above
+   {
+      int interiorRings = (numPointsForInsideTessFactor >> 1) - 1;
+      int even = insideTessFactorOdd ? 0 : 1;
+      NumPoints += TRI_EDGES * (interiorRings * (interiorRings + even)) + even;
+   }
+
+   /* GENERATE POINTS */
+   if (mode != POLY_TESS_MODE_COUNT) {
+      ctx.Point = poly_heap_alloc_points(p, patch, NumPoints);
+
+      // Generate exterior ring edge points, clockwise starting from point V
+      // (VW, the U==0 edge)
+      int pointOffset = 0;
+      for (int edge = 0; edge < TRI_EDGES; edge++) {
+         int odd = edge & 0x1;
+         int endPoint = numPointsForOutsideEdge[edge] - 1;
+         // don't include end, since next edge starts with it.
+         for (int p = 0; p < endPoint; p++, pointOffset++) {
+            // whether to reverse point order given we are defining V or U (W
+            // implicit): edge0, VW, has V decreasing, so reverse 1D points
+            // below edge1, WU, has U increasing, so don't reverse 1D points
+            // below edge2, UV, has U decreasing, so reverse 1D points below
+            int q = odd ? p : endPoint - p;
+
+            FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
+                                          outsideTessFactorOdd[edge], q);
+            DefinePoint(&ctx.Point[pointOffset], (edge == 0) ? 0 : fxpParam,
+                        (edge == 0)   ? fxpParam
+                        : (edge == 2) ? FXP_ONE - fxpParam
+                                      : 0);
+         }
+      }
+
+      // Generate interior ring points, clockwise spiralling in
+      int numRings = (numPointsForInsideTessFactor >> 1);
+      for (int ring = 1; ring < numRings; ring++) {
+         int startPoint = ring;
+         int endPoint = numPointsForInsideTessFactor - 1 - startPoint;
+
+         int perpendicularAxisPoint = startPoint;
+         FXP fxpPerpParam = PlacePointIn1D(
+            &insideTessFactorCtx, insideTessFactorOdd, perpendicularAxisPoint);
+
+         // Map location to the right size in
+         // barycentric space. We know this fixed
+         // point math won't over/underflow
+         fxpPerpParam *= FXP_TWO_THIRDS;
+         fxpPerpParam = (fxpPerpParam + FXP_ONE_HALF /*round*/) >>
+                        FXP_FRACTION_BITS; // get back to n.16
+
+         for (int edge = 0; edge < TRI_EDGES; edge++) {
+            int odd = edge & 0x1;
+
+            // don't include end: next edge starts with it.
+            for (int p = startPoint; p < endPoint; p++, pointOffset++) {
+               // whether to reverse point given we are defining V or U (W
+               // implicit): edge0, VW, has V decreasing, so reverse 1D points
+               // below edge1, WU, has U increasing, so don't reverse 1D points
+               // below edge2, UV, has U decreasing, so reverse 1D points below
+               int q = odd ? p : endPoint - (p - startPoint);
+
+               FXP fxpParam =
+                  PlacePointIn1D(&insideTessFactorCtx, insideTessFactorOdd, q);
+               // edge0 VW, has perpendicular parameter U constant
+               // edge1 WU, has perpendicular parameter V constant
+               // edge2 UV, has perpendicular parameter W constant
+               // reciprocal is the rate of change of edge-parallel parameters
+               // as they are pushed into the triangle
+               const unsigned int deriv = 2;
+
+               // we know this fixed point math won't over/underflow
+               FXP tmp = fxpParam - (fxpPerpParam + 1 /*round*/) / deriv;
+
+               DefinePoint(&ctx.Point[pointOffset],
+                           edge > 0 ? tmp : fxpPerpParam,
+                           edge == 0   ? tmp
+                           : edge == 1 ? fxpPerpParam
+                                       : FXP_ONE - tmp - fxpPerpParam);
+            }
+         }
+      }
+      if (!insideTessFactorOdd) {
+         // Last point is the point at the center.
+         DefinePoint(&ctx.Point[pointOffset], FXP_ONE_THIRD, FXP_ONE_THIRD);
+      }
+   }
+
+   if (p->points_mode) {
+      poly_draw_points(&ctx, p, patch, NumPoints);
+      return;
+   }
+
+   {
+      // Generate primitives for all the concentric rings, one side at a time
+      // for each ring +1 is so even tess includes the center point, which we
+      // want to now
+      int numRings = ((numPointsForInsideTessFactor + 1) >> 1);
+
+      int NumIndices = 0;
+      {
+         int OuterPoints = numPointsForOutsideEdge[0] +
+                           numPointsForOutsideEdge[1] +
+                           numPointsForOutsideEdge[2];
+
+         int numRings18 = numRings * 18;
+         NumIndices = ((numRings18 - 27) * numPointsForInsideTessFactor) +
+                      (3 * OuterPoints) - (numRings18 * (numRings - 1)) +
+                      (insideTessFactorOdd ? 3 : 0);
+      }
+
+      // Generate the draw and allocate the index buffer now that we know the size
+      ctx.Index = poly_draw(p, mode, false, patch, NumIndices);
+
+      if (mode == POLY_TESS_MODE_COUNT)
+         return;
+
+      int insideOffset = insideEdgePointBaseOffset;
+      int outsideEdgePointBaseOffset = 0;
+
+      NumIndices = 0;
+      for (int ring = 1; ring < numRings; ring++) {
+         int numPointsForInsideEdge = numPointsForInsideTessFactor - 2 * ring;
+         int edge0InsidePointBaseOffset = insideOffset;
+         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
+         for (int edge = 0; edge < TRI_EDGES; edge++) {
+            int outsidePoints = ring == 1 ? numPointsForOutsideEdge[edge]
+                                          : (numPointsForInsideEdge + 2);
+
+            int numTriangles = numPointsForInsideEdge + outsidePoints - 2;
+
+            int insideBaseOffset;
+            int outsideBaseOffset;
+            if (edge == 2) {
+               ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
+                  insideOffset;
+               ctx.IndexPatchCtx.insidePointIndexBadValue =
+                  numPointsForInsideEdge - 1;
+               ctx.IndexPatchCtx.insidePointIndexReplacementValue =
+                  edge0InsidePointBaseOffset;
+               ctx.IndexPatchCtx.outsidePointIndexPatchBase =
+                  ctx.IndexPatchCtx.insidePointIndexBadValue +
+                  1; // past inside patched index range
+               ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
+                  outsideEdgePointBaseOffset -
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+               ctx.IndexPatchCtx.outsidePointIndexBadValue =
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase + outsidePoints -
+                  1;
+               ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
+                  edge0OutsidePointBaseOffset;
+               ctx.bUsingPatchedIndices = true;
+               insideBaseOffset = 0;
+               outsideBaseOffset = ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+            } else {
+               insideBaseOffset = insideOffset;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+            }
+            if (ring == 1) {
+               StitchTransition(
+                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
+                  insideTessFactorCtx.numHalfTessFactorPoints,
+                  insideTessFactorOdd, outsideBaseOffset,
+                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
+                  outsideTessFactorOdd[edge]);
+            } else {
+               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
+                             /*baseIndexOffset: */ NumIndices,
+                             numPointsForInsideEdge, insideBaseOffset,
+                             outsideBaseOffset);
+            }
+            if (2 == edge) {
+               ctx.bUsingPatchedIndices = false;
+            }
+            NumIndices += numTriangles * 3;
+            outsideEdgePointBaseOffset += outsidePoints - 1;
+            insideOffset += numPointsForInsideEdge - 1;
+         }
+      }
+      if (insideTessFactorOdd) {
+         // Triangulate center (a single triangle)
+         DefineTriangle(&ctx, outsideEdgePointBaseOffset,
+                        outsideEdgePointBaseOffset + 1,
+                        outsideEdgePointBaseOffset + 2, NumIndices);
+         NumIndices += 3;
+      }
+   }
+}
+
+static inline void
+poly_tess_quad_process(constant struct poly_tess_args *p, uint32_t patch,
+                       enum poly_tess_mode mode)
+{
+   enum poly_tess_partitioning partitioning = p->partitioning;
+   global float *factors = tess_factors(p, patch);
+
+   float tessFactor_Ueq0 = factors[0];
+   float tessFactor_Veq0 = factors[1];
+   float tessFactor_Ueq1 = factors[2];
+   float tessFactor_Veq1 = factors[3];
+
+   float insideTessFactor_U = factors[4];
+   float insideTessFactor_V = factors[5];
+
+   struct CHWTessellator ctx = {
+      .mode = mode,
+      .index_bias = patch * POLY_TES_PATCH_ID_STRIDE,
+   };
+
+   // Is the patch culled?
+   if (!(tessFactor_Ueq0 > 0) || // NaN will pass
+       !(tessFactor_Veq0 > 0) || !(tessFactor_Ueq1 > 0) ||
+       !(tessFactor_Veq1 > 0)) {
+      poly_draw_empty(p, mode, patch);
+      return;
+   }
+
+   FXP outsideTessFactor[QUAD_EDGES];
+   FXP insideTessFactor[QUAD_AXES];
+   bool outsideTessFactorOdd[QUAD_EDGES];
+   bool insideTessFactorOdd[QUAD_AXES];
+   TESS_FACTOR_CONTEXT outsideTessFactorCtx[QUAD_EDGES];
+   TESS_FACTOR_CONTEXT insideTessFactorCtx[QUAD_AXES];
+   // Stuff below is just specific to the traversal order
+   // this code happens to use to generate points/lines
+   int numPointsForOutsideEdge[QUAD_EDGES];
+   int numPointsForInsideTessFactor[QUAD_AXES];
+   int insideEdgePointBaseOffset;
+
+   // Clamp edge TessFactors
+   tessFactor_Ueq0 = clamp_factor(tessFactor_Ueq0, partitioning, 0);
+   tessFactor_Veq0 = clamp_factor(tessFactor_Veq0, partitioning, 0);
+   tessFactor_Ueq1 = clamp_factor(tessFactor_Ueq1, partitioning, 0);
+   tessFactor_Veq1 = clamp_factor(tessFactor_Veq1, partitioning, 0);
+
+   float maxf = max(max(max(tessFactor_Ueq0, tessFactor_Veq0),
+                        max(tessFactor_Ueq1, tessFactor_Veq1)),
+                    max(insideTessFactor_U, insideTessFactor_V));
+
+   insideTessFactor_U = clamp_factor(insideTessFactor_U, partitioning, maxf);
+   insideTessFactor_V = clamp_factor(insideTessFactor_V, partitioning, maxf);
+   // Note the above clamps map NaN to lowerBound
+
+   // Process tessFactors
+   float outsideTessFactor_f[QUAD_EDGES] = {tessFactor_Ueq0, tessFactor_Veq0,
+                                            tessFactor_Ueq1, tessFactor_Veq1};
+   float insideTessFactor_f[QUAD_AXES] = {insideTessFactor_U,
+                                          insideTessFactor_V};
+   if (partitioning == POLY_TESS_PARTITIONING_INTEGER) {
+      for (int edge = 0; edge < QUAD_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = isOdd(outsideTessFactor_f[edge]);
+      }
+      for (int axis = 0; axis < QUAD_AXES; axis++) {
+         insideTessFactorOdd[axis] = isOdd(insideTessFactor_f[axis]) &&
+                                     (1.0f != insideTessFactor_f[axis]);
+      }
+   } else {
+      bool odd = (partitioning == POLY_TESS_PARTITIONING_FRACTIONAL_ODD);
+
+      for (int edge = 0; edge < QUAD_EDGES; edge++) {
+         outsideTessFactorOdd[edge] = odd;
+      }
+      insideTessFactorOdd[U] = insideTessFactorOdd[V] = odd;
+   }
+
+   // Save fixed point TessFactors
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      outsideTessFactor[edge] = floatToFixed(outsideTessFactor_f[edge]);
+   }
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      insideTessFactor[axis] = floatToFixed(insideTessFactor_f[axis]);
+   }
+
+   if (partitioning != POLY_TESS_PARTITIONING_FRACTIONAL_EVEN) {
+      // Special case if all TessFactors are 1
+      if ((FXP_ONE == insideTessFactor[U]) &&
+          (FXP_ONE == insideTessFactor[V]) &&
+          (FXP_ONE == outsideTessFactor[Ueq0]) &&
+          (FXP_ONE == outsideTessFactor[Veq0]) &&
+          (FXP_ONE == outsideTessFactor[Ueq1]) &&
+          (FXP_ONE == outsideTessFactor[Veq1])) {
+
+         /* Just do minimum tess factor */
+         if (!p->points_mode) {
+            ctx.Index = poly_draw(p, mode, false, patch, 6);
+            if (mode == POLY_TESS_MODE_COUNT)
+               return;
+
+            DefineTriangle(&ctx, 0, 1, 3, /*indexStorageOffset*/ 0);
+            DefineTriangle(&ctx, 1, 2, 3, /*indexStorageOffset*/ 3);
+         } else {
+            poly_draw_points(&ctx, p, patch, 4);
+            if (mode == POLY_TESS_MODE_COUNT)
+               return;
+         }
+
+         global struct poly_tess_point *points =
+            poly_heap_alloc_points(p, patch, 4);
+
+         DefinePoint(&points[0], 0, 0);
+         DefinePoint(&points[1], FXP_ONE, 0);
+         DefinePoint(&points[2], FXP_ONE, FXP_ONE);
+         DefinePoint(&points[3], 0, FXP_ONE);
+         return;
+      }
+   }
+
+   // Compute TessFactor-specific metadata
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      ComputeTessFactorCtx(outsideTessFactorOdd[edge], outsideTessFactor[edge],
+                           &outsideTessFactorCtx[edge]);
+   }
+
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      ComputeTessFactorCtx(insideTessFactorOdd[axis], insideTessFactor[axis],
+                           &insideTessFactorCtx[axis]);
+   }
+
+   int NumPoints = 0;
+
+   // outside edge offsets and storage
+   for (int edge = 0; edge < QUAD_EDGES; edge++) {
+      numPointsForOutsideEdge[edge] = NumPointsForTessFactor(
+         outsideTessFactorOdd[edge], outsideTessFactor[edge]);
+      NumPoints += numPointsForOutsideEdge[edge];
+   }
+   NumPoints -= 4;
+
+   // inside edge offsets
+   for (int axis = 0; axis < QUAD_AXES; axis++) {
+      numPointsForInsideTessFactor[axis] = NumPointsForTessFactor(
+         insideTessFactorOdd[axis], insideTessFactor[axis]);
+      int pointCountMin = insideTessFactorOdd[axis] ? 4 : 3;
+      // max() allows degenerate transition regions when inside TessFactor == 1
+      numPointsForInsideTessFactor[axis] =
+         max(pointCountMin, numPointsForInsideTessFactor[axis]);
+   }
+
+   insideEdgePointBaseOffset = NumPoints;
+
+   // inside storage, including interior edges above
+   int numInteriorPoints = (numPointsForInsideTessFactor[U] - 2) *
+                           (numPointsForInsideTessFactor[V] - 2);
+   NumPoints += numInteriorPoints;
+
+   if (mode != POLY_TESS_MODE_COUNT) {
+      ctx.Point = poly_heap_alloc_points(p, patch, NumPoints);
+
+      // Generate exterior ring edge points, clockwise from top-left
+      int pointOffset = 0;
+      for (int edge = 0; edge < QUAD_EDGES; edge++) {
+         int odd = edge & 0x1;
+         // don't include end, since next edge starts with it.
+         int endPoint = numPointsForOutsideEdge[edge] - 1;
+         for (int p = 0; p < endPoint; p++, pointOffset++) {
+            int q =
+               ((edge == 1) || (edge == 2)) ? p : endPoint - p; // reverse order
+            FXP fxpParam = PlacePointIn1D(&outsideTessFactorCtx[edge],
+                                          outsideTessFactorOdd[edge], q);
+
+            FXP u = odd ? fxpParam : ((edge == 2) ? FXP_ONE : 0);
+            FXP v = odd ? ((edge == 3) ? FXP_ONE : 0) : fxpParam;
+            DefinePoint(&ctx.Point[pointOffset], u, v);
+         }
+      }
+
+      // Generate interior ring points, clockwise from (U==0,V==1) (bottom-left)
+      // spiralling toward center
+      int minNumPointsForTessFactor =
+         min(numPointsForInsideTessFactor[U], numPointsForInsideTessFactor[V]);
+      // note for even tess we aren't counting center point here.
+      int numRings = (minNumPointsForTessFactor >> 1);
+
+      for (int ring = 1; ring < numRings; ring++) {
+         int startPoint = ring;
+         int endPoint[QUAD_AXES] = {
+            numPointsForInsideTessFactor[U] - 1 - startPoint,
+            numPointsForInsideTessFactor[V] - 1 - startPoint,
+         };
+
+         for (int edge = 0; edge < QUAD_EDGES; edge++) {
+            int odd[QUAD_AXES] = {edge & 0x1, ((edge + 1) & 0x1)};
+            int perpendicularAxisPoint =
+               (edge < 2) ? startPoint : endPoint[odd[0]];
+            FXP fxpPerpParam = PlacePointIn1D(&insideTessFactorCtx[odd[0]],
+                                              insideTessFactorOdd[odd[0]],
+                                              perpendicularAxisPoint);
+
+            for (int p = startPoint; p < endPoint[odd[1]]; p++,
+                     pointOffset++) // don't include end: next edge starts with
+                                    // it.
+            {
+               bool odd_ = odd[1];
+               int q = ((edge == 1) || (edge == 2))
+                          ? p
+                          : endPoint[odd_] - (p - startPoint);
+               FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[odd_],
+                                             insideTessFactorOdd[odd_], q);
+               DefinePoint(&ctx.Point[pointOffset],
+                           odd_ ? fxpPerpParam : fxpParam,
+                           odd_ ? fxpParam : fxpPerpParam);
+            }
+         }
+      }
+      // For even tessellation, the inner "ring" is degenerate - a row of points
+      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
+          !insideTessFactorOdd[V]) {
+         int startPoint = numRings;
+         int endPoint = numPointsForInsideTessFactor[U] - 1 - startPoint;
+         for (int p = startPoint; p <= endPoint; p++, pointOffset++) {
+            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[U],
+                                          insideTessFactorOdd[U], p);
+            DefinePoint(&ctx.Point[pointOffset], fxpParam, FXP_ONE_HALF);
+         }
+      } else if ((numPointsForInsideTessFactor[V] >=
+                  numPointsForInsideTessFactor[U]) &&
+                 !insideTessFactorOdd[U]) {
+         int startPoint = numRings;
+         int endPoint = numPointsForInsideTessFactor[V] - 1 - startPoint;
+         for (int p = endPoint; p >= startPoint; p--, pointOffset++) {
+            FXP fxpParam = PlacePointIn1D(&insideTessFactorCtx[V],
+                                          insideTessFactorOdd[V], p);
+            DefinePoint(&ctx.Point[pointOffset], FXP_ONE_HALF, fxpParam);
+         }
+      }
+   }
+
+   if (p->points_mode) {
+      poly_draw_points(&ctx, p, patch, NumPoints);
+      return;
+   }
+
+   /* CONNECTIVITY */
+   {
+      // Generate primitives for all the concentric rings, one side at a time
+      // for each ring. +1 is so even tess includes the center point
+      int numPointRowsToCenter[QUAD_AXES] = {
+         (numPointsForInsideTessFactor[U] + 1) >> 1,
+         (numPointsForInsideTessFactor[V] + 1) >> 1,
+      };
+
+      int numRings = min(numPointRowsToCenter[U], numPointRowsToCenter[V]);
+
+      /* Calculate # of indices so we can allocate */
+      {
+         /* Handle main case */
+         int OuterPoints =
+            numPointsForOutsideEdge[0] + numPointsForOutsideEdge[1] +
+            numPointsForOutsideEdge[2] + numPointsForOutsideEdge[3];
+
+         int InnerPoints =
+            numPointsForInsideTessFactor[U] + numPointsForInsideTessFactor[V];
+
+         int NumIndices = (OuterPoints * 3) + (12 * numRings * InnerPoints) -
+                          (InnerPoints * 18) - (24 * numRings * (numRings - 1));
+
+         /* Determine major/minor axes */
+         bool U_major =
+            (numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]);
+         unsigned M = U_major ? U : V;
+         unsigned m = U_major ? V : U;
+
+         /* Handle degenerate ring */
+         if (insideTessFactorOdd[m]) {
+            NumIndices += 12 * ((numPointsForInsideTessFactor[M] >> 1) -
+                                (numPointsForInsideTessFactor[m] >> 1));
+            NumIndices += (insideTessFactorOdd[M] ? 6 : 12);
+         }
+
+         // Generate the draw and allocate the index buffer with the size
+         ctx.Index = poly_draw(p, mode, false, patch, NumIndices);
+      }
+
+      if (mode == POLY_TESS_MODE_COUNT)
+         return;
+
+      int degeneratePointRing[QUAD_AXES] = {
+         // Even partitioning causes degenerate row of points,
+         // which results in exceptions to the point ordering conventions
+         // when travelling around the rings counterclockwise.
+         !insideTessFactorOdd[V] ? numPointRowsToCenter[V] - 1 : -1,
+         !insideTessFactorOdd[U] ? numPointRowsToCenter[U] - 1 : -1,
+      };
+
+      int numPointsForOutsideEdge_[QUAD_EDGES] = {
+         numPointsForOutsideEdge[Ueq0],
+         numPointsForOutsideEdge[Veq0],
+         numPointsForOutsideEdge[Ueq1],
+         numPointsForOutsideEdge[Veq1],
+      };
+
+      int insideEdgePointBaseOffset_ = insideEdgePointBaseOffset;
+      int outsideEdgePointBaseOffset = 0;
+
+      int NumIndices = 0;
+
+      for (int ring = 1; ring < numRings; ring++) {
+         int numPointsForInsideEdge[QUAD_AXES] = {
+            numPointsForInsideTessFactor[U] - 2 * ring,
+            numPointsForInsideTessFactor[V] - 2 * ring};
+
+         int edge0InsidePointBaseOffset = insideEdgePointBaseOffset_;
+         int edge0OutsidePointBaseOffset = outsideEdgePointBaseOffset;
+
+         for (int edge = 0; edge < QUAD_EDGES; edge++) {
+            int odd = (edge + 1) & 0x1;
+
+            int numTriangles =
+               numPointsForInsideEdge[odd] + numPointsForOutsideEdge_[edge] - 2;
+            int insideBaseOffset;
+            int outsideBaseOffset;
+
+            // We need to patch the indexing so Stitch() can think it sees 2
+            // sequentially increasing rows of points, even though we have
+            // wrapped around to the end of the inner and outer ring's points,
+            // so the last point is really the first point for the ring. We make
+            // it so that when Stitch() calls AddIndex(), that function will do
+            // any necessary index adjustment.
+            if (edge == 3) {
+               if (ring == degeneratePointRing[odd]) {
+                  ctx.IndexPatchCtx2.baseIndexToInvert =
+                     insideEdgePointBaseOffset_ + 1;
+                  ctx.IndexPatchCtx2.cornerCaseBadValue =
+                     outsideEdgePointBaseOffset +
+                     numPointsForOutsideEdge_[edge] - 1;
+                  ctx.IndexPatchCtx2.cornerCaseReplacementValue =
+                     edge0OutsidePointBaseOffset;
+                  ctx.IndexPatchCtx2.indexInversionEndPoint =
+                     (ctx.IndexPatchCtx2.baseIndexToInvert << 1) - 1;
+                  insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
+                  outsideBaseOffset = outsideEdgePointBaseOffset;
+                  ctx.bUsingPatchedIndices2 = true;
+               } else {
+                  ctx.IndexPatchCtx.insidePointIndexDeltaToRealValue =
+                     insideEdgePointBaseOffset_;
+                  ctx.IndexPatchCtx.insidePointIndexBadValue =
+                     numPointsForInsideEdge[odd] - 1;
+                  ctx.IndexPatchCtx.insidePointIndexReplacementValue =
+                     edge0InsidePointBaseOffset;
+                  ctx.IndexPatchCtx.outsidePointIndexPatchBase =
+                     ctx.IndexPatchCtx.insidePointIndexBadValue +
+                     1; // past inside patched index range
+                  ctx.IndexPatchCtx.outsidePointIndexDeltaToRealValue =
+                     outsideEdgePointBaseOffset -
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+                  ctx.IndexPatchCtx.outsidePointIndexBadValue =
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase +
+                     numPointsForOutsideEdge_[edge] - 1;
+                  ctx.IndexPatchCtx.outsidePointIndexReplacementValue =
+                     edge0OutsidePointBaseOffset;
+
+                  insideBaseOffset = 0;
+                  outsideBaseOffset =
+                     ctx.IndexPatchCtx.outsidePointIndexPatchBase;
+                  ctx.bUsingPatchedIndices = true;
+               }
+            } else if ((edge == 2) && (ring == degeneratePointRing[odd])) {
+               ctx.IndexPatchCtx2.baseIndexToInvert =
+                  insideEdgePointBaseOffset_;
+               ctx.IndexPatchCtx2.cornerCaseBadValue = -1;         // unused
+               ctx.IndexPatchCtx2.cornerCaseReplacementValue = -1; // unused
+               ctx.IndexPatchCtx2.indexInversionEndPoint =
+                  ctx.IndexPatchCtx2.baseIndexToInvert << 1;
+               insideBaseOffset = ctx.IndexPatchCtx2.baseIndexToInvert;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+               ctx.bUsingPatchedIndices2 = true;
+            } else {
+               insideBaseOffset = insideEdgePointBaseOffset_;
+               outsideBaseOffset = outsideEdgePointBaseOffset;
+            }
+            if (ring == 1) {
+               StitchTransition(
+                  &ctx, /*baseIndexOffset: */ NumIndices, insideBaseOffset,
+                  insideTessFactorCtx[odd].numHalfTessFactorPoints,
+                  insideTessFactorOdd[odd], outsideBaseOffset,
+                  outsideTessFactorCtx[edge].numHalfTessFactorPoints,
+                  outsideTessFactorOdd[edge]);
+            } else {
+               StitchRegular(&ctx, /*bTrapezoid*/ true, DIAGONALS_MIRRORED,
+                             /*baseIndexOffset: */ NumIndices,
+                             numPointsForInsideEdge[odd], insideBaseOffset,
+                             outsideBaseOffset);
+            }
+            ctx.bUsingPatchedIndices = false;
+            ctx.bUsingPatchedIndices2 = false;
+            NumIndices += numTriangles * 3;
+            outsideEdgePointBaseOffset += numPointsForOutsideEdge_[edge] - 1;
+            if ((edge == 2) && (ring == degeneratePointRing[odd])) {
+               insideEdgePointBaseOffset_ -= numPointsForInsideEdge[odd] - 1;
+            } else {
+               insideEdgePointBaseOffset_ += numPointsForInsideEdge[odd] - 1;
+            }
+            numPointsForOutsideEdge_[edge] = numPointsForInsideEdge[odd];
+         }
+      }
+
+      // Triangulate center - a row of quads if odd
+      // This triangulation may be producing diagonals that are asymmetric about
+      // the center of the patch in this region.
+      if ((numPointsForInsideTessFactor[U] > numPointsForInsideTessFactor[V]) &&
+          insideTessFactorOdd[V]) {
+         ctx.bUsingPatchedIndices2 = true;
+         int stripNumQuads = (((numPointsForInsideTessFactor[U] >> 1) -
+                               (numPointsForInsideTessFactor[V] >> 1))
+                              << 1) +
+                             (insideTessFactorOdd[U] ? 1 : 2);
+         ctx.IndexPatchCtx2.baseIndexToInvert =
+            outsideEdgePointBaseOffset + stripNumQuads + 2;
+         ctx.IndexPatchCtx2.cornerCaseBadValue =
+            ctx.IndexPatchCtx2.baseIndexToInvert;
+         ctx.IndexPatchCtx2.cornerCaseReplacementValue =
+            outsideEdgePointBaseOffset;
+         ctx.IndexPatchCtx2.indexInversionEndPoint =
+            ctx.IndexPatchCtx2.baseIndexToInvert +
+            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
+         StitchRegular(
+            &ctx, /*bTrapezoid*/ false, DIAGONALS_INSIDE_TO_OUTSIDE,
+            /*baseIndexOffset: */ NumIndices,
+            /*numInsideEdgePoints:*/ stripNumQuads + 1,
+            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
+            outsideEdgePointBaseOffset + 1);
+         ctx.bUsingPatchedIndices2 = false;
+         NumIndices += stripNumQuads * 6;
+      } else if ((numPointsForInsideTessFactor[V] >=
+                  numPointsForInsideTessFactor[U]) &&
+                 insideTessFactorOdd[U]) {
+         ctx.bUsingPatchedIndices2 = true;
+         int stripNumQuads = (((numPointsForInsideTessFactor[V] >> 1) -
+                               (numPointsForInsideTessFactor[U] >> 1))
+                              << 1) +
+                             (insideTessFactorOdd[V] ? 1 : 2);
+         ctx.IndexPatchCtx2.baseIndexToInvert =
+            outsideEdgePointBaseOffset + stripNumQuads + 1;
+         ctx.IndexPatchCtx2.cornerCaseBadValue = -1; // unused
+         ctx.IndexPatchCtx2.indexInversionEndPoint =
+            ctx.IndexPatchCtx2.baseIndexToInvert +
+            ctx.IndexPatchCtx2.baseIndexToInvert + stripNumQuads;
+         DIAGONALS diag = insideTessFactorOdd[V]
+                             ? DIAGONALS_INSIDE_TO_OUTSIDE_EXCEPT_MIDDLE
+                             : DIAGONALS_INSIDE_TO_OUTSIDE;
+         StitchRegular(
+            &ctx, /*bTrapezoid*/ false, diag,
+            /*baseIndexOffset: */ NumIndices,
+            /*numInsideEdgePoints:*/ stripNumQuads + 1,
+            /*insideEdgePointBaseOffset*/ ctx.IndexPatchCtx2.baseIndexToInvert,
+            outsideEdgePointBaseOffset);
+         ctx.bUsingPatchedIndices2 = false;
+         NumIndices += stripNumQuads * 6;
+      }
+   }
+}
diff --git a/src/poly/geometry.h b/src/poly/geometry.h
new file mode 100644
index 00000000000..4048b956307
--- /dev/null
+++ b/src/poly/geometry.h
@@ -0,0 +1,641 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/libcl/libcl.h"
+#include "compiler/shader_enums.h"
+
+#include "util/bitscan.h"
+#include "util/u_math.h"
+
+#ifdef __OPENCL_VERSION__
+#include "compiler/libcl/libcl_vk.h"
+#endif
+
+#pragma once
+
+#define POLY_MAX_SO_BUFFERS     4
+#define POLY_MAX_VERTEX_STREAMS 4
+
+enum poly_gs_shape {
+   /* Indexed, where indices are encoded as:
+    *
+    *    round_to_pot(max_indices) * round_to_pot(input_primitives) *
+    *                              * instance_count
+    *
+    * invoked for max_indices * input_primitives * instance_count indices.
+    *
+    * This is used with any dynamic topology. No hardware instancing used.
+    */
+   POLY_GS_SHAPE_DYNAMIC_INDEXED,
+
+   /* Indexed with a static index buffer. Indices ranges up to max_indices.
+    * Hardware instance count = input_primitives * software instance count.
+    */
+   POLY_GS_SHAPE_STATIC_INDEXED,
+
+   /* Non-indexed. Dispatched as:
+    *
+    *    (max_indices, input_primitives * instance count).
+    */
+   POLY_GS_SHAPE_STATIC_PER_PRIM,
+
+   /* Non-indexed. Dispatched as:
+    *
+    *    (max_indices * input_primitives, instance count).
+    */
+   POLY_GS_SHAPE_STATIC_PER_INSTANCE,
+};
+
+static inline unsigned
+poly_gs_rast_vertices(enum poly_gs_shape shape, unsigned max_indices,
+                      unsigned input_primitives, unsigned instance_count)
+{
+   switch (shape) {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED:
+      return max_indices * input_primitives * instance_count;
+
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+   case POLY_GS_SHAPE_STATIC_PER_PRIM:
+      return max_indices;
+
+   case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
+      return max_indices * input_primitives;
+   }
+
+   UNREACHABLE("invalid shape");
+}
+
+static inline unsigned
+poly_gs_rast_instances(enum poly_gs_shape shape, unsigned max_indices,
+                       unsigned input_primitives, unsigned instance_count)
+{
+   switch (shape) {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED:
+      return 1;
+
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+   case POLY_GS_SHAPE_STATIC_PER_PRIM:
+      return input_primitives * instance_count;
+
+   case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
+      return instance_count;
+   }
+
+   UNREACHABLE("invalid shape");
+}
+
+static inline bool
+poly_gs_indexed(enum poly_gs_shape shape)
+{
+   return shape == POLY_GS_SHAPE_DYNAMIC_INDEXED ||
+          shape == POLY_GS_SHAPE_STATIC_INDEXED;
+}
+
+static inline unsigned
+poly_gs_index_size(enum poly_gs_shape shape)
+{
+   switch (shape) {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED:
+      return 4;
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+/* Heap to allocate from. */
+struct poly_heap {
+   DEVICE(uchar) base;
+   uint32_t bottom, size;
+} PACKED;
+static_assert(sizeof(struct poly_heap) == 4 * 4);
+
+#ifdef __OPENCL_VERSION__
+static inline uint
+_poly_heap_alloc_offs(global struct poly_heap *heap, uint size_B, bool atomic)
+{
+   size_B = align(size_B, 16);
+
+   uint offs;
+   if (atomic) {
+      offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
+   } else {
+      offs = heap->bottom;
+      heap->bottom = offs + size_B;
+   }
+
+   /* Use printf+abort because assert is stripped from release builds. */
+   if (heap->bottom >= heap->size) {
+      printf(
+         "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
+         size_B, offs, heap->size);
+
+      abort();
+   }
+
+   return offs;
+}
+
+static inline uint
+poly_heap_alloc_nonatomic_offs(global struct poly_heap *heap, uint size_B)
+{
+   return _poly_heap_alloc_offs(heap, size_B, false);
+}
+
+static inline uint
+poly_heap_alloc_atomic_offs(global struct poly_heap *heap, uint size_B)
+{
+   return _poly_heap_alloc_offs(heap, size_B, true);
+}
+
+static inline global void *
+poly_heap_alloc_nonatomic(global struct poly_heap *heap, uint size_B)
+{
+   return heap->base + poly_heap_alloc_nonatomic_offs(heap, size_B);
+}
+
+uint64_t nir_load_ro_sink_address_poly(void);
+
+static inline uint64_t
+poly_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
+                  uint elsize_B)
+{
+   if (offset_el < size_el)
+      return index_buffer + (offset_el * elsize_B);
+   else
+      return nir_load_ro_sink_address_poly();
+}
+#endif
+
+struct poly_ia_state {
+   /* Index buffer if present. */
+   uint64_t index_buffer;
+
+   /* Size of the bound index buffer for bounds checking */
+   uint32_t index_buffer_range_el;
+
+   /* Number of vertices per instance. Written by CPU for direct draw, indirect
+    * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
+    */
+   uint32_t verts_per_instance;
+} PACKED;
+static_assert(sizeof(struct poly_ia_state) == 4 * 4);
+
+static inline uint
+poly_index_buffer_range_el(uint size_el, uint offset_el)
+{
+   return offset_el < size_el ? (size_el - offset_el) : 0;
+}
+
+struct poly_geometry_params {
+   /* Address of associated indirect draw buffer */
+   DEVICE(uint) indirect_desc;
+
+   /* Address of count buffer. For an indirect draw, this will be written by the
+    * indirect setup kernel.
+    */
+   DEVICE(uint) count_buffer;
+
+   /* Address of the primitives generated counters */
+   DEVICE(uint) prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
+   DEVICE(uint) xfb_prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
+   DEVICE(uint) xfb_overflow[POLY_MAX_VERTEX_STREAMS];
+   DEVICE(uint) xfb_any_overflow;
+
+   /* Pointers to transform feedback buffer offsets in bytes */
+   DEVICE(uint) xfb_offs_ptrs[POLY_MAX_SO_BUFFERS];
+
+   /* Output index buffer, allocated by pre-GS. */
+   DEVICE(uint) output_index_buffer;
+
+   /* Address of transform feedback buffer in general, supplied by the CPU. */
+   DEVICE(uchar) xfb_base_original[POLY_MAX_SO_BUFFERS];
+
+   /* Address of transform feedback for the current primitive. Written by pre-GS
+    * program.
+    */
+   DEVICE(uchar) xfb_base[POLY_MAX_SO_BUFFERS];
+
+   /* Address and present mask for the input to the geometry shader. These will
+    * reflect the vertex shader for VS->GS or instead the tessellation
+    * evaluation shader for TES->GS.
+    */
+   uint64_t input_buffer;
+   uint64_t input_mask;
+
+   /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
+   uint64_t flat_outputs;
+
+   uint32_t xfb_size[POLY_MAX_SO_BUFFERS];
+
+   /* Number of vertices emitted by transform feedback per stream. Written by
+    * the pre-GS program.
+    */
+   uint32_t xfb_verts[POLY_MAX_VERTEX_STREAMS];
+
+   /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
+    * out by the GS indirect setup kernel or the CPU for a direct draw. This is
+    * the "indirect local" format: first 3 is in threads, second 3 is in grid
+    * blocks. This lets us use nontrivial workgroups with indirect draws without
+    * needing any predication.
+    */
+   uint32_t vs_grid[6];
+   uint32_t gs_grid[6];
+
+   /* Number of input primitives across all instances, calculated by the CPU for
+    * a direct draw or the GS indirect setup kernel for an indirect draw.
+    */
+   uint32_t input_primitives;
+
+   /* Number of input primitives per instance, rounded up to a power-of-two and
+    * with the base-2 log taken. This is used to partition the output vertex IDs
+    * efficiently.
+    */
+   uint32_t primitives_log2;
+
+   /* Number of bytes output by the GS count shader per input primitive (may be
+    * 0), written by CPU and consumed by indirect draw setup shader for
+    * allocating counts.
+    */
+   uint32_t count_buffer_stride;
+
+   /* Dynamic input topology. Must be compatible with the geometry shader's
+    * layout() declared input class.
+    */
+   uint32_t input_topology;
+} PACKED;
+static_assert(sizeof(struct poly_geometry_params) == 86 * 4);
+
+/* TCS shared memory layout:
+ *
+ *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
+ *
+ * TODO: compact.
+ */
+static inline uint
+poly_tcs_in_offs_el(uint vtx, gl_varying_slot location,
+                    uint64_t crosslane_vs_out_mask)
+{
+   uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
+   uint offs = util_bitcount64(crosslane_vs_out_mask &
+                               (((uint64_t)(1) << location) - 1));
+
+   return base + offs;
+}
+
+static inline uint
+poly_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
+{
+   return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
+}
+
+/*
+ * TCS out buffer layout, per-patch:
+ *
+ *    float tess_level_outer[4];
+ *    float tess_level_inner[2];
+ *    vec4 patch_out[MAX_PATCH_OUTPUTS];
+ *    vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
+ *
+ * Vertex out are compacted based on the mask of written out. Patch
+ * out are used as-is.
+ *
+ * Bounding boxes are ignored.
+ */
+static inline uint
+poly_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
+                     uint64_t vtx_out_mask)
+{
+   uint off = 0;
+   if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+      return off;
+
+   off += 4;
+   if (location == VARYING_SLOT_TESS_LEVEL_INNER)
+      return off;
+
+   off += 2;
+   if (location >= VARYING_SLOT_PATCH0)
+      return off + (4 * (location - VARYING_SLOT_PATCH0));
+
+   /* Anything else is a per-vtx output */
+   off += 4 * nr_patch_out;
+   off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
+
+   uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
+   return off + (4 * idx);
+}
+
+static inline uint
+poly_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
+                       uint64_t vtx_out_mask)
+{
+   return poly_tcs_out_offs_el(out_patch_size, VARYING_SLOT_POS, nr_patch_out,
+                               vtx_out_mask);
+}
+
+static inline uint
+poly_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
+                    uint64_t vtx_out_mask)
+{
+   return poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
+          4;
+}
+
+/* In a tess eval shader, stride for hw vertex ID */
+#define POLY_TES_PATCH_ID_STRIDE 8192
+
+static inline uint
+poly_compact_prim(enum mesa_prim prim)
+{
+   static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
+   static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
+
+#ifndef __OPENCL_VERSION__
+   assert(prim != MESA_PRIM_QUADS);
+   assert(prim != MESA_PRIM_QUAD_STRIP);
+   assert(prim != MESA_PRIM_POLYGON);
+   assert(prim != MESA_PRIM_PATCHES);
+#endif
+
+   return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
+}
+
+static inline enum mesa_prim
+poly_uncompact_prim(uint packed)
+{
+   if (packed >= MESA_PRIM_QUADS)
+      return (enum mesa_prim)(packed + 3);
+
+   return (enum mesa_prim)packed;
+}
+
+/*
+ * Write a strip into a 32-bit index buffer. This is the sequence:
+ *
+ *    (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
+ *
+ * For points, we write index buffers without restart just for remapping.
+ */
+static inline void
+_poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
+                  uint32_t vertex_offset, uint32_t verts_in_prim,
+                  uint32_t stream, uint32_t stream_multiplier, uint32_t n)
+{
+   bool restart = n > 1;
+   if (verts_in_prim < n)
+      return;
+
+   GLOBAL uint32_t *out = &index_buffer[index_offset];
+
+   /* Write out indices for the strip */
+   for (uint32_t i = 0; i < verts_in_prim; ++i) {
+      out[i] = (vertex_offset + i) * stream_multiplier + stream;
+   }
+
+   if (restart)
+      out[verts_in_prim] = -1;
+}
+
+static inline unsigned
+poly_decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
+                                             unsigned verts_per_patch)
+{
+   if (prim >= MESA_PRIM_PATCHES) {
+      return vertices / verts_per_patch;
+   } else {
+      return u_decomposed_prims_for_vertices(prim, vertices);
+   }
+}
+
+#ifdef __OPENCL_VERSION__
+/*
+ * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
+ * manually with subgroup ops and local memory since Mesa doesn't do those
+ * lowerings yet.
+ */
+static inline uint2
+poly_work_group_scan_inclusive_add(uint x, local uint *scratch)
+{
+   uint sg_id = get_sub_group_id();
+
+   /* Partial prefix sum of the subgroup */
+   uint sg = sub_group_scan_inclusive_add(x);
+
+   /* Reduction (sum) for the subgroup */
+   uint sg_sum = sub_group_broadcast(sg, 31);
+
+   /* Write out all the subgroups sums */
+   barrier(CLK_LOCAL_MEM_FENCE);
+   scratch[sg_id] = sg_sum;
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
+    * threads in subgroup T.
+    */
+   uint other_sum = scratch[get_sub_group_local_id()];
+
+   /* Exclusive sum the subgroup sums to get the total before the current group,
+    * which can be added to the total for the current group.
+    */
+   uint other_sums = sub_group_scan_exclusive_add(other_sum);
+   uint base = sub_group_broadcast(other_sums, sg_id);
+   uint prefix = base + sg;
+
+   /* Reduce the workgroup using the prefix sum we already did */
+   uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
+
+   return (uint2)(prefix, reduction);
+}
+
+static inline void
+poly_prefix_sum(local uint *scratch, global uint *buffer, uint len, uint words,
+                uint word, uint wg_count)
+{
+   uint tid = cl_local_id.x;
+
+   /* Main loop: complete workgroups processing multiple values at once */
+   uint i, count = 0;
+   uint len_remainder = len % wg_count;
+   uint len_rounded_down = len - len_remainder;
+
+   for (i = tid; i < len_rounded_down; i += wg_count) {
+      global uint *ptr = &buffer[(i * words) + word];
+      uint value = *ptr;
+      uint2 sums = poly_work_group_scan_inclusive_add(value, scratch);
+
+      *ptr = count + sums[0];
+      count += sums[1];
+   }
+
+   /* The last iteration is special since we won't have a full subgroup unless
+    * the length is divisible by the subgroup size, and we don't advance count.
+    */
+   global uint *ptr = &buffer[(i * words) + word];
+   uint value = (tid < len_remainder) ? *ptr : 0;
+   uint scan = poly_work_group_scan_inclusive_add(value, scratch)[0];
+
+   if (tid < len_remainder) {
+      *ptr = count + scan;
+   }
+}
+
+static inline void
+poly_increment_counters(global uint32_t *a, global uint32_t *b,
+                        global uint32_t *c, uint count)
+{
+   global uint32_t *ptr[] = {a, b, c};
+
+   for (uint i = 0; i < 3; ++i) {
+      if (ptr[i]) {
+         *(ptr[i]) += count;
+      }
+   }
+}
+
+static inline void
+poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives,
+                  global uint32_t *vs_invocations, global uint32_t *c_prims,
+                  global uint32_t *c_invs, constant uint32_t *draw,
+                  enum mesa_prim prim, unsigned verts_per_patch)
+{
+   poly_increment_counters(ia_vertices, vs_invocations, NULL,
+                           draw[0] * draw[1]);
+
+   uint prims = poly_decomposed_prims_for_vertices_with_tess(prim, draw[0],
+                                                             verts_per_patch) *
+                draw[1];
+
+   poly_increment_counters(ia_primitives, c_prims, c_invs, prims);
+}
+
+static inline void
+poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw,
+                       global uintptr_t *vertex_buffer /* output */,
+                       global struct poly_ia_state *ia /* output */,
+                       global struct poly_geometry_params *p /* output */,
+                       global struct poly_heap *heap,
+                       uint64_t vs_outputs /* Vertex (TES) output mask */,
+                       uint32_t index_size_B /* 0 if no index bffer */,
+                       uint32_t index_buffer_range_el,
+                       uint32_t prim /* Input primitive type, enum mesa_prim */,
+                       int is_prefix_summing, uint max_indices,
+                       enum poly_gs_shape shape)
+{
+   /* Determine the (primitives, instances) grid size. */
+   uint vertex_count = draw[0];
+   uint instance_count = draw[1];
+
+   ia->verts_per_instance = vertex_count;
+
+   /* Calculate number of primitives input into the GS */
+   uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count);
+   p->input_primitives = prim_per_instance * instance_count;
+
+   /* Invoke VS as (vertices, instances); GS as (primitives, instances) */
+   p->vs_grid[0] = vertex_count;
+   p->vs_grid[1] = instance_count;
+
+   p->gs_grid[0] = prim_per_instance;
+   p->gs_grid[1] = instance_count;
+
+   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
+
+   /* If indexing is enabled, the third word is the offset into the index buffer
+    * in elements. Apply that offset now that we have it. For a hardware
+    * indirect draw, the hardware would do this for us, but for software input
+    * assembly we need to do it ourselves.
+    */
+   if (index_size_B) {
+      ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
+                                           draw[2], index_size_B);
+
+      ia->index_buffer_range_el =
+         poly_index_buffer_range_el(index_buffer_range_el, draw[2]);
+   }
+
+   /* We need to allocate VS and GS count buffers, do so now */
+   uint vertex_buffer_size =
+      poly_tcs_in_size(vertex_count * instance_count, vs_outputs);
+
+   if (is_prefix_summing) {
+      p->count_buffer = poly_heap_alloc_nonatomic(
+         heap, p->input_primitives * p->count_buffer_stride);
+   }
+
+   p->input_buffer =
+      (uintptr_t)poly_heap_alloc_nonatomic(heap, vertex_buffer_size);
+   *vertex_buffer = p->input_buffer;
+
+   p->input_mask = vs_outputs;
+
+   /* Allocate the index buffer and write the draw consuming it */
+   global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
+
+   *cmd = (VkDrawIndexedIndirectCommand){
+      .indexCount = poly_gs_rast_vertices(shape, max_indices, prim_per_instance,
+                                          instance_count),
+      .instanceCount = poly_gs_rast_instances(
+         shape, max_indices, prim_per_instance, instance_count),
+   };
+
+   if (shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
+      cmd->firstIndex =
+         poly_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4;
+
+      p->output_index_buffer =
+         (global uint *)(heap->base + (cmd->firstIndex * 4));
+   }
+}
+
+static uint
+poly_load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
+                uint index_size)
+{
+   bool oob = id >= index_buffer_range_el;
+
+   /* If the load would be out-of-bounds, load the first element which is
+    * assumed valid. If the application index buffer is empty with robustness2,
+    * index_buffer will point to a zero sink where only the first is valid.
+    */
+   if (oob) {
+      id = 0;
+   }
+
+   uint el;
+   if (index_size == 1) {
+      el = ((constant uint8_t *)index_buffer)[id];
+   } else if (index_size == 2) {
+      el = ((constant uint16_t *)index_buffer)[id];
+   } else {
+      el = ((constant uint32_t *)index_buffer)[id];
+   }
+
+   /* D3D robustness semantics. TODO: Optimize? */
+   if (oob) {
+      el = 0;
+   }
+
+   return el;
+}
+
+static void
+poly_store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
+{
+   global uint32_t *out_32 = (global uint32_t *)index_buffer;
+   global uint16_t *out_16 = (global uint16_t *)index_buffer;
+   global uint8_t *out_8 = (global uint8_t *)index_buffer;
+
+   if (index_size_B == 4)
+      out_32[id] = value;
+   else if (index_size_B == 2)
+      out_16[id] = value;
+   else
+      out_8[id] = value;
+}
+
+#endif
diff --git a/src/poly/meson.build b/src/poly/meson.build
new file mode 100644
index 00000000000..3f0e75b6ec5
--- /dev/null
+++ b/src/poly/meson.build
@@ -0,0 +1,9 @@
+# Copyright © 2025 Collabora Ltd.
+# SPDX-License-Identifier: MIT
+
+inc_poly = include_directories([
+   '.', 'nir'
+])
+
+subdir('cl')
+subdir('nir')
diff --git a/src/poly/nir/meson.build b/src/poly/nir/meson.build
new file mode 100644
index 00000000000..5560f5c860f
--- /dev/null
+++ b/src/poly/nir/meson.build
@@ -0,0 +1,18 @@
+# Copyright © 2025 Collabora Ltd.
+# SPDX-License-Identifier: MIT
+
+libpoly_nir_files = files(
+  'poly_nir_lower_gs.c',
+  'poly_nir_lower_ia.c',
+  'poly_nir_lower_tess.c',
+)
+
+libpoly_nir = static_library(
+  'libpoly_nir',
+  [libpoly_nir_files],
+  include_directories : [inc_poly],
+  c_args : [no_override_init_args, '-Wno-c2x-extensions'],
+  gnu_symbol_visibility : 'hidden',
+  dependencies: [idep_nir, idep_mesautil, idep_libpoly],
+  build_by_default : false,
+)
diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/poly/nir/poly_nir_lower_gs.c
similarity index 92%
rename from src/asahi/lib/agx_nir_lower_gs.c
rename to src/poly/nir/poly_nir_lower_gs.c
index 30630bedbf5..79e2fb9b038 100644
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/poly/nir/poly_nir_lower_gs.c
@@ -5,11 +5,11 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "agx_nir_lower_gs.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "compiler/nir/nir_builder.h"
 #include "gallium/include/pipe/p_defines.h"
-#include "libagx/geometry.h"
-#include "libagx/libagx.h"
+#include "poly/cl/libpoly.h"
+#include "poly/geometry.h"
 #include "util/bitscan.h"
 #include "util/list.h"
 #include "util/macros.h"
@@ -85,7 +85,7 @@ rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
 }
 
 static bool
-agx_nir_lower_gs_intrinsics(nir_shader *shader)
+lower_gs_intrinsics(nir_shader *shader)
 {
    struct state state;
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
@@ -158,16 +158,16 @@ agx_nir_lower_gs_intrinsics(nir_shader *shader)
 }
 
 struct lower_gs_state {
-   int static_count[MAX_VERTEX_STREAMS];
+   int static_count[POLY_MAX_VERTEX_STREAMS];
 
    /* The index of each counter in the count buffer, or -1 if it's not in the
     * count buffer.
     *
     * Invariant: info->count_words == sum(count_index[i] >= 0).
     */
-   int count_index[MAX_VERTEX_STREAMS];
+   int count_index[POLY_MAX_VERTEX_STREAMS];
 
-   struct agx_gs_info *info;
+   struct poly_gs_info *info;
 };
 
 /* Helpers for loading from the geometry state buffer */
@@ -184,8 +184,8 @@ load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes)
 
 #define load_geometry_param(b, field)                                          \
    load_geometry_param_offset(                                                 \
-      b, offsetof(struct agx_geometry_params, field),                          \
-      sizeof(((struct agx_geometry_params *)0)->field))
+      b, offsetof(struct poly_geometry_params, field),                         \
+      sizeof(((struct poly_geometry_params *)0)->field))
 
 /* Helpers for lowering I/O to variables */
 struct lower_output_to_var_state {
@@ -257,18 +257,18 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
       return prim;
 
    case MESA_PRIM_LINES:
-      return libagx_vertex_id_for_line_class(b, topology, prim, vert, nr);
+      return poly_vertex_id_for_line_class(b, topology, prim, vert, nr);
 
    case MESA_PRIM_TRIANGLES:
-      return libagx_vertex_id_for_tri_class(b, topology, prim, vert,
-                                            flatshade_first);
+      return poly_vertex_id_for_tri_class(b, topology, prim, vert,
+                                          flatshade_first);
 
    case MESA_PRIM_LINES_ADJACENCY:
-      return libagx_vertex_id_for_line_adj_class(b, topology, prim, vert);
+      return poly_vertex_id_for_line_adj_class(b, topology, prim, vert);
 
    case MESA_PRIM_TRIANGLES_ADJACENCY:
-      return libagx_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
-                                                flatshade_first);
+      return poly_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
+                                              flatshade_first);
 
    default:
       UNREACHABLE("invalid topology class");
@@ -276,8 +276,8 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
 }
 
 nir_def *
-agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
-                          nir_def *vertex)
+poly_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
+                           nir_def *vertex)
 {
    assert(intr->intrinsic == nir_intrinsic_load_per_vertex_input);
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
@@ -287,15 +287,15 @@ agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
 
    if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
       /* GS may be preceded by VS or TES so specified as param */
-      addr = libagx_geometry_input_address(
+      addr = poly_geometry_input_address(
          b, nir_load_geometry_param_buffer_poly(b), vertex, location);
    } else {
       assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL);
 
       /* TCS always preceded by VS so we use the VS state directly */
-      addr = libagx_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
-                                          nir_load_vs_outputs_poly(b), vertex,
-                                          location);
+      addr = poly_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
+                                        nir_load_vs_outputs_poly(b), vertex,
+                                        location);
    }
 
    addr = nir_iadd_imm(b, addr, 4 * nir_intrinsic_component(intr));
@@ -320,7 +320,7 @@ lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
    nir_def *unrolled =
       nir_iadd(b, nir_imul(b, nir_load_instance_id(b), verts), vertex);
 
-   nir_def *val = agx_load_per_vertex_input(b, intr, unrolled);
+   nir_def *val = poly_load_per_vertex_input(b, intr, unrolled);
    nir_def_replace(&intr->def, val);
    return true;
 }
@@ -377,10 +377,10 @@ write_xfb_counts(nir_builder *b, nir_intrinsic_instr *intr,
    nir_def *id =
       state->info->prefix_sum ? calc_unrolled_id(b) : nir_imm_int(b, 0);
 
-   nir_def *addr = libagx_load_xfb_count_address(
-      b, nir_load_geometry_param_buffer_poly(b),
-      nir_imm_int(b, state->count_index[stream]),
-      nir_imm_int(b, state->info->count_words), id);
+   nir_def *addr =
+      poly_load_xfb_count_address(b, nir_load_geometry_param_buffer_poly(b),
+                                  nir_imm_int(b, state->count_index[stream]),
+                                  nir_imm_int(b, state->info->count_words), id);
 
    if (state->info->prefix_sum) {
       nir_store_global(b, addr, 4, intr->src[2].ssa, nir_component_mask(1));
@@ -656,7 +656,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
    raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs));
 
    switch (state->info->shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED: {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED: {
       unsigned stride = output_vertex_id_pot_stride(gs);
 
       nir_def *unrolled = nir_udiv_imm(b, raw_vertex_id, stride);
@@ -669,8 +669,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
       break;
    }
 
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-   case AGX_GS_SHAPE_STATIC_PER_PRIM: {
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+   case POLY_GS_SHAPE_STATIC_PER_PRIM: {
       nir_def *stride = load_geometry_param(b, gs_grid[0]);
 
       rs.output_id = raw_vertex_id;
@@ -679,7 +679,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
       break;
    }
 
-   case AGX_GS_SHAPE_STATIC_PER_INSTANCE: {
+   case POLY_GS_SHAPE_STATIC_PER_INSTANCE: {
       unsigned stride = MAX2(state->info->max_indices, 1);
 
       rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
@@ -733,8 +733,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
 
       for (unsigned p_ = 0; p_ < n_; ++p_) {
          nir_def *p = nir_imm_int(b, p_);
-         nir_push_if(b, libagx_xfb_vertex_copy_in_strip(b, n, id_in_strip,
-                                                        strip_length, p));
+         nir_push_if(b, poly_xfb_vertex_copy_in_strip(b, n, id_in_strip,
+                                                      strip_length, p));
 
          /* Write XFB for each output */
          for (unsigned i = 0; i < xfb->output_count; ++i) {
@@ -746,14 +746,14 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
              * base for this invocation for the stream plus the offset within
              * this invocation.
              */
-            nir_def *invocation_base = libagx_previous_xfb_primitives(
+            nir_def *invocation_base = poly_previous_xfb_primitives(
                b, nir_load_geometry_param_buffer_poly(b),
                nir_imm_int(b, state->static_count[stream]),
                nir_imm_int(b, state->count_index[stream]),
                nir_imm_int(b, state->info->count_words),
                nir_imm_bool(b, state->info->prefix_sum), unrolled);
 
-            nir_def *index = libagx_xfb_vertex_offset(
+            nir_def *index = poly_xfb_vertex_offset(
                b, n, invocation_base, base, id_in_strip, p,
                nir_inot(b, nir_i2b(b, nir_load_provoking_last(b))));
 
@@ -776,7 +776,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
                 */
                value = nir_pad_vector_imm_int(b, value, 0, 4);
 
-               nir_def *addr = libagx_xfb_vertex_address(
+               nir_def *addr = poly_xfb_vertex_address(
                   b, nir_load_geometry_param_buffer_poly(b), index,
                   nir_imm_int(b, buffer), nir_imm_int(b, stride),
                   nir_imm_int(b, output.offset));
@@ -842,12 +842,12 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
 
    switch (intr->intrinsic) {
    case nir_intrinsic_set_vertex_and_primitive_count: {
-      if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
+      if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
          break;
 
       /* All streams are merged, just pick a single instruction */
       if (nir_intrinsic_stream_id(intr) == 0) {
-         libagx_pad_index_gs(
+         poly_pad_index_gs(
             b, load_geometry_param(b, output_index_buffer),
             nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
             intr->src[1].ssa, nir_imm_int(b, state->info->max_indices));
@@ -857,10 +857,10 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
    }
 
    case nir_intrinsic_emit_primitive_poly: {
-      if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
+      if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
          break;
 
-      libagx_write_strip(
+      poly_write_strip(
          b, load_geometry_param(b, output_index_buffer),
          nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
          intr->src[0].ssa,
@@ -903,14 +903,14 @@ collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    return true;
 }
 
-struct agx_xfb_key {
+struct poly_xfb_key {
    uint8_t streams;
    uint8_t buffers_written;
    uint8_t buffer_to_stream[NIR_MAX_XFB_BUFFERS];
    int8_t count_index[4];
    uint16_t stride[NIR_MAX_XFB_BUFFERS];
    uint16_t output_end[NIR_MAX_XFB_BUFFERS];
-   int16_t static_count[MAX_VERTEX_STREAMS];
+   int16_t static_count[POLY_MAX_VERTEX_STREAMS];
    uint16_t invocations;
    uint16_t vertices_per_prim;
 };
@@ -921,14 +921,14 @@ struct agx_xfb_key {
  * transform feedback offsets and counters as applicable.
  */
 static nir_shader *
-create_pre_gs(struct agx_xfb_key *key,
+create_pre_gs(struct poly_xfb_key *key,
               const nir_shader_compiler_options *options)
 {
    nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                    "Pre-GS patch up");
    nir_builder *b = &b_;
 
-   libagx_pre_gs(
+   poly_pre_gs(
       b, nir_load_geometry_param_buffer_poly(b), nir_imm_int(b, key->streams),
       nir_imm_int(b, key->buffers_written),
       nir_imm_ivec4(b, key->buffer_to_stream[0], key->buffer_to_stream[1],
@@ -1033,7 +1033,7 @@ calculate_max_indices(enum mesa_prim prim, unsigned verts)
 }
 
 struct topology_ctx {
-   struct agx_gs_info *info;
+   struct poly_gs_info *info;
    uint32_t topology[384];
 };
 
@@ -1041,7 +1041,7 @@ static bool
 evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
    struct topology_ctx *ctx = data;
-   struct agx_gs_info *info = ctx->info;
+   struct poly_gs_info *info = ctx->info;
    if (intr->intrinsic != nir_intrinsic_emit_primitive_poly)
       return false;
 
@@ -1050,7 +1050,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
     * if-statements interleaved with other stuff).
     */
    if (intr->instr.block != nir_start_block(b->impl)) {
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
       return false;
    }
 
@@ -1058,11 +1058,11 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    if (!nir_src_is_const(intr->src[0]) || !nir_src_is_const(intr->src[1]) ||
        !nir_src_is_const(intr->src[2])) {
 
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
       return false;
    }
 
-   _libagx_write_strip(
+   _poly_write_strip(
       ctx->topology, nir_src_as_uint(intr->src[0]),
       nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]),
       nir_intrinsic_stream_id(intr), stream_multiplier(b->shader),
@@ -1076,7 +1076,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
  *    0, 1, 2, -1, 3, 4, 5, ...
  */
 static bool
-match_list_topology(struct agx_gs_info *info, uint32_t count,
+match_list_topology(struct poly_gs_info *info, uint32_t count,
                     uint32_t *topology, bool has_restart)
 {
    unsigned count_with_restart = count + has_restart;
@@ -1095,7 +1095,7 @@ match_list_topology(struct agx_gs_info *info, uint32_t count,
    }
 
    /* If we match, rewrite the topology and drop indexing */
-   info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
+   info->shape = POLY_GS_SHAPE_STATIC_PER_INSTANCE;
    info->mode = u_decomposed_prim(info->mode);
    info->max_indices =
       ((info->max_indices + has_restart) / count_with_restart) * count;
@@ -1131,12 +1131,12 @@ is_strip_topology(uint32_t *indices, uint32_t index_count)
  * VS(compute) + GS(vertex) sequences without auxiliary programs.
  */
 static void
-optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
+optimize_static_topology(struct poly_gs_info *info, nir_shader *gs)
 {
    struct topology_ctx ctx = {.info = info};
    bool has_restart = info->mode != MESA_PRIM_POINTS;
    nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx);
-   if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED)
+   if (info->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED)
       return;
 
    /* We can always drop the trailing restart index */
@@ -1150,7 +1150,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
 
    /* Try to pattern match a strip topology */
    if (is_strip_topology(ctx.topology, info->max_indices)) {
-      info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM;
+      info->shape = POLY_GS_SHAPE_STATIC_PER_PRIM;
       return;
    }
 
@@ -1161,7 +1161,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
     * XXX: check if this holds with streams.
     */
    if (info->max_indices >= ARRAY_SIZE(info->topology)) {
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
       return;
    }
 
@@ -1170,12 +1170,12 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
       info->topology[i] = ctx.topology[i];
    }
 
-   info->shape = AGX_GS_SHAPE_STATIC_INDEXED;
+   info->shape = POLY_GS_SHAPE_STATIC_INDEXED;
 }
 
 bool
-agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
-                 nir_shader **pre_gs, struct agx_gs_info *info)
+poly_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
+                  nir_shader **pre_gs, struct poly_gs_info *info)
 {
    /* Lower I/O as assumed by the rest of GS lowering */
    if (gs->xfb_info != NULL) {
@@ -1212,7 +1212,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
    /* Lower geometry shader writes to contain all of the required counts, so we
     * know where in the various buffers we should write vertices.
     */
-   NIR_PASS(_, gs, agx_nir_lower_gs_intrinsics);
+   NIR_PASS(_, gs, lower_gs_intrinsics);
 
    /* Clean up after all that lowering we did */
    bool progress = false;
@@ -1241,7 +1241,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
     */
    struct lower_gs_state gs_state = {.info = info};
 
-   *info = (struct agx_gs_info){
+   *info = (struct poly_gs_info){
       .mode = gs->info.gs.output_primitive,
       .xfb = gs->xfb_info != NULL,
       .shape = -1,
@@ -1252,10 +1252,13 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
    nir_gs_count_vertices_and_primitives(gs, NULL, static_indices,
                                         gs_state.static_count, 4);
 
+   STATIC_ASSERT(ARRAY_SIZE(gs_state.count_index) ==
+                 ARRAY_SIZE(gs_state.static_count));
+
    /* Anything we don't know statically will be tracked by the count buffer.
     * Determine the layout for it.
     */
-   for (unsigned i = 0; i < MAX_VERTEX_STREAMS; ++i) {
+   for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
       gs_state.count_index[i] =
          (gs_state.static_count[i] < 0) ? info->count_words++ : -1;
    }
@@ -1272,7 +1275,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
    if (static_indices[0] >= 0) {
       optimize_static_topology(info, gs);
    } else {
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
    }
 
    *gs_copy = create_gs_rast_shader(gs, &gs_state);
@@ -1344,20 +1347,22 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
    /* Gather information required for transform feedback / query programs */
    struct nir_xfb_info *xfb = gs->xfb_info;
 
-   struct agx_xfb_key key = {
+   struct poly_xfb_key key = {
       .streams = gs->info.gs.active_stream_mask,
       .invocations = gs->info.gs.invocations,
       .vertices_per_prim = nir_verts_in_output_prim(gs),
    };
 
-   for (unsigned i = 0; i < 4; ++i) {
+   STATIC_ASSERT(ARRAY_SIZE(key.buffer_to_stream) == ARRAY_SIZE(key.stride));
+
+   for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
       key.count_index[i] = gs_state.count_index[i];
       key.static_count[i] = gs_state.static_count[i];
    }
 
    if (xfb) {
       key.buffers_written = xfb->buffers_written;
-      for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned i = 0; i < ARRAY_SIZE(key.buffer_to_stream); ++i) {
          key.buffer_to_stream[i] = xfb->buffer_to_stream[i];
          key.stride[i] = xfb->buffers[i].stride;
       }
@@ -1409,14 +1414,13 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    nir_def *buffer, *nr_verts, *instance_id, *primitive_id;
    if (b->shader->info.stage == MESA_SHADER_VERTEX) {
       buffer = nir_load_vs_output_buffer_poly(b);
-      nr_verts =
-         libagx_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
+      nr_verts = poly_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
    } else {
       assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);
 
       /* Instancing is unrolled during tessellation so nr_verts is ignored. */
       nr_verts = nir_imm_int(b, 0);
-      buffer = libagx_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
+      buffer = poly_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
    }
 
    if (b->shader->info.stage == MESA_SHADER_VERTEX &&
@@ -1431,7 +1435,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    nir_def *linear_id =
       nir_iadd(b, nir_imul(b, instance_id, nr_verts), primitive_id);
 
-   nir_def *addr = libagx_vertex_output_address(
+   nir_def *addr = poly_vertex_output_address(
       b, buffer, nir_imm_int64(b, b->shader->info.outputs_written), linear_id,
       location);
 
@@ -1444,7 +1448,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }
 
 bool
-agx_nir_lower_vs_before_gs(struct nir_shader *vs)
+poly_nir_lower_vs_before_gs(struct nir_shader *vs)
 {
    /* Lower vertex stores to memory stores */
    return nir_shader_intrinsics_pass(vs, lower_vs_before_gs,
diff --git a/src/poly/nir/poly_nir_lower_gs.h b/src/poly/nir/poly_nir_lower_gs.h
new file mode 100644
index 00000000000..75727a661aa
--- /dev/null
+++ b/src/poly/nir/poly_nir_lower_gs.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "poly/geometry.h"
+#include "nir.h"
+#include "shader_enums.h"
+
+struct nir_def *poly_load_per_vertex_input(struct nir_builder *b,
+                                           nir_intrinsic_instr *intr,
+                                           struct nir_def *vertex);
+
+nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
+                                 unsigned index_size_B);
+
+bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
+
+bool poly_nir_lower_vs_before_gs(struct nir_shader *vs);
+
+struct poly_gs_info {
+   /* Output primitive mode for geometry shaders */
+   enum mesa_prim mode;
+
+   /* Number of words per primitive in the count buffer */
+   unsigned count_words;
+
+   /* Per-input primitive stride of the output index buffer */
+   unsigned max_indices;
+
+   /* Whether the GS includes transform feedback at a compile-time level */
+   bool xfb;
+
+   /* Whether a prefix sum is required on the count outputs. Implies xfb */
+   bool prefix_sum;
+
+   /* Whether the GS writes to a stream other than stream #0 */
+   bool multistream;
+
+   /* Shape of the rasterization draw, named by the instance ID */
+   enum poly_gs_shape shape;
+
+   /* Static topology used if shape = POLY_GS_SHAPE_STATIC_INDEXED */
+   uint8_t topology[64];
+};
+
+bool poly_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
+                       struct nir_shader **gs_copy, struct nir_shader **pre_gs,
+                       struct poly_gs_info *info);
+
+bool poly_nir_lower_tcs(struct nir_shader *tcs);
+
+bool poly_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs);
+
+uint64_t poly_tcs_per_vertex_outputs(const struct nir_shader *nir);
+
+unsigned poly_tcs_output_stride(const struct nir_shader *nir);
diff --git a/src/asahi/lib/agx_nir_lower_ia.c b/src/poly/nir/poly_nir_lower_ia.c
similarity index 70%
rename from src/asahi/lib/agx_nir_lower_ia.c
rename to src/poly/nir/poly_nir_lower_ia.c
index f0c0c45ab52..14bf7e704d7 100644
--- a/src/asahi/lib/agx_nir_lower_ia.c
+++ b/src/poly/nir/poly_nir_lower_ia.c
@@ -4,25 +4,30 @@
  */
 
 #include "compiler/nir/nir_builder.h"
-#include "libagx/geometry.h"
-#include "libagx/libagx.h"
-#include "agx_nir_lower_gs.h"
+#include "poly/cl/libpoly.h"
+#include "poly/geometry.h"
 #include "nir.h"
 
+/* XXX: Remove me later */
+nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
+                                 unsigned index_size_B);
+
+bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
+
 /*
  * This file implements basic input assembly in software. It runs on software
  * vertex shaders, as part of geometry/tessellation lowering. It does not apply
  * the topology, which happens in the geometry shader.
  */
 nir_def *
-agx_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B)
+poly_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B)
 {
    /* If drawing with an index buffer, pull the vertex ID. Otherwise, the
     * vertex ID is just the index as-is.
     */
    if (index_size_B) {
       nir_def *ia = nir_load_input_assembly_buffer_poly(b);
-      id = libagx_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B));
+      id = poly_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B));
    }
 
    /* Add the "start", either an index bias or a base vertex. This must happen
@@ -39,7 +44,8 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 
    if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
       nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
-      nir_def_replace(&intr->def, agx_nir_load_vertex_id(b, id, *index_size_B));
+      nir_def_replace(&intr->def,
+                      poly_nir_load_vertex_id(b, id, *index_size_B));
       return true;
    } else if (intr->intrinsic == nir_intrinsic_load_instance_id) {
       nir_def_replace(&intr->def,
@@ -51,7 +57,7 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }
 
 bool
-agx_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B)
+poly_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B)
 {
    return nir_shader_intrinsics_pass(s, lower, nir_metadata_control_flow,
                                      &index_size_B);
diff --git a/src/asahi/lib/agx_nir_lower_tess.c b/src/poly/nir/poly_nir_lower_tess.c
similarity index 78%
rename from src/asahi/lib/agx_nir_lower_tess.c
rename to src/poly/nir/poly_nir_lower_tess.c
index d765d10dda1..70aa21eeeb3 100644
--- a/src/asahi/lib/agx_nir_lower_tess.c
+++ b/src/poly/nir/poly_nir_lower_tess.c
@@ -3,11 +3,11 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "libagx/geometry.h"
-#include "libagx/libagx.h"
+#include "poly/cl/libpoly.h"
+#include "poly/geometry.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitscan.h"
 #include "util/macros.h"
-#include "agx_nir_lower_gs.h"
 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_builder_opcodes.h"
@@ -18,12 +18,12 @@
 static nir_def *
 tcs_unrolled_id(nir_builder *b)
 {
-   return libagx_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b),
-                                 nir_load_workgroup_id(b));
+   return poly_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b),
+                               nir_load_workgroup_id(b));
 }
 
 uint64_t
-agx_tcs_per_vertex_outputs(const nir_shader *nir)
+poly_tcs_per_vertex_outputs(const nir_shader *nir)
 {
    return nir->info.outputs_written &
           ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER |
@@ -31,11 +31,11 @@ agx_tcs_per_vertex_outputs(const nir_shader *nir)
 }
 
 unsigned
-agx_tcs_output_stride(const nir_shader *nir)
+poly_tcs_output_stride(const nir_shader *nir)
 {
-   return libagx_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
-                                nir->info.tess.tcs_vertices_out,
-                                agx_tcs_per_vertex_outputs(nir));
+   return poly_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
+                              nir->info.tess.tcs_vertices_out,
+                              poly_tcs_per_vertex_outputs(nir));
 }
 
 static nir_def *
@@ -44,12 +44,12 @@ tcs_out_addr(nir_builder *b, nir_intrinsic_instr *intr, nir_def *vertex_id)
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
 
    nir_def *offset = nir_get_io_offset_src(intr)->ssa;
-   nir_def *addr = libagx_tcs_out_address(
+   nir_def *addr = poly_tcs_out_address(
       b, nir_load_tess_param_buffer_poly(b), tcs_unrolled_id(b), vertex_id,
       nir_iadd_imm(b, offset, sem.location),
       nir_imm_int(b, util_last_bit(b->shader->info.patch_outputs_written)),
       nir_imm_int(b, b->shader->info.tess.tcs_vertices_out),
-      nir_imm_int64(b, agx_tcs_per_vertex_outputs(b->shader)));
+      nir_imm_int64(b, poly_tcs_per_vertex_outputs(b->shader)));
 
    addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
 
@@ -68,9 +68,9 @@ lower_tes_load(nir_builder *b, nir_intrinsic_instr *intr)
    if (intr->intrinsic == nir_intrinsic_load_per_vertex_input)
       vertex = intr->src[0].ssa;
 
-   nir_def *addr = libagx_tes_in_address(b, nir_load_tess_param_buffer_poly(b),
-                                         nir_load_vertex_id(b), vertex,
-                                         nir_iadd_imm(b, offset, location));
+   nir_def *addr = poly_tes_in_address(b, nir_load_tess_param_buffer_poly(b),
+                                       nir_load_vertex_id(b), vertex,
+                                       nir_iadd_imm(b, offset, location));
 
    if (nir_intrinsic_has_component(intr))
       addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
@@ -84,10 +84,10 @@ tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr)
 {
    nir_def *base = nir_imul(
       b, tcs_unrolled_id(b),
-      libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)));
+      poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)));
    nir_def *vertex = nir_iadd(b, base, intr->src[0].ssa);
 
-   return agx_load_per_vertex_input(b, intr, vertex);
+   return poly_load_per_vertex_input(b, intr, vertex);
 }
 
 static nir_def *
@@ -114,16 +114,15 @@ lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr)
       return tcs_load_input(b, intr);
 
    case nir_intrinsic_load_patch_vertices_in:
-      return libagx_tcs_patch_vertices_in(b,
-                                          nir_load_tess_param_buffer_poly(b));
+      return poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b));
 
    case nir_intrinsic_load_tess_level_outer_default:
-      return libagx_tess_level_outer_default(
-         b, nir_load_tess_param_buffer_poly(b));
+      return poly_tess_level_outer_default(b,
+                                           nir_load_tess_param_buffer_poly(b));
 
    case nir_intrinsic_load_tess_level_inner_default:
-      return libagx_tess_level_inner_default(
-         b, nir_load_tess_param_buffer_poly(b));
+      return poly_tess_level_inner_default(b,
+                                           nir_load_tess_param_buffer_poly(b));
 
    case nir_intrinsic_load_output: {
       nir_def *addr = tcs_out_addr(b, intr, nir_undef(b, 1, 32));
@@ -176,7 +175,7 @@ lower_tcs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }
 
 bool
-agx_nir_lower_tcs(nir_shader *tcs)
+poly_nir_lower_tcs(nir_shader *tcs)
 {
    return nir_shader_intrinsics_pass(tcs, lower_tcs, nir_metadata_control_flow,
                                      NULL);
@@ -187,12 +186,12 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
    switch (intr->intrinsic) {
    case nir_intrinsic_load_tess_coord_xy:
-      return libagx_load_tess_coord(b, nir_load_tess_param_buffer_poly(b),
-                                    nir_load_vertex_id(b));
+      return poly_load_tess_coord(b, nir_load_tess_param_buffer_poly(b),
+                                  nir_load_vertex_id(b));
 
    case nir_intrinsic_load_primitive_id:
-      return libagx_tes_patch_id(b, nir_load_tess_param_buffer_poly(b),
-                                 nir_load_vertex_id(b));
+      return poly_tes_patch_id(b, nir_load_tess_param_buffer_poly(b),
+                               nir_load_vertex_id(b));
 
    case nir_intrinsic_load_input:
    case nir_intrinsic_load_per_vertex_input:
@@ -201,8 +200,7 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
       return lower_tes_load(b, intr);
 
    case nir_intrinsic_load_patch_vertices_in:
-      return libagx_tes_patch_vertices_in(b,
-                                          nir_load_tess_param_buffer_poly(b));
+      return poly_tes_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b));
 
    default:
       return NULL;
@@ -232,12 +230,12 @@ lower_tes_indexing(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    b->cursor = nir_before_instr(&intr->instr);
    nir_def *p = nir_load_tess_param_buffer_poly(b);
    nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
-   nir_def_replace(&intr->def, libagx_load_tes_index(b, p, id));
+   nir_def_replace(&intr->def, poly_load_tes_index(b, p, id));
    return true;
 }
 
 bool
-agx_nir_lower_tes(nir_shader *tes, bool to_hw_vs)
+poly_nir_lower_tes(nir_shader *tes, bool to_hw_vs)
 {
    nir_lower_tess_coord_z(
       tes, tes->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
diff --git a/src/poly/tessellator.h b/src/poly/tessellator.h
new file mode 100644
index 00000000000..f8b722bfef3
--- /dev/null
+++ b/src/poly/tessellator.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "compiler/libcl/libcl.h"
+
+enum poly_tess_partitioning {
+   POLY_TESS_PARTITIONING_FRACTIONAL_ODD,
+   POLY_TESS_PARTITIONING_FRACTIONAL_EVEN,
+   POLY_TESS_PARTITIONING_INTEGER,
+};
+
+enum poly_tess_mode {
+   /* Do not actually tessellate, just write the index counts */
+   POLY_TESS_MODE_COUNT,
+
+   /* Tessellate using the count buffers to allocate indices */
+   POLY_TESS_MODE_WITH_COUNTS,
+};
+
+struct poly_tess_point {
+   uint32_t u;
+   uint32_t v;
+};
+static_assert(sizeof(struct poly_tess_point) == 8);
+
+struct poly_tess_args {
+   /* Heap to allocate tessellator outputs in */
+   DEVICE(struct poly_heap) heap;
+
+   /* Patch coordinate buffer, indexed as:
+    *
+    *    coord_allocs[patch_ID] + vertex_in_patch
+    */
+   DEVICE(struct poly_tess_point) patch_coord_buffer;
+
+   /* Per-patch index within the heap for the tess coords, written by the
+    * tessellator based on the allocated memory.
+    */
+   DEVICE(uint32_t) coord_allocs;
+
+   /* Space for output draws from the tessellator. API draw calls. */
+   DEVICE(uint32_t) out_draws;
+
+   /* Tessellation control shader output buffer. */
+   DEVICE(float) tcs_buffer;
+
+   /* Count buffer. # of indices per patch written here, then prefix summed. */
+   DEVICE(uint32_t) counts;
+
+   /* Allocated index buffer for all patches, if we're prefix summing counts */
+   DEVICE(uint32_t) index_buffer;
+
+   /* Address of the tess eval invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by tessellator.
+    */
+   DEVICE(uint32_t) statistic;
+
+   /* When geom+tess used together, the buffer containing TES outputs (executed
+    * as a hardware compute shader).
+    */
+   uint64_t tes_buffer;
+
+   /* Bitfield of TCS per-vertex outputs */
+   uint64_t tcs_per_vertex_outputs;
+
+   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
+    * Unused in Vulkan and OpenGL ES.
+    */
+   float tess_level_outer_default[4];
+   float tess_level_inner_default[2];
+
+   /* Number of vertices in the input patch */
+   uint32_t input_patch_size;
+
+   /* Number of vertices in the TCS output patch */
+   uint32_t output_patch_size;
+
+   /* Number of patch constants written by TCS */
+   uint32_t tcs_patch_constants;
+
+   /* Number of input patches per instance of the VS/TCS */
+   uint32_t patches_per_instance;
+
+   /* Stride between tessellation facotrs in the TCS output buffer. */
+   uint32_t tcs_stride_el;
+
+   /* Number of patches being tessellated */
+   uint32_t nr_patches;
+
+   /* Partitioning and points mode. These affect per-patch setup code but not
+    * the hot tessellation loop so we make them dynamic to reduce tessellator
+    * variants.
+    */
+   enum poly_tess_partitioning partitioning;
+   uint32_t points_mode;
+   uint32_t isolines;
+
+   /* When fed into a geometry shader, triangles should be counter-clockwise.
+    * The tessellator always produces clockwise triangles, but we can swap
+    * dynamically in the TES.
+    */
+   uint32_t ccw;
+} PACKED;
+static_assert(sizeof(struct poly_tess_args) == 36 * 4);