poly: Migrate AGX's GS/TESS emulation to common code

This moves most of the code to a new home: src/poly. Most precomp kernels logic that could be moved are provided by poly now. Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37914>
2025-12-20 07:20:10 +01:00 · 2025-10-06 12:57:04 +02:00 · 2025-10-06 12:57:04 +02:00 · b2accf86d1
commit b2accf86d1
parent 8048004238
35 changed files with 3421 additions and 3117 deletions
--- a/meson.build
+++ b/meson.build
@ -845,6 +845,10 @@ endif
 with_llvm = with_llvm \
  .enable_if(with_clc, error_message : 'CLC requires LLVM')

+with_poly = [
+  with_gallium_asahi, with_asahi_vk, with_tools.contains('asahi'),
+].contains(true)
+
 dep_clc = null_dep
 if with_clc
  dep_clc = dependency('libclc')
--- a/src/.clang-format
+++ b/src/.clang-format
@ -237,7 +237,9 @@ ForEachMacros:
  - agx_foreach_reg_dest
  - agx_foreach_successor
  - foreach_next_use
-  - libagx_foreach_xfb
+
+# poly
+  - poly_foreach_xfb

 # radv
  - PHASE
--- a/src/asahi/lib/agx_helpers.h
+++ b/src/asahi/lib/agx_helpers.h
@ -316,16 +316,6 @@ agx_fill_decompress_args(struct ail_layout *layout, unsigned layer,
      agx_fill_decompress_args(layout, layer, level, ptr, images),             \
      util_logbase2(layout->sample_count_sa))

-#define libagx_tessellate(context, grid, barrier, prim, mode, state)           \
-   if (prim == TESS_PRIMITIVE_QUADS) {                                         \
-      libagx_tess_quad(context, grid, barrier, state, mode);                   \
-   } else if (prim == TESS_PRIMITIVE_TRIANGLES) {                              \
-      libagx_tess_tri(context, grid, barrier, state, mode);                    \
-   } else {                                                                    \
-      assert(prim == TESS_PRIMITIVE_ISOLINES);                                 \
-      libagx_tess_isoline(context, grid, barrier, state, mode);                \
-   }
-
 struct agx_border_packed;

 void agx_pack_border(struct agx_border_packed *out, const uint32_t in[4],
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -1,61 +0,0 @@
-/*
- * Copyright 2023 Alyssa Rosenzweig
- * SPDX-License-Identifier: MIT
- */
-
-#pragma once
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "libagx/geometry.h"
-#include "nir.h"
-#include "shader_enums.h"
-
-struct nir_def *agx_load_per_vertex_input(struct nir_builder *b,
-                                          nir_intrinsic_instr *intr,
-                                          struct nir_def *vertex);
-
-nir_def *agx_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
-                                unsigned index_size_B);
-
-bool agx_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
-
-bool agx_nir_lower_vs_before_gs(struct nir_shader *vs);
-
-struct agx_gs_info {
-   /* Output primitive mode for geometry shaders */
-   enum mesa_prim mode;
-
-   /* Number of words per primitive in the count buffer */
-   unsigned count_words;
-
-   /* Per-input primitive stride of the output index buffer */
-   unsigned max_indices;
-
-   /* Whether the GS includes transform feedback at a compile-time level */
-   bool xfb;
-
-   /* Whether a prefix sum is required on the count outputs. Implies xfb */
-   bool prefix_sum;
-
-   /* Whether the GS writes to a stream other than stream #0 */
-   bool multistream;
-
-   /* Shape of the rasterization draw, named by the instance ID */
-   enum agx_gs_shape shape;
-
-   /* Static topology used if shape = AGX_GS_SHAPE_STATIC_INDEXED */
-   uint8_t topology[64];
-};
-
-bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
-                      struct nir_shader **gs_copy, struct nir_shader **pre_gs,
-                      struct agx_gs_info *info);
-
-bool agx_nir_lower_tcs(struct nir_shader *tcs);
-
-bool agx_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs);
-
-uint64_t agx_tcs_per_vertex_outputs(const struct nir_shader *nir);
-
-unsigned agx_tcs_output_stride(const struct nir_shader *nir);
--- a/src/asahi/lib/agx_nir_prolog_epilog.c
+++ b/src/asahi/lib/agx_nir_prolog_epilog.c
@ -5,11 +5,12 @@
 */

 #include "gallium/include/pipe/p_defines.h"
+#include "poly/cl/libpoly.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/format/u_formats.h"
 #include "agx_abi.h"
 #include "agx_linker.h"
 #include "agx_nir.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_pack.h"
 #include "agx_tilebuffer.h"
@ -149,11 +150,11 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   nir_def *id = nir_load_vertex_id(b);

   if (key->adjacency == MESA_PRIM_LINES_ADJACENCY) {
-      id = libagx_map_to_line_adj(b, id);
+      id = poly_map_to_line_adj(b, id);
   } else if (key->adjacency == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
-      id = libagx_map_to_tri_strip_adj(b, id);
+      id = poly_map_to_tri_strip_adj(b, id);
   } else if (key->adjacency == MESA_PRIM_LINE_STRIP_ADJACENCY) {
-      id = libagx_map_to_line_strip_adj(b, id);
+      id = poly_map_to_line_strip_adj(b, id);
   } else if (key->adjacency == MESA_PRIM_TRIANGLES_ADJACENCY) {
      /* Sequence (0, 2, 4), (6, 8, 10), ... */
      id = nir_imul_imm(b, id, 2);
@ -161,7 +162,7 @@ lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
      UNREACHABLE("unknown");
   }

-   id = agx_nir_load_vertex_id(b, id, key->sw_index_size_B);
+   id = poly_nir_load_vertex_id(b, id, key->sw_index_size_B);

   nir_def_replace(&intr->def, id);
   return true;
@ -215,7 +216,7 @@ agx_nir_vs_prolog(nir_builder *b, const void *key_)
   }

   if (!key->hw) {
-      agx_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
+      poly_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
   } else if (key->adjacency) {
      nir_shader_intrinsics_pass(b->shader, lower_adjacency,
                                 nir_metadata_control_flow, (void *)key);
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@ -11,11 +11,8 @@ libasahi_lib_files = files(
  'agx_linker.c',
  'agx_bg_eot.c',
  'agx_tilebuffer.c',
-  'agx_nir_lower_gs.c',
-  'agx_nir_lower_ia.c',
  'agx_nir_lower_msaa.c',
  'agx_nir_lower_sample_intrinsics.c',
-  'agx_nir_lower_tess.c',
  'agx_nir_lower_tilebuffer.c',
  'agx_nir_lower_uvs.c',
  'agx_nir_lower_vbo.c',
@ -66,8 +63,8 @@ libasahi_lib = static_library(
  include_directories : [inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
  c_args : [no_override_init_args, '-Wno-c2x-extensions'],
  gnu_symbol_visibility : 'hidden',
-  link_with: [libasahi_decode, libvdrm],
-  dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx],
+  link_with: [libasahi_decode, libvdrm, libpoly_nir],
+  dependencies: [dep_libdrm, dep_valgrind, idep_nir, idep_mesautil, idep_libagx, idep_libpoly],
  build_by_default : false,
 )

--- a/src/asahi/libagx/draws.cl
+++ b/src/asahi/libagx/draws.cl
@ -4,8 +4,8 @@
 */
 #include "asahi/lib/agx_abi.h"
 #include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
 #include "agx_pack.h"
-#include "geometry.h"
 #include "libagx_dgc.h"

 /*
@ -36,7 +36,7 @@ libagx_predicate_indirect(global uint32_t *out, constant uint32_t *in,
 KERNEL(1)
 libagx_draw_without_adj(global VkDrawIndirectCommand *out,
                        global VkDrawIndirectCommand *in,
-                        global struct agx_ia_state *ia, uint64_t index_buffer,
+                        global struct poly_ia_state *ia, uint64_t index_buffer,
                        uint64_t index_buffer_range_el, int index_size_B,
                        enum mesa_prim prim)
 {
@ -49,11 +49,11 @@ libagx_draw_without_adj(global VkDrawIndirectCommand *out,
   if (index_size_B) {
      uint offs = in->firstVertex;

-      ia->index_buffer = libagx_index_buffer(
-         index_buffer, index_buffer_range_el, offs, index_size_B);
+      ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
+                                           offs, index_size_B);

      ia->index_buffer_range_el =
-         libagx_index_buffer_range_el(index_buffer_range_el, offs);
+         poly_index_buffer_range_el(index_buffer_range_el, offs);
   }
 }

@ -122,8 +122,7 @@ libagx_memset_small(global uchar *dst, uchar b, int len, uint tid)
 * TODO: Handle multiple draws in parallel.
 */
 KERNEL(32)
-libagx_draw_robust_index(global uint32_t *vdm,
-                         global struct agx_heap *heap,
+libagx_draw_robust_index(global uint32_t *vdm, global struct poly_heap *heap,
                         constant VkDrawIndexedIndirectCommand *cmd,
                         uint64_t in_buf_ptr, uint32_t in_buf_range_B,
                         ushort restart, enum agx_primitive topology,
@ -163,7 +162,7 @@ libagx_draw_robust_index(global uint32_t *vdm,
      /* Allocate memory for the shadow index buffer */
      global uchar *padded;
      if (first) {
-         padded = agx_heap_alloc_nonatomic(heap, out_size_B);
+         padded = poly_heap_alloc_nonatomic(heap, out_size_B);
      }
      padded = (global uchar *)sub_group_broadcast((uintptr_t)padded, 0);

@ -172,7 +171,7 @@ libagx_draw_robust_index(global uint32_t *vdm,
      draw.start = 0;

      /* Clone the index buffer. The destination is aligned as a post-condition
-       * of agx_heap_alloc_nonatomic.
+       * of poly_heap_alloc_nonatomic.
       */
      libagx_memcpy_to_aligned((global uint *)padded, in_buf, in_size_B, tid,
                               32);
--- a/src/asahi/libagx/geometry.cl
+++ b/src/asahi/libagx/geometry.cl
@ -4,15 +4,11 @@
 * SPDX-License-Identifier: MIT
 */

-#include "asahi/lib/agx_abi.h"
 #include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
 #include "util/macros.h"
 #include "util/u_math.h"
-#include "geometry.h"
-#include "query.h"
-#include "tessellator.h"
-
-uint64_t nir_ro_to_rw_poly(uint64_t address);

 /* Swap the two non-provoking vertices in odd triangles. This generates a vertex
 * ID list with a consistent winding order.
@ -32,54 +28,6 @@ map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
   return (provoking || even) ? vert : ((3 - pv) - vert);
 }

-static inline uint
-xfb_prim(uint id, uint n, uint copy)
-{
-   return sub_sat(id, n - 1u) + copy;
-}
-
-/*
- * Determine whether an output vertex has an n'th copy in the transform feedback
- * buffer. This is written weirdly to let constant folding remove unnecessary
- * stores when length is known statically.
- */
-bool
-libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
-{
-   uint prim = xfb_prim(id, n, copy);
-
-   int num_prims = length - (n - 1);
-   return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
-}
-
-uint
-libagx_xfb_vertex_offset(uint n, uint invocation_base_prim,
-                         uint strip_base_prim, uint id_in_strip, uint copy,
-                         bool flatshade_first)
-{
-   uint prim = xfb_prim(id_in_strip, n, copy);
-   uint vert_0 = min(id_in_strip, n - 1);
-   uint vert = vert_0 - copy;
-
-   if (n == 3) {
-      vert = map_vertex_in_tri_strip(prim, vert, flatshade_first);
-   }
-
-   /* Tally up in the whole buffer */
-   uint base_prim = invocation_base_prim + strip_base_prim;
-   uint base_vertex = base_prim * n;
-   return base_vertex + (prim * n) + vert;
-}
-
-uint64_t
-libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index,
-                          uint buffer, uint stride, uint output_offset)
-{
-   uint xfb_offset = (index * stride) + output_offset;
-
-   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
-}
-
 static uint
 vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
 {
@ -90,20 +38,6 @@ vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
      return prim + vert;
 }

-uint
-libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
-                                uint num_prims)
-{
-   /* Line list, line strip, or line loop */
-   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
-      return 0;
-
-   if (mode == MESA_PRIM_LINES)
-      prim *= 2;
-
-   return prim + vert;
-}
-
 static uint
 vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
 {
@ -122,44 +56,6 @@ vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
   return (vert == 0) ? 0 : prim + vert;
 }

-uint
-libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
-                               bool flatshade_first)
-{
-   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
-      vert = vert + 1;
-      vert = (vert == 3) ? 0 : vert;
-   }
-
-   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
-      return 0;
-
-   if (mode == MESA_PRIM_TRIANGLES)
-      prim *= 3;
-
-   /* Triangle list, triangle strip, or triangle fan */
-   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
-      unsigned pv = flatshade_first ? 0 : 2;
-
-      bool even = (prim & 1) == 0;
-      bool provoking = vert == pv;
-
-      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
-   }
-
-   return prim + vert;
-}
-
-uint
-libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
-{
-   /* Line list adj or line strip adj */
-   if (mode == MESA_PRIM_LINES_ADJACENCY)
-      prim *= 4;
-
-   return prim + vert;
-}
-
 static uint
 vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
                            bool flatshade_first)
@ -206,18 +102,6 @@ vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
   return (prim * 2) + offset;
 }

-uint
-libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
-                                   uint nr, bool flatshade_first)
-{
-   /* Tri adj list or tri adj strip */
-   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
-      return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first);
-   } else {
-      return (6 * prim) + vert;
-   }
-}
-
 static uint
 vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
                       uint vert, uint num_prims)
@ -262,127 +146,6 @@ vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
   }
 }

-uint
-libagx_map_to_line_adj(uint id)
-{
-   /* Sequence (1, 2), (5, 6), (9, 10), ... */
-   return ((id & ~1) * 2) + (id & 1) + 1;
-}
-
-uint
-libagx_map_to_line_strip_adj(uint id)
-{
-   /* Sequence (1, 2), (2, 3), (4, 5), .. */
-   uint prim = id / 2;
-   uint vert = id & 1;
-   return prim + vert + 1;
-}
-
-uint
-libagx_map_to_tri_strip_adj(uint id)
-{
-   /* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8)
-    *
-    * Although tri strips with adjacency have 6 cases in general, after
-    * disregarding the vertices only available in a geometry shader, there are
-    * only even/odd cases. In other words, it's just a triangle strip subject to
-    * extra padding.
-    *
-    * Dividing through by two, the sequence is:
-    *
-    *   (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4)
-    */
-   uint prim = id / 3;
-   uint vtx = id % 3;
-
-   /* Flip the winding order of odd triangles */
-   if ((prim % 2) == 1) {
-      if (vtx == 1)
-         vtx = 2;
-      else if (vtx == 2)
-         vtx = 1;
-   }
-
-   return 2 * (prim + vtx);
-}
-
-static void
-store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
-{
-   global uint32_t *out_32 = (global uint32_t *)index_buffer;
-   global uint16_t *out_16 = (global uint16_t *)index_buffer;
-   global uint8_t *out_8 = (global uint8_t *)index_buffer;
-
-   if (index_size_B == 4)
-      out_32[id] = value;
-   else if (index_size_B == 2)
-      out_16[id] = value;
-   else
-      out_8[id] = value;
-}
-
-static uint
-load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
-           uint index_size)
-{
-   bool oob = id >= index_buffer_range_el;
-
-   /* If the load would be out-of-bounds, load the first element which is
-    * assumed valid. If the application index buffer is empty with robustness2,
-    * index_buffer will point to a zero sink where only the first is valid.
-    */
-   if (oob) {
-      id = 0;
-   }
-
-   uint el;
-   if (index_size == 1) {
-      el = ((constant uint8_t *)index_buffer)[id];
-   } else if (index_size == 2) {
-      el = ((constant uint16_t *)index_buffer)[id];
-   } else {
-      el = ((constant uint32_t *)index_buffer)[id];
-   }
-
-   /* D3D robustness semantics. TODO: Optimize? */
-   if (oob) {
-      el = 0;
-   }
-
-   return el;
-}
-
-uint
-libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
-                         uint index_size)
-{
-   return load_index(p->index_buffer, p->index_buffer_range_el, id, index_size);
-}
-
-static void
-increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c,
-                   uint count)
-{
-   global uint32_t *ptr[] = {a, b, c};
-
-   for (uint i = 0; i < 3; ++i) {
-      if (ptr[i]) {
-         *(ptr[i]) += count;
-      }
-   }
-}
-
-static unsigned
-decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
-                                        unsigned verts_per_patch)
-{
-   if (prim >= MESA_PRIM_PATCHES) {
-      return vertices / verts_per_patch;
-   } else {
-      return u_decomposed_prims_for_vertices(prim, vertices);
-   }
-}
-
 KERNEL(1)
 libagx_increment_ia(global uint32_t *ia_vertices,
                    global uint32_t *ia_primitives,
@ -390,13 +153,8 @@ libagx_increment_ia(global uint32_t *ia_vertices,
                    global uint32_t *c_invs, constant uint32_t *draw,
                    enum mesa_prim prim, unsigned verts_per_patch)
 {
-   increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]);
-
-   uint prims =
-      decomposed_prims_for_vertices_with_tess(prim, draw[0], verts_per_patch) *
-      draw[1];
-
-   increment_counters(ia_primitives, c_prims, c_invs, prims);
+   poly_increment_ia(ia_vertices, ia_primitives, vs_invocations, c_prims,
+                     c_invs, draw, prim, verts_per_patch);
 }

 KERNEL(1024)
@ -418,8 +176,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,

   /* Count non-restart indices */
   for (uint i = tid; i < count; i += 1024) {
-      uint index = load_index(index_buffer, index_buffer_range_el, start + i,
-                              index_size_B);
+      uint index = poly_load_index(index_buffer, index_buffer_range_el,
+                                   start + i, index_size_B);

      if (index != restart_index)
         partial++;
@ -433,7 +191,8 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,

   /* Elect a single thread from the workgroup to increment the counters */
   if (tid == 0) {
-      increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]);
+      poly_increment_counters(ia_vertices, vs_invocations, NULL,
+                              scratch * draw[1]);
   }

   /* TODO: We should vectorize this */
@ -441,22 +200,22 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
      uint accum = 0;
      int last_restart = -1;
      for (uint i = 0; i < count; ++i) {
-         uint index = load_index(index_buffer, index_buffer_range_el, start + i,
-                                 index_size_B);
+         uint index = poly_load_index(index_buffer, index_buffer_range_el,
+                                      start + i, index_size_B);

         if (index == restart_index) {
-            accum += decomposed_prims_for_vertices_with_tess(
+            accum += poly_decomposed_prims_for_vertices_with_tess(
               prim, i - last_restart - 1, verts_per_patch);
            last_restart = i;
         }
      }

      {
-         accum += decomposed_prims_for_vertices_with_tess(
+         accum += poly_decomposed_prims_for_vertices_with_tess(
            prim, count - last_restart - 1, verts_per_patch);
      }

-      increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
+      poly_increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
   }
 }

@ -483,7 +242,7 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
 * sets up most of the new draw descriptor.
 */
 static global void *
-setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
+setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw,
                      global uint *out, enum mesa_prim mode, uint index_size_B)
 {
   /* Determine an upper bound on the memory required for the index buffer.
@ -499,7 +258,7 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
    * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
    * currently wired up in any driver.
    */
-   uint old_heap_bottom_B = agx_heap_alloc_nonatomic_offs(heap, alloc_size);
+   uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);

   /* Setup most of the descriptor. Count will be determined after unroll. */
   out[1] = in_draw[1];                       /* instance count */
@ -512,14 +271,14 @@ setup_unroll_for_draw(global struct agx_heap *heap, constant uint *in_draw,
 }

 KERNEL(1024)
-libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
+libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
                      constant uint *in_draw, global uint32_t *out_draw,
                      uint32_t max_draws, uint32_t restart_index,
                      uint32_t index_buffer_size_el, uint32_t index_size_log2,
                      uint32_t flatshade_first, uint mode__11)
 {
   uint32_t index_size_B = 1 << index_size_log2;
-   enum mesa_prim mode = libagx_uncompact_prim(mode__11);
+   enum mesa_prim mode = poly_uncompact_prim(mode__11);
   uint tid = cl_local_id.x;
   uint count = in_draw[0];

@ -531,7 +290,7 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,

   barrier(CLK_LOCAL_MEM_FENCE);

-   uintptr_t in_ptr = (uintptr_t)(libagx_index_buffer(
+   uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
      index_buffer, index_buffer_size_el, in_draw[2], index_size_B));

   local uint scratch[32];
@ -545,8 +304,8 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
      for (;;) {
         uint idx = next_restart + tid;
         bool restart =
-            idx >= count || load_index(in_ptr, index_buffer_size_el, idx,
-                                       index_size_B) == restart_index;
+            idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx,
+                                            index_size_B) == restart_index;

         uint next_offs = first_true_thread_in_workgroup(restart, scratch);

@ -566,10 +325,10 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
            uint offset = needle + id;

            uint x = ((out_prims_base + i) * per_prim) + vtx;
-            uint y =
-               load_index(in_ptr, index_buffer_size_el, offset, index_size_B);
+            uint y = poly_load_index(in_ptr, index_buffer_size_el, offset,
+                                     index_size_B);

-            store_index(out_ptr, index_size_B, x, y);
+            poly_store_index(out_ptr, index_size_B, x, y);
         }
      }

@ -581,216 +340,39 @@ libagx_unroll_restart(global struct agx_heap *heap, uint64_t index_buffer,
      out_draw[0] = out_prims * per_prim;
 }

-static uint
-setup_xfb_buffer(global struct agx_geometry_params *p, uint i, uint stride,
-                 uint max_output_end, uint vertices_per_prim)
-{
-   uint xfb_offset = *(p->xfb_offs_ptrs[i]);
-   p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset;
-
-   /* Let output_end = output_offset + output_size.
-    *
-    * Primitive P will write up to (but not including) offset:
-    *
-    *    xfb_offset + ((P - 1) * (verts_per_prim * stride))
-    *               + ((verts_per_prim - 1) * stride)
-    *               + output_end
-    *
-    * To fit all outputs for P, that value must be less than the XFB
-    * buffer size for the output with maximal output_end, as everything
-    * else is constant here across outputs within a buffer/primitive:
-    *
-    *    floor(P) <= (stride + size - xfb_offset - output_end)
-    *                 // (stride * verts_per_prim)
-    */
-   int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset;
-   uint numer = max(numer_s, 0);
-   return numer / (stride * vertices_per_prim);
-}
-
-void
-libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
-                   uint32_t prim_index_offset, uint32_t vertex_offset,
-                   uint32_t verts_in_prim, uint3 info)
-{
-   _libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset,
-                       vertex_offset, verts_in_prim, info.x, info.y, info.z);
-}
-
-void
-libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset,
-                    uint nr_indices, uint alloc)
-{
-   for (uint i = nr_indices; i < alloc; ++i) {
-      index_buffer[inv_index_offset + i] = -1;
-   }
-}
-
 KERNEL(1)
 libagx_gs_setup_indirect(
   uint64_t index_buffer, constant uint *draw,
   global uintptr_t *vertex_buffer /* output */,
-   global struct agx_ia_state *ia /* output */,
-   global struct agx_geometry_params *p /* output */,
-   global struct agx_heap *heap,
+   global struct poly_ia_state *ia /* output */,
+   global struct poly_geometry_params *p /* output */,
+   global struct poly_heap *heap,
   uint64_t vs_outputs /* Vertex (TES) output mask */,
   uint32_t index_size_B /* 0 if no index bffer */,
   uint32_t index_buffer_range_el,
   uint32_t prim /* Input primitive type, enum mesa_prim */,
-   int is_prefix_summing, uint max_indices, enum agx_gs_shape shape)
+   int is_prefix_summing, uint max_indices, enum poly_gs_shape shape)
 {
-   /* Determine the (primitives, instances) grid size. */
-   uint vertex_count = draw[0];
-   uint instance_count = draw[1];
-
-   ia->verts_per_instance = vertex_count;
-
-   /* Calculate number of primitives input into the GS */
-   uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count);
-   p->input_primitives = prim_per_instance * instance_count;
-
-   /* Invoke VS as (vertices, instances); GS as (primitives, instances) */
-   p->vs_grid[0] = vertex_count;
-   p->vs_grid[1] = instance_count;
-
-   p->gs_grid[0] = prim_per_instance;
-   p->gs_grid[1] = instance_count;
-
-   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
-
-   /* If indexing is enabled, the third word is the offset into the index buffer
-    * in elements. Apply that offset now that we have it. For a hardware
-    * indirect draw, the hardware would do this for us, but for software input
-    * assembly we need to do it ourselves.
-    */
-   if (index_size_B) {
-      ia->index_buffer = libagx_index_buffer(
-         index_buffer, index_buffer_range_el, draw[2], index_size_B);
-
-      ia->index_buffer_range_el =
-         libagx_index_buffer_range_el(index_buffer_range_el, draw[2]);
-   }
-
-   /* We need to allocate VS and GS count buffers, do so now */
-   uint vertex_buffer_size =
-      libagx_tcs_in_size(vertex_count * instance_count, vs_outputs);
-
-   if (is_prefix_summing) {
-      p->count_buffer = agx_heap_alloc_nonatomic(
-         heap, p->input_primitives * p->count_buffer_stride);
-   }
-
-   p->input_buffer =
-      (uintptr_t)agx_heap_alloc_nonatomic(heap, vertex_buffer_size);
-   *vertex_buffer = p->input_buffer;
-
-   p->input_mask = vs_outputs;
-
-   /* Allocate the index buffer and write the draw consuming it */
-   global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
-
-   *cmd = (VkDrawIndexedIndirectCommand){
-      .indexCount = agx_gs_rast_vertices(shape, max_indices, prim_per_instance,
-                                         instance_count),
-      .instanceCount = agx_gs_rast_instances(shape, max_indices,
-                                             prim_per_instance, instance_count),
-   };
-
-   if (shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
-      cmd->firstIndex =
-         agx_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4;
-
-      p->output_index_buffer =
-         (global uint *)(heap->base + (cmd->firstIndex * 4));
-   }
-}
-
-/*
- * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
- * manually with subgroup ops and local memory since Mesa doesn't do those
- * lowerings yet.
- */
-static uint2
-libagx_work_group_scan_inclusive_add(uint x, local uint *scratch)
-{
-   uint sg_id = get_sub_group_id();
-
-   /* Partial prefix sum of the subgroup */
-   uint sg = sub_group_scan_inclusive_add(x);
-
-   /* Reduction (sum) for the subgroup */
-   uint sg_sum = sub_group_broadcast(sg, 31);
-
-   /* Write out all the subgroups sums */
-   barrier(CLK_LOCAL_MEM_FENCE);
-   scratch[sg_id] = sg_sum;
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
-    * threads in subgroup T.
-    */
-   uint other_sum = scratch[get_sub_group_local_id()];
-
-   /* Exclusive sum the subgroup sums to get the total before the current group,
-    * which can be added to the total for the current group.
-    */
-   uint other_sums = sub_group_scan_exclusive_add(other_sum);
-   uint base = sub_group_broadcast(other_sums, sg_id);
-   uint prefix = base + sg;
-
-   /* Reduce the workgroup using the prefix sum we already did */
-   uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
-
-   return (uint2)(prefix, reduction);
-}
-
-static void
-_libagx_prefix_sum(local uint *scratch, global uint *buffer, uint len,
-                   uint words, uint word)
-{
-   uint tid = cl_local_id.x;
-
-   /* Main loop: complete workgroups processing 1024 values at once */
-   uint i, count = 0;
-   uint len_remainder = len % 1024;
-   uint len_rounded_down = len - len_remainder;
-
-   for (i = tid; i < len_rounded_down; i += 1024) {
-      global uint *ptr = &buffer[(i * words) + word];
-      uint value = *ptr;
-      uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch);
-
-      *ptr = count + sums[0];
-      count += sums[1];
-   }
-
-   /* The last iteration is special since we won't have a full subgroup unless
-    * the length is divisible by the subgroup size, and we don't advance count.
-    */
-   global uint *ptr = &buffer[(i * words) + word];
-   uint value = (tid < len_remainder) ? *ptr : 0;
-   uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0];
-
-   if (tid < len_remainder) {
-      *ptr = count + scan;
-   }
+   poly_gs_setup_indirect(index_buffer, draw, vertex_buffer, ia, p, heap,
+                          vs_outputs, index_size_B, index_buffer_range_el, prim,
+                          is_prefix_summing, max_indices, shape);
 }

 KERNEL(1024)
-libagx_prefix_sum_geom(constant struct agx_geometry_params *p)
+libagx_prefix_sum_geom(constant struct poly_geometry_params *p)
 {
   local uint scratch[32];
-   _libagx_prefix_sum(scratch, p->count_buffer, p->input_primitives,
-                      p->count_buffer_stride / 4, cl_group_id.x);
+   poly_prefix_sum(scratch, p->count_buffer, p->input_primitives,
+                   p->count_buffer_stride / 4, cl_group_id.x, 1024);
 }

 KERNEL(1024)
-libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
+libagx_prefix_sum_tess(global struct poly_tess_args *p, global uint *c_prims,
                       global uint *c_invs, uint increment_stats__2)
 {
   local uint scratch[32];
-   _libagx_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */,
-                      0 /* word */);
+   poly_prefix_sum(scratch, p->counts, p->nr_patches, 1 /* words */,
+                   0 /* word */, 1024);

   /* After prefix summing, we know the total # of indices, so allocate the
    * index buffer now. Elect a thread for the allocation.
@ -805,7 +387,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
   /* Allocate 4-byte indices */
   uint32_t elsize_B = sizeof(uint32_t);
   uint32_t size_B = total * elsize_B;
-   uint alloc_B = agx_heap_alloc_nonatomic_offs(p->heap, size_B);
+   uint alloc_B = poly_heap_alloc_nonatomic_offs(p->heap, size_B);
   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->base) + alloc_B);

   /* ...and now we can generate the API indexed draw */
@ -818,7 +400,7 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
   desc[4] = 0;                  /* start_instance */

   /* If necessary, increment clipper statistics too. This is only used when
-    * there's no geometry shader following us. See agx_nir_lower_gs.c for more
+    * there's no geometry shader following us. See poly_nir_lower_gs.c for more
    * info on the emulation. We just need to calculate the # of primitives
    * tessellated.
    */
@ -827,150 +409,6 @@ libagx_prefix_sum_tess(global struct libagx_tess_args *p, global uint *c_prims,
                   : p->isolines  ? (total / 2)
                                  : (total / 3);

-      increment_counters(c_prims, c_invs, NULL, prims);
+      poly_increment_counters(c_prims, c_invs, NULL, prims);
   }
 }
-
-uintptr_t
-libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
-                             gl_varying_slot location)
-{
-   /* Written like this to let address arithmetic work */
-   return buffer + ((uintptr_t)libagx_tcs_in_offs_el(vtx, location, mask)) * 16;
-}
-
-uintptr_t
-libagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx,
-                              gl_varying_slot location)
-{
-   return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx,
-                                       location);
-}
-
-unsigned
-libagx_input_vertices(constant struct agx_ia_state *ia)
-{
-   return ia->verts_per_instance;
-}
-
-global uint *
-libagx_load_xfb_count_address(constant struct agx_geometry_params *p, int index,
-                              int count_words, uint unrolled_id)
-{
-   return &p->count_buffer[(unrolled_id * count_words) + index];
-}
-
-uint
-libagx_previous_xfb_primitives(global struct agx_geometry_params *p,
-                               int static_count, int count_index,
-                               int count_words, bool prefix_sum,
-                               uint unrolled_id)
-{
-   if (static_count >= 0) {
-      /* If the number of outputted vertices per invocation is known statically,
-       * we can calculate the base.
-       */
-      return unrolled_id * static_count;
-   } else {
-      /* Otherwise, load from the count buffer buffer. Note that the sums are
-       * inclusive, so index 0 is nonzero. This requires a little fixup here. We
-       * use a saturating unsigned subtraction so we don't read out-of-bounds.
-       *
-       * If we didn't prefix sum, there's only one element.
-       */
-      uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0;
-      uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index];
-
-      return unrolled_id == 0 ? 0 : count;
-   }
-}
-
-/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */
-#define libagx_foreach_xfb(word, index)                                        \
-   for (uint i = 0; i < 4; ++i)                                                \
-      if (word & BITFIELD_BIT(i))
-
-void
-libagx_pre_gs(global struct agx_geometry_params *p, uint streams,
-              uint buffers_written, uint4 buffer_to_stream, int4 count_index,
-              uint4 stride, uint4 output_end, int4 static_count,
-              uint invocations, uint vertices_per_prim,
-              global uint *gs_invocations, global uint *gs_primitives,
-              global uint *c_primitives, global uint *c_invocations)
-{
-   unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) +
-                          !!(count_index[2] >= 0) + !!(count_index[3] >= 0);
-   bool prefix_sum = count_words && buffers_written;
-   uint unrolled_in_prims = p->input_primitives;
-
-   /* Determine the number of primitives generated in each stream */
-   uint4 in_prims = 0;
-   libagx_foreach_xfb(streams, i) {
-      in_prims[i] = libagx_previous_xfb_primitives(
-         p, static_count[i], count_index[i], count_words, prefix_sum,
-         unrolled_in_prims);
-
-      *(p->prims_generated_counter[i]) += in_prims[i];
-   }
-
-   uint4 prims = in_prims;
-   uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3];
-
-   if (buffers_written) {
-      libagx_foreach_xfb(buffers_written, i) {
-         uint max_prims =
-            setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim);
-
-         unsigned stream = buffer_to_stream[i];
-         prims[stream] = min(prims[stream], max_prims);
-      }
-
-      int4 overflow = prims < in_prims;
-
-      libagx_foreach_xfb(streams, i) {
-         p->xfb_verts[i] = prims[i] * vertices_per_prim;
-
-         *(p->xfb_overflow[i]) += (bool)overflow[i];
-         *(p->xfb_prims_generated_counter[i]) += prims[i];
-      }
-
-      *(p->xfb_any_overflow) += any(overflow);
-
-      /* Update XFB counters */
-      libagx_foreach_xfb(buffers_written, i) {
-         uint32_t prim_stride_B = stride[i] * vertices_per_prim;
-         unsigned stream = buffer_to_stream[i];
-
-         global uint *ptr = p->xfb_offs_ptrs[i];
-
-         ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr);
-         *ptr += prims[stream] * prim_stride_B;
-      }
-   }
-
-   /* The geometry shader is invoked once per primitive (after unrolling
-    * primitive restart). From the spec:
-    *
-    *    In case of instanced geometry shaders (see section 11.3.4.2) the
-    *    geometry shader invocations count is incremented for each separate
-    *    instanced invocation.
-    */
-   *gs_invocations += unrolled_in_prims * invocations;
-   *gs_primitives += emitted_prims;
-
-   /* Clipper queries are not well-defined, so we can emulate them in lots of
-    * silly ways. We need the hardware counters to implement them properly. For
-    * now, just consider all primitives emitted as passing through the clipper.
-    * This satisfies spec text:
-    *
-    *    The number of primitives that reach the primitive clipping stage.
-    *
-    * and
-    *
-    *    If at least one vertex of the primitive lies inside the clipping
-    *    volume, the counter is incremented by one or more. Otherwise, the
-    *    counter is incremented by zero or more.
-    */
-   *c_primitives += emitted_prims;
-   *c_invocations += emitted_prims;
-}
--- a/src/asahi/libagx/geometry.h
+++ b/src/asahi/libagx/geometry.h
@ -1,410 +0,0 @@
-/*
- * Copyright 2023 Alyssa Rosenzweig
- * Copyright 2023 Valve Corporation
- * SPDX-License-Identifier: MIT
- */
-
-#include "asahi/lib/agx_abi.h"
-#include "compiler/libcl/libcl.h"
-#include "compiler/shader_enums.h"
-
-#include "util/bitscan.h"
-#include "util/u_math.h"
-
-#pragma once
-
-#define MAX_SO_BUFFERS     4
-#define MAX_VERTEX_STREAMS 4
-
-enum agx_gs_shape {
-   /* Indexed, where indices are encoded as:
-    *
-    *    round_to_pot(max_indices) * round_to_pot(input_primitives) *
-    *                              * instance_count
-    *
-    * invoked for max_indices * input_primitives * instance_count indices.
-    *
-    * This is used with any dynamic topology. No hardware instancing used.
-    */
-   AGX_GS_SHAPE_DYNAMIC_INDEXED,
-
-   /* Indexed with a static index buffer. Indices ranges up to max_indices.
-    * Hardware instance count = input_primitives * software instance count.
-    */
-   AGX_GS_SHAPE_STATIC_INDEXED,
-
-   /* Non-indexed. Dispatched as:
-    *
-    *    (max_indices, input_primitives * instance count).
-    */
-   AGX_GS_SHAPE_STATIC_PER_PRIM,
-
-   /* Non-indexed. Dispatched as:
-    *
-    *    (max_indices * input_primitives, instance count).
-    */
-   AGX_GS_SHAPE_STATIC_PER_INSTANCE,
-};
-
-static inline unsigned
-agx_gs_rast_vertices(enum agx_gs_shape shape, unsigned max_indices,
-                     unsigned input_primitives, unsigned instance_count)
-{
-   switch (shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED:
-      return max_indices * input_primitives * instance_count;
-
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-   case AGX_GS_SHAPE_STATIC_PER_PRIM:
-      return max_indices;
-
-   case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
-      return max_indices * input_primitives;
-   }
-
-   UNREACHABLE("invalid shape");
-}
-
-static inline unsigned
-agx_gs_rast_instances(enum agx_gs_shape shape, unsigned max_indices,
-                      unsigned input_primitives, unsigned instance_count)
-{
-   switch (shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED:
-      return 1;
-
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-   case AGX_GS_SHAPE_STATIC_PER_PRIM:
-      return input_primitives * instance_count;
-
-   case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
-      return instance_count;
-   }
-
-   UNREACHABLE("invalid shape");
-}
-
-static inline bool
-agx_gs_indexed(enum agx_gs_shape shape)
-{
-   return shape == AGX_GS_SHAPE_DYNAMIC_INDEXED ||
-          shape == AGX_GS_SHAPE_STATIC_INDEXED;
-}
-
-static inline unsigned
-agx_gs_index_size(enum agx_gs_shape shape)
-{
-   switch (shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED:
-      return 4;
-   case AGX_GS_SHAPE_STATIC_INDEXED:
-      return 1;
-   default:
-      return 0;
-   }
-}
-
-/* Heap to allocate from. */
-struct agx_heap {
-   DEVICE(uchar) base;
-   uint32_t bottom, size;
-} PACKED;
-static_assert(sizeof(struct agx_heap) == 4 * 4);
-
-#ifdef __OPENCL_VERSION__
-static inline uint
-_agx_heap_alloc_offs(global struct agx_heap *heap, uint size_B, bool atomic)
-{
-   size_B = align(size_B, 16);
-
-   uint offs;
-   if (atomic) {
-      offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
-   } else {
-      offs = heap->bottom;
-      heap->bottom = offs + size_B;
-   }
-
-   /* Use printf+abort because assert is stripped from release builds. */
-   if (heap->bottom >= heap->size) {
-      printf(
-         "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
-         size_B, offs, heap->size);
-
-      abort();
-   }
-
-   return offs;
-}
-
-static inline uint
-agx_heap_alloc_nonatomic_offs(global struct agx_heap *heap, uint size_B)
-{
-   return _agx_heap_alloc_offs(heap, size_B, false);
-}
-
-static inline uint
-agx_heap_alloc_atomic_offs(global struct agx_heap *heap, uint size_B)
-{
-   return _agx_heap_alloc_offs(heap, size_B, true);
-}
-
-static inline global void *
-agx_heap_alloc_nonatomic(global struct agx_heap *heap, uint size_B)
-{
-   return heap->base + agx_heap_alloc_nonatomic_offs(heap, size_B);
-}
-
-uint64_t nir_load_ro_sink_address_poly(void);
-
-static inline uint64_t
-libagx_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
-                    uint elsize_B)
-{
-   if (offset_el < size_el)
-      return index_buffer + (offset_el * elsize_B);
-   else
-      return nir_load_ro_sink_address_poly();
-}
-#endif
-
-struct agx_ia_state {
-   /* Index buffer if present. */
-   uint64_t index_buffer;
-
-   /* Size of the bound index buffer for bounds checking */
-   uint32_t index_buffer_range_el;
-
-   /* Number of vertices per instance. Written by CPU for direct draw, indirect
-    * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
-    */
-   uint32_t verts_per_instance;
-} PACKED;
-static_assert(sizeof(struct agx_ia_state) == 4 * 4);
-
-static inline uint
-libagx_index_buffer_range_el(uint size_el, uint offset_el)
-{
-   return offset_el < size_el ? (size_el - offset_el) : 0;
-}
-
-struct agx_geometry_params {
-   /* Address of associated indirect draw buffer */
-   DEVICE(uint) indirect_desc;
-
-   /* Address of count buffer. For an indirect draw, this will be written by the
-    * indirect setup kernel.
-    */
-   DEVICE(uint) count_buffer;
-
-   /* Address of the primitives generated counters */
-   DEVICE(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
-   DEVICE(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
-   DEVICE(uint) xfb_overflow[MAX_VERTEX_STREAMS];
-   DEVICE(uint) xfb_any_overflow;
-
-   /* Pointers to transform feedback buffer offsets in bytes */
-   DEVICE(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
-
-   /* Output index buffer, allocated by pre-GS. */
-   DEVICE(uint) output_index_buffer;
-
-   /* Address of transform feedback buffer in general, supplied by the CPU. */
-   DEVICE(uchar) xfb_base_original[MAX_SO_BUFFERS];
-
-   /* Address of transform feedback for the current primitive. Written by pre-GS
-    * program.
-    */
-   DEVICE(uchar) xfb_base[MAX_SO_BUFFERS];
-
-   /* Address and present mask for the input to the geometry shader. These will
-    * reflect the vertex shader for VS->GS or instead the tessellation
-    * evaluation shader for TES->GS.
-    */
-   uint64_t input_buffer;
-   uint64_t input_mask;
-
-   /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
-   uint64_t flat_outputs;
-
-   uint32_t xfb_size[MAX_SO_BUFFERS];
-
-   /* Number of vertices emitted by transform feedback per stream. Written by
-    * the pre-GS program.
-    */
-   uint32_t xfb_verts[MAX_VERTEX_STREAMS];
-
-   /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
-    * out by the GS indirect setup kernel or the CPU for a direct draw. This is
-    * the "indirect local" format: first 3 is in threads, second 3 is in grid
-    * blocks. This lets us use nontrivial workgroups with indirect draws without
-    * needing any predication.
-    */
-   uint32_t vs_grid[6];
-   uint32_t gs_grid[6];
-
-   /* Number of input primitives across all instances, calculated by the CPU for
-    * a direct draw or the GS indirect setup kernel for an indirect draw.
-    */
-   uint32_t input_primitives;
-
-   /* Number of input primitives per instance, rounded up to a power-of-two and
-    * with the base-2 log taken. This is used to partition the output vertex IDs
-    * efficiently.
-    */
-   uint32_t primitives_log2;
-
-   /* Number of bytes output by the GS count shader per input primitive (may be
-    * 0), written by CPU and consumed by indirect draw setup shader for
-    * allocating counts.
-    */
-   uint32_t count_buffer_stride;
-
-   /* Dynamic input topology. Must be compatible with the geometry shader's
-    * layout() declared input class.
-    */
-   uint32_t input_topology;
-} PACKED;
-static_assert(sizeof(struct agx_geometry_params) == 86 * 4);
-
-/* TCS shared memory layout:
- *
- *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
- *
- * TODO: compact.
- */
-static inline uint
-libagx_tcs_in_offs_el(uint vtx, gl_varying_slot location,
-                      uint64_t crosslane_vs_out_mask)
-{
-   uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
-   uint offs = util_bitcount64(crosslane_vs_out_mask &
-                               (((uint64_t)(1) << location) - 1));
-
-   return base + offs;
-}
-
-static inline uint
-libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
-                   uint64_t crosslane_vs_out_mask)
-{
-   return libagx_tcs_in_offs_el(vtx, location, crosslane_vs_out_mask) * 16;
-}
-
-static inline uint
-libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
-{
-   return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
-}
-
-/*
- * TCS out buffer layout, per-patch:
- *
- *    float tess_level_outer[4];
- *    float tess_level_inner[2];
- *    vec4 patch_out[MAX_PATCH_OUTPUTS];
- *    vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
- *
- * Vertex out are compacted based on the mask of written out. Patch
- * out are used as-is.
- *
- * Bounding boxes are ignored.
- */
-static inline uint
-libagx_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
-                       uint64_t vtx_out_mask)
-{
-   uint off = 0;
-   if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
-      return off;
-
-   off += 4;
-   if (location == VARYING_SLOT_TESS_LEVEL_INNER)
-      return off;
-
-   off += 2;
-   if (location >= VARYING_SLOT_PATCH0)
-      return off + (4 * (location - VARYING_SLOT_PATCH0));
-
-   /* Anything else is a per-vtx output */
-   off += 4 * nr_patch_out;
-   off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
-
-   uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
-   return off + (4 * idx);
-}
-
-static inline uint
-libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
-                    uint64_t vtx_out_mask)
-{
-   return libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask) *
-          4;
-}
-
-static inline uint
-libagx_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
-                         uint64_t vtx_out_mask)
-{
-   return libagx_tcs_out_offs_el(out_patch_size, 0, nr_patch_out, vtx_out_mask);
-}
-
-static inline uint
-libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
-                      uint64_t vtx_out_mask)
-{
-   return libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
-          4;
-}
-
-/* In a tess eval shader, stride for hw vertex ID */
-#define LIBAGX_TES_PATCH_ID_STRIDE 8192
-
-static uint
-libagx_compact_prim(enum mesa_prim prim)
-{
-   static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
-   static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
-
-#ifndef __OPENCL_VERSION__
-   assert(prim != MESA_PRIM_QUADS);
-   assert(prim != MESA_PRIM_QUAD_STRIP);
-   assert(prim != MESA_PRIM_POLYGON);
-   assert(prim != MESA_PRIM_PATCHES);
-#endif
-
-   return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
-}
-
-static enum mesa_prim
-libagx_uncompact_prim(uint packed)
-{
-   return (packed >= MESA_PRIM_QUADS) ? (packed + 3) : packed;
-}
-
-/*
- * Write a strip into a 32-bit index buffer. This is the sequence:
- *
- *    (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
- *
- * For points, we write index buffers without restart just for remapping.
- */
-static inline void
-_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
-                    uint32_t vertex_offset, uint32_t verts_in_prim,
-                    uint32_t stream, uint32_t stream_multiplier, uint32_t n)
-{
-   bool restart = n > 1;
-   if (verts_in_prim < n)
-      return;
-
-   GLOBAL uint32_t *out = &index_buffer[index_offset];
-
-   /* Write out indices for the strip */
-   for (uint32_t i = 0; i < verts_in_prim; ++i) {
-      out[i] = (vertex_offset + i) * stream_multiplier + stream;
-   }
-
-   if (restart)
-      out[verts_in_prim] = -1;
-}
--- a/src/asahi/libagx/meson.build
+++ b/src/asahi/libagx/meson.build
@ -21,6 +21,7 @@ libagx_spv = custom_target(
    libagx_shader_files, '--',
    '-I' + join_paths(meson.project_source_root(), 'include'),
    '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
+    '-I' + join_paths(meson.project_source_root(), 'src/poly/cl'),
    '-I' + join_paths(meson.current_source_dir(), '.'),
    '-I' + join_paths(meson.current_source_dir(), '../../'),
    '-I' + join_paths(meson.current_source_dir(), 'shaders'),
--- a/src/asahi/libagx/tessellation.cl
+++ b/src/asahi/libagx/tessellation.cl
@ -3,148 +3,14 @@
 * SPDX-License-Identifier: MIT
 */

-#include "geometry.h"
-#include "tessellator.h"
-#include <agx_pack.h>
-
-uint
-libagx_tcs_patch_vertices_in(constant struct libagx_tess_args *p)
-{
-   return p->input_patch_size;
-}
-
-uint
-libagx_tes_patch_vertices_in(constant struct libagx_tess_args *p)
-{
-   return p->output_patch_size;
-}
-
-uint
-libagx_tcs_unrolled_id(constant struct libagx_tess_args *p, uint3 wg_id)
-{
-   return (wg_id.y * p->patches_per_instance) + wg_id.x;
-}
-
-uint64_t
-libagx_tes_buffer(constant struct libagx_tess_args *p)
-{
-   return p->tes_buffer;
-}
-
-/*
- * Helper to lower indexing for a tess eval shader ran as a compute shader. This
- * handles the tess+geom case. This is simpler than the general input assembly
- * lowering, as we know:
- *
- * 1. the index buffer is U32
- * 2. the index is in bounds
- *
- * Therefore we do a simple load. No bounds checking needed.
- */
-uint32_t
-libagx_load_tes_index(constant struct libagx_tess_args *p, uint32_t index)
-{
-   /* Swap second and third vertices of each triangle to flip winding order
-    * dynamically if needed.
-    */
-   if (p->ccw) {
-      uint id = index % 3;
-
-      if (id == 1)
-         index++;
-      else if (id == 2)
-         index--;
-   }
-
-   return p->index_buffer[index];
-}
-
-ushort
-libagx_tcs_in_offset(uint vtx, gl_varying_slot location,
-                     uint64_t crosslane_vs_out_mask)
-{
-   return libagx_tcs_in_offs(vtx, location, crosslane_vs_out_mask);
-}
-
-uintptr_t
-libagx_tcs_out_address(constant struct libagx_tess_args *p, uint patch_id,
-                       uint vtx_id, gl_varying_slot location, uint nr_patch_out,
-                       uint out_patch_size, uint64_t vtx_out_mask)
-{
-   uint stride_el =
-      libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask);
-
-   uint offs_el =
-      libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask);
-
-   offs_el += patch_id * stride_el;
-
-   /* Written to match the AGX addressing mode */
-   return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2);
-}
-
-static uint
-libagx_tes_unrolled_patch_id(uint raw_id)
-{
-   return raw_id / LIBAGX_TES_PATCH_ID_STRIDE;
-}
-
-uint
-libagx_tes_patch_id(constant struct libagx_tess_args *p, uint raw_id)
-{
-   return libagx_tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
-}
-
-static uint
-tes_vertex_id_in_patch(uint raw_id)
-{
-   return raw_id % LIBAGX_TES_PATCH_ID_STRIDE;
-}
-
-float2
-libagx_load_tess_coord(constant struct libagx_tess_args *p, uint raw_id)
-{
-   uint patch = libagx_tes_unrolled_patch_id(raw_id);
-   uint vtx = tes_vertex_id_in_patch(raw_id);
-
-   global struct libagx_tess_point *t =
-      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
-
-   /* Written weirdly because NIR struggles with loads of structs */
-   uint2 fixed = *((global uint2 *)t);
-
-   /* Convert fixed point to float */
-   return convert_float2(fixed) / (1u << 16);
-}
-
-uintptr_t
-libagx_tes_in_address(constant struct libagx_tess_args *p, uint raw_id,
-                      uint vtx_id, gl_varying_slot location)
-{
-   uint patch = libagx_tes_unrolled_patch_id(raw_id);
-
-   return libagx_tcs_out_address(p, patch, vtx_id, location,
-                                 p->tcs_patch_constants, p->output_patch_size,
-                                 p->tcs_per_vertex_outputs);
-}
-
-float4
-libagx_tess_level_outer_default(constant struct libagx_tess_args *p)
-{
-   return vload4(0, p->tess_level_outer_default);
-}
-
-float2
-libagx_tess_level_inner_default(constant struct libagx_tess_args *p)
-{
-   return vload2(0, p->tess_level_inner_default);
-}
+#include "poly/geometry.h"
+#include "poly/tessellator.h"

 KERNEL(1)
 libagx_tess_setup_indirect(
-   global struct libagx_tess_args *p,
+   global struct poly_tess_args *p,
   global uint32_t *grids /* output: VS then TCS then tess */,
-   global struct agx_ia_state *ia /* output */, global uint32_t *indirect,
+   global struct poly_ia_state *ia /* output */, global uint32_t *indirect,
   global uint64_t *vertex_output_buffer_ptr, uint64_t in_index_buffer,
   uint32_t in_index_buffer_range_el, uint32_t in_index_size_B,
   uint64_t vertex_outputs /* bitfield */,
@ -174,11 +40,11 @@ libagx_tess_setup_indirect(
   alloc += unrolled_patches * sizeof(uint32_t);

   uint vb_offs = alloc;
-   uint vb_size = libagx_tcs_in_size(count * instance_count, vertex_outputs);
+   uint vb_size = poly_tcs_in_size(count * instance_count, vertex_outputs);
   alloc += vb_size;

   /* Allocate all patch calculations in one go */
-   global uchar *blob = agx_heap_alloc_nonatomic(p->heap, alloc);
+   global uchar *blob = poly_heap_alloc_nonatomic(p->heap, alloc);

   p->tcs_buffer = (global float *)(blob + tcs_out_offs);
   p->patches_per_instance = in_patches;
@ -201,11 +67,11 @@ libagx_tess_setup_indirect(
    */
   if (in_index_size_B) {
      ia->index_buffer =
-         libagx_index_buffer(in_index_buffer, in_index_buffer_range_el,
-                             indirect[2], in_index_size_B);
+         poly_index_buffer(in_index_buffer, in_index_buffer_range_el,
+                           indirect[2], in_index_size_B);

      ia->index_buffer_range_el =
-         libagx_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
+         poly_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
   }

   /* VS grid size */
--- a/src/asahi/libagx/tessellator.cl
+++ b/src/asahi/libagx/tessellator.cl
--- a/src/asahi/libagx/tessellator.h
+++ b/src/asahi/libagx/tessellator.h
@ -5,104 +5,14 @@

 #pragma once

-#include "compiler/libcl/libcl.h"
+#include "poly/tessellator.h"

-enum libagx_tess_partitioning {
-   LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD,
-   LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN,
-   LIBAGX_TESS_PARTITIONING_INTEGER,
-};
-
-enum libagx_tess_mode {
-   /* Do not actually tessellate, just write the index counts */
-   LIBAGX_TESS_MODE_COUNT,
-
-   /* Tessellate using the count buffers to allocate indices */
-   LIBAGX_TESS_MODE_WITH_COUNTS,
-};
-
-struct libagx_tess_point {
-   uint32_t u;
-   uint32_t v;
-};
-static_assert(sizeof(struct libagx_tess_point) == 8);
-
-struct libagx_tess_args {
-   /* Heap to allocate tessellator outputs in */
-   DEVICE(struct agx_heap) heap;
-
-   /* Patch coordinate buffer, indexed as:
-    *
-    *    coord_allocs[patch_ID] + vertex_in_patch
-    */
-   DEVICE(struct libagx_tess_point) patch_coord_buffer;
-
-   /* Per-patch index within the heap for the tess coords, written by the
-    * tessellator based on the allocated memory.
-    */
-   DEVICE(uint32_t) coord_allocs;
-
-   /* Space for output draws from the tessellator. API draw calls. */
-   DEVICE(uint32_t) out_draws;
-
-   /* Tessellation control shader output buffer. */
-   DEVICE(float) tcs_buffer;
-
-   /* Count buffer. # of indices per patch written here, then prefix summed. */
-   DEVICE(uint32_t) counts;
-
-   /* Allocated index buffer for all patches, if we're prefix summing counts */
-   DEVICE(uint32_t) index_buffer;
-
-   /* Address of the tess eval invocation counter for implementing pipeline
-    * statistics, if active. Zero if inactive. Incremented by tessellator.
-    */
-   DEVICE(uint32_t) statistic;
-
-   /* When geom+tess used together, the buffer containing TES outputs (executed
-    * as a hardware compute shader).
-    */
-   uint64_t tes_buffer;
-
-   /* Bitfield of TCS per-vertex outputs */
-   uint64_t tcs_per_vertex_outputs;
-
-   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
-    * Unused in Vulkan and OpenGL ES.
-    */
-   float tess_level_outer_default[4];
-   float tess_level_inner_default[2];
-
-   /* Number of vertices in the input patch */
-   uint32_t input_patch_size;
-
-   /* Number of vertices in the TCS output patch */
-   uint32_t output_patch_size;
-
-   /* Number of patch constants written by TCS */
-   uint32_t tcs_patch_constants;
-
-   /* Number of input patches per instance of the VS/TCS */
-   uint32_t patches_per_instance;
-
-   /* Stride between tessellation facotrs in the TCS output buffer. */
-   uint32_t tcs_stride_el;
-
-   /* Number of patches being tessellated */
-   uint32_t nr_patches;
-
-   /* Partitioning and points mode. These affect per-patch setup code but not
-    * the hot tessellation loop so we make them dynamic to reduce tessellator
-    * variants.
-    */
-   enum libagx_tess_partitioning partitioning;
-   uint32_t points_mode;
-   uint32_t isolines;
-
-   /* When fed into a geometry shader, triangles should be counter-clockwise.
-    * The tessellator always produces clockwise triangles, but we can swap
-    * dynamically in the TES.
-    */
-   uint32_t ccw;
-} PACKED;
-static_assert(sizeof(struct libagx_tess_args) == 36 * 4);
+#define libagx_tessellate(context, grid, barrier, prim, mode, state)           \
+   if (prim == TESS_PRIMITIVE_QUADS) {                                         \
+      libagx_tess_quad(context, grid, barrier, state, mode);                   \
+   } else if (prim == TESS_PRIMITIVE_TRIANGLES) {                              \
+      libagx_tess_tri(context, grid, barrier, state, mode);                    \
+   } else {                                                                    \
+      assert(prim == TESS_PRIMITIVE_ISOLINES);                                 \
+      libagx_tess_isoline(context, grid, barrier, state, mode);                \
+   }
--- a/src/asahi/vulkan/hk_cmd_dispatch.c
+++ b/src/asahi/vulkan/hk_cmd_dispatch.c
@ -5,10 +5,10 @@
 * SPDX-License-Identifier: MIT
 */
 #include "libagx/query.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "vulkan/vulkan_core.h"
 #include "agx_helpers.h"
 #include "agx_linker.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_pack.h"
 #include "agx_scratch.h"
 #include "agx_tilebuffer.h"
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -5,6 +5,7 @@
 * SPDX-License-Identifier: MIT
 */
 #include <assert.h>
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "agx_abi.h"
 #include "agx_bg_eot.h"
 #include "agx_bo.h"
@ -13,7 +14,6 @@
 #include "agx_device.h"
 #include "agx_helpers.h"
 #include "agx_linker.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_ppp.h"
 #include "agx_tilebuffer.h"
@ -31,10 +31,10 @@

 #include "asahi/genxml/agx_pack.h"
 #include "asahi/libagx/compression.h"
-#include "asahi/libagx/geometry.h"
 #include "asahi/libagx/libagx.h"
 #include "asahi/libagx/query.h"
 #include "asahi/libagx/tessellator.h"
+#include "poly/geometry.h"
 #include "util/blend.h"
 #include "util/format/format_utils.h"
 #include "util/format/u_formats.h"
@ -1007,9 +1007,9 @@ hk_heap(struct hk_cmd_buffer *cmd)
       * the CPU as rodata, even though the GPU uses it for scratch internally.
       */
      off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
-      struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
+      struct poly_heap *map = agx_bo_map(dev->rodata.bo) + off;

-      *map = (struct agx_heap){
+      *map = (struct poly_heap){
         .base = dev->heap->va->addr,
         .size = size,
      };
@ -1021,7 +1021,7 @@ hk_heap(struct hk_cmd_buffer *cmd)
      uint64_t addr = dev->rodata.heap;

      /* Zeroing the allocated index frees everything */
-      hk_queue_write(cmd, addr + offsetof(struct agx_heap, bottom), 0,
+      hk_queue_write(cmd, addr + offsetof(struct poly_heap, bottom), 0,
                     true /* after gfx */);

      cmd->uses_heap = true;
@ -1045,7 +1045,7 @@ hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
 {
   assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");

-   struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
+   struct poly_ia_state ia = {.verts_per_instance = draw.b.count[0]};

   if (draw.indexed) {
      unsigned index_size_B = agx_index_size_to_B(draw.index_size);
@ -1115,7 +1115,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
      mode = u_decomposed_prim(mode);
   }

-   struct agx_geometry_params params = {
+   struct poly_geometry_params params = {
      .flat_outputs = fs->info.fs.interp.flat,
      .input_topology = mode,

@ -1174,7 +1174,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
   params.vs_grid[4] = params.gs_grid[4] = 1;
   params.vs_grid[5] = params.gs_grid[5] = 1;

-   struct agx_gs_info *gsi = &count->info.gs;
+   struct poly_gs_info *gsi = &count->info.gs;

   if (indirect) {
      /* TODO: size */
@ -1183,7 +1183,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
      params.indirect_desc = cmd->geom_indirect;
      params.vs_grid[2] = params.gs_grid[2] = 1;

-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
         /* Need to allocate heap if we haven't yet */
         hk_heap(cmd);

@ -1191,7 +1191,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
         cmd->geom_index_count = dev->heap->size;
      } else {
         cmd->geom_index_count =
-            agx_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0);
+            poly_gs_rast_vertices(gsi->shape, gsi->max_indices, 1, 0);
      }
   } else {
      uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
@ -1207,13 +1207,13 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
         params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
      }

-      cmd->geom_index_count = agx_gs_rast_vertices(
+      cmd->geom_index_count = poly_gs_rast_vertices(
         gsi->shape, gsi->max_indices, params.gs_grid[0], instances);

-      cmd->geom_instance_count = agx_gs_rast_instances(
+      cmd->geom_instance_count = poly_gs_rast_instances(
         gsi->shape, gsi->max_indices, params.gs_grid[0], instances);

-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
         params.output_index_buffer =
            hk_pool_alloc(cmd, cmd->geom_index_count * 4, 4).gpu;

@ -1221,7 +1221,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
      }
   }

-   if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
+   if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) {
      cmd->geom_index_buffer =
         hk_pool_upload(cmd, count->info.gs.topology, gsi->max_indices * 4, 4);
   }
@ -1231,7 +1231,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
 }

 static void
-hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
+hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct poly_tess_args *out,
                      struct agx_draw draw)
 {
   struct hk_device *dev = hk_cmd_buffer_device(cmd);
@ -1239,14 +1239,14 @@ hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
   struct hk_graphics_state *gfx = &cmd->state.gfx;
   struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);

-   enum libagx_tess_partitioning partitioning =
+   enum poly_tess_partitioning partitioning =
      gfx->tess.info.spacing == TESS_SPACING_EQUAL
-         ? LIBAGX_TESS_PARTITIONING_INTEGER
+         ? POLY_TESS_PARTITIONING_INTEGER
      : gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
-         ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
-         : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
+         ? POLY_TESS_PARTITIONING_FRACTIONAL_ODD
+         : POLY_TESS_PARTITIONING_FRACTIONAL_EVEN;

-   struct libagx_tess_args args = {
+   struct poly_tess_args args = {
      .heap = hk_heap(cmd),
      .tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
      .statistic = hk_pipeline_stat_addr(
@ -1428,7 +1428,7 @@ hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct agx_draw draw,

   libagx_unroll_restart_struct(cmd, agx_1d(1024 * draw_count),
                                AGX_BARRIER_ALL | AGX_PREGFX, ia,
-                                libagx_compact_prim(prim));
+                                poly_compact_prim(prim));

   return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
                                    dev->heap->size, draw.index_size,
@ -1485,7 +1485,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,

      if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
         gsi.vertex_buffer = desc->root.draw.tess_params +
-                             offsetof(struct libagx_tess_args, tes_buffer);
+                             offsetof(struct poly_tess_args, tes_buffer);
      } else {
         gsi.vertex_buffer = desc->root.root_desc_addr +
                             offsetof(struct hk_root_descriptor_table,
@ -1501,10 +1501,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
                                      AGX_BARRIER_ALL | AGX_PREGFX, gsi);

      grid_vs = agx_grid_indirect_local(
-         geometry_params + offsetof(struct agx_geometry_params, vs_grid));
+         geometry_params + offsetof(struct poly_geometry_params, vs_grid));

      grid_gs = agx_grid_indirect_local(
-         geometry_params + offsetof(struct agx_geometry_params, gs_grid));
+         geometry_params + offsetof(struct poly_geometry_params, gs_grid));
   } else {
      grid_vs = grid_gs = draw.b;
      grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
@ -1554,9 +1554,9 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
   /* Pre-rast geometry shader */
   hk_dispatch_with_local_size(cmd, cs, main, grid_gs, wg);

-   if (agx_gs_indexed(count->info.gs.shape)) {
+   if (poly_gs_indexed(count->info.gs.shape)) {
      enum agx_index_size index_size =
-         agx_translate_index_size(agx_gs_index_size(count->info.gs.shape));
+         agx_translate_index_size(poly_gs_index_size(count->info.gs.shape));

      if (agx_is_indirect(draw.b)) {
         return agx_draw_indexed_indirect(
@ -1661,13 +1661,13 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,

   /* First generate counts, then prefix sum them, and then tessellate. */
   libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
-                     LIBAGX_TESS_MODE_COUNT, state);
+                     POLY_TESS_MODE_COUNT, state);

   libagx_prefix_sum_tess(cmd, agx_1d(1024), AGX_BARRIER_ALL | AGX_PREGFX,
                          state, c_prims, c_inv, c_prims || c_inv);

   libagx_tessellate(cmd, grid_tess, AGX_BARRIER_ALL | AGX_PREGFX, info.mode,
-                     LIBAGX_TESS_MODE_WITH_COUNTS, state);
+                     POLY_TESS_MODE_WITH_COUNTS, state);

   return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
                                    dev->heap->size, AGX_INDEX_SIZE_U32, false);
@ -2219,8 +2219,9 @@ hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
   uint32_t index = cmd->state.gfx.index.restart;

   if (gs) {
-      enum agx_gs_shape shape = gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape;
-      index = BITFIELD_MASK(8 * agx_gs_index_size(shape));
+      enum poly_gs_shape shape =
+         gs->variants[HK_GS_VARIANT_COUNT].info.gs.shape;
+      index = BITFIELD_MASK(8 * poly_gs_index_size(shape));
   }

   /* VDM State updates are relatively expensive, so only emit them when the
@ -3061,7 +3062,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
      bool indirect = agx_is_indirect(draw.b) || draw.restart;

      desc->root.draw.input_assembly =
-         indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
+         indirect ? hk_pool_alloc(cmd, sizeof(struct poly_ia_state), 4).gpu
                  : hk_upload_ia_params(cmd, draw);
      desc->root_dirty = true;
   }
@ -3078,7 +3079,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
      if (!indirect) {
         uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
         unsigned vb_size =
-            libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
+            poly_tcs_in_size(verts * instances, vs->b.info.outputs);

         /* Allocate if there are any outputs, or use the null sink to trap
          * reads if there aren't. Those reads are undefined but should not
@ -3094,7 +3095,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,

   struct agx_ptr tess_args = {0};
   if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
-      tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4);
+      tess_args = hk_pool_alloc(cmd, sizeof(struct poly_tess_args), 4);
      gfx->descriptors.root.draw.tess_params = tess_args.gpu;
      gfx->descriptors.root_dirty = true;
   }
--- a/src/asahi/vulkan/hk_device.c
+++ b/src/asahi/vulkan/hk_device.c
@ -19,8 +19,8 @@
 #include "asahi/genxml/agx_pack.h"
 #include "asahi/lib/agx_bo.h"
 #include "asahi/lib/agx_device.h"
-#include "asahi/libagx/geometry.h"
 #include "compiler/nir/nir_builder.h"
+#include "poly/geometry.h"
 #include "util/hash_table.h"
 #include "util/ralloc.h"
 #include "util/simple_mtx.h"
@ -86,7 +86,7 @@ hk_upload_rodata(struct hk_device *dev)
    */
   offs = align(offs, sizeof(uint64_t));
   dev->rodata.heap = dev->rodata.bo->va->addr + offs;
-   offs += sizeof(struct agx_heap);
+   offs += sizeof(struct poly_heap);

   return VK_SUCCESS;
 }
--- a/src/asahi/vulkan/hk_shader.c
+++ b/src/asahi/vulkan/hk_shader.c
@ -8,10 +8,10 @@
 */
 #include "hk_shader.h"

+#include "poly/nir/poly_nir_lower_gs.h"
 #include "agx_debug.h"
 #include "agx_device.h"
 #include "agx_helpers.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "glsl_types.h"
 #include "hk_instance.h"
@ -1114,13 +1114,13 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
         shader->info.tess.tcs_output_patch_size =
            nir->info.tess.tcs_vertices_out;
         shader->info.tess.tcs_per_vertex_outputs =
-            agx_tcs_per_vertex_outputs(nir);
+            poly_tcs_per_vertex_outputs(nir);
         shader->info.tess.tcs_nr_patch_outputs =
            util_last_bit(nir->info.patch_outputs_written);
-         shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir);
+         shader->info.tess.tcs_output_stride = poly_tcs_output_stride(nir);
      } else {
         /* This destroys info so it needs to happen after the gather */
-         NIR_PASS(_, nir, agx_nir_lower_tes, hw);
+         NIR_PASS(_, nir, poly_nir_lower_tes, hw);
      }
   }

@ -1137,7 +1137,7 @@ hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
      if (hw) {
         hk_lower_hw_vs(nir, shader, kill_psiz);
      } else {
-         NIR_PASS(_, nir, agx_nir_lower_vs_before_gs);
+         NIR_PASS(_, nir, poly_nir_lower_vs_before_gs);
         nir->info.stage = MESA_SHADER_COMPUTE;
         memset(&nir->info.cs, 0, sizeof(nir->info.cs));
         nir->xfb_info = NULL;
@ -1335,7 +1335,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
      hk_populate_vs_key(&key_tmp.vs, state);
      key = &key_tmp;
   } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
-      NIR_PASS(_, nir, agx_nir_lower_tcs);
+      NIR_PASS(_, nir, poly_nir_lower_tcs);
   }

   /* Compile all variants up front */
@ -1345,7 +1345,7 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,

      nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;

-      NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs,
+      NIR_PASS(_, nir, poly_nir_lower_gs, &count, &rast, &pre_gs,
               &count_variant->info.gs);

      agx_preprocess_nir(count);
--- a/src/asahi/vulkan/hk_shader.h
+++ b/src/asahi/vulkan/hk_shader.h
@ -8,9 +8,9 @@
 #pragma once

 #include "asahi/compiler/agx_compile.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/macros.h"
 #include "agx_linker.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_pack.h"
 #include "agx_usc.h"
@ -94,7 +94,7 @@ struct hk_shader_info {
         struct hk_tess_info info;
      } tess;

-      struct agx_gs_info gs;
+      struct poly_gs_info gs;

      /* Used to initialize the union for other stages */
      uint8_t _pad[32];
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@ -5,10 +5,10 @@

 #include "compiler/nir/nir_builder.h"
 #include "pipe/p_defines.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitset.h"
 #include "util/u_dynarray.h"
 #include "agx_abi.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_state.h"
 #include "nir.h"
 #include "nir_builder_opcodes.h"
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -34,6 +34,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
+#include "poly/geometry.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitscan.h"
 #include "util/bitset.h"
 #include "util/blend.h"
@ -57,10 +59,8 @@
 #include "agx_disk_cache.h"
 #include "agx_linker.h"
 #include "agx_nir.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_lower_vbo.h"
 #include "agx_tilebuffer.h"
-#include "geometry.h"
 #include "libagx.h"
 #include "libagx_dgc.h"
 #include "libagx_shaders.h"
@ -1544,7 +1544,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
   nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);

   /* Auxiliary programs */
-   struct agx_gs_info gs_info = {0};
+   struct poly_gs_info gs_info = {0};
   uint64_t outputs = 0;
   struct agx_fs_epilog_link_info epilog_key = {false};
   nir_shader *gs_count = NULL;
@ -1564,7 +1564,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
      struct asahi_vs_shader_key *key = &key_->vs;

      if (nir->info.vs.tes_poly) {
-         NIR_PASS(_, nir, agx_nir_lower_tes, key->hw);
+         NIR_PASS(_, nir, poly_nir_lower_tes, key->hw);
      } else {
         NIR_PASS(_, nir, agx_nir_gather_vs_inputs, attrib_components_read);
         NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog);
@ -1580,7 +1580,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
         NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
         NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
      } else {
-         NIR_PASS(_, nir, agx_nir_lower_vs_before_gs);
+         NIR_PASS(_, nir, poly_nir_lower_vs_before_gs);

         /* Turn into a compute shader now that we're free of vertexisms */
         nir->info.stage = MESA_SHADER_COMPUTE;
@ -1589,9 +1589,9 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
         outputs = nir->info.outputs_written;
      }
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-      NIR_PASS(_, nir, agx_nir_lower_tcs);
+      NIR_PASS(_, nir, poly_nir_lower_tcs);
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
+      NIR_PASS(_, nir, poly_nir_lower_gs, &gs_count, &gs_copy, &pre_gs,
               &gs_info);

      agx_preprocess_nir(gs_count);
@ -1932,11 +1932,11 @@ agx_create_shader_state(struct pipe_context *pctx,
      so->tess.spacing = nir->info.tess.spacing;
      so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
      so->tess.primitive = nir->info.tess._primitive_mode;
-      so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
+      so->tess.per_vertex_outputs = poly_tcs_per_vertex_outputs(nir);
      so->tess.nr_patch_outputs =
         util_last_bit(nir->info.patch_outputs_written);
      if (nir->info.stage == MESA_SHADER_TESS_CTRL)
-         so->tess.output_stride = agx_tcs_output_stride(nir);
+         so->tess.output_stride = poly_tcs_output_stride(nir);
   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      so->gs_mode = nir->info.gs.output_primitive;
   }
@ -3903,7 +3903,7 @@ agx_batch_heap(struct agx_batch *batch)
                                        PIPE_USAGE_DEFAULT, size);
      }

-      struct agx_heap heap = {
+      struct poly_heap heap = {
         .base = agx_resource(ctx->heap)->bo->va->addr,
         .size = size,
      };
@ -3924,7 +3924,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
                          const struct pipe_draw_start_count_bias *draw,
                          const struct pipe_draw_indirect_info *indirect)
 {
-   struct agx_ia_state ia = {
+   struct poly_ia_state ia = {
      .index_buffer = input_index_buffer,
      .index_buffer_range_el = index_buffer_size_B / info->index_size,
      .verts_per_instance = draw ? draw->count : 0,
@ -3933,7 +3933,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
   batch->uniforms.input_assembly =
      agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);

-   struct agx_geometry_params params = {
+   struct poly_geometry_params params = {
      .indirect_desc = batch->geom_indirect,
      .flat_outputs =
         batch->ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
@ -4017,8 +4017,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,

      params.input_primitives = params.gs_grid[0] * info->instance_count;

-      unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
-                                            batch->uniforms.vertex_outputs);
+      unsigned vb_size = poly_tcs_in_size(draw->count * info->instance_count,
+                                          batch->uniforms.vertex_outputs);
      unsigned size = params.input_primitives * params.count_buffer_stride;

      if (size && prefix_sum) {
@ -4034,8 +4034,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
         params.input_buffer = addr;
      }

-      struct agx_gs_info *gsi = &batch->ctx->gs->gs;
-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      struct poly_gs_info *gsi = &batch->ctx->gs->gs;
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
         unsigned idx_size = params.input_primitives * gsi->max_indices;

         params.output_index_buffer =
@ -4125,10 +4125,10 @@ agx_launch_gs_prerast(struct agx_batch *batch,
      libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);

      grid_vs = agx_grid_indirect_local(
-         gp + offsetof(struct agx_geometry_params, vs_grid));
+         gp + offsetof(struct poly_geometry_params, vs_grid));

      grid_gs = agx_grid_indirect_local(
-         gp + offsetof(struct agx_geometry_params, gs_grid));
+         gp + offsetof(struct poly_geometry_params, gs_grid));
   } else {
      grid_vs = agx_3d(draws->count, info->instance_count, 1);

@ -4246,7 +4246,7 @@ agx_draw_without_restart(struct agx_batch *batch,
   /* Unroll the index buffer for each draw */
   libagx_unroll_restart_struct(batch, agx_1d(1024 * indirect->draw_count),
                                AGX_BARRIER_ALL, unroll,
-                                libagx_compact_prim(info->mode));
+                                poly_compact_prim(info->mode));

   /* Now draw the results without restart */
   struct pipe_draw_info new_info = {
@ -4538,8 +4538,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
   bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
   enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);

-   enum libagx_tess_partitioning partitioning =
-      (enum libagx_tess_partitioning)pspacing;
+   enum poly_tess_partitioning partitioning =
+      (enum poly_tess_partitioning)pspacing;

   struct agx_bo *draw_bo = NULL;
   size_t draw_stride = 5 * sizeof(uint32_t);
@ -4557,7 +4557,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
   if (info->index_size)
      ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);

-   struct agx_ia_state ia = {
+   struct poly_ia_state ia = {
      .index_buffer = ib,
      .index_buffer_range_el = ib_extent,
      .verts_per_instance = draws ? draws->count : 0,
@ -4572,7 +4572,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
   uint64_t heap = agx_batch_heap(batch);
   assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");

-   struct libagx_tess_args args = {
+   struct poly_tess_args args = {
      .heap = heap,
      .tcs_stride_el = tcs->tess.output_stride / 4,
      .statistic = agx_get_query_address(
@ -4644,8 +4644,8 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
      args.out_draws = blob.gpu + draw_offs;
      args.counts = blob.gpu + count_offs;

-      unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
-                                            batch->uniforms.vertex_outputs);
+      unsigned vb_size = poly_tcs_in_size(draws->count * info->instance_count,
+                                          batch->uniforms.vertex_outputs);
      uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
      batch->uniforms.vertex_output_buffer_ptr =
         agx_pool_upload(&batch->pool, &addr, 8);
@ -4716,11 +4716,11 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,

   /* Generate counts, then prefix sum them, then finally tessellate. */
   libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
-                     LIBAGX_TESS_MODE_COUNT, state);
+                     POLY_TESS_MODE_COUNT, state);
   libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state, c_prims,
                          c_invs, c_prims || c_invs);
   libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
-                     LIBAGX_TESS_MODE_WITH_COUNTS, state);
+                     POLY_TESS_MODE_WITH_COUNTS, state);

   /* Face culling state needs to be specialized for tess */
   ctx->dirty |= AGX_DIRTY_RS;
@ -5141,12 +5141,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      agx_launch_gs_prerast(batch, info, draws, indirect);

      /* Setup to rasterize the GS results */
-      struct agx_gs_info *gsi = &ctx->gs->gs;
+      struct poly_gs_info *gsi = &ctx->gs->gs;
      info_gs = (struct pipe_draw_info){
         .mode = gsi->mode,
-         .index_size = agx_gs_index_size(gsi->shape),
-         .primitive_restart = agx_gs_indexed(gsi->shape),
-         .restart_index = agx_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0,
+         .index_size = poly_gs_index_size(gsi->shape),
+         .primitive_restart = poly_gs_indexed(gsi->shape),
+         .restart_index = poly_gs_index_size(gsi->shape) == 1 ? 0xFF : ~0,
         .index.resource = &index_rsrc.base,
         .instance_count = 1,
      };
@ -5167,11 +5167,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
            u_decomposed_prims_for_vertices(info->mode, draws->count);

         draw_gs = (struct pipe_draw_start_count_bias){
-            .count = agx_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
-                                          info->instance_count),
+            .count = poly_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
+                                           info->instance_count),
         };

-         info_gs.instance_count = agx_gs_rast_instances(
+         info_gs.instance_count = poly_gs_rast_instances(
            gsi->shape, gsi->max_indices, prims, info->instance_count);

         draws = &draw_gs;
@ -5184,10 +5184,10 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      batch->reduced_prim = u_reduced_prim(info->mode);
      ctx->dirty |= AGX_DIRTY_PRIM;

-      if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
+      if (gsi->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
         ib = batch->geom_index;
         ib_extent = index_rsrc.bo->size - (batch->geom_index - ib);
-      } else if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
+      } else if (gsi->shape == POLY_GS_SHAPE_STATIC_INDEXED) {
         ib = agx_pool_upload(&batch->pool, gsi->topology, gsi->max_indices);
         ib_extent = gsi->max_indices;
      }
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -18,13 +18,14 @@
 #include "asahi/lib/agx_tilebuffer.h"
 #include "asahi/lib/agx_uvs.h"
 #include "asahi/lib/pool.h"
-#include "asahi/libagx/geometry.h"
 #include "compiler/shader_enums.h"
 #include "gallium/auxiliary/util/u_blitter.h"
 #include "gallium/include/pipe/p_context.h"
 #include "gallium/include/pipe/p_screen.h"
 #include "gallium/include/pipe/p_state.h"
 #include "pipe/p_defines.h"
+#include "poly/geometry.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitset.h"
 #include "util/disk_cache.h"
 #include "util/hash_table.h"
@ -32,7 +33,6 @@
 #include "util/u_range.h"
 #include "agx_bg_eot.h"
 #include "agx_helpers.h"
-#include "agx_nir_lower_gs.h"
 #include "agx_nir_texture.h"

 #ifdef __GLIBC__
@ -248,7 +248,7 @@ struct agx_compiled_shader {
   struct agx_compiled_shader *gs_count, *pre_gs;
   struct agx_compiled_shader *gs_copy;

-   struct agx_gs_info gs;
+   struct poly_gs_info gs;

   /* Logical shader stage used for descriptor access. This may differ from the
    * physical shader stage of the compiled shader, for example when executing a
--- a/src/meson.build
+++ b/src/meson.build
@ -53,6 +53,9 @@ if with_gallium_or_lvp or with_gbm or with_platform_wayland
  subdir('loader')
 endif
 subdir('compiler')
+if with_poly
+  subdir('poly')
+endif
 if with_tools.contains('drm-shim')
  subdir('drm-shim')
 endif
--- a/src/poly/.clang-format
+++ b/src/poly/.clang-format
@ -0,0 +1,8 @@
+
+BasedOnStyle: InheritParentConfig
+DisableFormat: false
+
+AlignConsecutiveBitFields: Consecutive
+ColumnLimit: 80
+BreakStringLiterals: false
+SpaceBeforeParens: ControlStatementsExceptControlMacros
--- a/src/poly/cl/geometry.cl
+++ b/src/poly/cl/geometry.cl
@ -0,0 +1,501 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * Copyright 2025 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+
+uint64_t nir_ro_to_rw_poly(uint64_t address);
+
+/* Swap the two non-provoking vertices in odd triangles. This generates a vertex
+ * ID list with a consistent winding order.
+ *
+ * Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2]
+ * is its own inverse. It is hence used both vertex fetch and transform
+ * feedback.
+ */
+static uint
+map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
+{
+   unsigned pv = flatshade_first ? 0 : 2;
+
+   bool even = (prim & 1) == 0;
+   bool provoking = vert == pv;
+
+   return (provoking || even) ? vert : ((3 - pv) - vert);
+}
+
+static inline uint
+xfb_prim(uint id, uint n, uint copy)
+{
+   return sub_sat(id, n - 1u) + copy;
+}
+
+/*
+ * Determine whether an output vertex has an n'th copy in the transform feedback
+ * buffer. This is written weirdly to let constant folding remove unnecessary
+ * stores when length is known statically.
+ */
+bool
+poly_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy)
+{
+   uint prim = xfb_prim(id, n, copy);
+
+   int num_prims = length - (n - 1);
+   return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims);
+}
+
+uint
+poly_xfb_vertex_offset(uint n, uint invocation_base_prim, uint strip_base_prim,
+                       uint id_in_strip, uint copy, bool flatshade_first)
+{
+   uint prim = xfb_prim(id_in_strip, n, copy);
+   uint vert_0 = min(id_in_strip, n - 1);
+   uint vert = vert_0 - copy;
+
+   if (n == 3) {
+      vert = map_vertex_in_tri_strip(prim, vert, flatshade_first);
+   }
+
+   /* Tally up in the whole buffer */
+   uint base_prim = invocation_base_prim + strip_base_prim;
+   uint base_vertex = base_prim * n;
+   return base_vertex + (prim * n) + vert;
+}
+
+uint64_t
+poly_xfb_vertex_address(constant struct poly_geometry_params *p, uint index,
+                        uint buffer, uint stride, uint output_offset)
+{
+   uint xfb_offset = (index * stride) + output_offset;
+
+   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
+}
+
+static uint
+vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
+{
+   /* (0, 1), (1, 2), (2, 0) */
+   if (prim == (num_prims - 1) && vert == 1)
+      return 0;
+   else
+      return prim + vert;
+}
+
+uint
+poly_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
+                              uint num_prims)
+{
+   /* Line list, line strip, or line loop */
+   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
+      return 0;
+
+   if (mode == MESA_PRIM_LINES)
+      prim *= 2;
+
+   return prim + vert;
+}
+
+static uint
+vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
+{
+   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
+    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
+    * Piglit clipflat expects us to switch between these orders depending on
+    * provoking vertex, to avoid trivializing the fan.
+    *
+    * Rotate accordingly.
+    */
+   if (flatshade_first) {
+      vert = (vert == 2) ? 0 : (vert + 1);
+   }
+
+   /* The simpler form assuming last is provoking. */
+   return (vert == 0) ? 0 : prim + vert;
+}
+
+uint
+poly_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
+                             bool flatshade_first)
+{
+   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
+      vert = vert + 1;
+      vert = (vert == 3) ? 0 : vert;
+   }
+
+   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
+      return 0;
+
+   if (mode == MESA_PRIM_TRIANGLES)
+      prim *= 3;
+
+   /* Triangle list, triangle strip, or triangle fan */
+   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
+      unsigned pv = flatshade_first ? 0 : 2;
+
+      bool even = (prim & 1) == 0;
+      bool provoking = vert == pv;
+
+      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
+   }
+
+   return prim + vert;
+}
+
+uint
+poly_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
+{
+   /* Line list adj or line strip adj */
+   if (mode == MESA_PRIM_LINES_ADJACENCY)
+      prim *= 4;
+
+   return prim + vert;
+}
+
+static uint
+vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
+                            bool flatshade_first)
+{
+   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
+    *
+    * There are different cases for first/middle/last/only primitives and for
+    * odd/even primitives.  Determine which case we're in.
+    */
+   bool last = prim == (num_prims - 1);
+   bool first = prim == 0;
+   bool even = (prim & 1) == 0;
+   bool even_or_first = even || first;
+
+   /* When the last vertex is provoking, we rotate the primitives
+    * accordingly. This seems required for OpenGL.
+    */
+   if (!flatshade_first && !even_or_first) {
+      vert = (vert + 4u) % 6u;
+   }
+
+   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
+    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
+    *
+    * Here we assume the first vertex is provoking, the Vulkan default.
+    */
+   uint offsets[6] = {
+      0,
+      first ? 1 : (even ? -2 : 3),
+      even_or_first ? 2 : 4,
+      last ? 5 : 6,
+      even_or_first ? 4 : 2,
+      even_or_first ? 3 : -2,
+   };
+
+   /* Ensure NIR can see thru the local array */
+   uint offset = 0;
+   for (uint i = 1; i < 6; ++i) {
+      if (i == vert)
+         offset = offsets[i];
+   }
+
+   /* Finally add to the base of the primitive */
+   return (prim * 2) + offset;
+}
+
+uint
+poly_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
+                                 uint nr, bool flatshade_first)
+{
+   /* Tri adj list or tri adj strip */
+   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
+      return vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first);
+   } else {
+      return (6 * prim) + vert;
+   }
+}
+
+static uint
+vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim,
+                       uint vert, uint num_prims)
+{
+   switch (mode) {
+   case MESA_PRIM_POINTS:
+   case MESA_PRIM_LINES:
+   case MESA_PRIM_TRIANGLES:
+   case MESA_PRIM_LINES_ADJACENCY:
+   case MESA_PRIM_TRIANGLES_ADJACENCY:
+      /* Regular primitive: every N vertices defines a primitive */
+      return (prim * mesa_vertices_per_prim(mode)) + vert;
+
+   case MESA_PRIM_LINE_LOOP:
+      return vertex_id_for_line_loop(prim, vert, num_prims);
+
+   case MESA_PRIM_LINE_STRIP:
+   case MESA_PRIM_LINE_STRIP_ADJACENCY:
+      /* (i, i + 1) or (i, ..., i + 3) */
+      return prim + vert;
+
+   case MESA_PRIM_TRIANGLE_STRIP: {
+      /* Order depends on the provoking vert.
+       *
+       * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
+       * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
+       *
+       * Pull the (maybe swapped) vert from the corresponding primitive
+       */
+      return prim + map_vertex_in_tri_strip(prim, vert, flatshade_first);
+   }
+
+   case MESA_PRIM_TRIANGLE_FAN:
+      return vertex_id_for_tri_fan(prim, vert, flatshade_first);
+
+   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return vertex_id_for_tri_strip_adj(prim, vert, num_prims,
+                                         flatshade_first);
+
+   default:
+      return 0;
+   }
+}
+
+uint
+poly_map_to_line_adj(uint id)
+{
+   /* Sequence (1, 2), (5, 6), (9, 10), ... */
+   return ((id & ~1) * 2) + (id & 1) + 1;
+}
+
+uint
+poly_map_to_line_strip_adj(uint id)
+{
+   /* Sequence (1, 2), (2, 3), (4, 5), .. */
+   uint prim = id / 2;
+   uint vert = id & 1;
+   return prim + vert + 1;
+}
+
+uint
+poly_map_to_tri_strip_adj(uint id)
+{
+   /* Sequence (0, 2, 4), (2, 6, 4), (4, 6, 8), (6, 10, 8)
+    *
+    * Although tri strips with adjacency have 6 cases in general, after
+    * disregarding the vertices only available in a geometry shader, there are
+    * only even/odd cases. In other words, it's just a triangle strip subject to
+    * extra padding.
+    *
+    * Dividing through by two, the sequence is:
+    *
+    *   (0, 1, 2), (1, 3, 2), (2, 3, 4), (3, 5, 4)
+    */
+   uint prim = id / 3;
+   uint vtx = id % 3;
+
+   /* Flip the winding order of odd triangles */
+   if ((prim % 2) == 1) {
+      if (vtx == 1)
+         vtx = 2;
+      else if (vtx == 2)
+         vtx = 1;
+   }
+
+   return 2 * (prim + vtx);
+}
+
+uint
+poly_load_index_buffer(constant struct poly_ia_state *p, uint id,
+                       uint index_size)
+{
+   return poly_load_index(p->index_buffer, p->index_buffer_range_el, id,
+                          index_size);
+}
+
+static uint
+setup_xfb_buffer(global struct poly_geometry_params *p, uint i, uint stride,
+                 uint max_output_end, uint vertices_per_prim)
+{
+   uint xfb_offset = *(p->xfb_offs_ptrs[i]);
+   p->xfb_base[i] = p->xfb_base_original[i] + xfb_offset;
+
+   /* Let output_end = output_offset + output_size.
+    *
+    * Primitive P will write up to (but not including) offset:
+    *
+    *    xfb_offset + ((P - 1) * (verts_per_prim * stride))
+    *               + ((verts_per_prim - 1) * stride)
+    *               + output_end
+    *
+    * To fit all outputs for P, that value must be less than the XFB
+    * buffer size for the output with maximal output_end, as everything
+    * else is constant here across outputs within a buffer/primitive:
+    *
+    *    floor(P) <= (stride + size - xfb_offset - output_end)
+    *                 // (stride * verts_per_prim)
+    */
+   int numer_s = p->xfb_size[i] + (stride - max_output_end) - xfb_offset;
+   uint numer = max(numer_s, 0);
+   return numer / (stride * vertices_per_prim);
+}
+
+void
+poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset,
+                 uint32_t prim_index_offset, uint32_t vertex_offset,
+                 uint32_t verts_in_prim, uint3 info)
+{
+   _poly_write_strip(index_buffer, inv_index_offset + prim_index_offset,
+                     vertex_offset, verts_in_prim, info.x, info.y, info.z);
+}
+
+void
+poly_pad_index_gs(global int *index_buffer, uint inv_index_offset,
+                  uint nr_indices, uint alloc)
+{
+   for (uint i = nr_indices; i < alloc; ++i) {
+      index_buffer[inv_index_offset + i] = -1;
+   }
+}
+
+uintptr_t
+poly_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
+                           gl_varying_slot location)
+{
+   /* Written like this to let address arithmetic work */
+   return buffer + ((uintptr_t)poly_tcs_in_offs_el(vtx, location, mask)) * 16;
+}
+
+uintptr_t
+poly_geometry_input_address(constant struct poly_geometry_params *p, uint vtx,
+                            gl_varying_slot location)
+{
+   return poly_vertex_output_address(p->input_buffer, p->input_mask, vtx,
+                                     location);
+}
+
+unsigned
+poly_input_vertices(constant struct poly_ia_state *ia)
+{
+   return ia->verts_per_instance;
+}
+
+global uint *
+poly_load_xfb_count_address(constant struct poly_geometry_params *p, int index,
+                            int count_words, uint unrolled_id)
+{
+   return &p->count_buffer[(unrolled_id * count_words) + index];
+}
+
+uint
+poly_previous_xfb_primitives(global struct poly_geometry_params *p,
+                             int static_count, int count_index, int count_words,
+                             bool prefix_sum, uint unrolled_id)
+{
+   if (static_count >= 0) {
+      /* If the number of outputted vertices per invocation is known statically,
+       * we can calculate the base.
+       */
+      return unrolled_id * static_count;
+   } else {
+      /* Otherwise, load from the count buffer buffer. Note that the sums are
+       * inclusive, so index 0 is nonzero. This requires a little fixup here. We
+       * use a saturating unsigned subtraction so we don't read out-of-bounds.
+       *
+       * If we didn't prefix sum, there's only one element.
+       */
+      uint prim_minus_1 = prefix_sum ? sub_sat(unrolled_id, 1u) : 0;
+      uint count = p->count_buffer[(prim_minus_1 * count_words) + count_index];
+
+      return unrolled_id == 0 ? 0 : count;
+   }
+}
+
+/* Like u_foreach_bit, specialized for XFB to enable loop unrolling */
+#define poly_foreach_xfb(word, index)                                          \
+   for (uint i = 0; i < 4; ++i)                                                \
+      if (word & BITFIELD_BIT(i))
+
+void
+poly_pre_gs(global struct poly_geometry_params *p, uint streams,
+            uint buffers_written, uint4 buffer_to_stream, int4 count_index,
+            uint4 stride, uint4 output_end, int4 static_count, uint invocations,
+            uint vertices_per_prim, global uint *gs_invocations,
+            global uint *gs_primitives, global uint *c_primitives,
+            global uint *c_invocations)
+{
+   unsigned count_words = !!(count_index[0] >= 0) + !!(count_index[1] >= 0) +
+                          !!(count_index[2] >= 0) + !!(count_index[3] >= 0);
+   bool prefix_sum = count_words && buffers_written;
+   uint unrolled_in_prims = p->input_primitives;
+
+   /* Determine the number of primitives generated in each stream */
+   uint4 in_prims = 0;
+   poly_foreach_xfb(streams, i) {
+      in_prims[i] = poly_previous_xfb_primitives(p, static_count[i],
+                                                 count_index[i], count_words,
+                                                 prefix_sum, unrolled_in_prims);
+
+      *(p->prims_generated_counter[i]) += in_prims[i];
+   }
+
+   uint4 prims = in_prims;
+   uint emitted_prims = prims[0] + prims[1] + prims[2] + prims[3];
+
+   if (buffers_written) {
+      poly_foreach_xfb(buffers_written, i) {
+         uint max_prims =
+            setup_xfb_buffer(p, i, stride[i], output_end[i], vertices_per_prim);
+
+         unsigned stream = buffer_to_stream[i];
+         prims[stream] = min(prims[stream], max_prims);
+      }
+
+      int4 overflow = prims < in_prims;
+
+      poly_foreach_xfb(streams, i) {
+         p->xfb_verts[i] = prims[i] * vertices_per_prim;
+
+         *(p->xfb_overflow[i]) += (bool)overflow[i];
+         *(p->xfb_prims_generated_counter[i]) += prims[i];
+      }
+
+      *(p->xfb_any_overflow) += any(overflow);
+
+      /* Update XFB counters */
+      poly_foreach_xfb(buffers_written, i) {
+         uint32_t prim_stride_B = stride[i] * vertices_per_prim;
+         unsigned stream = buffer_to_stream[i];
+
+         global uint *ptr = p->xfb_offs_ptrs[i];
+
+         ptr = (global uint *)nir_ro_to_rw_poly((uint64_t)ptr);
+         *ptr += prims[stream] * prim_stride_B;
+      }
+   }
+
+   /* The geometry shader is invoked once per primitive (after unrolling
+    * primitive restart). From the spec:
+    *
+    *    In case of instanced geometry shaders (see section 11.3.4.2) the
+    *    geometry shader invocations count is incremented for each separate
+    *    instanced invocation.
+    */
+   *gs_invocations += unrolled_in_prims * invocations;
+   *gs_primitives += emitted_prims;
+
+   /* Clipper queries are not well-defined, so we can emulate them in lots of
+    * silly ways. We need the hardware counters to implement them properly. For
+    * now, just consider all primitives emitted as passing through the clipper.
+    * This satisfies spec text:
+    *
+    *    The number of primitives that reach the primitive clipping stage.
+    *
+    * and
+    *
+    *    If at least one vertex of the primitive lies inside the clipping
+    *    volume, the counter is incremented by one or more. Otherwise, the
+    *    counter is incremented by zero or more.
+    */
+   *c_primitives += emitted_prims;
+   *c_invocations += emitted_prims;
+}
--- a/src/poly/cl/meson.build
+++ b/src/poly/cl/meson.build
@ -0,0 +1,35 @@
+# Copyright 2024 Valve Corporation
+# Copyright © 2025 Collabora Ltd.
+# SPDX-License-Identifier: MIT
+
+libpoly_shader_files = files(
+  'geometry.cl',
+  'tessellation.cl',
+)
+
+libpoly_shaders_spv = custom_target(
+  input : libpoly_shader_files,
+  output : 'libpoly.spv',
+  command : [
+    prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@',
+    libpoly_shader_files, '--',
+    '-I' + join_paths(meson.project_source_root(), 'include'),
+    '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
+    '-I' + join_paths(meson.current_source_dir(), '.'),
+    '-I' + join_paths(meson.current_source_dir(), '../../'),
+    cl_args,
+    ],
+  depends : [],
+  depfile : 'libpoly_shaders.h.d',
+)
+
+libpoly_shaders = custom_target(
+  input : libpoly_shaders_spv,
+  output : ['libpoly.cpp', 'libpoly.h'],
+  command : [prog_vtn_bindgen2, libpoly_shaders_spv, '@OUTPUT0@', '@OUTPUT1@'],
+)
+
+idep_libpoly = declare_dependency(
+  sources : [libpoly_shaders],
+  include_directories : include_directories('.'),
+)
--- a/src/poly/cl/tessellation.cl
+++ b/src/poly/cl/tessellation.cl
@ -0,0 +1,133 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+
+uint
+poly_tcs_patch_vertices_in(constant struct poly_tess_args *p)
+{
+   return p->input_patch_size;
+}
+
+uint
+poly_tes_patch_vertices_in(constant struct poly_tess_args *p)
+{
+   return p->output_patch_size;
+}
+
+uint
+poly_tcs_unrolled_id(constant struct poly_tess_args *p, uint3 wg_id)
+{
+   return (wg_id.y * p->patches_per_instance) + wg_id.x;
+}
+
+uint64_t
+poly_tes_buffer(constant struct poly_tess_args *p)
+{
+   return p->tes_buffer;
+}
+
+/*
+ * Helper to lower indexing for a tess eval shader ran as a compute shader. This
+ * handles the tess+geom case. This is simpler than the general input assembly
+ * lowering, as we know:
+ *
+ * 1. the index buffer is U32
+ * 2. the index is in bounds
+ *
+ * Therefore we do a simple load. No bounds checking needed.
+ */
+uint32_t
+poly_load_tes_index(constant struct poly_tess_args *p, uint32_t index)
+{
+   /* Swap second and third vertices of each triangle to flip winding order
+    * dynamically if needed.
+    */
+   if (p->ccw) {
+      uint id = index % 3;
+
+      if (id == 1)
+         index++;
+      else if (id == 2)
+         index--;
+   }
+
+   return p->index_buffer[index];
+}
+
+uintptr_t
+poly_tcs_out_address(constant struct poly_tess_args *p, uint patch_id,
+                     uint vtx_id, gl_varying_slot location, uint nr_patch_out,
+                     uint out_patch_size, uint64_t vtx_out_mask)
+{
+   uint stride_el =
+      poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask);
+
+   uint offs_el =
+      poly_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask);
+
+   offs_el += patch_id * stride_el;
+
+   /* Written to match the AGX addressing mode */
+   return (uintptr_t)(p->tcs_buffer) + (((uintptr_t)offs_el) << 2);
+}
+
+static uint
+tes_unrolled_patch_id(uint raw_id)
+{
+   return raw_id / POLY_TES_PATCH_ID_STRIDE;
+}
+
+uint
+poly_tes_patch_id(constant struct poly_tess_args *p, uint raw_id)
+{
+   return tes_unrolled_patch_id(raw_id) % p->patches_per_instance;
+}
+
+static uint
+tes_vertex_id_in_patch(uint raw_id)
+{
+   return raw_id % POLY_TES_PATCH_ID_STRIDE;
+}
+
+float2
+poly_load_tess_coord(constant struct poly_tess_args *p, uint raw_id)
+{
+   uint patch = tes_unrolled_patch_id(raw_id);
+   uint vtx = tes_vertex_id_in_patch(raw_id);
+
+   global struct poly_tess_point *t =
+      &p->patch_coord_buffer[p->coord_allocs[patch] + vtx];
+
+   /* Written weirdly because NIR struggles with loads of structs */
+   uint2 fixed = *((global uint2 *)t);
+
+   /* Convert fixed point to float */
+   return convert_float2(fixed) / (1u << 16);
+}
+
+uintptr_t
+poly_tes_in_address(constant struct poly_tess_args *p, uint raw_id, uint vtx_id,
+                    gl_varying_slot location)
+{
+   uint patch = tes_unrolled_patch_id(raw_id);
+
+   return poly_tcs_out_address(p, patch, vtx_id, location,
+                               p->tcs_patch_constants, p->output_patch_size,
+                               p->tcs_per_vertex_outputs);
+}
+
+float4
+poly_tess_level_outer_default(constant struct poly_tess_args *p)
+{
+   return vload4(0, p->tess_level_outer_default);
+}
+
+float2
+poly_tess_level_inner_default(constant struct poly_tess_args *p)
+{
+   return vload2(0, p->tess_level_inner_default);
+}
--- a/src/poly/cl/tessellator.h
+++ b/src/poly/cl/tessellator.h
--- a/src/poly/geometry.h
+++ b/src/poly/geometry.h
@ -0,0 +1,641 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/libcl/libcl.h"
+#include "compiler/shader_enums.h"
+
+#include "util/bitscan.h"
+#include "util/u_math.h"
+
+#ifdef __OPENCL_VERSION__
+#include "compiler/libcl/libcl_vk.h"
+#endif
+
+#pragma once
+
+#define POLY_MAX_SO_BUFFERS     4
+#define POLY_MAX_VERTEX_STREAMS 4
+
+enum poly_gs_shape {
+   /* Indexed, where indices are encoded as:
+    *
+    *    round_to_pot(max_indices) * round_to_pot(input_primitives) *
+    *                              * instance_count
+    *
+    * invoked for max_indices * input_primitives * instance_count indices.
+    *
+    * This is used with any dynamic topology. No hardware instancing used.
+    */
+   POLY_GS_SHAPE_DYNAMIC_INDEXED,
+
+   /* Indexed with a static index buffer. Indices ranges up to max_indices.
+    * Hardware instance count = input_primitives * software instance count.
+    */
+   POLY_GS_SHAPE_STATIC_INDEXED,
+
+   /* Non-indexed. Dispatched as:
+    *
+    *    (max_indices, input_primitives * instance count).
+    */
+   POLY_GS_SHAPE_STATIC_PER_PRIM,
+
+   /* Non-indexed. Dispatched as:
+    *
+    *    (max_indices * input_primitives, instance count).
+    */
+   POLY_GS_SHAPE_STATIC_PER_INSTANCE,
+};
+
+static inline unsigned
+poly_gs_rast_vertices(enum poly_gs_shape shape, unsigned max_indices,
+                      unsigned input_primitives, unsigned instance_count)
+{
+   switch (shape) {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED:
+      return max_indices * input_primitives * instance_count;
+
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+   case POLY_GS_SHAPE_STATIC_PER_PRIM:
+      return max_indices;
+
+   case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
+      return max_indices * input_primitives;
+   }
+
+   UNREACHABLE("invalid shape");
+}
+
+static inline unsigned
+poly_gs_rast_instances(enum poly_gs_shape shape, unsigned max_indices,
+                       unsigned input_primitives, unsigned instance_count)
+{
+   switch (shape) {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED:
+      return 1;
+
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+   case POLY_GS_SHAPE_STATIC_PER_PRIM:
+      return input_primitives * instance_count;
+
+   case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
+      return instance_count;
+   }
+
+   UNREACHABLE("invalid shape");
+}
+
+static inline bool
+poly_gs_indexed(enum poly_gs_shape shape)
+{
+   return shape == POLY_GS_SHAPE_DYNAMIC_INDEXED ||
+          shape == POLY_GS_SHAPE_STATIC_INDEXED;
+}
+
+static inline unsigned
+poly_gs_index_size(enum poly_gs_shape shape)
+{
+   switch (shape) {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED:
+      return 4;
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+/* Heap to allocate from. */
+struct poly_heap {
+   DEVICE(uchar) base;
+   uint32_t bottom, size;
+} PACKED;
+static_assert(sizeof(struct poly_heap) == 4 * 4);
+
+#ifdef __OPENCL_VERSION__
+static inline uint
+_poly_heap_alloc_offs(global struct poly_heap *heap, uint size_B, bool atomic)
+{
+   size_B = align(size_B, 16);
+
+   uint offs;
+   if (atomic) {
+      offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
+   } else {
+      offs = heap->bottom;
+      heap->bottom = offs + size_B;
+   }
+
+   /* Use printf+abort because assert is stripped from release builds. */
+   if (heap->bottom >= heap->size) {
+      printf(
+         "FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
+         size_B, offs, heap->size);
+
+      abort();
+   }
+
+   return offs;
+}
+
+static inline uint
+poly_heap_alloc_nonatomic_offs(global struct poly_heap *heap, uint size_B)
+{
+   return _poly_heap_alloc_offs(heap, size_B, false);
+}
+
+static inline uint
+poly_heap_alloc_atomic_offs(global struct poly_heap *heap, uint size_B)
+{
+   return _poly_heap_alloc_offs(heap, size_B, true);
+}
+
+static inline global void *
+poly_heap_alloc_nonatomic(global struct poly_heap *heap, uint size_B)
+{
+   return heap->base + poly_heap_alloc_nonatomic_offs(heap, size_B);
+}
+
+uint64_t nir_load_ro_sink_address_poly(void);
+
+static inline uint64_t
+poly_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
+                  uint elsize_B)
+{
+   if (offset_el < size_el)
+      return index_buffer + (offset_el * elsize_B);
+   else
+      return nir_load_ro_sink_address_poly();
+}
+#endif
+
+struct poly_ia_state {
+   /* Index buffer if present. */
+   uint64_t index_buffer;
+
+   /* Size of the bound index buffer for bounds checking */
+   uint32_t index_buffer_range_el;
+
+   /* Number of vertices per instance. Written by CPU for direct draw, indirect
+    * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
+    */
+   uint32_t verts_per_instance;
+} PACKED;
+static_assert(sizeof(struct poly_ia_state) == 4 * 4);
+
+static inline uint
+poly_index_buffer_range_el(uint size_el, uint offset_el)
+{
+   return offset_el < size_el ? (size_el - offset_el) : 0;
+}
+
+struct poly_geometry_params {
+   /* Address of associated indirect draw buffer */
+   DEVICE(uint) indirect_desc;
+
+   /* Address of count buffer. For an indirect draw, this will be written by the
+    * indirect setup kernel.
+    */
+   DEVICE(uint) count_buffer;
+
+   /* Address of the primitives generated counters */
+   DEVICE(uint) prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
+   DEVICE(uint) xfb_prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
+   DEVICE(uint) xfb_overflow[POLY_MAX_VERTEX_STREAMS];
+   DEVICE(uint) xfb_any_overflow;
+
+   /* Pointers to transform feedback buffer offsets in bytes */
+   DEVICE(uint) xfb_offs_ptrs[POLY_MAX_SO_BUFFERS];
+
+   /* Output index buffer, allocated by pre-GS. */
+   DEVICE(uint) output_index_buffer;
+
+   /* Address of transform feedback buffer in general, supplied by the CPU. */
+   DEVICE(uchar) xfb_base_original[POLY_MAX_SO_BUFFERS];
+
+   /* Address of transform feedback for the current primitive. Written by pre-GS
+    * program.
+    */
+   DEVICE(uchar) xfb_base[POLY_MAX_SO_BUFFERS];
+
+   /* Address and present mask for the input to the geometry shader. These will
+    * reflect the vertex shader for VS->GS or instead the tessellation
+    * evaluation shader for TES->GS.
+    */
+   uint64_t input_buffer;
+   uint64_t input_mask;
+
+   /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
+   uint64_t flat_outputs;
+
+   uint32_t xfb_size[POLY_MAX_SO_BUFFERS];
+
+   /* Number of vertices emitted by transform feedback per stream. Written by
+    * the pre-GS program.
+    */
+   uint32_t xfb_verts[POLY_MAX_VERTEX_STREAMS];
+
+   /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
+    * out by the GS indirect setup kernel or the CPU for a direct draw. This is
+    * the "indirect local" format: first 3 is in threads, second 3 is in grid
+    * blocks. This lets us use nontrivial workgroups with indirect draws without
+    * needing any predication.
+    */
+   uint32_t vs_grid[6];
+   uint32_t gs_grid[6];
+
+   /* Number of input primitives across all instances, calculated by the CPU for
+    * a direct draw or the GS indirect setup kernel for an indirect draw.
+    */
+   uint32_t input_primitives;
+
+   /* Number of input primitives per instance, rounded up to a power-of-two and
+    * with the base-2 log taken. This is used to partition the output vertex IDs
+    * efficiently.
+    */
+   uint32_t primitives_log2;
+
+   /* Number of bytes output by the GS count shader per input primitive (may be
+    * 0), written by CPU and consumed by indirect draw setup shader for
+    * allocating counts.
+    */
+   uint32_t count_buffer_stride;
+
+   /* Dynamic input topology. Must be compatible with the geometry shader's
+    * layout() declared input class.
+    */
+   uint32_t input_topology;
+} PACKED;
+static_assert(sizeof(struct poly_geometry_params) == 86 * 4);
+
+/* TCS shared memory layout:
+ *
+ *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
+ *
+ * TODO: compact.
+ */
+static inline uint
+poly_tcs_in_offs_el(uint vtx, gl_varying_slot location,
+                    uint64_t crosslane_vs_out_mask)
+{
+   uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
+   uint offs = util_bitcount64(crosslane_vs_out_mask &
+                               (((uint64_t)(1) << location) - 1));
+
+   return base + offs;
+}
+
+static inline uint
+poly_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
+{
+   return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
+}
+
+/*
+ * TCS out buffer layout, per-patch:
+ *
+ *    float tess_level_outer[4];
+ *    float tess_level_inner[2];
+ *    vec4 patch_out[MAX_PATCH_OUTPUTS];
+ *    vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
+ *
+ * Vertex out are compacted based on the mask of written out. Patch
+ * out are used as-is.
+ *
+ * Bounding boxes are ignored.
+ */
+static inline uint
+poly_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
+                     uint64_t vtx_out_mask)
+{
+   uint off = 0;
+   if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
+      return off;
+
+   off += 4;
+   if (location == VARYING_SLOT_TESS_LEVEL_INNER)
+      return off;
+
+   off += 2;
+   if (location >= VARYING_SLOT_PATCH0)
+      return off + (4 * (location - VARYING_SLOT_PATCH0));
+
+   /* Anything else is a per-vtx output */
+   off += 4 * nr_patch_out;
+   off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
+
+   uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
+   return off + (4 * idx);
+}
+
+static inline uint
+poly_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
+                       uint64_t vtx_out_mask)
+{
+   return poly_tcs_out_offs_el(out_patch_size, VARYING_SLOT_POS, nr_patch_out,
+                               vtx_out_mask);
+}
+
+static inline uint
+poly_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
+                    uint64_t vtx_out_mask)
+{
+   return poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
+          4;
+}
+
+/* In a tess eval shader, stride for hw vertex ID */
+#define POLY_TES_PATCH_ID_STRIDE 8192
+
+static inline uint
+poly_compact_prim(enum mesa_prim prim)
+{
+   static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
+   static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
+
+#ifndef __OPENCL_VERSION__
+   assert(prim != MESA_PRIM_QUADS);
+   assert(prim != MESA_PRIM_QUAD_STRIP);
+   assert(prim != MESA_PRIM_POLYGON);
+   assert(prim != MESA_PRIM_PATCHES);
+#endif
+
+   return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
+}
+
+static inline enum mesa_prim
+poly_uncompact_prim(uint packed)
+{
+   if (packed >= MESA_PRIM_QUADS)
+      return (enum mesa_prim)(packed + 3);
+
+   return (enum mesa_prim)packed;
+}
+
+/*
+ * Write a strip into a 32-bit index buffer. This is the sequence:
+ *
+ *    (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
+ *
+ * For points, we write index buffers without restart just for remapping.
+ */
+static inline void
+_poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
+                  uint32_t vertex_offset, uint32_t verts_in_prim,
+                  uint32_t stream, uint32_t stream_multiplier, uint32_t n)
+{
+   bool restart = n > 1;
+   if (verts_in_prim < n)
+      return;
+
+   GLOBAL uint32_t *out = &index_buffer[index_offset];
+
+   /* Write out indices for the strip */
+   for (uint32_t i = 0; i < verts_in_prim; ++i) {
+      out[i] = (vertex_offset + i) * stream_multiplier + stream;
+   }
+
+   if (restart)
+      out[verts_in_prim] = -1;
+}
+
+static inline unsigned
+poly_decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
+                                             unsigned verts_per_patch)
+{
+   if (prim >= MESA_PRIM_PATCHES) {
+      return vertices / verts_per_patch;
+   } else {
+      return u_decomposed_prims_for_vertices(prim, vertices);
+   }
+}
+
+#ifdef __OPENCL_VERSION__
+/*
+ * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
+ * manually with subgroup ops and local memory since Mesa doesn't do those
+ * lowerings yet.
+ */
+static inline uint2
+poly_work_group_scan_inclusive_add(uint x, local uint *scratch)
+{
+   uint sg_id = get_sub_group_id();
+
+   /* Partial prefix sum of the subgroup */
+   uint sg = sub_group_scan_inclusive_add(x);
+
+   /* Reduction (sum) for the subgroup */
+   uint sg_sum = sub_group_broadcast(sg, 31);
+
+   /* Write out all the subgroups sums */
+   barrier(CLK_LOCAL_MEM_FENCE);
+   scratch[sg_id] = sg_sum;
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
+    * threads in subgroup T.
+    */
+   uint other_sum = scratch[get_sub_group_local_id()];
+
+   /* Exclusive sum the subgroup sums to get the total before the current group,
+    * which can be added to the total for the current group.
+    */
+   uint other_sums = sub_group_scan_exclusive_add(other_sum);
+   uint base = sub_group_broadcast(other_sums, sg_id);
+   uint prefix = base + sg;
+
+   /* Reduce the workgroup using the prefix sum we already did */
+   uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
+
+   return (uint2)(prefix, reduction);
+}
+
+static inline void
+poly_prefix_sum(local uint *scratch, global uint *buffer, uint len, uint words,
+                uint word, uint wg_count)
+{
+   uint tid = cl_local_id.x;
+
+   /* Main loop: complete workgroups processing multiple values at once */
+   uint i, count = 0;
+   uint len_remainder = len % wg_count;
+   uint len_rounded_down = len - len_remainder;
+
+   for (i = tid; i < len_rounded_down; i += wg_count) {
+      global uint *ptr = &buffer[(i * words) + word];
+      uint value = *ptr;
+      uint2 sums = poly_work_group_scan_inclusive_add(value, scratch);
+
+      *ptr = count + sums[0];
+      count += sums[1];
+   }
+
+   /* The last iteration is special since we won't have a full subgroup unless
+    * the length is divisible by the subgroup size, and we don't advance count.
+    */
+   global uint *ptr = &buffer[(i * words) + word];
+   uint value = (tid < len_remainder) ? *ptr : 0;
+   uint scan = poly_work_group_scan_inclusive_add(value, scratch)[0];
+
+   if (tid < len_remainder) {
+      *ptr = count + scan;
+   }
+}
+
+static inline void
+poly_increment_counters(global uint32_t *a, global uint32_t *b,
+                        global uint32_t *c, uint count)
+{
+   global uint32_t *ptr[] = {a, b, c};
+
+   for (uint i = 0; i < 3; ++i) {
+      if (ptr[i]) {
+         *(ptr[i]) += count;
+      }
+   }
+}
+
+static inline void
+poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives,
+                  global uint32_t *vs_invocations, global uint32_t *c_prims,
+                  global uint32_t *c_invs, constant uint32_t *draw,
+                  enum mesa_prim prim, unsigned verts_per_patch)
+{
+   poly_increment_counters(ia_vertices, vs_invocations, NULL,
+                           draw[0] * draw[1]);
+
+   uint prims = poly_decomposed_prims_for_vertices_with_tess(prim, draw[0],
+                                                             verts_per_patch) *
+                draw[1];
+
+   poly_increment_counters(ia_primitives, c_prims, c_invs, prims);
+}
+
+static inline void
+poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw,
+                       global uintptr_t *vertex_buffer /* output */,
+                       global struct poly_ia_state *ia /* output */,
+                       global struct poly_geometry_params *p /* output */,
+                       global struct poly_heap *heap,
+                       uint64_t vs_outputs /* Vertex (TES) output mask */,
+                       uint32_t index_size_B /* 0 if no index bffer */,
+                       uint32_t index_buffer_range_el,
+                       uint32_t prim /* Input primitive type, enum mesa_prim */,
+                       int is_prefix_summing, uint max_indices,
+                       enum poly_gs_shape shape)
+{
+   /* Determine the (primitives, instances) grid size. */
+   uint vertex_count = draw[0];
+   uint instance_count = draw[1];
+
+   ia->verts_per_instance = vertex_count;
+
+   /* Calculate number of primitives input into the GS */
+   uint prim_per_instance = u_decomposed_prims_for_vertices(prim, vertex_count);
+   p->input_primitives = prim_per_instance * instance_count;
+
+   /* Invoke VS as (vertices, instances); GS as (primitives, instances) */
+   p->vs_grid[0] = vertex_count;
+   p->vs_grid[1] = instance_count;
+
+   p->gs_grid[0] = prim_per_instance;
+   p->gs_grid[1] = instance_count;
+
+   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
+
+   /* If indexing is enabled, the third word is the offset into the index buffer
+    * in elements. Apply that offset now that we have it. For a hardware
+    * indirect draw, the hardware would do this for us, but for software input
+    * assembly we need to do it ourselves.
+    */
+   if (index_size_B) {
+      ia->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
+                                           draw[2], index_size_B);
+
+      ia->index_buffer_range_el =
+         poly_index_buffer_range_el(index_buffer_range_el, draw[2]);
+   }
+
+   /* We need to allocate VS and GS count buffers, do so now */
+   uint vertex_buffer_size =
+      poly_tcs_in_size(vertex_count * instance_count, vs_outputs);
+
+   if (is_prefix_summing) {
+      p->count_buffer = poly_heap_alloc_nonatomic(
+         heap, p->input_primitives * p->count_buffer_stride);
+   }
+
+   p->input_buffer =
+      (uintptr_t)poly_heap_alloc_nonatomic(heap, vertex_buffer_size);
+   *vertex_buffer = p->input_buffer;
+
+   p->input_mask = vs_outputs;
+
+   /* Allocate the index buffer and write the draw consuming it */
+   global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
+
+   *cmd = (VkDrawIndexedIndirectCommand){
+      .indexCount = poly_gs_rast_vertices(shape, max_indices, prim_per_instance,
+                                          instance_count),
+      .instanceCount = poly_gs_rast_instances(
+         shape, max_indices, prim_per_instance, instance_count),
+   };
+
+   if (shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
+      cmd->firstIndex =
+         poly_heap_alloc_nonatomic_offs(heap, cmd->indexCount * 4) / 4;
+
+      p->output_index_buffer =
+         (global uint *)(heap->base + (cmd->firstIndex * 4));
+   }
+}
+
+static uint
+poly_load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
+                uint index_size)
+{
+   bool oob = id >= index_buffer_range_el;
+
+   /* If the load would be out-of-bounds, load the first element which is
+    * assumed valid. If the application index buffer is empty with robustness2,
+    * index_buffer will point to a zero sink where only the first is valid.
+    */
+   if (oob) {
+      id = 0;
+   }
+
+   uint el;
+   if (index_size == 1) {
+      el = ((constant uint8_t *)index_buffer)[id];
+   } else if (index_size == 2) {
+      el = ((constant uint16_t *)index_buffer)[id];
+   } else {
+      el = ((constant uint32_t *)index_buffer)[id];
+   }
+
+   /* D3D robustness semantics. TODO: Optimize? */
+   if (oob) {
+      el = 0;
+   }
+
+   return el;
+}
+
+static void
+poly_store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
+{
+   global uint32_t *out_32 = (global uint32_t *)index_buffer;
+   global uint16_t *out_16 = (global uint16_t *)index_buffer;
+   global uint8_t *out_8 = (global uint8_t *)index_buffer;
+
+   if (index_size_B == 4)
+      out_32[id] = value;
+   else if (index_size_B == 2)
+      out_16[id] = value;
+   else
+      out_8[id] = value;
+}
+
+#endif
--- a/src/poly/meson.build
+++ b/src/poly/meson.build
@ -0,0 +1,9 @@
+# Copyright © 2025 Collabora Ltd.
+# SPDX-License-Identifier: MIT
+
+inc_poly = include_directories([
+   '.', 'nir'
+])
+
+subdir('cl')
+subdir('nir')
--- a/src/poly/nir/meson.build
+++ b/src/poly/nir/meson.build
@ -0,0 +1,18 @@
+# Copyright © 2025 Collabora Ltd.
+# SPDX-License-Identifier: MIT
+
+libpoly_nir_files = files(
+  'poly_nir_lower_gs.c',
+  'poly_nir_lower_ia.c',
+  'poly_nir_lower_tess.c',
+)
+
+libpoly_nir = static_library(
+  'libpoly_nir',
+  [libpoly_nir_files],
+  include_directories : [inc_poly],
+  c_args : [no_override_init_args, '-Wno-c2x-extensions'],
+  gnu_symbol_visibility : 'hidden',
+  dependencies: [idep_nir, idep_mesautil, idep_libpoly],
+  build_by_default : false,
+)
--- a/src/poly/nir/poly_nir_lower_gs.c
+++ b/src/poly/nir/poly_nir_lower_gs.c
@ -5,11 +5,11 @@
 * SPDX-License-Identifier: MIT
 */

-#include "agx_nir_lower_gs.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "compiler/nir/nir_builder.h"
 #include "gallium/include/pipe/p_defines.h"
-#include "libagx/geometry.h"
-#include "libagx/libagx.h"
+#include "poly/cl/libpoly.h"
+#include "poly/geometry.h"
 #include "util/bitscan.h"
 #include "util/list.h"
 #include "util/macros.h"
@ -85,7 +85,7 @@ rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
 }

 static bool
-agx_nir_lower_gs_intrinsics(nir_shader *shader)
+lower_gs_intrinsics(nir_shader *shader)
 {
   struct state state;
   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
@ -158,16 +158,16 @@ agx_nir_lower_gs_intrinsics(nir_shader *shader)
 }

 struct lower_gs_state {
-   int static_count[MAX_VERTEX_STREAMS];
+   int static_count[POLY_MAX_VERTEX_STREAMS];

   /* The index of each counter in the count buffer, or -1 if it's not in the
    * count buffer.
    *
    * Invariant: info->count_words == sum(count_index[i] >= 0).
    */
-   int count_index[MAX_VERTEX_STREAMS];
+   int count_index[POLY_MAX_VERTEX_STREAMS];

-   struct agx_gs_info *info;
+   struct poly_gs_info *info;
 };

 /* Helpers for loading from the geometry state buffer */
@ -184,8 +184,8 @@ load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes)

 #define load_geometry_param(b, field)                                          \
   load_geometry_param_offset(                                                 \
-      b, offsetof(struct agx_geometry_params, field),                          \
-      sizeof(((struct agx_geometry_params *)0)->field))
+      b, offsetof(struct poly_geometry_params, field),                         \
+      sizeof(((struct poly_geometry_params *)0)->field))

 /* Helpers for lowering I/O to variables */
 struct lower_output_to_var_state {
@ -257,18 +257,18 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
      return prim;

   case MESA_PRIM_LINES:
-      return libagx_vertex_id_for_line_class(b, topology, prim, vert, nr);
+      return poly_vertex_id_for_line_class(b, topology, prim, vert, nr);

   case MESA_PRIM_TRIANGLES:
-      return libagx_vertex_id_for_tri_class(b, topology, prim, vert,
-                                            flatshade_first);
+      return poly_vertex_id_for_tri_class(b, topology, prim, vert,
+                                          flatshade_first);

   case MESA_PRIM_LINES_ADJACENCY:
-      return libagx_vertex_id_for_line_adj_class(b, topology, prim, vert);
+      return poly_vertex_id_for_line_adj_class(b, topology, prim, vert);

   case MESA_PRIM_TRIANGLES_ADJACENCY:
-      return libagx_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
-                                                flatshade_first);
+      return poly_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
+                                              flatshade_first);

   default:
      UNREACHABLE("invalid topology class");
@ -276,8 +276,8 @@ vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
 }

 nir_def *
-agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
-                          nir_def *vertex)
+poly_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
+                           nir_def *vertex)
 {
   assert(intr->intrinsic == nir_intrinsic_load_per_vertex_input);
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
@ -287,15 +287,15 @@ agx_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,

   if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
      /* GS may be preceded by VS or TES so specified as param */
-      addr = libagx_geometry_input_address(
+      addr = poly_geometry_input_address(
         b, nir_load_geometry_param_buffer_poly(b), vertex, location);
   } else {
      assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL);

      /* TCS always preceded by VS so we use the VS state directly */
-      addr = libagx_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
-                                          nir_load_vs_outputs_poly(b), vertex,
-                                          location);
+      addr = poly_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
+                                        nir_load_vs_outputs_poly(b), vertex,
+                                        location);
   }

   addr = nir_iadd_imm(b, addr, 4 * nir_intrinsic_component(intr));
@ -320,7 +320,7 @@ lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
   nir_def *unrolled =
      nir_iadd(b, nir_imul(b, nir_load_instance_id(b), verts), vertex);

-   nir_def *val = agx_load_per_vertex_input(b, intr, unrolled);
+   nir_def *val = poly_load_per_vertex_input(b, intr, unrolled);
   nir_def_replace(&intr->def, val);
   return true;
 }
@ -377,10 +377,10 @@ write_xfb_counts(nir_builder *b, nir_intrinsic_instr *intr,
   nir_def *id =
      state->info->prefix_sum ? calc_unrolled_id(b) : nir_imm_int(b, 0);

-   nir_def *addr = libagx_load_xfb_count_address(
-      b, nir_load_geometry_param_buffer_poly(b),
-      nir_imm_int(b, state->count_index[stream]),
-      nir_imm_int(b, state->info->count_words), id);
+   nir_def *addr =
+      poly_load_xfb_count_address(b, nir_load_geometry_param_buffer_poly(b),
+                                  nir_imm_int(b, state->count_index[stream]),
+                                  nir_imm_int(b, state->info->count_words), id);

   if (state->info->prefix_sum) {
      nir_store_global(b, addr, 4, intr->src[2].ssa, nir_component_mask(1));
@ -656,7 +656,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
   raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs));

   switch (state->info->shape) {
-   case AGX_GS_SHAPE_DYNAMIC_INDEXED: {
+   case POLY_GS_SHAPE_DYNAMIC_INDEXED: {
      unsigned stride = output_vertex_id_pot_stride(gs);

      nir_def *unrolled = nir_udiv_imm(b, raw_vertex_id, stride);
@ -669,8 +669,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
      break;
   }

-   case AGX_GS_SHAPE_STATIC_INDEXED:
-   case AGX_GS_SHAPE_STATIC_PER_PRIM: {
+   case POLY_GS_SHAPE_STATIC_INDEXED:
+   case POLY_GS_SHAPE_STATIC_PER_PRIM: {
      nir_def *stride = load_geometry_param(b, gs_grid[0]);

      rs.output_id = raw_vertex_id;
@ -679,7 +679,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
      break;
   }

-   case AGX_GS_SHAPE_STATIC_PER_INSTANCE: {
+   case POLY_GS_SHAPE_STATIC_PER_INSTANCE: {
      unsigned stride = MAX2(state->info->max_indices, 1);

      rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
@ -733,8 +733,8 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)

      for (unsigned p_ = 0; p_ < n_; ++p_) {
         nir_def *p = nir_imm_int(b, p_);
-         nir_push_if(b, libagx_xfb_vertex_copy_in_strip(b, n, id_in_strip,
-                                                        strip_length, p));
+         nir_push_if(b, poly_xfb_vertex_copy_in_strip(b, n, id_in_strip,
+                                                      strip_length, p));

         /* Write XFB for each output */
         for (unsigned i = 0; i < xfb->output_count; ++i) {
@ -746,14 +746,14 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
             * base for this invocation for the stream plus the offset within
             * this invocation.
             */
-            nir_def *invocation_base = libagx_previous_xfb_primitives(
+            nir_def *invocation_base = poly_previous_xfb_primitives(
               b, nir_load_geometry_param_buffer_poly(b),
               nir_imm_int(b, state->static_count[stream]),
               nir_imm_int(b, state->count_index[stream]),
               nir_imm_int(b, state->info->count_words),
               nir_imm_bool(b, state->info->prefix_sum), unrolled);

-            nir_def *index = libagx_xfb_vertex_offset(
+            nir_def *index = poly_xfb_vertex_offset(
               b, n, invocation_base, base, id_in_strip, p,
               nir_inot(b, nir_i2b(b, nir_load_provoking_last(b))));

@ -776,7 +776,7 @@ create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
                */
               value = nir_pad_vector_imm_int(b, value, 0, 4);

-               nir_def *addr = libagx_xfb_vertex_address(
+               nir_def *addr = poly_xfb_vertex_address(
                  b, nir_load_geometry_param_buffer_poly(b), index,
                  nir_imm_int(b, buffer), nir_imm_int(b, stride),
                  nir_imm_int(b, output.offset));
@ -842,12 +842,12 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)

   switch (intr->intrinsic) {
   case nir_intrinsic_set_vertex_and_primitive_count: {
-      if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
+      if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
         break;

      /* All streams are merged, just pick a single instruction */
      if (nir_intrinsic_stream_id(intr) == 0) {
-         libagx_pad_index_gs(
+         poly_pad_index_gs(
            b, load_geometry_param(b, output_index_buffer),
            nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
            intr->src[1].ssa, nir_imm_int(b, state->info->max_indices));
@ -857,10 +857,10 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
   }

   case nir_intrinsic_emit_primitive_poly: {
-      if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
+      if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
         break;

-      libagx_write_strip(
+      poly_write_strip(
         b, load_geometry_param(b, output_index_buffer),
         nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
         intr->src[0].ssa,
@ -903,14 +903,14 @@ collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   return true;
 }

-struct agx_xfb_key {
+struct poly_xfb_key {
   uint8_t streams;
   uint8_t buffers_written;
   uint8_t buffer_to_stream[NIR_MAX_XFB_BUFFERS];
   int8_t count_index[4];
   uint16_t stride[NIR_MAX_XFB_BUFFERS];
   uint16_t output_end[NIR_MAX_XFB_BUFFERS];
-   int16_t static_count[MAX_VERTEX_STREAMS];
+   int16_t static_count[POLY_MAX_VERTEX_STREAMS];
   uint16_t invocations;
   uint16_t vertices_per_prim;
 };
@ -921,14 +921,14 @@ struct agx_xfb_key {
 * transform feedback offsets and counters as applicable.
 */
 static nir_shader *
-create_pre_gs(struct agx_xfb_key *key,
+create_pre_gs(struct poly_xfb_key *key,
              const nir_shader_compiler_options *options)
 {
   nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "Pre-GS patch up");
   nir_builder *b = &b_;

-   libagx_pre_gs(
+   poly_pre_gs(
      b, nir_load_geometry_param_buffer_poly(b), nir_imm_int(b, key->streams),
      nir_imm_int(b, key->buffers_written),
      nir_imm_ivec4(b, key->buffer_to_stream[0], key->buffer_to_stream[1],
@ -1033,7 +1033,7 @@ calculate_max_indices(enum mesa_prim prim, unsigned verts)
 }

 struct topology_ctx {
-   struct agx_gs_info *info;
+   struct poly_gs_info *info;
   uint32_t topology[384];
 };

@ -1041,7 +1041,7 @@ static bool
 evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
   struct topology_ctx *ctx = data;
-   struct agx_gs_info *info = ctx->info;
+   struct poly_gs_info *info = ctx->info;
   if (intr->intrinsic != nir_intrinsic_emit_primitive_poly)
      return false;

@ -1050,7 +1050,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
    * if-statements interleaved with other stuff).
    */
   if (intr->instr.block != nir_start_block(b->impl)) {
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
      return false;
   }

@ -1058,11 +1058,11 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   if (!nir_src_is_const(intr->src[0]) || !nir_src_is_const(intr->src[1]) ||
       !nir_src_is_const(intr->src[2])) {

-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
      return false;
   }

-   _libagx_write_strip(
+   _poly_write_strip(
      ctx->topology, nir_src_as_uint(intr->src[0]),
      nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]),
      nir_intrinsic_stream_id(intr), stream_multiplier(b->shader),
@ -1076,7 +1076,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 *    0, 1, 2, -1, 3, 4, 5, ...
 */
 static bool
-match_list_topology(struct agx_gs_info *info, uint32_t count,
+match_list_topology(struct poly_gs_info *info, uint32_t count,
                    uint32_t *topology, bool has_restart)
 {
   unsigned count_with_restart = count + has_restart;
@ -1095,7 +1095,7 @@ match_list_topology(struct agx_gs_info *info, uint32_t count,
   }

   /* If we match, rewrite the topology and drop indexing */
-   info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
+   info->shape = POLY_GS_SHAPE_STATIC_PER_INSTANCE;
   info->mode = u_decomposed_prim(info->mode);
   info->max_indices =
      ((info->max_indices + has_restart) / count_with_restart) * count;
@ -1131,12 +1131,12 @@ is_strip_topology(uint32_t *indices, uint32_t index_count)
 * VS(compute) + GS(vertex) sequences without auxiliary programs.
 */
 static void
-optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
+optimize_static_topology(struct poly_gs_info *info, nir_shader *gs)
 {
   struct topology_ctx ctx = {.info = info};
   bool has_restart = info->mode != MESA_PRIM_POINTS;
   nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx);
-   if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED)
+   if (info->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED)
      return;

   /* We can always drop the trailing restart index */
@ -1150,7 +1150,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)

   /* Try to pattern match a strip topology */
   if (is_strip_topology(ctx.topology, info->max_indices)) {
-      info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM;
+      info->shape = POLY_GS_SHAPE_STATIC_PER_PRIM;
      return;
   }

@ -1161,7 +1161,7 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
    * XXX: check if this holds with streams.
    */
   if (info->max_indices >= ARRAY_SIZE(info->topology)) {
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
      return;
   }

@ -1170,12 +1170,12 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
      info->topology[i] = ctx.topology[i];
   }

-   info->shape = AGX_GS_SHAPE_STATIC_INDEXED;
+   info->shape = POLY_GS_SHAPE_STATIC_INDEXED;
 }

 bool
-agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
-                 nir_shader **pre_gs, struct agx_gs_info *info)
+poly_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
+                  nir_shader **pre_gs, struct poly_gs_info *info)
 {
   /* Lower I/O as assumed by the rest of GS lowering */
   if (gs->xfb_info != NULL) {
@ -1212,7 +1212,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
   /* Lower geometry shader writes to contain all of the required counts, so we
    * know where in the various buffers we should write vertices.
    */
-   NIR_PASS(_, gs, agx_nir_lower_gs_intrinsics);
+   NIR_PASS(_, gs, lower_gs_intrinsics);

   /* Clean up after all that lowering we did */
   bool progress = false;
@ -1241,7 +1241,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
    */
   struct lower_gs_state gs_state = {.info = info};

-   *info = (struct agx_gs_info){
+   *info = (struct poly_gs_info){
      .mode = gs->info.gs.output_primitive,
      .xfb = gs->xfb_info != NULL,
      .shape = -1,
@ -1252,10 +1252,13 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
   nir_gs_count_vertices_and_primitives(gs, NULL, static_indices,
                                        gs_state.static_count, 4);

+   STATIC_ASSERT(ARRAY_SIZE(gs_state.count_index) ==
+                 ARRAY_SIZE(gs_state.static_count));
+
   /* Anything we don't know statically will be tracked by the count buffer.
    * Determine the layout for it.
    */
-   for (unsigned i = 0; i < MAX_VERTEX_STREAMS; ++i) {
+   for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
      gs_state.count_index[i] =
         (gs_state.static_count[i] < 0) ? info->count_words++ : -1;
   }
@ -1272,7 +1275,7 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
   if (static_indices[0] >= 0) {
      optimize_static_topology(info, gs);
   } else {
-      info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
+      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
   }

   *gs_copy = create_gs_rast_shader(gs, &gs_state);
@ -1344,20 +1347,22 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
   /* Gather information required for transform feedback / query programs */
   struct nir_xfb_info *xfb = gs->xfb_info;

-   struct agx_xfb_key key = {
+   struct poly_xfb_key key = {
      .streams = gs->info.gs.active_stream_mask,
      .invocations = gs->info.gs.invocations,
      .vertices_per_prim = nir_verts_in_output_prim(gs),
   };

-   for (unsigned i = 0; i < 4; ++i) {
+   STATIC_ASSERT(ARRAY_SIZE(key.buffer_to_stream) == ARRAY_SIZE(key.stride));
+
+   for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
      key.count_index[i] = gs_state.count_index[i];
      key.static_count[i] = gs_state.static_count[i];
   }

   if (xfb) {
      key.buffers_written = xfb->buffers_written;
-      for (unsigned i = 0; i < 4; ++i) {
+      for (unsigned i = 0; i < ARRAY_SIZE(key.buffer_to_stream); ++i) {
         key.buffer_to_stream[i] = xfb->buffer_to_stream[i];
         key.stride[i] = xfb->buffers[i].stride;
      }
@ -1409,14 +1414,13 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   nir_def *buffer, *nr_verts, *instance_id, *primitive_id;
   if (b->shader->info.stage == MESA_SHADER_VERTEX) {
      buffer = nir_load_vs_output_buffer_poly(b);
-      nr_verts =
-         libagx_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
+      nr_verts = poly_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
   } else {
      assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);

      /* Instancing is unrolled during tessellation so nr_verts is ignored. */
      nr_verts = nir_imm_int(b, 0);
-      buffer = libagx_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
+      buffer = poly_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
   }

   if (b->shader->info.stage == MESA_SHADER_VERTEX &&
@ -1431,7 +1435,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   nir_def *linear_id =
      nir_iadd(b, nir_imul(b, instance_id, nr_verts), primitive_id);

-   nir_def *addr = libagx_vertex_output_address(
+   nir_def *addr = poly_vertex_output_address(
      b, buffer, nir_imm_int64(b, b->shader->info.outputs_written), linear_id,
      location);

@ -1444,7 +1448,7 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }

 bool
-agx_nir_lower_vs_before_gs(struct nir_shader *vs)
+poly_nir_lower_vs_before_gs(struct nir_shader *vs)
 {
   /* Lower vertex stores to memory stores */
   return nir_shader_intrinsics_pass(vs, lower_vs_before_gs,
--- a/src/poly/nir/poly_nir_lower_gs.h
+++ b/src/poly/nir/poly_nir_lower_gs.h
@ -0,0 +1,61 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "poly/geometry.h"
+#include "nir.h"
+#include "shader_enums.h"
+
+struct nir_def *poly_load_per_vertex_input(struct nir_builder *b,
+                                           nir_intrinsic_instr *intr,
+                                           struct nir_def *vertex);
+
+nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
+                                 unsigned index_size_B);
+
+bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
+
+bool poly_nir_lower_vs_before_gs(struct nir_shader *vs);
+
+struct poly_gs_info {
+   /* Output primitive mode for geometry shaders */
+   enum mesa_prim mode;
+
+   /* Number of words per primitive in the count buffer */
+   unsigned count_words;
+
+   /* Per-input primitive stride of the output index buffer */
+   unsigned max_indices;
+
+   /* Whether the GS includes transform feedback at a compile-time level */
+   bool xfb;
+
+   /* Whether a prefix sum is required on the count outputs. Implies xfb */
+   bool prefix_sum;
+
+   /* Whether the GS writes to a stream other than stream #0 */
+   bool multistream;
+
+   /* Shape of the rasterization draw, named by the instance ID */
+   enum poly_gs_shape shape;
+
+   /* Static topology used if shape = POLY_GS_SHAPE_STATIC_INDEXED */
+   uint8_t topology[64];
+};
+
+bool poly_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count,
+                       struct nir_shader **gs_copy, struct nir_shader **pre_gs,
+                       struct poly_gs_info *info);
+
+bool poly_nir_lower_tcs(struct nir_shader *tcs);
+
+bool poly_nir_lower_tes(struct nir_shader *tes, bool to_hw_vs);
+
+uint64_t poly_tcs_per_vertex_outputs(const struct nir_shader *nir);
+
+unsigned poly_tcs_output_stride(const struct nir_shader *nir);
--- a/src/poly/nir/poly_nir_lower_ia.c
+++ b/src/poly/nir/poly_nir_lower_ia.c
@ -4,25 +4,30 @@
 */

 #include "compiler/nir/nir_builder.h"
-#include "libagx/geometry.h"
-#include "libagx/libagx.h"
-#include "agx_nir_lower_gs.h"
+#include "poly/cl/libpoly.h"
+#include "poly/geometry.h"
 #include "nir.h"

+/* XXX: Remove me later */
+nir_def *poly_nir_load_vertex_id(struct nir_builder *b, nir_def *id,
+                                 unsigned index_size_B);
+
+bool poly_nir_lower_sw_vs(struct nir_shader *s, unsigned index_size_B);
+
 /*
 * This file implements basic input assembly in software. It runs on software
 * vertex shaders, as part of geometry/tessellation lowering. It does not apply
 * the topology, which happens in the geometry shader.
 */
 nir_def *
-agx_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B)
+poly_nir_load_vertex_id(nir_builder *b, nir_def *id, unsigned index_size_B)
 {
   /* If drawing with an index buffer, pull the vertex ID. Otherwise, the
    * vertex ID is just the index as-is.
    */
   if (index_size_B) {
      nir_def *ia = nir_load_input_assembly_buffer_poly(b);
-      id = libagx_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B));
+      id = poly_load_index_buffer(b, ia, id, nir_imm_int(b, index_size_B));
   }

   /* Add the "start", either an index bias or a base vertex. This must happen
@ -39,7 +44,8 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)

   if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
      nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
-      nir_def_replace(&intr->def, agx_nir_load_vertex_id(b, id, *index_size_B));
+      nir_def_replace(&intr->def,
+                      poly_nir_load_vertex_id(b, id, *index_size_B));
      return true;
   } else if (intr->intrinsic == nir_intrinsic_load_instance_id) {
      nir_def_replace(&intr->def,
@ -51,7 +57,7 @@ lower(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }

 bool
-agx_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B)
+poly_nir_lower_sw_vs(nir_shader *s, unsigned index_size_B)
 {
   return nir_shader_intrinsics_pass(s, lower, nir_metadata_control_flow,
                                     &index_size_B);
--- a/src/poly/nir/poly_nir_lower_tess.c
+++ b/src/poly/nir/poly_nir_lower_tess.c
@ -3,11 +3,11 @@
 * SPDX-License-Identifier: MIT
 */

-#include "libagx/geometry.h"
-#include "libagx/libagx.h"
+#include "poly/cl/libpoly.h"
+#include "poly/geometry.h"
+#include "poly/nir/poly_nir_lower_gs.h"
 #include "util/bitscan.h"
 #include "util/macros.h"
-#include "agx_nir_lower_gs.h"
 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_builder_opcodes.h"
@ -18,12 +18,12 @@
 static nir_def *
 tcs_unrolled_id(nir_builder *b)
 {
-   return libagx_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b),
-                                 nir_load_workgroup_id(b));
+   return poly_tcs_unrolled_id(b, nir_load_tess_param_buffer_poly(b),
+                               nir_load_workgroup_id(b));
 }

 uint64_t
-agx_tcs_per_vertex_outputs(const nir_shader *nir)
+poly_tcs_per_vertex_outputs(const nir_shader *nir)
 {
   return nir->info.outputs_written &
          ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER |
@ -31,11 +31,11 @@ agx_tcs_per_vertex_outputs(const nir_shader *nir)
 }

 unsigned
-agx_tcs_output_stride(const nir_shader *nir)
+poly_tcs_output_stride(const nir_shader *nir)
 {
-   return libagx_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
-                                nir->info.tess.tcs_vertices_out,
-                                agx_tcs_per_vertex_outputs(nir));
+   return poly_tcs_out_stride(util_last_bit(nir->info.patch_outputs_written),
+                              nir->info.tess.tcs_vertices_out,
+                              poly_tcs_per_vertex_outputs(nir));
 }

 static nir_def *
@ -44,12 +44,12 @@ tcs_out_addr(nir_builder *b, nir_intrinsic_instr *intr, nir_def *vertex_id)
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);

   nir_def *offset = nir_get_io_offset_src(intr)->ssa;
-   nir_def *addr = libagx_tcs_out_address(
+   nir_def *addr = poly_tcs_out_address(
      b, nir_load_tess_param_buffer_poly(b), tcs_unrolled_id(b), vertex_id,
      nir_iadd_imm(b, offset, sem.location),
      nir_imm_int(b, util_last_bit(b->shader->info.patch_outputs_written)),
      nir_imm_int(b, b->shader->info.tess.tcs_vertices_out),
-      nir_imm_int64(b, agx_tcs_per_vertex_outputs(b->shader)));
+      nir_imm_int64(b, poly_tcs_per_vertex_outputs(b->shader)));

   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);

@ -68,9 +68,9 @@ lower_tes_load(nir_builder *b, nir_intrinsic_instr *intr)
   if (intr->intrinsic == nir_intrinsic_load_per_vertex_input)
      vertex = intr->src[0].ssa;

-   nir_def *addr = libagx_tes_in_address(b, nir_load_tess_param_buffer_poly(b),
-                                         nir_load_vertex_id(b), vertex,
-                                         nir_iadd_imm(b, offset, location));
+   nir_def *addr = poly_tes_in_address(b, nir_load_tess_param_buffer_poly(b),
+                                       nir_load_vertex_id(b), vertex,
+                                       nir_iadd_imm(b, offset, location));

   if (nir_intrinsic_has_component(intr))
      addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
@ -84,10 +84,10 @@ tcs_load_input(nir_builder *b, nir_intrinsic_instr *intr)
 {
   nir_def *base = nir_imul(
      b, tcs_unrolled_id(b),
-      libagx_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)));
+      poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b)));
   nir_def *vertex = nir_iadd(b, base, intr->src[0].ssa);

-   return agx_load_per_vertex_input(b, intr, vertex);
+   return poly_load_per_vertex_input(b, intr, vertex);
 }

 static nir_def *
@ -114,16 +114,15 @@ lower_tcs_impl(nir_builder *b, nir_intrinsic_instr *intr)
      return tcs_load_input(b, intr);

   case nir_intrinsic_load_patch_vertices_in:
-      return libagx_tcs_patch_vertices_in(b,
-                                          nir_load_tess_param_buffer_poly(b));
+      return poly_tcs_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b));

   case nir_intrinsic_load_tess_level_outer_default:
-      return libagx_tess_level_outer_default(
-         b, nir_load_tess_param_buffer_poly(b));
+      return poly_tess_level_outer_default(b,
+                                           nir_load_tess_param_buffer_poly(b));

   case nir_intrinsic_load_tess_level_inner_default:
-      return libagx_tess_level_inner_default(
-         b, nir_load_tess_param_buffer_poly(b));
+      return poly_tess_level_inner_default(b,
+                                           nir_load_tess_param_buffer_poly(b));

   case nir_intrinsic_load_output: {
      nir_def *addr = tcs_out_addr(b, intr, nir_undef(b, 1, 32));
@ -176,7 +175,7 @@ lower_tcs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 }

 bool
-agx_nir_lower_tcs(nir_shader *tcs)
+poly_nir_lower_tcs(nir_shader *tcs)
 {
   return nir_shader_intrinsics_pass(tcs, lower_tcs, nir_metadata_control_flow,
                                     NULL);
@ -187,12 +186,12 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
   switch (intr->intrinsic) {
   case nir_intrinsic_load_tess_coord_xy:
-      return libagx_load_tess_coord(b, nir_load_tess_param_buffer_poly(b),
-                                    nir_load_vertex_id(b));
+      return poly_load_tess_coord(b, nir_load_tess_param_buffer_poly(b),
+                                  nir_load_vertex_id(b));

   case nir_intrinsic_load_primitive_id:
-      return libagx_tes_patch_id(b, nir_load_tess_param_buffer_poly(b),
-                                 nir_load_vertex_id(b));
+      return poly_tes_patch_id(b, nir_load_tess_param_buffer_poly(b),
+                               nir_load_vertex_id(b));

   case nir_intrinsic_load_input:
   case nir_intrinsic_load_per_vertex_input:
@ -201,8 +200,7 @@ lower_tes_impl(nir_builder *b, nir_intrinsic_instr *intr, void *data)
      return lower_tes_load(b, intr);

   case nir_intrinsic_load_patch_vertices_in:
-      return libagx_tes_patch_vertices_in(b,
-                                          nir_load_tess_param_buffer_poly(b));
+      return poly_tes_patch_vertices_in(b, nir_load_tess_param_buffer_poly(b));

   default:
      return NULL;
@ -232,12 +230,12 @@ lower_tes_indexing(nir_builder *b, nir_intrinsic_instr *intr, void *data)
   b->cursor = nir_before_instr(&intr->instr);
   nir_def *p = nir_load_tess_param_buffer_poly(b);
   nir_def *id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
-   nir_def_replace(&intr->def, libagx_load_tes_index(b, p, id));
+   nir_def_replace(&intr->def, poly_load_tes_index(b, p, id));
   return true;
 }

 bool
-agx_nir_lower_tes(nir_shader *tes, bool to_hw_vs)
+poly_nir_lower_tes(nir_shader *tes, bool to_hw_vs)
 {
   nir_lower_tess_coord_z(
      tes, tes->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
--- a/src/poly/tessellator.h
+++ b/src/poly/tessellator.h
@ -0,0 +1,108 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "compiler/libcl/libcl.h"
+
+enum poly_tess_partitioning {
+   POLY_TESS_PARTITIONING_FRACTIONAL_ODD,
+   POLY_TESS_PARTITIONING_FRACTIONAL_EVEN,
+   POLY_TESS_PARTITIONING_INTEGER,
+};
+
+enum poly_tess_mode {
+   /* Do not actually tessellate, just write the index counts */
+   POLY_TESS_MODE_COUNT,
+
+   /* Tessellate using the count buffers to allocate indices */
+   POLY_TESS_MODE_WITH_COUNTS,
+};
+
+struct poly_tess_point {
+   uint32_t u;
+   uint32_t v;
+};
+static_assert(sizeof(struct poly_tess_point) == 8);
+
+struct poly_tess_args {
+   /* Heap to allocate tessellator outputs in */
+   DEVICE(struct poly_heap) heap;
+
+   /* Patch coordinate buffer, indexed as:
+    *
+    *    coord_allocs[patch_ID] + vertex_in_patch
+    */
+   DEVICE(struct poly_tess_point) patch_coord_buffer;
+
+   /* Per-patch index within the heap for the tess coords, written by the
+    * tessellator based on the allocated memory.
+    */
+   DEVICE(uint32_t) coord_allocs;
+
+   /* Space for output draws from the tessellator. API draw calls. */
+   DEVICE(uint32_t) out_draws;
+
+   /* Tessellation control shader output buffer. */
+   DEVICE(float) tcs_buffer;
+
+   /* Count buffer. # of indices per patch written here, then prefix summed. */
+   DEVICE(uint32_t) counts;
+
+   /* Allocated index buffer for all patches, if we're prefix summing counts */
+   DEVICE(uint32_t) index_buffer;
+
+   /* Address of the tess eval invocation counter for implementing pipeline
+    * statistics, if active. Zero if inactive. Incremented by tessellator.
+    */
+   DEVICE(uint32_t) statistic;
+
+   /* When geom+tess used together, the buffer containing TES outputs (executed
+    * as a hardware compute shader).
+    */
+   uint64_t tes_buffer;
+
+   /* Bitfield of TCS per-vertex outputs */
+   uint64_t tcs_per_vertex_outputs;
+
+   /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
+    * Unused in Vulkan and OpenGL ES.
+    */
+   float tess_level_outer_default[4];
+   float tess_level_inner_default[2];
+
+   /* Number of vertices in the input patch */
+   uint32_t input_patch_size;
+
+   /* Number of vertices in the TCS output patch */
+   uint32_t output_patch_size;
+
+   /* Number of patch constants written by TCS */
+   uint32_t tcs_patch_constants;
+
+   /* Number of input patches per instance of the VS/TCS */
+   uint32_t patches_per_instance;
+
+   /* Stride between tessellation facotrs in the TCS output buffer. */
+   uint32_t tcs_stride_el;
+
+   /* Number of patches being tessellated */
+   uint32_t nr_patches;
+
+   /* Partitioning and points mode. These affect per-patch setup code but not
+    * the hot tessellation loop so we make them dynamic to reduce tessellator
+    * variants.
+    */
+   enum poly_tess_partitioning partitioning;
+   uint32_t points_mode;
+   uint32_t isolines;
+
+   /* When fed into a geometry shader, triangles should be counter-clockwise.
+    * The tessellator always produces clockwise triangles, but we can swap
+    * dynamically in the TES.
+    */
+   uint32_t ccw;
+} PACKED;
+static_assert(sizeof(struct poly_tess_args) == 36 * 4);