From 68048759f0ae2e685eced36e83a33d8f69e35925 Mon Sep 17 00:00:00 2001
From: Aitor Camacho <aitor@lunarg.com>
Date: Sat, 18 Apr 2026 12:53:15 +0900
Subject: [PATCH] kk: Implement tessellation

Same approach as HK for tessellation. It also handles instance_id lowering.
instance_id_includes_base_index is not taken into account in multiple
other passes that use instance id. These passes expect instance id to
actually be instance id. This change adds a pass to work around this.

Signed-off-by: Aitor Camacho <aitor@lunarg.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41038>
---
 src/kosmickrisp/bridge/mtl_encoder.h          |   3 +
 src/kosmickrisp/bridge/mtl_encoder.m          |  10 +
 src/kosmickrisp/bridge/mtl_types.h            |   6 +
 src/kosmickrisp/bridge/stubs/mtl_encoder.c    |   6 +
 src/kosmickrisp/bridge/vk_to_mtl_map.c        |  13 +
 src/kosmickrisp/bridge/vk_to_mtl_map.h        |   2 +
 .../compiler/msl_nir_lower_common.c           |  22 +
 src/kosmickrisp/compiler/nir_to_msl.h         |   2 +-
 src/kosmickrisp/libkk/kk_geometry.cl          |  43 ++
 src/kosmickrisp/libkk/kk_tessellation.cl      |  93 +++
 src/kosmickrisp/libkk/kk_tessellator.cl       |  31 +
 src/kosmickrisp/libkk/kk_tessellator.h        |  20 +
 src/kosmickrisp/libkk/meson.build             |   3 +
 src/kosmickrisp/vulkan/kk_cmd_buffer.h        |  13 +-
 src/kosmickrisp/vulkan/kk_cmd_draw.c          | 600 ++++++++++++++----
 .../vulkan/kk_nir_lower_descriptors.c         |  36 +-
 src/kosmickrisp/vulkan/kk_nir_lower_vbo.c     |  26 +-
 src/kosmickrisp/vulkan/kk_physical_device.c   |   1 +
 src/kosmickrisp/vulkan/kk_shader.c            |  10 +-
 src/kosmickrisp/vulkan/kk_shader.h            |   7 +-
 20 files changed, 797 insertions(+), 150 deletions(-)
 create mode 100644 src/kosmickrisp/libkk/kk_geometry.cl
 create mode 100644 src/kosmickrisp/libkk/kk_tessellation.cl
 create mode 100644 src/kosmickrisp/libkk/kk_tessellator.cl
 create mode 100644 src/kosmickrisp/libkk/kk_tessellator.h

diff --git a/src/kosmickrisp/bridge/mtl_encoder.h b/src/kosmickrisp/bridge/mtl_encoder.h
index 21dc2223d78..beed5d5fbc5 100644
--- a/src/kosmickrisp/bridge/mtl_encoder.h
+++ b/src/kosmickrisp/bridge/mtl_encoder.h
@@ -61,6 +61,9 @@ void mtl_dispatch_threadgroups_with_indirect_buffer(
    mtl_compute_encoder *encoder, mtl_buffer *buffer, uint32_t offset,
    struct mtl_size local_size);
 
+void mtl_memory_barrier_with_scope(mtl_compute_encoder *encoder,
+                                   enum mtl_barrier_scope scope);
+
 /* MTLRenderEncoder */
 mtl_render_encoder *mtl_new_render_command_encoder_with_descriptor(
    mtl_command_buffer *command_buffer, mtl_render_pass_descriptor *descriptor);
diff --git a/src/kosmickrisp/bridge/mtl_encoder.m b/src/kosmickrisp/bridge/mtl_encoder.m
index 1561e7d66c4..dd5037ba84f 100644
--- a/src/kosmickrisp/bridge/mtl_encoder.m
+++ b/src/kosmickrisp/bridge/mtl_encoder.m
@@ -234,6 +234,16 @@ mtl_dispatch_threadgroups_with_indirect_buffer(mtl_compute_encoder *encoder,
    }
 }
 
+void
+mtl_memory_barrier_with_scope(mtl_compute_encoder *encoder,
+                              enum mtl_barrier_scope scope)
+{
+   @autoreleasepool {
+      id<MTLComputeCommandEncoder> enc = (id<MTLComputeCommandEncoder>)encoder;
+      [enc memoryBarrierWithScope:(MTLBarrierScope)scope];
+   }
+}
+
 /* MTLRenderEncoder */
 
 /* Encoder commands */
diff --git a/src/kosmickrisp/bridge/mtl_types.h b/src/kosmickrisp/bridge/mtl_types.h
index aa4eedf7760..ee76cfdcc16 100644
--- a/src/kosmickrisp/bridge/mtl_types.h
+++ b/src/kosmickrisp/bridge/mtl_types.h
@@ -220,6 +220,12 @@ enum mtl_depth_clip_mode {
    MTL_DEPTH_CLIP_MODE_CLAMP = 1,
 };
 
+enum mtl_barrier_scope {
+   MTL_BARRIER_SCOPE_BUFFERS = 1 << 0,
+   MTL_BARRIER_SCOPE_TEXTURES = 1 << 1,
+   MTL_BARRIER_SCOPE_RENDER_TARGETS = 1 << 2,
+};
+
 /** STRUCTURES */
 struct mtl_range {
    size_t offset;
diff --git a/src/kosmickrisp/bridge/stubs/mtl_encoder.c b/src/kosmickrisp/bridge/stubs/mtl_encoder.c
index cbf6e7ea2d0..931733ebbbf 100644
--- a/src/kosmickrisp/bridge/stubs/mtl_encoder.c
+++ b/src/kosmickrisp/bridge/stubs/mtl_encoder.c
@@ -59,6 +59,12 @@ mtl_copy_from_texture_to_texture(mtl_blit_encoder *blit_enc_handle,
 {
 }
 
+void
+mtl_memory_barrier_with_scope(mtl_compute_encoder *encoder,
+                              enum mtl_barrier_scope scope)
+{
+}
+
 /* MTLComputeEncoder */
 mtl_compute_encoder *
 mtl_new_compute_command_encoder(mtl_command_buffer *cmd_buffer)
diff --git a/src/kosmickrisp/bridge/vk_to_mtl_map.c b/src/kosmickrisp/bridge/vk_to_mtl_map.c
index 1299ee76f84..068ea721ef0 100644
--- a/src/kosmickrisp/bridge/vk_to_mtl_map.c
+++ b/src/kosmickrisp/bridge/vk_to_mtl_map.c
@@ -250,3 +250,16 @@ index_size_in_bytes_to_mtl_index_type(unsigned bytes)
       UNREACHABLE("Unsupported byte size for index");
    }
 }
+
+unsigned
+mtl_index_type_to_size_B(enum mtl_index_type type)
+{
+   switch (type) {
+   case MTL_INDEX_TYPE_UINT16:
+      return 2u;
+   case MTL_INDEX_TYPE_UINT32:
+      return 4u;
+   default:
+      UNREACHABLE("Unhandled index type");
+   }
+}
diff --git a/src/kosmickrisp/bridge/vk_to_mtl_map.h b/src/kosmickrisp/bridge/vk_to_mtl_map.h
index 151f64bc671..d80e790cf1a 100644
--- a/src/kosmickrisp/bridge/vk_to_mtl_map.h
+++ b/src/kosmickrisp/bridge/vk_to_mtl_map.h
@@ -76,4 +76,6 @@ enum mtl_cull_mode vk_front_face_to_mtl_cull_mode(enum VkCullModeFlagBits mode);
 
 enum mtl_index_type index_size_in_bytes_to_mtl_index_type(unsigned bytes);
 
+unsigned mtl_index_type_to_size_B(enum mtl_index_type type);
+
 #endif /* KK_MTL_TO_VK_MAP_H */
diff --git a/src/kosmickrisp/compiler/msl_nir_lower_common.c b/src/kosmickrisp/compiler/msl_nir_lower_common.c
index 2829c338872..a756ac947d6 100644
--- a/src/kosmickrisp/compiler/msl_nir_lower_common.c
+++ b/src/kosmickrisp/compiler/msl_nir_lower_common.c
@@ -574,3 +574,25 @@ msl_nir_lower_clip_cull_distance(nir_shader *nir, unsigned num_cull_distances)
    else
       NIR_PASS(_, nir, msl_nir_lower_clip_cull_distance_vs);
 }
+
+static bool
+lower_instance_id(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_instance_id)
+      return false;
+
+   b->cursor = nir_after_instr(&intr->instr);
+   nir_def *base_instance = nir_load_base_instance(b);
+   nir_def *instance_id = nir_isub(b, &intr->def, base_instance);
+   nir_def_rewrite_uses_after(&intr->def, instance_id);
+   BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
+
+   return true;
+}
+
+bool
+msl_nir_lower_instance_id(nir_shader *nir)
+{
+   return nir_shader_intrinsics_pass(nir, lower_instance_id,
+                                     nir_metadata_control_flow, NULL);
+}
diff --git a/src/kosmickrisp/compiler/nir_to_msl.h b/src/kosmickrisp/compiler/nir_to_msl.h
index c2a6d6647a0..2d5f19c0a79 100644
--- a/src/kosmickrisp/compiler/nir_to_msl.h
+++ b/src/kosmickrisp/compiler/nir_to_msl.h
@@ -79,6 +79,7 @@ bool msl_nir_fake_guard_for_discards(struct nir_shader *nir);
 bool msl_nir_lower_sample_shading(nir_shader *nir);
 void msl_nir_lower_clip_cull_distance(nir_shader *nir,
                                       unsigned num_cull_distances);
+bool msl_nir_lower_instance_id(nir_shader *nir);
 
 bool msl_gather_uses_per_draw_data(nir_shader *nir);
 
@@ -94,7 +95,6 @@ static const nir_shader_compiler_options kk_nir_options = {
    .lower_insert_byte = true,
    .lower_fmod = true,
    .discard_is_demote = true,
-   .instance_id_includes_base_index = true,
    .lower_device_index_to_zero = true,
    .lower_pack_64_2x32_split = true,
    .lower_unpack_64_2x32_split = true,
diff --git a/src/kosmickrisp/libkk/kk_geometry.cl b/src/kosmickrisp/libkk/kk_geometry.cl
new file mode 100644
index 00000000000..c54e0059e0e
--- /dev/null
+++ b/src/kosmickrisp/libkk/kk_geometry.cl
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2026 LunarG, Inc.
+ * Copyright 2026 Google LLC
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/libcl/libcl_vk.h"
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+
+KERNEL(1)
+libkk_prefix_sum_tess(global struct poly_tess_params *p)
+{
+   if (cl_local_id.x != 0)
+      return;
+
+   /* The last element of an inclusive prefix sum is the total sum */
+   uint total = 0;
+
+   if (p->nr_patches > 0) {
+      for (uint32_t i = 0u; i < p->nr_patches; ++i) {
+         total += p->counts[i];
+         p->counts[i] = total;
+      }
+   }
+
+   /* Allocate 4-byte indices */
+   uint32_t elsize_B = sizeof(uint32_t);
+   uint32_t size_B = total * elsize_B;
+   uint alloc_B = poly_heap_alloc_offs(p->heap, size_B);
+   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->base) + alloc_B);
+
+   /* ...and now we can generate the API indexed draw */
+   global uint32_t *desc = p->out_draws;
+
+   desc[0] = total;              /* count */
+   desc[1] = 1;                  /* instance_count */
+   desc[2] = alloc_B / elsize_B; /* start */
+   desc[3] = 0;                  /* index_bias */
+   desc[4] = 0;                  /* start_instance */
+}
diff --git a/src/kosmickrisp/libkk/kk_tessellation.cl b/src/kosmickrisp/libkk/kk_tessellation.cl
new file mode 100644
index 00000000000..bb2d06e1695
--- /dev/null
+++ b/src/kosmickrisp/libkk/kk_tessellation.cl
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2026 LunarG, Inc.
+ * Copyright 2026 Google LLC
+ * Copyright 2023 Alyssa Rosenzweig
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "poly/geometry.h"
+#include "poly/tessellator.h"
+
+KERNEL(1)
+libkk_tess_setup_indirect(
+   global struct poly_tess_params *p,
+   global uint32_t *grids /* output: VS then TCS then tess */,
+   global struct poly_vertex_params *vp /* output */, global uint32_t *indirect,
+   uint64_t in_index_buffer, uint32_t in_index_buffer_range_el,
+   uint32_t in_index_size_B, uint64_t vertex_outputs /* bitfield */,
+
+   /* Tess control invocation counter if active, else zero */
+   global uint32_t *tcs_statistic)
+{
+   uint count = indirect[0], instance_count = indirect[1];
+   unsigned in_patches = count / p->input_patch_size;
+
+   /* TCS invocation counter increments once per-patch */
+   if (tcs_statistic) {
+      *tcs_statistic += in_patches;
+   }
+
+   size_t draw_stride = 5 * sizeof(uint32_t);
+   unsigned unrolled_patches = in_patches * instance_count;
+
+   uint32_t alloc = 0;
+   uint32_t tcs_out_offs = alloc;
+   alloc += unrolled_patches * p->tcs_stride_el * 4;
+
+   uint32_t patch_coord_offs = alloc;
+   alloc += unrolled_patches * 4;
+
+   uint32_t count_offs = alloc;
+   alloc += unrolled_patches * sizeof(uint32_t);
+
+   uint vb_offs = alloc;
+   uint vb_size = poly_tcs_in_size(count * instance_count, vertex_outputs);
+   alloc += vb_size;
+
+   /* Allocate all patch calculations in one go */
+   global uchar *blob = poly_heap_alloc(p->heap, alloc);
+
+   p->tcs_buffer = (global float *)(blob + tcs_out_offs);
+   p->patches_per_instance = in_patches;
+   p->coord_allocs = (global uint *)(blob + patch_coord_offs);
+   p->nr_patches = unrolled_patches;
+
+   vp->output_buffer = (uintptr_t)(blob + vb_offs);
+   vp->outputs = vertex_outputs;
+   p->counts = (global uint32_t *)(blob + count_offs);
+
+   if (vp) {
+      vp->verts_per_instance = count;
+   }
+
+   /* If indexing is enabled, the third word is the offset into the index buffer
+    * in elements. Apply that offset now that we have it. For a hardware
+    * indirect draw, the hardware would do this for us, but for software input
+    * assembly we need to do it ourselves.
+    *
+    * XXX: Deduplicate?
+    */
+   if (in_index_size_B) {
+      /* TODO_KOSMICKRISP Use poly_index_buffer and implement
+       * load_ro_sink_address_poly */
+      vp->index_buffer = in_index_buffer + (indirect[2] * in_index_size_B);
+
+      vp->index_buffer_range_el =
+         poly_index_buffer_range_el(in_index_buffer_range_el, indirect[2]);
+   }
+
+   /* VS grid size */
+   grids[0] = count;
+   grids[1] = instance_count;
+   grids[2] = 1;
+
+   /* TCS grid size */
+   grids[3] = in_patches * p->output_patch_size;
+   grids[4] = instance_count;
+   grids[5] = 1;
+
+   /* Tess grid size */
+   grids[6] = unrolled_patches;
+   grids[7] = 1;
+   grids[8] = 1;
+}
diff --git a/src/kosmickrisp/libkk/kk_tessellator.cl b/src/kosmickrisp/libkk/kk_tessellator.cl
new file mode 100644
index 00000000000..fc6e8dad59a
--- /dev/null
+++ b/src/kosmickrisp/libkk/kk_tessellator.cl
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2026 LunarG, Inc.
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "poly/cl/tessellator.h"
+
+KERNEL(1)
+libkk_tess_isoline(constant struct poly_tess_params *p,
+                   enum poly_tess_mode tess_mode)
+{
+   uint patch = cl_global_id.x;
+   poly_tess_isoline_process(p, patch, tess_mode);
+}
+
+KERNEL(1)
+libkk_tess_tri(constant struct poly_tess_params *p,
+               enum poly_tess_mode tess_mode)
+{
+   uint patch = cl_global_id.x;
+   poly_tess_tri_process(p, patch, tess_mode);
+}
+
+KERNEL(1)
+libkk_tess_quad(constant struct poly_tess_params *p,
+                enum poly_tess_mode tess_mode)
+{
+   uint patch = cl_global_id.x;
+   poly_tess_quad_process(p, patch, tess_mode);
+}
diff --git a/src/kosmickrisp/libkk/kk_tessellator.h b/src/kosmickrisp/libkk/kk_tessellator.h
new file mode 100644
index 00000000000..5d687f40a3a
--- /dev/null
+++ b/src/kosmickrisp/libkk/kk_tessellator.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2026 LunarG, Inc.
+ * Copyright 2026 Google LLC
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "poly/tessellator.h"
+
+#define libkk_tessellate(context, grid, barrier, prim, mode, state)            \
+   if (prim == TESS_PRIMITIVE_QUADS) {                                         \
+      libkk_tess_quad(context, grid, barrier, state, mode);                    \
+   } else if (prim == TESS_PRIMITIVE_TRIANGLES) {                              \
+      libkk_tess_tri(context, grid, barrier, state, mode);                     \
+   } else {                                                                    \
+      assert(prim == TESS_PRIMITIVE_ISOLINES);                                 \
+      libkk_tess_isoline(context, grid, barrier, state, mode);                 \
+   }
diff --git a/src/kosmickrisp/libkk/meson.build b/src/kosmickrisp/libkk/meson.build
index fe4d3c4d907..04e17525255 100644
--- a/src/kosmickrisp/libkk/meson.build
+++ b/src/kosmickrisp/libkk/meson.build
@@ -4,7 +4,10 @@
 
 libkk_shader_files = files(
   'kk_draws.cl',
+  'kk_geometry.cl',
   'kk_query.cl',
+  'kk_tessellation.cl',
+  'kk_tessellator.cl',
 )
 
 libkk_spv = custom_target(
diff --git a/src/kosmickrisp/vulkan/kk_cmd_buffer.h b/src/kosmickrisp/vulkan/kk_cmd_buffer.h
index de7f1783ea0..c14acbc87ae 100644
--- a/src/kosmickrisp/vulkan/kk_cmd_buffer.h
+++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.h
@@ -48,8 +48,9 @@ struct kk_root_descriptor_table {
 
          float blend_constant[4];
          float clip_z_coeff;
-         uint32_t base_vertex;
          uint32_t index_size;
+         uint64_t base_vertex_addr;
+         uint64_t base_instance_addr;
       } draw;
       struct {
          uint32_t base_group[3];
@@ -158,6 +159,16 @@ struct kk_graphics_state {
       mtl_buffer *handles[KK_MAX_VBUFS];
    } vb;
 
+   /* Tessellation state */
+   struct {
+      /* Grid buffer for when the draw is indirect */
+      struct kk_ptr indirect_ptr;
+      mtl_buffer *out_draws_buffer;
+      uint64_t out_draws_offset;
+      struct kk_tess_info info;
+      enum mesa_prim prim;
+   } tess;
+
    /* Needed by vk_command_buffer::dynamic_graphics_state */
    struct vk_vertex_input_state _dynamic_vi;
    struct vk_sample_locations_state _dynamic_sl;
diff --git a/src/kosmickrisp/vulkan/kk_cmd_draw.c b/src/kosmickrisp/vulkan/kk_cmd_draw.c
index 558beb48bf4..50b357d5ddf 100644
--- a/src/kosmickrisp/vulkan/kk_cmd_draw.c
+++ b/src/kosmickrisp/vulkan/kk_cmd_draw.c
@@ -19,7 +19,10 @@
 #include "kosmickrisp/bridge/mtl_bridge.h"
 #include "kosmickrisp/bridge/vk_to_mtl_map.h"
 
+#include "kosmickrisp/libkk/kk_tessellator.h"
+
 #include "poly/geometry.h"
+#include "poly/tessellator.h"
 
 #include "vulkan/runtime/vk_render_pass.h"
 #include "vulkan/util/vk_format.h"
@@ -782,6 +785,255 @@ kk_flush_pipeline(struct kk_cmd_buffer *cmd)
       if (gfx->depth_stencil_state)
          mtl_set_depth_stencil_state(enc, gfx->depth_stencil_state);
    }
+
+   /* Merge tess info before GS construction since that depends on
+    * gfx->tess.prim
+    */
+   if ((IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) &&
+       cmd->state.shaders[MESA_SHADER_TESS_CTRL]) {
+      struct kk_shader *tesc = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
+      struct kk_shader *tese = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
+
+      gfx->tess.info =
+         kk_tess_info_merge(tese->info.tess.info, tesc->info.tess.info);
+
+      /* Determine primitive based on the merged state */
+      if (gfx->tess.info.points) {
+         gfx->tess.prim = MESA_PRIM_POINTS;
+      } else if (gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES) {
+         gfx->tess.prim = MESA_PRIM_LINES;
+      } else {
+         gfx->tess.prim = MESA_PRIM_TRIANGLES;
+      }
+   }
+}
+
+static void
+kk_init_heap(const void *data)
+{
+   struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data;
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+
+   size_t size = 128 * 1024 * 1024;
+   kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap);
+
+   struct poly_heap *map = (struct poly_heap *)dev->heap->cpu;
+
+   /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
+   *map = (struct poly_heap){
+      .base = dev->heap->gpu + sizeof(struct poly_heap),
+      .size = size - sizeof(struct poly_heap),
+   };
+}
+
+static uint64_t
+kk_heap(struct kk_cmd_buffer *cmd)
+{
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+
+   util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd);
+
+   /* We need to free all allocations after each command buffer execution */
+   if (!cmd->uses_heap) {
+      uint64_t addr = dev->heap->gpu;
+
+      /* Zeroing the allocated index frees everything */
+      kk_cmd_write(cmd, (struct libkk_imm_write){
+                           addr + offsetof(struct poly_heap, bottom), 0});
+
+      cmd->uses_heap = true;
+   }
+
+   return dev->heap->gpu;
+}
+
+enum kk_predicate_op : uint16_t {
+   /* value > draw_id */
+   KK_PREDICATE_GT_DRAW_ID,
+   /* value == 0 */
+   KK_PREDICATE_EQ_ZERO,
+   /* value != 0 */
+   KK_PREDICATE_NEQ_ZERO,
+};
+
+struct kk_draw_command {
+   enum mesa_prim prim;
+   /* Mask of stages that need per-draw data uploaded */
+   uint32_t upload_mask;
+   mtl_buffer *index_buffer;
+   uint64_t index_buffer_offset;
+   uint64_t index_buffer_range_B;
+   uint64_t index_buffer_size_B;
+   uint32_t restart_index;
+   uint8_t index_buffer_el_size_B;
+   bool indirect;
+   bool indexed;
+   bool restart;
+   uint32_t predicate_count;
+   enum kk_predicate_op predicate_op[2];
+   uint32_t draw_count;
+   uint32_t pad_;
+   uint64_t predicate_addr[2];
+
+   union {
+      struct {
+         mtl_buffer *buffer;
+         uint64_t offset;
+         uint32_t stride;
+      } indirect_command;
+      /* These arrays will be >1 when draw_count is >1 as this struct is
+       * dynamically allocated. */
+      VkDrawIndirectCommand draws[1];
+      VkDrawIndexedIndirectCommand indexed_draws[1];
+   };
+};
+static_assert(sizeof(struct kk_draw_command) == 104u, "Packed struct");
+
+struct kk_draw_data {
+   /* For non-indirect, 0 is vertex/index count, 1 instance count and 2 first
+    * instance */
+   struct kk_grid grid;
+   struct {
+      mtl_buffer *buffer;
+      uint64_t offset;
+      uint64_t range;
+      enum mtl_index_type type;
+   } index;
+   uint32_t vertex_offset;
+   enum mtl_primitive_type primitive_type;
+};
+
+static uint64_t
+kk_upload_vertex_params(struct kk_cmd_buffer *cmd, struct kk_draw_data data)
+{
+   struct kk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+
+   const uint32_t wg_size[3] = {1, 1, 1};
+
+   struct poly_vertex_params params;
+   poly_vertex_params_init(&params, 0, wg_size);
+
+   /* XXX: We should deduplicate this logic */
+   bool indirect = kk_grid_is_indirect(data.grid);
+
+   if (!indirect)
+      poly_vertex_params_set_draw(&params, data.grid.size.x, data.grid.size.y);
+
+   if (data.index.buffer) {
+      params.index_buffer =
+         mtl_buffer_get_gpu_address(data.index.buffer) + data.index.offset;
+
+      params.index_buffer_range_el =
+         data.index.range / mtl_index_type_to_size_B(data.index.type);
+   }
+
+   struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX];
+   params.outputs = vs->info.vs.outputs_written;
+
+   if (!indirect) {
+      uint32_t verts = data.grid.size.x, instances = data.grid.size.y;
+      unsigned vb_size =
+         poly_tcs_in_size(verts * instances, vs->info.vs.outputs_written);
+
+      /* Allocate if there are any outputs, or use the null sink to trap
+       * reads if there aren't. Those reads are undefined but should not
+       * fault. Affects:
+       *
+       *    dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
+       */
+      if (vb_size)
+         params.output_buffer = kk_pool_alloc(cmd, vb_size, 4).gpu;
+      else
+         params.output_buffer = 0u;
+   }
+
+   desc->root.draw.vertex_outputs = params.outputs;
+
+   return kk_pool_upload(cmd, &params, sizeof(params), 8).gpu;
+}
+
+static void
+kk_upload_tess_params(struct kk_cmd_buffer *cmd, struct poly_tess_params *out,
+                      struct kk_draw_data draw)
+{
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct kk_graphics_state *gfx = &cmd->state.gfx;
+   struct kk_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
+
+   enum poly_tess_partitioning partitioning =
+      gfx->tess.info.spacing == TESS_SPACING_EQUAL
+         ? POLY_TESS_PARTITIONING_INTEGER
+      : gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
+         ? POLY_TESS_PARTITIONING_FRACTIONAL_ODD
+         : POLY_TESS_PARTITIONING_FRACTIONAL_EVEN;
+
+   struct poly_tess_params args = {
+      .heap = kk_heap(cmd),
+      .tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
+      .statistic = 0u,
+      .input_patch_size = dyn->ts.patch_control_points,
+      .output_patch_size = tcs->info.tess.tcs_output_patch_size,
+      .tcs_patch_constants = tcs->info.tess.tcs_nr_patch_outputs,
+      .tcs_per_vertex_outputs = tcs->info.tess.tcs_per_vertex_outputs,
+      .partitioning = partitioning,
+      .points_mode = gfx->tess.info.points,
+      .isolines = gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES,
+   };
+
+   if (!args.points_mode && gfx->tess.info.mode != TESS_PRIMITIVE_ISOLINES) {
+      args.ccw = gfx->tess.info.ccw;
+      args.ccw ^=
+         dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
+   }
+
+   uint32_t draw_stride_el = 5;
+   size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
+
+   /* heap is allocated by kk_heap */
+   /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
+   args.patch_coord_buffer = dev->heap->gpu + sizeof(struct poly_heap);
+
+   if (!kk_grid_is_indirect(draw.grid)) {
+      unsigned in_patches = draw.grid.size.x / args.input_patch_size;
+      unsigned unrolled_patches = in_patches * draw.grid.size.y;
+
+      uint32_t alloc = 0;
+      uint32_t tcs_out_offs = alloc;
+      alloc += unrolled_patches * args.tcs_stride_el * sizeof(uint32_t);
+
+      uint32_t patch_coord_offs = alloc;
+      alloc += unrolled_patches * sizeof(uint32_t);
+
+      uint32_t count_offs = alloc;
+      alloc += unrolled_patches * sizeof(uint32_t);
+
+      /* Single API draw */
+      uint32_t draw_offs = alloc;
+      alloc += draw_stride_B;
+
+      struct kk_ptr ptr = kk_pool_alloc(cmd, alloc, 4);
+      gfx->tess.out_draws_buffer = ptr.buffer;
+      gfx->tess.out_draws_offset = ptr.offset + draw_offs;
+      uint64_t addr = ptr.gpu;
+      args.tcs_buffer = addr + tcs_out_offs;
+      args.patches_per_instance = in_patches;
+      args.coord_allocs = addr + patch_coord_offs;
+      args.nr_patches = unrolled_patches;
+      args.out_draws = addr + draw_offs;
+      args.counts = addr + count_offs;
+   } else {
+      /* Allocate 3x indirect global+local grids for VS/TCS/tess */
+      uint32_t grid_stride = sizeof(uint32_t) * 3;
+      gfx->tess.indirect_ptr = kk_pool_alloc(cmd, grid_stride * 3, 4);
+
+      struct kk_ptr ptr = kk_pool_alloc(cmd, draw_stride_B, 4);
+      gfx->tess.out_draws_buffer = ptr.buffer;
+      gfx->tess.out_draws_offset = ptr.offset;
+      args.out_draws = ptr.gpu;
+   }
+
+   memcpy(out, &args, sizeof(args));
 }
 
 static void
@@ -835,10 +1087,17 @@ kk_flush_dynamic_state(struct kk_cmd_buffer *cmd)
       desc->root_dirty = true;
    }
 
-   if (IS_DIRTY(RS_FRONT_FACE)) {
-      mtl_set_front_face_winding(
-         enc, vk_front_face_to_mtl_winding(
-                 cmd->vk.dynamic_graphics_state.rs.front_face));
+   if (IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(TS_DOMAIN_ORIGIN) ||
+       IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) {
+      bool front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
+      if (cmd->state.shaders[MESA_SHADER_TESS_EVAL]) {
+         front_face_ccw ^= gfx->tess.info.ccw;
+         front_face_ccw ^=
+            dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
+      }
+      mtl_set_front_face_winding(enc, front_face_ccw
+                                         ? MTL_WINDING_COUNTER_CLOCKWISE
+                                         : MTL_WINDING_CLOCKWISE);
    }
 
    if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(RS_DEPTH_BIAS_ENABLE)) {
@@ -928,87 +1187,6 @@ kk_flush_gfx_state(struct kk_cmd_buffer *cmd)
 #undef IS_SHADER_DIRTY
 #undef IS_DIRTY
 
-enum kk_predicate_op : uint16_t {
-   /* value > draw_id */
-   KK_PREDICATE_GT_DRAW_ID,
-   /* value == 0 */
-   KK_PREDICATE_EQ_ZERO,
-   /* value != 0 */
-   KK_PREDICATE_NEQ_ZERO,
-};
-
-struct kk_draw_command {
-   enum mesa_prim prim;
-   /* Mask of stages that need per-draw data uploaded */
-   uint32_t upload_mask;
-   mtl_buffer *index_buffer;
-   uint64_t index_buffer_offset;
-   uint64_t index_buffer_range_B;
-   uint64_t index_buffer_size_B;
-   uint32_t restart_index;
-   uint8_t index_buffer_el_size_B;
-   bool indirect;
-   bool indexed;
-   bool restart;
-   uint32_t predicate_count;
-   enum kk_predicate_op predicate_op[2];
-   uint32_t draw_count;
-   uint32_t pad_;
-   uint64_t predicate_addr[2];
-
-   union {
-      struct {
-         mtl_buffer *buffer;
-         uint64_t offset;
-         uint32_t stride;
-      } indirect_command;
-      /* These arrays will be >1 when draw_count is >1 as this struct is
-       * dynamically allocated. */
-      VkDrawIndirectCommand draws[1];
-      VkDrawIndexedIndirectCommand indexed_draws[1];
-   };
-};
-static_assert(sizeof(struct kk_draw_command) == 104u, "Packed struct");
-
-static void
-kk_init_heap(const void *data)
-{
-   struct kk_cmd_buffer *cmd = (struct kk_cmd_buffer *)data;
-   struct kk_device *dev = kk_cmd_buffer_device(cmd);
-
-   size_t size = 128 * 1024 * 1024;
-   kk_alloc_bo(dev, &dev->vk.base, size, 0, &dev->heap);
-
-   struct poly_heap *map = (struct poly_heap *)dev->heap->cpu;
-
-   /* TODO_KOSMICKRISP Self-contained until we have rodata at the device. */
-   *map = (struct poly_heap){
-      .base = dev->heap->gpu + sizeof(struct poly_heap),
-      .size = size - sizeof(struct poly_heap),
-   };
-}
-
-static uint64_t
-kk_heap(struct kk_cmd_buffer *cmd)
-{
-   struct kk_device *dev = kk_cmd_buffer_device(cmd);
-
-   util_call_once_data(&dev->heap_init_once, kk_init_heap, cmd);
-
-   /* We need to free all allocations after each command buffer execution */
-   if (!cmd->uses_heap) {
-      uint64_t addr = dev->heap->gpu;
-
-      /* Zeroing the allocated index frees everything */
-      kk_cmd_write(cmd, (struct libkk_imm_write){
-                           addr + offsetof(struct poly_heap, bottom), 0});
-
-      cmd->uses_heap = true;
-   }
-
-   return dev->heap->gpu;
-}
-
 /* Returns true if the draw was successfully converted. */
 static bool
 kk_convert_to_indirect_draw(struct kk_cmd_buffer *cmd,
@@ -1193,19 +1371,6 @@ build_per_draw_upload_mask(struct kk_cmd_buffer *cmd)
    return mask;
 }
 
-struct kk_draw_data {
-   /* For non-indirect, 0 is vertex/index count, 1 instance count and 2 first
-    * instance */
-   struct kk_grid grid;
-   struct {
-      mtl_buffer *buffer;
-      uint64_t offset;
-      enum mtl_index_type type;
-   } index;
-   uint32_t vertex_offset;
-   enum mtl_primitive_type primitive_type;
-};
-
 static void
 kk_dispatch_draw(mtl_render_encoder *enc, struct kk_draw_data data)
 {
@@ -1225,9 +1390,12 @@ kk_dispatch_draw(mtl_render_encoder *enc, struct kk_draw_data data)
                                      data.index.offset, data.grid.size.y,
                                      data.vertex_offset, data.grid.size.z);
       } else {
-         mtl_draw_primitives(enc, data.primitive_type, data.vertex_offset,
-                             data.grid.size.x, data.grid.size.y,
-                             data.grid.size.z);
+         /* Avoid Metal validation error. Empty draws from tessellation will
+          * have values set to 0. */
+         if (data.grid.size.x != 0 && data.grid.size.y != 0)
+            mtl_draw_primitives(enc, data.primitive_type, data.vertex_offset,
+                                data.grid.size.x, data.grid.size.y,
+                                data.grid.size.z);
       }
    }
 }
@@ -1357,6 +1525,128 @@ kk_upload_per_draw_data(struct kk_cmd_buffer *cmd, uint32_t upload_mask,
    }
 }
 
+static void
+kk_dispatch_compute(mtl_compute_encoder *enc, struct kk_grid grid,
+                    struct mtl_size local_size)
+{
+   if (grid.mode == KK_GRID_DIRECT)
+      mtl_dispatch_threads(enc, grid.size, local_size);
+   else
+      mtl_dispatch_threadgroups_with_indirect_buffer(enc, grid.indirect,
+                                                     grid.offset, local_size);
+}
+
+static struct kk_draw_data
+kk_launch_tess(struct kk_cmd_buffer *cmd, struct kk_draw_data draw,
+               uint32_t draw_id)
+{
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+   struct kk_graphics_state *gfx = &cmd->state.gfx;
+   struct kk_grid grid_vs, grid_tcs, grid_tess;
+
+   struct kk_shader *vs = cmd->state.shaders[MESA_SHADER_VERTEX];
+   struct kk_shader *tcs = cmd->state.shaders[MESA_SHADER_TESS_CTRL];
+
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   uint32_t input_patch_size = dyn->ts.patch_control_points;
+   uint64_t state = gfx->descriptors.root.draw.tess_params;
+   struct kk_tess_info info = gfx->tess.info;
+
+   /* Setup grids */
+   if (kk_grid_is_indirect(draw.grid)) {
+      struct libkk_tess_setup_indirect_args args = {
+         .p = state,
+         .grids = gfx->tess.indirect_ptr.gpu,
+         .indirect =
+            mtl_buffer_get_gpu_address(draw.grid.indirect) + draw.grid.offset,
+         .vp = gfx->descriptors.root.draw.vertex_params,
+         .vertex_outputs = vs->info.vs.outputs_written,
+         .tcs_statistic = 0,
+      };
+
+      if (draw.index.buffer) {
+         args.in_index_buffer =
+            mtl_buffer_get_gpu_address(draw.index.buffer) + draw.index.offset;
+         args.in_index_size_B = mtl_index_type_to_size_B(draw.index.type);
+         args.in_index_buffer_range_el =
+            draw.index.range / args.in_index_size_B;
+      }
+
+      libkk_tess_setup_indirect_struct(cmd, kk_grid_1d(1), true, args);
+
+      uint32_t grid_stride = sizeof(uint32_t) * 3;
+      grid_vs =
+         kk_grid_indirect(gfx->tess.indirect_ptr.buffer,
+                          gfx->tess.indirect_ptr.offset + 0u * grid_stride);
+      grid_tcs =
+         kk_grid_indirect(gfx->tess.indirect_ptr.buffer,
+                          gfx->tess.indirect_ptr.offset + 1u * grid_stride);
+      grid_tess =
+         kk_grid_indirect(gfx->tess.indirect_ptr.buffer,
+                          gfx->tess.indirect_ptr.offset + 2u * grid_stride);
+   } else {
+      uint32_t patches = draw.grid.size.x / input_patch_size;
+      grid_vs = grid_tcs = kk_grid_2d(draw.grid.size.x, draw.grid.size.y);
+
+      grid_tcs.size.x = patches * tcs->info.tess.tcs_output_patch_size;
+      grid_tess = kk_grid_1d(patches * draw.grid.size.y);
+   }
+
+   /* First launch the VS and TCS */
+
+   mtl_compute_encoder *enc = kk_encoder_pre_gfx_encoder(cmd);
+   {
+      mtl_compute_pipeline_state *pipeline = vs->pipeline.gfx.pre_render[0];
+      struct mtl_size local_size = {64, 1, 1};
+      mtl_compute_set_pipeline_state(enc, pipeline);
+      mtl_compute_set_buffer(enc, gfx->descriptors.root.root_buffer.buffer,
+                             gfx->descriptors.root.root_buffer.offset, 0u);
+
+      struct kk_per_draw_data shader_data = {.draw_id = draw_id};
+
+      struct kk_ptr shader_data_gpu =
+         kk_pool_upload(cmd, &shader_data, sizeof(shader_data), 8u);
+      mtl_compute_set_buffer(enc, shader_data_gpu.buffer,
+                             shader_data_gpu.offset, 2);
+      kk_dispatch_compute(enc, grid_vs, local_size);
+      /* TODO_KOSMICKRISP Maybe too big of a barrier? We could definitely just
+       * barrier the buffers we know we modify. */
+      mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
+   }
+   {
+      mtl_compute_pipeline_state *pipeline = vs->pipeline.gfx.pre_render[1];
+      struct mtl_size local_size = {tcs->info.tess.tcs_output_patch_size, 1, 1};
+      /* Avoid Metal validation error by trying to launch empty compute. Return
+       * empty data. We set restart to true to avoid unroll. */
+      if (grid_tcs.mode == KK_GRID_DIRECT && grid_tcs.size.x == 0u)
+         return (struct kk_draw_data){.grid = kk_grid_1d(0u)};
+      mtl_compute_set_pipeline_state(enc, pipeline);
+      kk_dispatch_compute(enc, grid_tcs, local_size);
+      mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
+   }
+
+   /* First generate counts, then prefix sum them, and then tessellate. */
+   libkk_tessellate(cmd, grid_tess, true, info.mode, POLY_TESS_MODE_COUNT,
+                    state);
+   mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
+
+   libkk_prefix_sum_tess(cmd, kk_grid_1d(1u), true, state);
+   mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
+
+   libkk_tessellate(cmd, grid_tess, true, info.mode, POLY_TESS_MODE_WITH_COUNTS,
+                    state);
+   mtl_memory_barrier_with_scope(enc, MTL_BARRIER_SCOPE_BUFFERS);
+
+   draw.grid =
+      kk_grid_indirect(gfx->tess.out_draws_buffer, gfx->tess.out_draws_offset);
+
+   draw.index.buffer = dev->heap->map;
+   draw.index.offset = sizeof(struct poly_heap);
+   draw.index.type = MTL_INDEX_TYPE_UINT32;
+   draw.primitive_type = mesa_prim_to_mtl_primitive_type(gfx->tess.prim);
+   return draw;
+}
+
 /* When the current draw contains stages not present in Metal such as
  * tessellation, this step will launch required emulation when needed and build
  * the per draw data required to launch the Metal draw. */
@@ -1364,30 +1654,99 @@ static struct kk_draw_data
 build_draw_data(struct kk_cmd_buffer *cmd, struct kk_draw_command *data,
                 uint32_t draw_id)
 {
+   bool tess = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
    struct kk_draw_data draw = {
       .index.buffer = data->index_buffer,
       .index.offset = data->index_buffer_offset,
       .index.type = data->indexed ? index_size_in_bytes_to_mtl_index_type(
                                        data->index_buffer_el_size_B)
                                   : 0u,
-      .primitive_type = mesa_prim_to_mtl_primitive_type(data->prim),
+      .index.range = data->index_buffer_range_B,
+      .primitive_type = tess ? 0u : mesa_prim_to_mtl_primitive_type(data->prim),
    };
 
+   uint64_t first_vertex_gpu = 0u;
+   uint64_t base_instance_gpu = 0u;
    if (data->indirect) {
-      draw.grid = kk_grid_indirect(data->indirect_command.buffer,
-                                   data->indirect_command.offset +
-                                      draw_id * data->indirect_command.stride);
+      uint64_t indirect_offset = data->indirect_command.offset +
+                                 draw_id * data->indirect_command.stride;
+      draw.grid =
+         kk_grid_indirect(data->indirect_command.buffer, indirect_offset);
+
+      if (tess) {
+         uint64_t first_vertex_offset =
+            data->indexed ? offsetof(VkDrawIndexedIndirectCommand, vertexOffset)
+                          : offsetof(VkDrawIndirectCommand, firstVertex);
+         uint64_t base_instance_offset =
+            data->indexed
+               ? offsetof(VkDrawIndexedIndirectCommand, firstInstance)
+               : offsetof(VkDrawIndirectCommand, firstInstance);
+         first_vertex_gpu =
+            mtl_buffer_get_gpu_address(data->indirect_command.buffer) +
+            indirect_offset + first_vertex_offset;
+         base_instance_gpu =
+            mtl_buffer_get_gpu_address(data->indirect_command.buffer) +
+            indirect_offset + base_instance_offset;
+      }
    } else if (data->indexed) {
-      VkDrawIndexedIndirectCommand cmd = data->indexed_draws[draw_id];
-      draw.grid =
-         kk_grid_3d(cmd.indexCount, cmd.instanceCount, cmd.firstInstance);
-      draw.vertex_offset = cmd.vertexOffset;
-      draw.index.offset += cmd.firstIndex * data->index_buffer_el_size_B;
+      VkDrawIndexedIndirectCommand draw_cmd = data->indexed_draws[draw_id];
+      draw.grid = kk_grid_3d(draw_cmd.indexCount, draw_cmd.instanceCount,
+                             draw_cmd.firstInstance);
+      draw.vertex_offset = draw_cmd.vertexOffset;
+      draw.index.offset += draw_cmd.firstIndex * data->index_buffer_el_size_B;
+
+      if (tess) {
+         first_vertex_gpu = kk_pool_upload(cmd, &draw_cmd.vertexOffset,
+                                           sizeof(draw_cmd.vertexOffset), 4u)
+                               .gpu;
+         base_instance_gpu = kk_pool_upload(cmd, &draw_cmd.firstInstance,
+                                            sizeof(draw_cmd.firstInstance), 4u)
+                                .gpu;
+      }
    } else {
-      VkDrawIndirectCommand cmd = data->draws[draw_id];
-      draw.grid =
-         kk_grid_3d(cmd.vertexCount, cmd.instanceCount, cmd.firstInstance);
-      draw.vertex_offset = cmd.firstVertex;
+      VkDrawIndirectCommand draw_cmd = data->draws[draw_id];
+      draw.grid = kk_grid_3d(draw_cmd.vertexCount, draw_cmd.instanceCount,
+                             draw_cmd.firstInstance);
+      draw.vertex_offset = draw_cmd.firstVertex;
+
+      if (tess) {
+         first_vertex_gpu = kk_pool_upload(cmd, &draw_cmd.firstVertex,
+                                           sizeof(draw_cmd.firstVertex), 4u)
+                               .gpu;
+         base_instance_gpu = kk_pool_upload(cmd, &draw_cmd.firstInstance,
+                                            sizeof(draw_cmd.firstInstance), 4u)
+                                .gpu;
+      }
+   }
+
+   /* Emulate tessellation. */
+   if (tess) {
+      struct kk_ptr tess_args = {};
+      struct kk_graphics_state *gfx = &cmd->state.gfx;
+      struct kk_descriptor_state *desc = &gfx->descriptors;
+      if (cmd->state.shaders[MESA_SHADER_TESS_EVAL]) {
+         gfx->descriptors.root.draw.index_size = data->index_buffer_el_size_B;
+         gfx->descriptors.root.draw.base_vertex_addr = first_vertex_gpu;
+         gfx->descriptors.root.draw.base_instance_addr = base_instance_gpu;
+         desc->root.draw.vertex_params = kk_upload_vertex_params(cmd, draw);
+         tess_args = kk_pool_alloc(cmd, sizeof(struct poly_tess_params), 4);
+         gfx->descriptors.root.draw.tess_params = tess_args.gpu;
+         gfx->descriptors.root_dirty = true;
+      }
+
+      if (desc->root_dirty) {
+         kk_upload_descriptor_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
+         struct kk_ptr root_buffer = desc->root.root_buffer;
+         mtl_set_vertex_buffer(kk_render_encoder(cmd), root_buffer.buffer,
+                               root_buffer.offset, 0);
+         mtl_set_fragment_buffer(kk_render_encoder(cmd), root_buffer.buffer,
+                                 root_buffer.offset, 0);
+         if (tess_args.gpu) {
+            kk_upload_tess_params(cmd, tess_args.cpu, draw);
+         }
+      }
+
+      draw = kk_launch_tess(cmd, draw, draw_id);
    }
 
    return draw;
@@ -1406,11 +1765,14 @@ kk_draw(struct kk_cmd_buffer *cmd, struct kk_draw_command *data)
    if (data->predicate_count > 0 && !kk_predicate_draws(cmd, data))
       return;
 
-   /* Unroll geometry. Skip draw if we fail. */
-   bool requires_unroll = data->prim == MESA_PRIM_TRIANGLE_FAN ||
-                          requires_index_promotion(data) ||
-                          requires_unroll_restart(cmd, data) ||
-                          requires_index_robustness(cmd, data);
+   bool tess = cmd->state.shaders[MESA_SHADER_TESS_EVAL];
+
+   /* Unroll geometry. Skip draw if we fail. No need to unroll if tessellation
+    * is present since it also handles unrolling. */
+   bool requires_unroll = !tess && (data->prim == MESA_PRIM_TRIANGLE_FAN ||
+                                    requires_index_promotion(data) ||
+                                    requires_unroll_restart(cmd, data) ||
+                                    requires_index_robustness(cmd, data));
    if (requires_unroll && !kk_unroll_geometry(cmd, data))
       return;
 
diff --git a/src/kosmickrisp/vulkan/kk_nir_lower_descriptors.c b/src/kosmickrisp/vulkan/kk_nir_lower_descriptors.c
index eb376ded9f0..db348fb3469 100644
--- a/src/kosmickrisp/vulkan/kk_nir_lower_descriptors.c
+++ b/src/kosmickrisp/vulkan/kk_nir_lower_descriptors.c
@@ -816,9 +816,39 @@ lower_poly(struct nir_builder *b, nir_intrinsic_instr *intrin, void *data)
    case nir_intrinsic_load_index_size_poly:
       return lower_sysval_to_root_table(b, intrin, draw.index_size);
    case nir_intrinsic_load_first_vertex:
-      if (*(bool *)data)
-         return lower_sysval_to_root_table(b, intrin, draw.base_vertex);
-      FALLTHROUGH;
+      /* Lower only compute shaders */
+      if (*(bool *)data) {
+         uint32_t root_table_offset =
+            kk_root_descriptor_offset(draw.base_vertex_addr);
+         b->cursor = nir_instr_remove(&intrin->instr);
+         assert((root_table_offset & 3) == 0 && "aligned");
+
+         nir_def *addr = load_root(b, intrin->def.num_components, 64u,
+                                   nir_imm_int(b, root_table_offset), 4);
+
+         nir_def *val = nir_load_global(b, 1u, intrin->def.bit_size, addr);
+
+         nir_def_rewrite_uses(&intrin->def, val);
+         return true;
+      }
+      return false;
+   case nir_intrinsic_load_base_instance:
+      /* Lower only compute shaders */
+      if (*(bool *)data) {
+         uint32_t root_table_offset =
+            kk_root_descriptor_offset(draw.base_instance_addr);
+         b->cursor = nir_instr_remove(&intrin->instr);
+         assert((root_table_offset & 3) == 0 && "aligned");
+
+         nir_def *addr = load_root(b, intrin->def.num_components, 64u,
+                                   nir_imm_int(b, root_table_offset), 4);
+
+         nir_def *val = nir_load_global(b, 1u, intrin->def.bit_size, addr);
+
+         nir_def_rewrite_uses(&intrin->def, val);
+         return true;
+      }
+      return false;
    default:
       return false;
    }
diff --git a/src/kosmickrisp/vulkan/kk_nir_lower_vbo.c b/src/kosmickrisp/vulkan/kk_nir_lower_vbo.c
index 5fbffc376cf..6d56e5669ce 100644
--- a/src/kosmickrisp/vulkan/kk_nir_lower_vbo.c
+++ b/src/kosmickrisp/vulkan/kk_nir_lower_vbo.c
@@ -17,9 +17,6 @@
 
 struct ctx {
    struct kk_attribute *attribs;
-   bool requires_vertex_id;
-   bool requires_instance_id;
-   bool requires_base_instance;
    bool requires_robustness2;
 };
 
@@ -165,22 +162,19 @@ pass(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
    nir_def *el;
    if (attrib.instanced) {
       if (attrib.divisor > 0) {
-         /* Metal's instance_id has base_instance included */
-         nir_def *instance_id =
-            nir_isub(b, nir_load_instance_id(b), nir_load_base_instance(b));
-         el = nir_udiv_imm(b, instance_id, attrib.divisor);
-         ctx->requires_instance_id = true;
+         el = nir_udiv_imm(b, nir_load_instance_id(b), attrib.divisor);
+         BITSET_SET(b->shader->info.system_values_read,
+                    SYSTEM_VALUE_INSTANCE_ID);
       } else
          el = nir_imm_int(b, 0);
 
       el = nir_iadd(b, el, nir_load_base_instance(b));
-      ctx->requires_base_instance = true;
 
       BITSET_SET(b->shader->info.system_values_read,
                  SYSTEM_VALUE_BASE_INSTANCE);
    } else {
       el = nir_load_vertex_id(b);
-      ctx->requires_vertex_id = true;
+      BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID);
    }
 
    /* Load the pointer of the buffer from the argument buffer */
@@ -283,14 +277,6 @@ kk_nir_lower_vbo(nir_shader *nir, struct kk_attribute *attribs,
       .attribs = attribs,
       .requires_robustness2 = robustness2,
    };
-   bool progress =
-      nir_shader_intrinsics_pass(nir, pass, nir_metadata_control_flow, &ctx);
-
-   if (ctx.requires_instance_id)
-      BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
-   if (ctx.requires_base_instance)
-      BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
-   if (ctx.requires_vertex_id)
-      BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID);
-   return progress;
+   return nir_shader_intrinsics_pass(nir, pass, nir_metadata_control_flow,
+                                     &ctx);
 }
diff --git a/src/kosmickrisp/vulkan/kk_physical_device.c b/src/kosmickrisp/vulkan/kk_physical_device.c
index 93d04d665ef..b54c9e07b07 100644
--- a/src/kosmickrisp/vulkan/kk_physical_device.c
+++ b/src/kosmickrisp/vulkan/kk_physical_device.c
@@ -227,6 +227,7 @@ kk_get_device_features(
       .shaderStorageImageReadWithoutFormat = true,
       .shaderStorageImageWriteWithoutFormat = true,
       .shaderUniformBufferArrayDynamicIndexing = true,
+      .tessellationShader = true,
       .textureCompressionASTC_LDR = true,
       .textureCompressionBC = true,
       .textureCompressionETC2 = true,
diff --git a/src/kosmickrisp/vulkan/kk_shader.c b/src/kosmickrisp/vulkan/kk_shader.c
index 6d94ec8a2e0..b6498a1fbfa 100644
--- a/src/kosmickrisp/vulkan/kk_shader.c
+++ b/src/kosmickrisp/vulkan/kk_shader.c
@@ -660,10 +660,10 @@ gather_shader_info(struct kk_shader *shader, nir_shader *nir,
 {
    shader->info.stage = nir->info.stage;
    shader->info.uses_per_draw_data = msl_gather_uses_per_draw_data(nir);
+   shader->info.num_cull_distances = nir->info.cull_distance_array_size;
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       nir_shader_intrinsics_pass(nir, gather_vs_inputs, nir_metadata_all,
                                  &shader->info.vs.attribs_read);
-      shader->info.vs.num_cull_distances = nir->info.cull_distance_array_size;
       shader->info.vs.outputs_written = nir->info.outputs_written;
    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       /* Some meta shaders like vk-meta-resolve will have depth_layout as NONE
@@ -761,7 +761,7 @@ kk_compile_shader(struct kk_device *dev, nir_shader *nir,
    gather_shader_info(shader, nir, state);
 
    unsigned num_cull_distances =
-      prev_stage ? prev_stage->info.vs.num_cull_distances : 0;
+      prev_stage ? prev_stage->info.num_cull_distances : 0;
    msl_nir_lower_clip_cull_distance(nir, num_cull_distances);
 
    /* When using poly to emulate tessellation, vertex and tess control shaders
@@ -778,7 +778,11 @@ kk_compile_shader(struct kk_device *dev, nir_shader *nir,
          memset(&nir->info.cs, 0, sizeof(nir->info.cs));
          nir->xfb_info = NULL;
          NIR_PASS(_, nir, poly_nir_lower_sw_vs);
-      }
+      } else
+         /* Metal's instance_id contains base_instance. When the emulation path
+          * is taken, since we launch compute, they correctly get translated.
+          * For the non-emulated path we need to subtract base_instance... */
+         NIR_PASS(_, nir, msl_nir_lower_instance_id);
    } else if (stage == MESA_SHADER_TESS_CTRL) {
       NIR_PASS(_, nir, poly_nir_lower_tcs);
 
diff --git a/src/kosmickrisp/vulkan/kk_shader.h b/src/kosmickrisp/vulkan/kk_shader.h
index 3a57b0864de..a45bd6cd7d3 100644
--- a/src/kosmickrisp/vulkan/kk_shader.h
+++ b/src/kosmickrisp/vulkan/kk_shader.h
@@ -48,6 +48,10 @@ kk_tess_info_merge(struct kk_tess_info a, struct kk_tess_info b)
 struct kk_shader_info {
    mesa_shader_stage stage;
    bool uses_per_draw_data;
+
+   /* Required for fragment shader cull distance discards. */
+   uint8_t num_cull_distances;
+
    union {
       /* Vertex shader is the pipeline, store all relevant data here. */
       struct {
@@ -62,9 +66,6 @@ struct kk_shader_info {
          uint32_t sample_count;
          uint64_t outputs_written;
 
-         /* Required for fragment shader cull distance discards. */
-         uint8_t num_cull_distances;
-
          /* Data needed for serialization. */
          enum mtl_primitive_topology_class topology;
          enum mtl_pixel_format rt_formats[MAX_DRAW_BUFFERS];