diff --git a/src/kosmickrisp/clc/kk_clc.c b/src/kosmickrisp/clc/kk_clc.c
new file mode 100644
index 00000000000..5025be656e3
--- /dev/null
+++ b/src/kosmickrisp/clc/kk_clc.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2026 LunarG, Inc.
+ * Copyright 2026 Google LLC
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2020 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "kosmickrisp/compiler/nir_to_msl.h"
+
+#include "compiler/glsl_types.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
+#include "nir_precompiled.h"
+#include "shader_enums.h"
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "util/macros.h"
+#include <sys/mman.h>
+
+static const struct spirv_to_nir_options spirv_options = {
+   .environment = NIR_SPIRV_OPENCL,
+   .shared_addr_format = nir_address_format_62bit_generic,
+   .global_addr_format = nir_address_format_62bit_generic,
+   .temp_addr_format = nir_address_format_62bit_generic,
+   .constant_addr_format = nir_address_format_64bit_global,
+   .create_library = true,
+   .printf = true,
+};
+
+/* Standard optimization loop */
+static void
+optimize(nir_shader *nir)
+{
+   bool progress;
+   do {
+      progress = false;
+
+      NIR_PASS(progress, nir, nir_split_var_copies);
+      NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp);
+      NIR_PASS(progress, nir, nir_lower_var_copies);
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+
+      NIR_PASS(progress, nir, nir_opt_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_remove_phis);
+      NIR_PASS(progress, nir, nir_lower_all_phis_to_scalar);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_dead_cf);
+      NIR_PASS(progress, nir, nir_opt_cse);
+
+      nir_opt_peephole_select_options peephole_select_options = {
+         .limit = 64,
+         .expensive_alu_ok = true,
+      };
+      NIR_PASS(progress, nir, nir_opt_peephole_select,
+               &peephole_select_options);
+      NIR_PASS(progress, nir, nir_opt_phi_precision);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+
+      NIR_PASS(progress, nir, nir_opt_deref);
+      NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, nir, nir_opt_undef);
+      NIR_PASS(progress, nir, nir_lower_undef_to_zero);
+
+      NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
+      NIR_PASS(progress, nir, nir_opt_loop_unroll);
+
+   } while (progress);
+}
+
+static nir_shader *
+compile(void *memctx, const uint32_t *spirv, size_t spirv_size)
+{
+   const nir_shader_compiler_options *nir_options = &kk_nir_options;
+
+   assert(spirv_size % 4 == 0);
+   nir_shader *nir =
+      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
+                   "library", &spirv_options, nir_options);
+   nir_validate_shader(nir, "after spirv_to_nir");
+   ralloc_steal(memctx, nir);
+
+   nir_fixup_is_exported(nir);
+
+   NIR_PASS(_, nir, nir_lower_system_values);
+   NIR_PASS(_, nir, nir_lower_calls_to_builtins);
+
+   nir_lower_compute_system_values_options cs = {.global_id_is_32bit = true};
+   NIR_PASS(_, nir, nir_lower_compute_system_values, &cs);
+
+   /* We have to lower away local constant initializers right before we
+    * inline functions.  That way they get properly initialized at the top
+    * of the function and not at the top of its caller.
+    */
+   NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS(_, nir, nir_lower_returns);
+   NIR_PASS(_, nir, nir_inline_functions);
+   nir_remove_non_exported(nir);
+   NIR_PASS(_, nir, nir_opt_copy_prop);
+   NIR_PASS(_, nir, nir_opt_deref);
+
+   /* We can't deal with constant data, get rid of it */
+   nir_lower_constant_to_temp(nir);
+
+   /* We can go ahead and lower the rest of the constant initializers.  We do
+    * this here so that nir_remove_dead_variables and split_per_member_structs
+    * below see the corresponding stores.
+    */
+   NIR_PASS(_, nir, nir_lower_variable_initializers, ~0);
+
+   /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
+    * aligned and so it can just read/write them as vec4s.  This results in a
+    * LOT of vec4->vec3 casts on loads and stores.  One solution to this
+    * problem is to get rid of all vec3 variables.
+    */
+   NIR_PASS(_, nir, nir_lower_vec3_to_vec4,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant);
+
+   /* We assign explicit types early so that the optimizer can take advantage
+    * of that information and hopefully get rid of some of our memcpys.
+    */
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_uniform | nir_var_shader_temp | nir_var_function_temp |
+               nir_var_mem_shared | nir_var_mem_global,
+            glsl_get_cl_type_size_align);
+
+   optimize(nir);
+
+   NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
+
+   /* Lower again, this time after dead-variables to get more compact variable
+    * layouts.
+    */
+   NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+            nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+               nir_var_mem_global | nir_var_mem_constant,
+            glsl_get_cl_type_size_align);
+   assert(nir->constant_data_size == 0);
+
+   NIR_PASS(_, nir, nir_lower_memcpy);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant,
+            nir_address_format_64bit_global);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
+            nir_address_format_32bit_offset_as_64bit);
+
+   /* Note: we cannot lower explicit I/O here, because we need derefs in tact
+    * for function calls into the library to work.
+    */
+
+   NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL);
+   NIR_PASS(_, nir, nir_opt_if, 0);
+   NIR_PASS(_, nir, nir_opt_idiv_const, 16);
+
+   msl_lower_textures(nir);
+   msl_lower_nir_late(nir);
+
+   optimize(nir);
+
+   return nir;
+}
+
+static void
+print_shader(FILE *fp, const char *name, const char *suffix, uint32_t variant,
+             const char *msl, nir_shader *nir)
+{
+   uint32_t msl_length = strlen(msl) + 1;
+   uint32_t workgroup_count_size = sizeof(nir->info.workgroup_size);
+   size_t sz_B = workgroup_count_size + msl_length;
+   size_t sz_el = DIV_ROUND_UP(sz_B, 4);
+   uint32_t *mem = calloc(sz_el, 4);
+
+   memcpy(mem, nir->info.workgroup_size, workgroup_count_size);
+   memcpy((uint8_t *)mem + workgroup_count_size, msl, msl_length);
+
+   nir_precomp_print_blob(fp, name, suffix, variant, mem, sz_B, true);
+   free(mem);
+}
+
+static nir_def *
+load_kernel_input(nir_builder *b, unsigned num_components, unsigned bit_size,
+                  unsigned offset_B)
+{
+   nir_def *root = nir_load_buffer_ptr_kk(b, 1, 64, .binding = 0);
+
+   /* We've bound the address of the root descriptor, index in. */
+   nir_def *addr = nir_iadd(b, root, nir_imm_int64(b, offset_B));
+   return nir_build_load_global_constant(b, num_components, bit_size, addr,
+                                         .align_mul = bit_size,
+                                         .access = ACCESS_CAN_SPECULATE);
+}
+
+static int
+type_size_vec4(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+int
+main(int argc, char **argv)
+{
+   if (argc != 4) {
+      fprintf(stderr, "Usage: %s [input spir-v] [output header] [output C]\n",
+              argv[0]);
+      return 1;
+   }
+
+   const char *infile = argv[1];
+   const char *outh_file = argv[2];
+   const char *outc_file = argv[3];
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   int fd = open(infile, O_RDONLY);
+   if (fd < 0) {
+      fprintf(stderr, "Failed to open %s\n", infile);
+      ralloc_free(mem_ctx);
+      return 1;
+   }
+
+   off_t spirv_len = lseek(fd, 0, SEEK_END);
+   const void *spirv_map = mmap(NULL, spirv_len, PROT_READ, MAP_PRIVATE, fd, 0);
+   close(fd);
+   if (spirv_map == MAP_FAILED) {
+      fprintf(stderr, "Failed to mmap the file: errno=%d, %s\n", errno,
+              strerror(errno));
+      ralloc_free(mem_ctx);
+      return 1;
+   }
+
+   FILE *fp_h = fopen(outh_file, "w");
+   FILE *fp_c = fopen(outc_file, "w");
+   glsl_type_singleton_init_or_ref();
+
+   nir_precomp_print_header(fp_c, fp_h, "KosmicKrisp Contributors",
+                            "libkk_shaders.h");
+
+   nir_shader *nir = compile(mem_ctx, spirv_map, spirv_len);
+
+   /* load_preamble works at 16-bit granularity */
+   struct nir_precomp_opts opt = {.arg_align_B = 2};
+
+   const char *target = "AppleSilicon";
+
+   nir_foreach_entrypoint(libfunc, nir) {
+      libfunc->pass_flags = 0;
+      unsigned nr_vars = nir_precomp_nr_variants(libfunc);
+
+      nir_precomp_print_layout_struct(fp_h, &opt, libfunc);
+
+      for (unsigned v = 0; v < nr_vars; ++v) {
+         nir_shader *s = nir_precompiled_build_variant(
+            libfunc, MESA_SHADER_COMPUTE, v, &kk_nir_options, &opt,
+            load_kernel_input);
+
+         nir_link_shader_functions(s, nir);
+         NIR_PASS(_, s, nir_inline_functions);
+         nir_remove_non_entrypoints(s);
+         NIR_PASS(_, s, nir_opt_deref);
+         NIR_PASS(_, s, nir_lower_vars_to_ssa);
+         NIR_PASS(_, s, nir_remove_dead_derefs);
+         NIR_PASS(_, s, nir_remove_dead_variables,
+                  nir_var_function_temp | nir_var_shader_temp, NULL);
+         NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
+                  nir_var_shader_temp | nir_var_function_temp,
+                  glsl_get_cl_type_size_align);
+
+         NIR_PASS(_, s, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
+                  glsl_get_cl_type_size_align);
+
+         NIR_PASS(_, s, nir_lower_explicit_io, nir_var_mem_shared,
+                  nir_address_format_62bit_generic);
+
+         msl_preprocess_nir(s);
+         msl_optimize_nir(nir);
+
+         NIR_PASS(_, s, nir_opt_deref);
+         NIR_PASS(_, s, nir_lower_vars_to_ssa);
+
+         NIR_PASS(_, s, nir_lower_explicit_io,
+                  nir_var_shader_temp | nir_var_function_temp |
+                     nir_var_mem_shared | nir_var_mem_global,
+                  nir_address_format_62bit_generic);
+
+         /* nir_lower_explicit_io can create phis we need to get rid of */
+         NIR_PASS(_, s, nir_convert_from_ssa, true, true);
+         NIR_PASS(_, s, nir_trivialize_registers);
+
+         /* nir_lower_explicit_io will create unpack_64 we need to lower */
+         NIR_PASS(_, s, nir_opt_algebraic);
+
+         nir_shader_gather_info(s, nir_shader_get_entrypoint(s));
+         struct nir_to_msl_options options = {};
+         const char *msl = nir_to_msl(s, &options);
+         print_shader(fp_c, libfunc->name, target, v, msl, s);
+         ralloc_free((void *)msl);
+
+         ralloc_free(s);
+      }
+   }
+
+   nir_precomp_print_program_enum(fp_h, nir, "libkk");
+   nir_precomp_print_dispatch_macros(fp_h, &opt, nir);
+
+   /* For each target, generate a table mapping programs to binaries */
+
+   nir_precomp_print_extern_binary_map(fp_h, "libkk", target);
+   nir_precomp_print_binary_map(fp_c, nir, "libkk", target, NULL);
+
+   glsl_type_singleton_decref();
+   fclose(fp_c);
+   fclose(fp_h);
+   ralloc_free(mem_ctx);
+   return 0;
+}
diff --git a/src/kosmickrisp/clc/kk_precompiled_shader.h b/src/kosmickrisp/clc/kk_precompiled_shader.h
new file mode 100644
index 00000000000..fa30779a351
--- /dev/null
+++ b/src/kosmickrisp/clc/kk_precompiled_shader.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2026 LunarG, Inc.
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef KK_PRECOMPILED_SHADER_H
+#define KK_PRECOMPILED_SHADER_H 1
+
+#include "kosmickrisp/bridge/mtl_types.h"
+
+#include <stdint.h>
+
+struct kk_precompiled_info {
+   uint16_t workgroup_size[3];
+};
+
+struct kk_precompiled_shader {
+   struct kk_precompiled_info info;
+   mtl_compute_pipeline_state *pipeline;
+};
+
+#endif /* KK_PRECOMPILED_SHADER_H_ */
diff --git a/src/kosmickrisp/clc/meson.build b/src/kosmickrisp/clc/meson.build
new file mode 100644
index 00000000000..7edd8c2ae68
--- /dev/null
+++ b/src/kosmickrisp/clc/meson.build
@@ -0,0 +1,13 @@
+# Copyright 2026 LunarG, Inc.
+# Copyright 2026 Google LLC
+# SPDX-License-Identifier: MIT
+
+prog_kk_clc = executable(
+  'kk_clc',
+  ['kk_clc.c'],
+  link_with : [libmsl_compiler],
+  include_directories : [inc_include, inc_src],
+  c_args : [pre_args, no_override_init_args],
+  link_args : [ld_args_build_id],
+  dependencies : [idep_vtn, idep_nir, idep_mesautil],
+)
\ No newline at end of file
diff --git a/src/kosmickrisp/compiler/nir_to_msl.c b/src/kosmickrisp/compiler/nir_to_msl.c
index ad0a0241916..03e621a7a85 100644
--- a/src/kosmickrisp/compiler/nir_to_msl.c
+++ b/src/kosmickrisp/compiler/nir_to_msl.c
@@ -1986,11 +1986,13 @@ msl_preprocess_nir(struct nir_shader *nir)
             nir_var_function_temp | nir_var_shader_in | nir_var_shader_out);
    NIR_PASS(_, nir, nir_lower_alu_to_scalar, kk_scalarize_filter, NULL);
 
+   /* If we do 256 here MSL compiler crashes with
+    * dEQP-VK.graphicsfuzz.stable-binarysearch-tree-nested-if-and-conditional */
+   NIR_PASS(_, nir, nir_lower_vars_to_scratch, 32,
+            glsl_get_natural_size_align_bytes, glsl_get_word_size_align_bytes);
    NIR_PASS(_, nir, nir_lower_indirect_derefs_to_if_else_trees,
-            nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
-   NIR_PASS(_, nir, nir_lower_vars_to_scratch, 0,
-            glsl_get_natural_size_align_bytes,
-            glsl_get_natural_size_align_bytes);
+            nir_var_function_temp | nir_var_shader_in | nir_var_shader_out,
+            UINT32_MAX);
 
    nir_lower_compute_system_values_options csv_options = {
       .has_base_global_invocation_id = 0,
@@ -2043,7 +2045,7 @@ msl_optimize_nir(struct nir_shader *nir)
    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
    NIR_PASS(_, nir, msl_nir_lower_algebraic_late);
    NIR_PASS(_, nir, nir_convert_from_ssa, true, false);
-   nir_trivialize_registers(nir);
+   NIR_PASS(_, nir, nir_trivialize_registers);
    NIR_PASS(_, nir, nir_opt_copy_prop);
 
    return progress;
diff --git a/src/kosmickrisp/compiler/nir_to_msl.h b/src/kosmickrisp/compiler/nir_to_msl.h
index c7b11040f5e..b5542de5ffd 100644
--- a/src/kosmickrisp/compiler/nir_to_msl.h
+++ b/src/kosmickrisp/compiler/nir_to_msl.h
@@ -72,3 +72,37 @@ bool msl_nir_vs_io_types(nir_shader *nir);
 bool msl_nir_fake_guard_for_discards(struct nir_shader *nir);
 bool msl_nir_lower_sample_shading(nir_shader *nir);
 void msl_lower_nir_late(nir_shader *nir);
+
+static const nir_shader_compiler_options kk_nir_options = {
+   .lower_fdph = true,
+   .has_fsub = true,
+   .has_isub = true,
+   .lower_extract_word = true,
+   .lower_extract_byte = true,
+   .lower_insert_word = true,
+   .lower_insert_byte = true,
+   .lower_fmod = true,
+   .discard_is_demote = true,
+   .instance_id_includes_base_index = true,
+   .lower_device_index_to_zero = true,
+   .lower_pack_64_2x32_split = true,
+   .lower_unpack_64_2x32_split = true,
+   .lower_pack_64_2x32 = true,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_split = true,
+   .lower_unpack_half_2x16 = true,
+   .has_cs_global_id = true,
+   .lower_fquantize2f16 = true,
+   .lower_scmp = true,
+   .lower_ifind_msb = true,
+   .lower_ufind_msb = true,
+   .lower_find_lsb = true,
+   .has_uclz = true,
+   .lower_mul_2x32_64 = true,
+   .lower_uadd_carry = true,
+   .lower_usub_borrow = true,
+   /* Metal does not support double. */
+   .lower_doubles_options = (nir_lower_doubles_options)(~0),
+   .lower_int64_options = nir_lower_ufind_msb64 | nir_lower_subgroup_shuffle64,
+   .io_options = nir_io_mediump_is_32bit,
+};
diff --git a/src/kosmickrisp/vulkan/cl/kk_query.cl b/src/kosmickrisp/libkk/kk_query.cl
similarity index 98%
rename from src/kosmickrisp/vulkan/cl/kk_query.cl
rename to src/kosmickrisp/libkk/kk_query.cl
index ef24ab8ab7b..3189b430eb5 100644
--- a/src/kosmickrisp/vulkan/cl/kk_query.cl
+++ b/src/kosmickrisp/libkk/kk_query.cl
@@ -10,13 +10,13 @@
 
 #include "kk_query.h"
 
-void
+KERNEL(1)
 libkk_write_u64(global struct libkk_imm_write *write_array)
 {
    *write_array[cl_group_id.x].address = write_array[cl_group_id.x].value;
 }
 
-void
+KERNEL(1)
 libkk_copy_queries(global uint64_t *availability, global uint64_t *results,
                    global uint16_t *oq_index, uint64_t dst_addr,
                    uint64_t dst_stride, uint32_t first_query,
diff --git a/src/kosmickrisp/vulkan/cl/kk_query.h b/src/kosmickrisp/libkk/kk_query.h
similarity index 100%
rename from src/kosmickrisp/vulkan/cl/kk_query.h
rename to src/kosmickrisp/libkk/kk_query.h
diff --git a/src/kosmickrisp/vulkan/cl/kk_triangle_fan.cl b/src/kosmickrisp/libkk/kk_triangle_fan.cl
similarity index 99%
rename from src/kosmickrisp/vulkan/cl/kk_triangle_fan.cl
rename to src/kosmickrisp/libkk/kk_triangle_fan.cl
index bc2e250d072..7e53ed0f1ad 100644
--- a/src/kosmickrisp/vulkan/cl/kk_triangle_fan.cl
+++ b/src/kosmickrisp/libkk/kk_triangle_fan.cl
@@ -79,7 +79,7 @@ libkk_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
     *
     * Here we assume the first vertex is provoking, the Vulkan default.
     */
-   uint offsets[6] = {
+   const uint offsets[6] = {
       0,
       first ? 1 : (even ? -2 : 3),
       even_or_first ? 2 : 4,
@@ -212,9 +212,8 @@ first_true_thread_in_workgroup(bool cond, local uint *scratch)
    return (first_group * 32) + off;
 }
 
-// TODO_KOSMICKRISP
 // KERNEL(1024)
-void
+KERNEL(1)
 libkk_unroll_geometry_and_restart(
    constant uint8_t *index_buffer, global uint8_t *out_ptr,
    constant uint32_t *in_draw, global uint32_t *out_draw,
diff --git a/src/kosmickrisp/libkk/meson.build b/src/kosmickrisp/libkk/meson.build
new file mode 100644
index 00000000000..7bbd55b2dc7
--- /dev/null
+++ b/src/kosmickrisp/libkk/meson.build
@@ -0,0 +1,54 @@
+# Copyright 2026 LunarG, Inc.
+# Copyright 2026 Google LLC
+# SPDX-License-Identifier: MIT
+
+libkk_shader_files = files(
+  'kk_triangle_fan.cl',
+  'kk_query.cl',
+)
+
+libkk_spv = custom_target(
+  input : libkk_shader_files,
+  output : 'libkk.spv',
+  command : [
+    prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@',
+    libkk_shader_files, '--',
+    '-I' + join_paths(meson.project_source_root(), 'include'),
+    '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
+    '-I' + join_paths(meson.current_source_dir(), '.'),
+    '-I' + join_paths(meson.current_source_dir(), '../../'),
+    '-I' + join_paths(meson.current_source_dir(), 'shaders'),
+    cl_args,
+    ],
+  depfile : 'libkk_shaders.h.d',
+)
+
+libkk = custom_target(
+  input : libkk_spv,
+  output : ['libkk.cpp', 'libkk.h'],
+  command : [prog_vtn_bindgen2, libkk_spv, '@OUTPUT0@', '@OUTPUT1@'],
+)
+
+idep_libkk = declare_dependency(
+  sources : [libkk],
+  include_directories : include_directories('.'),
+)
+
+libkk_shaders = custom_target(
+  input : libkk_spv,
+  output : ['libkk_shaders.h', 'libkk_shaders.c'],
+  command : [prog_kk_clc, libkk_spv, '@OUTPUT0@', '@OUTPUT1@']
+)
+
+# TODO_KOSMICKRISP We can get rid of this library once we remove the generated entrypoints for the trampoline
+libkk_shaders_lib = static_library(
+  'libkk_shaders',
+  sources : [libkk_shaders],
+  dependencies : [idep_mesautil],
+)
+
+idep_libkk_shaders_h = declare_dependency(
+  sources : [libkk_shaders[0]],
+  include_directories : include_directories('.'),
+  link_with : libkk_shaders_lib,
+)
diff --git a/src/kosmickrisp/meson.build b/src/kosmickrisp/meson.build
index fe54329e132..2bc318325e4 100644
--- a/src/kosmickrisp/meson.build
+++ b/src/kosmickrisp/meson.build
@@ -5,6 +5,8 @@
 subdir('bridge')
 subdir('compiler')
 subdir('util')
+subdir('clc')
+subdir('libkk')
 subdir('vulkan')
 
 executable(
diff --git a/src/kosmickrisp/vulkan/kk_cmd_buffer.c b/src/kosmickrisp/vulkan/kk_cmd_buffer.c
index e15d3e5b20b..62893767f20 100644
--- a/src/kosmickrisp/vulkan/kk_cmd_buffer.c
+++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.c
@@ -421,6 +421,31 @@ kk_cmd_buffer_flush_push_descriptors(struct kk_cmd_buffer *cmd,
    desc->push_dirty = 0;
 }
 
+void
+kk_dispatch_precomp(struct kk_cmd_buffer *cmd, struct mtl_size grid,
+                    bool pre_gfx, enum libkk_program idx, void *data,
+                    size_t data_size)
+{
+   struct kk_device *dev = kk_cmd_buffer_device(cmd);
+   struct kk_precompiled_shader *prog = &dev->precompiled_cache.shaders[idx];
+
+   mtl_compute_encoder *encoder =
+      pre_gfx ? kk_encoder_pre_gfx_encoder(cmd) : kk_compute_encoder(cmd);
+
+   struct kk_bo *bo = kk_cmd_allocate_buffer(cmd, data_size, 4u);
+   memcpy(bo->cpu, data, data_size);
+
+   mtl_compute_set_buffer(encoder, bo->map, 0, 0);
+   mtl_compute_set_pipeline_state(encoder, prog->pipeline);
+
+   struct mtl_size local_size = {
+      .x = prog->info.workgroup_size[0],
+      .y = prog->info.workgroup_size[1],
+      .z = prog->info.workgroup_size[2],
+   };
+   mtl_dispatch_threads(encoder, grid, local_size);
+}
+
 void
 kk_cmd_write(struct kk_cmd_buffer *cmd, mtl_buffer *buffer, uint64_t addr,
              uint64_t value)
diff --git a/src/kosmickrisp/vulkan/kk_cmd_buffer.h b/src/kosmickrisp/vulkan/kk_cmd_buffer.h
index ed81ac4013a..7a12e79f79b 100644
--- a/src/kosmickrisp/vulkan/kk_cmd_buffer.h
+++ b/src/kosmickrisp/vulkan/kk_cmd_buffer.h
@@ -254,14 +254,13 @@ uint64_t kk_upload_descriptor_root(struct kk_cmd_buffer *cmd,
 void kk_cmd_buffer_flush_push_descriptors(struct kk_cmd_buffer *cmd,
                                           struct kk_descriptor_state *desc);
 
+void kk_dispatch_precomp(struct kk_cmd_buffer *cmd, struct mtl_size grid,
+                         bool pre_gfx, enum libkk_program idx, void *data,
+                         size_t data_size);
+
+#define MESA_DISPATCH_PRECOMP kk_dispatch_precomp
+
 void kk_cmd_write(struct kk_cmd_buffer *cmd, mtl_buffer *buffer, uint64_t addr,
                   uint64_t value);
 
-void kk_cmd_dispatch_pipeline(struct kk_cmd_buffer *cmd,
-                              mtl_compute_encoder *encoder,
-                              mtl_compute_pipeline_state *pipeline,
-                              const void *push_data, size_t push_size,
-                              uint32_t groupCountX, uint32_t groupCountY,
-                              uint32_t groupCountZ);
-
 #endif
diff --git a/src/kosmickrisp/vulkan/kk_cmd_dispatch.c b/src/kosmickrisp/vulkan/kk_cmd_dispatch.c
index 2a66ad47794..00558e60b85 100644
--- a/src/kosmickrisp/vulkan/kk_cmd_dispatch.c
+++ b/src/kosmickrisp/vulkan/kk_cmd_dispatch.c
@@ -18,44 +18,6 @@
 
 #include "vk_common_entrypoints.h"
 
-void
-kk_cmd_dispatch_pipeline(struct kk_cmd_buffer *cmd,
-                         mtl_compute_encoder *encoder,
-                         mtl_compute_pipeline_state *pipeline,
-                         const void *push_data, size_t push_size,
-                         uint32_t groupCountX, uint32_t groupCountY,
-                         uint32_t groupCountZ)
-{
-   struct kk_root_descriptor_table *root = NULL;
-   struct kk_bo *bo = kk_cmd_allocate_buffer(cmd, sizeof(*root), 8u);
-   /* kk_cmd_allocate_buffer already sets the error, we can just exit */
-   if (!bo)
-      return;
-
-   root = bo->cpu;
-   assert(push_size <= sizeof(root->push));
-   memcpy(root->push, push_data, push_size);
-   root->cs.base_group[0] = 1; /* TODO_KOSMICKRISP This is hard-coded because we
-                                  know this is the size we create them with */
-   root->cs.base_group[1] = 1;
-   root->cs.base_group[2] = 1;
-
-   mtl_compute_set_buffer(encoder, bo->map, 0, 0);
-   mtl_compute_set_pipeline_state(encoder, pipeline);
-
-   struct mtl_size grid_size = {
-      .x = groupCountX,
-      .y = groupCountY,
-      .z = groupCountZ,
-   };
-   struct mtl_size local_size = {
-      .x = 1,
-      .y = 1,
-      .z = 1,
-   };
-   mtl_dispatch_threads(encoder, grid_size, local_size);
-}
-
 VKAPI_ATTR void VKAPI_CALL
 kk_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t groupCountX,
                uint32_t groupCountY, uint32_t groupCountZ)
diff --git a/src/kosmickrisp/vulkan/kk_device.h b/src/kosmickrisp/vulkan/kk_device.h
index 77e84eb0b20..1553e0956d8 100644
--- a/src/kosmickrisp/vulkan/kk_device.h
+++ b/src/kosmickrisp/vulkan/kk_device.h
@@ -15,6 +15,9 @@
 
 #include "kosmickrisp/bridge/mtl_types.h"
 
+#include "kosmickrisp/clc/kk_precompiled_shader.h"
+#include "libkk_shaders.h"
+
 #include "util/u_dynarray.h"
 
 #include "vk_device.h"
@@ -25,13 +28,6 @@ struct kk_bo;
 struct kk_physical_device;
 struct vk_pipeline_cache;
 
-enum kk_device_lib_pipeline {
-   KK_LIB_IMM_WRITE = 0,
-   KK_LIB_COPY_QUERY,
-   KK_LIB_TRIANGLE_FAN,
-   KK_LIB_COUNT,
-};
-
 struct kk_residency_set {
    simple_mtx_t mutex;
    mtl_residency_set *handle;
@@ -75,6 +71,10 @@ struct kk_sampler_heap {
    struct hash_table *ht;
 };
 
+struct kk_precompiled_cache {
+   struct kk_precompiled_shader shaders[LIBKK_NUM_PROGRAMS];
+};
+
 struct kk_device {
    struct vk_device vk;
 
@@ -91,7 +91,7 @@ struct kk_device {
     * recording as required by Metal. */
    struct kk_residency_set residency_set;
 
-   mtl_compute_pipeline_state *lib_pipelines[KK_LIB_COUNT];
+   struct kk_precompiled_cache precompiled_cache;
 
    struct kk_queue queue;
 
@@ -103,14 +103,6 @@ struct kk_device {
 
 VK_DEFINE_HANDLE_CASTS(kk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
 
-static inline mtl_compute_pipeline_state *
-kk_device_lib_pipeline(const struct kk_device *dev,
-                       enum kk_device_lib_pipeline pipeline)
-{
-   assert(pipeline < KK_LIB_COUNT);
-   return dev->lib_pipelines[pipeline];
-}
-
 static inline struct kk_physical_device *
 kk_device_physical(const struct kk_device *dev)
 {
diff --git a/src/kosmickrisp/vulkan/kk_device_lib.c b/src/kosmickrisp/vulkan/kk_device_lib.c
index 9d2afbf9e45..8de55346da4 100644
--- a/src/kosmickrisp/vulkan/kk_device_lib.c
+++ b/src/kosmickrisp/vulkan/kk_device_lib.c
@@ -8,184 +8,65 @@
 
 #include "kk_shader.h"
 
-#include "kkcl.h"
+#include "kosmickrisp/clc/kk_precompiled_shader.h"
+
+#include "libkk_shaders.h"
 
 #include "kosmickrisp/bridge/mtl_bridge.h"
 
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
 
-static nir_def *
-load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
-{
-   nir_deref_instr *deref =
-      nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
-   return nir_load_deref(b, deref);
-}
-
-static nir_shader *
-create_imm_write_shader()
-{
-   nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
-                                                      "kk-meta-imm-write-u64");
-   nir_builder *b = &build;
-
-   struct glsl_struct_field push_fields[] = {
-      {.type = glsl_uint64_t_type(), .name = "buffer_address", .offset = 0},
-   };
-   const struct glsl_type *push_iface_type = glsl_interface_type(
-      push_fields, ARRAY_SIZE(push_fields), GLSL_INTERFACE_PACKING_STD140,
-      false /* row_major */, "push");
-   nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
-                                            push_iface_type, "push");
-
-   b->shader->info.workgroup_size[0] = 1;
-   b->shader->info.workgroup_size[1] = 1;
-   b->shader->info.workgroup_size[2] = 1;
-
-   libkk_write_u64(b, load_struct_var(b, push, 0));
-
-   return build.shader;
-}
-
-static nir_shader *
-create_copy_query_shader()
-{
-   nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
-                                                      "kk-meta-copy-queries");
-   nir_builder *b = &build;
-
-   struct glsl_struct_field push_fields[] = {
-      {.type = glsl_uint64_t_type(), .name = "availability", .offset = 0},
-      {.type = glsl_uint64_t_type(), .name = "results", .offset = 8},
-      {.type = glsl_uint64_t_type(), .name = "indices", .offset = 16},
-      {.type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24},
-      {.type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32},
-      {.type = glsl_uint_type(), .name = "first_query", .offset = 40},
-      {.type = glsl_uint_type(), .name = "flags", .offset = 44},
-      {.type = glsl_uint16_t_type(), .name = "reports_per_query", .offset = 48},
-   };
-   /* TODO_KOSMICKRISP Don't use push constants and directly bind the buffer to
-    * the binding index. This requires compiler work first to remove the
-    * hard-coded buffer0 value. Same applies to other creation functions.
-    */
-   const struct glsl_type *push_iface_type = glsl_interface_type(
-      push_fields, ARRAY_SIZE(push_fields), GLSL_INTERFACE_PACKING_STD140,
-      false /* row_major */, "push");
-   nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
-                                            push_iface_type, "push");
-
-   b->shader->info.workgroup_size[0] = 1;
-   b->shader->info.workgroup_size[1] = 1;
-   b->shader->info.workgroup_size[2] = 1;
-
-   libkk_copy_queries(b, load_struct_var(b, push, 0),
-                      load_struct_var(b, push, 1), load_struct_var(b, push, 2),
-                      load_struct_var(b, push, 3), load_struct_var(b, push, 4),
-                      load_struct_var(b, push, 5), load_struct_var(b, push, 6),
-                      load_struct_var(b, push, 7));
-
-   return build.shader;
-}
-
-static nir_shader *
-create_triangle_fan_shader()
-{
-   nir_builder build = nir_builder_init_simple_shader(
-      MESA_SHADER_COMPUTE, NULL, "kk-device-unroll-geomtry-and-restart");
-   nir_builder *b = &build;
-
-   struct glsl_struct_field push_fields[] = {
-      {.type = glsl_uint64_t_type(), .name = "index_buffer", .offset = 0},
-      {.type = glsl_uint64_t_type(), .name = "out_ptr", .offset = 8},
-      {.type = glsl_uint64_t_type(), .name = "indirect_in", .offset = 16},
-      {.type = glsl_uint64_t_type(), .name = "indirect_out", .offset = 24},
-      {.type = glsl_uint_type(), .name = "restart_index", .offset = 32},
-      {.type = glsl_uint_type(), .name = "index_buffer_size_el", .offset = 36},
-      {.type = glsl_uint_type(), .name = "in_el_size_B,", .offset = 40},
-      {.type = glsl_uint_type(), .name = "out_el_size_B,", .offset = 44},
-      {.type = glsl_uint_type(), .name = "flatshade_first", .offset = 48},
-      {.type = glsl_uint_type(), .name = "mode", .offset = 52},
-   };
-   const struct glsl_type *push_iface_type = glsl_interface_type(
-      push_fields, ARRAY_SIZE(push_fields), GLSL_INTERFACE_PACKING_STD140,
-      false /* row_major */, "push");
-   nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
-                                            push_iface_type, "push");
-
-   b->shader->info.workgroup_size[0] = 1;
-   b->shader->info.workgroup_size[1] = 1;
-   b->shader->info.workgroup_size[2] = 1;
-
-   libkk_unroll_geometry_and_restart(
-      b, load_struct_var(b, push, 0), load_struct_var(b, push, 1),
-      load_struct_var(b, push, 2), load_struct_var(b, push, 3),
-      load_struct_var(b, push, 4), load_struct_var(b, push, 5),
-      load_struct_var(b, push, 6), load_struct_var(b, push, 7),
-      load_struct_var(b, push, 8), load_struct_var(b, push, 9));
-
-   return build.shader;
-}
-
-static struct {
-   enum kk_device_lib_pipeline ndx;
-   nir_shader *(*create_shader_fn)();
-} lib_shaders[KK_LIB_COUNT] = {
-   {KK_LIB_IMM_WRITE, create_imm_write_shader},
-   {KK_LIB_COPY_QUERY, create_copy_query_shader},
-   {KK_LIB_TRIANGLE_FAN, create_triangle_fan_shader},
-};
-static_assert(ARRAY_SIZE(lib_shaders) == KK_LIB_COUNT,
-              "Device lib shader count and created shader count mismatch");
-
-VkResult
-kk_device_init_lib(struct kk_device *dev)
+static VkResult
+build_precompiled_shaders(struct kk_device *dev)
 {
    VkResult result = VK_SUCCESS;
    uint32_t i = 0u;
-   for (; i < KK_LIB_COUNT; ++i) {
-      nir_shader *s = lib_shaders[i].create_shader_fn();
-      if (!s)
-         goto fail;
+   for (; i < LIBKK_NUM_PROGRAMS; ++i) {
+      const uint32_t *bin = libkk_AppleSilicon[i];
+      const struct kk_precompiled_info *info = (void *)bin;
+      const char *msl = (const char *)bin + sizeof(*info);
 
-      struct kk_shader *shader = NULL;
-      result = kk_compile_nir_shader(dev, s, &dev->vk.alloc, &shader);
-      if (result != VK_SUCCESS)
-         goto fail;
-
-      mtl_library *library = mtl_new_library(dev->mtl_handle, shader->msl_code);
+      mtl_library *library = mtl_new_library(dev->mtl_handle, msl);
       if (library == NULL)
          goto fail;
 
-      uint32_t local_size_threads = shader->info.cs.local_size.x *
-                                    shader->info.cs.local_size.y *
-                                    shader->info.cs.local_size.z;
+      struct kk_precompiled_shader *shader = &dev->precompiled_cache.shaders[i];
+
+      /* TODO_KOSMICKRISP Do not hardcode the entrypoint */
       mtl_function *function =
-         mtl_new_function_with_name(library, shader->entrypoint_name);
-      dev->lib_pipelines[i] = mtl_new_compute_pipeline_state(
+         mtl_new_function_with_name(library, "main_entrypoint");
+      uint32_t local_size_threads = info->workgroup_size[0] *
+                                    info->workgroup_size[1] *
+                                    info->workgroup_size[2];
+      shader->pipeline = mtl_new_compute_pipeline_state(
          dev->mtl_handle, function, local_size_threads);
       mtl_release(function);
       mtl_release(library);
 
-      /* We no longer need the shader. Although it may be useful to keep it
-       * alive for the info maybe? */
-      shader->vk.ops->destroy(&dev->vk, &shader->vk, &dev->vk.alloc);
-
-      if (!dev->lib_pipelines[i])
+      if (!shader->pipeline)
          goto fail;
+
+      memcpy(&shader->info, info, sizeof(*info));
    }
 
    return result;
 
 fail:
    for (uint32_t j = 0u; j < i; ++j)
-      mtl_release(dev->lib_pipelines[j]);
+      mtl_release(dev->precompiled_cache.shaders[j].pipeline);
    return vk_error(dev, result);
 }
 
+VkResult
+kk_device_init_lib(struct kk_device *dev)
+{
+   return build_precompiled_shaders(dev);
+}
+
 void
 kk_device_finish_lib(struct kk_device *dev)
 {
-   for (uint32_t i = 0; i < KK_LIB_COUNT; ++i)
-      mtl_release(dev->lib_pipelines[i]);
+   for (uint32_t i = 0; i < LIBKK_NUM_PROGRAMS; ++i)
+      mtl_release(dev->precompiled_cache.shaders[i].pipeline);
 }
diff --git a/src/kosmickrisp/vulkan/kk_encoder.c b/src/kosmickrisp/vulkan/kk_encoder.c
index 2790e33ec80..35774bd85c9 100644
--- a/src/kosmickrisp/vulkan/kk_encoder.c
+++ b/src/kosmickrisp/vulkan/kk_encoder.c
@@ -13,7 +13,8 @@
 #include "kosmickrisp/bridge/mtl_bridge.h"
 #include "kosmickrisp/bridge/vk_to_mtl_map.h"
 
-#include "cl/kk_query.h"
+#include "kosmickrisp/libkk/kk_query.h"
+#include "libkk_shaders.h"
 
 static void
 kk_encoder_start_internal(struct kk_encoder_internal *encoder,
@@ -134,8 +135,6 @@ upload_queue_writes(struct kk_cmd_buffer *cmd)
        enc->copy_query_pool_result_infos.size == 0u)
       return;
 
-   struct kk_device *dev = kk_cmd_buffer_device(cmd);
-   mtl_compute_encoder *compute = kk_compute_encoder(cmd);
    uint32_t count = util_dynarray_num_elements(&enc->imm_writes, uint64_t) / 2u;
    if (count != 0) {
       struct kk_bo *bo = kk_cmd_allocate_buffer(cmd, enc->imm_writes.size, 8u);
@@ -143,27 +142,23 @@ upload_queue_writes(struct kk_cmd_buffer *cmd)
       if (!bo)
          return;
       memcpy(bo->cpu, enc->imm_writes.data, enc->imm_writes.size);
-      struct kk_imm_write_push push_data = {
-         .buffer_address = bo->gpu,
-         .count = count,
-      };
-      kk_cmd_dispatch_pipeline(cmd, compute,
-                               kk_device_lib_pipeline(dev, KK_LIB_IMM_WRITE),
-                               &push_data, sizeof(push_data), count, 1, 1);
+      struct mtl_size grid = {count, 1, 1};
+      libkk_write_u64(cmd, grid, false, bo->gpu);
       enc->imm_writes.size = 0u;
    }
 
    count = util_dynarray_num_elements(&enc->copy_query_pool_result_infos,
-                                      struct kk_copy_query_pool_results_info);
+                                      struct libkk_copy_queries_args);
    if (count != 0u) {
       for (uint32_t i = 0u; i < count; ++i) {
          struct kk_copy_query_pool_results_info *push_data =
             util_dynarray_element(&enc->copy_query_pool_result_infos,
                                   struct kk_copy_query_pool_results_info, i);
 
-         kk_cmd_dispatch_pipeline(
-            cmd, compute, kk_device_lib_pipeline(dev, KK_LIB_COPY_QUERY),
-            push_data, sizeof(*push_data), push_data->query_count, 1, 1);
+         struct mtl_size grid = {push_data->query_count, 1, 1};
+         const struct libkk_copy_queries_args *data =
+            (const struct libkk_copy_queries_args *)push_data;
+         libkk_copy_queries_struct(cmd, grid, false, *data);
       }
       enc->copy_query_pool_result_infos.size = 0u;
    }
@@ -330,7 +325,7 @@ kk_blit_encoder(struct kk_cmd_buffer *cmd)
    return (mtl_blit_encoder *)encoder->encoder;
 }
 
-static mtl_compute_encoder *
+mtl_compute_encoder *
 kk_encoder_pre_gfx_encoder(struct kk_cmd_buffer *cmd)
 {
    struct kk_encoder *encoder = cmd->encoder;
@@ -352,26 +347,12 @@ kk_encoder_pre_gfx_encoder(struct kk_cmd_buffer *cmd)
    return encoder->pre_gfx.encoder;
 }
 
-struct kk_triangle_fan_info {
-   uint64_t index_buffer;
-   uint64_t out_ptr;
-   uint64_t in_draw;
-   uint64_t out_draw;
-   uint32_t restart_index;
-   uint32_t index_buffer_size_el;
-   uint32_t in_el_size_B;
-   uint32_t out_el_size_B;
-   uint32_t flatshade_first;
-   uint32_t mode;
-};
-
 static void
-kk_encoder_render_triangle_fan_common(struct kk_cmd_buffer *cmd,
-                                      struct kk_triangle_fan_info *info,
-                                      mtl_buffer *indirect, mtl_buffer *index,
-                                      uint32_t index_count,
-                                      uint32_t in_el_size_B,
-                                      uint32_t out_el_size_B)
+kk_encoder_render_triangle_fan_common(
+   struct kk_cmd_buffer *cmd,
+   struct libkk_unroll_geometry_and_restart_args *info, mtl_buffer *indirect,
+   mtl_buffer *index, uint32_t index_count, uint32_t in_el_size_B,
+   uint32_t out_el_size_B)
 {
    uint32_t index_buffer_size_B = index_count * out_el_size_B;
    uint32_t buffer_size_B =
@@ -387,12 +368,9 @@ kk_encoder_render_triangle_fan_common(struct kk_cmd_buffer *cmd,
    info->in_el_size_B = in_el_size_B;
    info->out_el_size_B = out_el_size_B;
    info->flatshade_first = true;
-   mtl_compute_encoder *encoder = kk_encoder_pre_gfx_encoder(cmd);
 
-   struct kk_device *dev = kk_cmd_buffer_device(cmd);
-   kk_cmd_dispatch_pipeline(cmd, encoder,
-                            kk_device_lib_pipeline(dev, KK_LIB_TRIANGLE_FAN),
-                            info, sizeof(*info), 1u, 1u, 1u);
+   struct mtl_size grid = {1, 1, 1};
+   libkk_unroll_geometry_and_restart_struct(cmd, grid, true, *info);
 
    enum mtl_index_type index_type =
       index_size_in_bytes_to_mtl_index_type(out_el_size_B);
@@ -411,7 +389,7 @@ kk_encoder_render_triangle_fan_indirect(struct kk_cmd_buffer *cmd,
       u_decomposed_prims_for_vertices(mode, cmd->state.gfx.vb.max_vertices) *
       mesa_vertices_per_prim(mode);
    uint32_t el_size_B = decomposed_index_count < UINT16_MAX ? 2u : 4u;
-   struct kk_triangle_fan_info info = {
+   struct libkk_unroll_geometry_and_restart_args info = {
       .in_draw = mtl_buffer_get_gpu_address(indirect) + offset,
       .restart_index = UINT32_MAX, /* No restart */
       .mode = mode,
@@ -437,7 +415,7 @@ kk_encoder_render_triangle_fan_indexed_indirect(struct kk_cmd_buffer *cmd,
       u_decomposed_prims_for_vertices(mode, max_index_count) *
       mesa_vertices_per_prim(mode);
 
-   struct kk_triangle_fan_info info = {
+   struct libkk_unroll_geometry_and_restart_args info = {
       .index_buffer = mtl_buffer_get_gpu_address(cmd->state.gfx.index.handle) +
                       cmd->state.gfx.index.offset,
       .in_draw = mtl_buffer_get_gpu_address(indirect) + offset,
diff --git a/src/kosmickrisp/vulkan/kk_encoder.h b/src/kosmickrisp/vulkan/kk_encoder.h
index eb820ba604f..8708141b4b7 100644
--- a/src/kosmickrisp/vulkan/kk_encoder.h
+++ b/src/kosmickrisp/vulkan/kk_encoder.h
@@ -101,6 +101,8 @@ mtl_compute_encoder *kk_compute_encoder(struct kk_cmd_buffer *cmd);
 
 mtl_blit_encoder *kk_blit_encoder(struct kk_cmd_buffer *cmd);
 
+mtl_compute_encoder *kk_encoder_pre_gfx_encoder(struct kk_cmd_buffer *cmd);
+
 void upload_queue_writes(struct kk_cmd_buffer *cmd);
 
 void kk_encoder_render_triangle_fan_indirect(struct kk_cmd_buffer *cmd,
diff --git a/src/kosmickrisp/vulkan/kk_query_pool.c b/src/kosmickrisp/vulkan/kk_query_pool.c
index 987b9299893..11b2bcf4bdd 100644
--- a/src/kosmickrisp/vulkan/kk_query_pool.c
+++ b/src/kosmickrisp/vulkan/kk_query_pool.c
@@ -17,7 +17,6 @@
 #include "kk_entrypoints.h"
 #include "kk_physical_device.h"
 #include "kk_query_table.h"
-#include "kkcl.h"
 
 struct kk_query_report {
    uint64_t value;
diff --git a/src/kosmickrisp/vulkan/kk_shader.c b/src/kosmickrisp/vulkan/kk_shader.c
index a6413f90934..c337ee7f1c7 100644
--- a/src/kosmickrisp/vulkan/kk_shader.c
+++ b/src/kosmickrisp/vulkan/kk_shader.c
@@ -33,41 +33,7 @@ static const nir_shader_compiler_options *
 kk_get_nir_options(struct vk_physical_device *vk_pdev, mesa_shader_stage stage,
                    UNUSED const struct vk_pipeline_robustness_state *rs)
 {
-   static nir_shader_compiler_options options = {
-      .lower_fdph = true,
-      .has_fsub = true,
-      .has_isub = true,
-      .lower_extract_word = true,
-      .lower_extract_byte = true,
-      .lower_insert_word = true,
-      .lower_insert_byte = true,
-      .lower_fmod = true,
-      .discard_is_demote = true,
-      .instance_id_includes_base_index = true,
-      .lower_device_index_to_zero = true,
-      .lower_pack_64_2x32_split = true,
-      .lower_unpack_64_2x32_split = true,
-      .lower_pack_64_2x32 = true,
-      .lower_pack_half_2x16 = true,
-      .lower_pack_split = true,
-      .lower_unpack_half_2x16 = true,
-      .has_cs_global_id = true,
-      .lower_fquantize2f16 = true,
-      .lower_scmp = true,
-      .lower_ifind_msb = true,
-      .lower_ufind_msb = true,
-      .lower_find_lsb = true,
-      .has_uclz = true,
-      .lower_mul_2x32_64 = true,
-      .lower_uadd_carry = true,
-      .lower_usub_borrow = true,
-      /* Metal does not support double. */
-      .lower_doubles_options = (nir_lower_doubles_options)(~0),
-      .lower_int64_options =
-         nir_lower_ufind_msb64 | nir_lower_subgroup_shuffle64,
-      .io_options = nir_io_mediump_is_32bit,
-   };
-   return &options;
+   return &kk_nir_options;
 }
 
 /* TODO_KOSMICKRISP Once we support robustness2, update these values. */
diff --git a/src/kosmickrisp/vulkan/meson.build b/src/kosmickrisp/vulkan/meson.build
index 41bf66d3eae..7ba3b3f5bc5 100644
--- a/src/kosmickrisp/vulkan/meson.build
+++ b/src/kosmickrisp/vulkan/meson.build
@@ -18,6 +18,7 @@ kk_device_dispatch_table_dependencies_list = [
   idep_vulkan_util,
   idep_vulkan_wsi,
   idep_vulkan_wsi_headers,
+  idep_libkk_shaders_h,
 ]
 
 libkk_device_dispatch_table = static_library(
@@ -100,11 +101,6 @@ kk_files = files(
   'kk_wsi.c',
 )
 
-kkcl_files = files(
-  'cl/kk_query.cl',
-  'cl/kk_triangle_fan.cl',
-)
-
 kk_entrypoints = custom_target(
   'kk_entrypoints',
   input : [vk_entrypoints_gen, vk_api_xml],
@@ -119,25 +115,6 @@ kk_entrypoints = custom_target(
 
 relative_dir = fs.relative_to(meson.global_source_root(), meson.global_build_root())
 
-kkcl_spv = custom_target(
-  input : kkcl_files,
-  output : 'kkcl.spv',
-  command : [
-    prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', kkcl_files, '--',
-    '-I' + join_paths(meson.project_source_root(), 'src/compiler/libcl'),
-    '-I' + join_paths(meson.current_source_dir(), '.'),
-    '-I' + join_paths(meson.project_source_root(), 'src'),
-    cl_args,
-    ],
-  depfile : 'libkk_shaders.h.d',
-)
-
-kkcl = custom_target(
-  input : kkcl_spv,
-  output : ['kkcl.cpp', 'kkcl.h'],
-  command : [prog_vtn_bindgen2, '@INPUT@', '@OUTPUT@'],
-)
-
 kk_deps = [
   idep_nir,
   idep_mesautil,
@@ -148,7 +125,9 @@ kk_deps = [
   kk_device_dispatch_table_dependencies,
   idep_kk_device_dispatch_table,
   idep_msl_to_nir,
-  idep_mtl_bridge
+  idep_mtl_bridge,
+  idep_libkk,
+  idep_libkk_shaders_h,
 ]
 
 libkk = static_library(
@@ -156,7 +135,6 @@ libkk = static_library(
   [
     kk_files,
     kk_entrypoints,
-    kkcl,
     sha1_h,
   ],
   include_directories : [