diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index 50414a51037..75a16f41545 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -3879,9 +3879,25 @@ remove_interpolate_at_sample(struct nir_builder *b, nir_intrinsic_instr *interp,
    return true;
 }
 
+static void
+zink_optimized_st_emulation_passes(nir_shader *nir, struct zink_shader *zs,
+                                   const struct zink_st_variant_key *key)
+{
+   if (!nir->info.io_lowered)
+      return;
+}
+
+static void
+zink_emulation_passes(nir_shader *nir, struct zink_shader *zs)
+{
+   if (!nir->info.io_lowered)
+      return;
+}
+
 struct zink_shader_object
-zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs,
-                    nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg)
+zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir,
+                    const struct zink_shader_key *key, const struct zink_st_variant_key *st_key,
+                    bool compile_uber, const void *extra_data, struct zink_program *pg)
 {
    bool need_optimize = true;
    bool inlined_uniforms = false;
@@ -3891,8 +3907,18 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
       NIR_PASS(_, nir, nir_lower_sample_shading);
    }
 
+   if (compile_uber) {
+      zink_emulation_passes(nir, zs);
+      need_optimize = true;
+   }
+   else if (st_key) {
+      zink_optimized_st_emulation_passes(nir, zs, st_key);
+      need_optimize = true;
+   }
+
    NIR_PASS(_, nir, add_derefs);
    NIR_PASS(_, nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
+
    if (key) {
       if (key->inline_uniforms) {
          NIR_PASS(_, nir, nir_inline_uniforms,
@@ -4077,7 +4103,7 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
 }
 
 struct zink_shader_object
-zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
+zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, bool compile_uber)
 {
    nir_shader *nir = zs->nir;
    /* TODO: maybe compile multiple variants for different set counts for compact mode? */
@@ -4107,6 +4133,10 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
       default: break;
       }
    }
+
+   if (compile_uber)
+      zink_emulation_passes(nir, zs);
+
    NIR_PASS(_, nir, add_derefs);
    NIR_PASS(_, nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
    if (screen->driconf.inline_uniforms) {
@@ -4114,6 +4144,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
       NIR_PASS(_, nir, rewrite_bo_access, screen);
       NIR_PASS(_, nir, remove_bo_access, zs);
    }
+
    optimize_nir(nir, zs, true);
    zink_descriptor_shader_init(screen, zs);
    nir_shader *nir_clone = NULL;
@@ -4128,7 +4159,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
          zs->non_fs.generated_tcs = zink_shader_tcs_create(screen, 32);
          zink_shader_tcs_init(screen, zs->non_fs.generated_tcs, nir_clone, &nir_tcs);
          nir_tcs->info.separate_shader = true;
-         zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs);
+         zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs, compile_uber);
          ralloc_free(nir_tcs);
          zs->non_fs.generated_tcs->nir = NULL;
       }
@@ -6448,11 +6479,13 @@ gfx_shader_prune(struct zink_screen *screen, struct zink_shader *shader)
       prog->base.removed = true;
       simple_mtx_unlock(lock);
 
-      for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-         hash_table_foreach(&prog->pipelines[i], table_entry) {
-            struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data;
+      for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+         for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+            hash_table_foreach(&prog->pipelines[r][i], table_entry) {
+               struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data;
 
-            util_queue_fence_wait(&pc_entry->fence);
+               util_queue_fence_wait(&pc_entry->fence);
+            }
          }
       }
    }
@@ -6468,7 +6501,10 @@ gfx_shader_prune(struct zink_screen *screen, struct zink_shader *shader)
       prog->shaders[MESA_SHADER_GEOMETRY]->non_fs.parent == shader) {
       prog->shaders[MESA_SHADER_GEOMETRY] = NULL;
    }
-   zink_gfx_program_reference(screen, &prog, NULL);
+
+   /* variant programs are owned and destroyed by their parent */
+   if (!prog->is_variant_program)
+      zink_gfx_program_reference(screen, &prog, NULL);
    return true;
 }
 
diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h
index bec8fae913e..0c33a493b38 100644
--- a/src/gallium/drivers/zink/zink_compiler.h
+++ b/src/gallium/drivers/zink/zink_compiler.h
@@ -63,9 +63,11 @@ void
 zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_shader *consumer);
 /* pass very large shader key data with extra_data */
 struct zink_shader_object
-zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg);
+zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir,
+                    const struct zink_shader_key *key, const struct zink_st_variant_key *st_key,
+                    bool compile_uber, const void *extra_data, struct zink_program *pg);
 struct zink_shader_object
-zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs);
+zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, bool compile_uber);
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir);
 void
diff --git a/src/gallium/drivers/zink/zink_context.c b/src/gallium/drivers/zink/zink_context.c
index 889699d80f2..413d46de30a 100644
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -3721,8 +3721,8 @@ zink_update_descriptor_refs(struct zink_context *ctx, bool compute)
                res->obj->unordered_read = false;
          }
       }
-      if (ctx->curr_program)
-         zink_batch_reference_program(ctx, &ctx->curr_program->base);
+      if (ctx->curr_program_uber || ctx->curr_program)
+         zink_batch_reference_program(ctx, &ctx->curr_program_uber->base);
    }
    if (ctx->di.bindless_refs_dirty) {
       ctx->di.bindless_refs_dirty = false;
diff --git a/src/gallium/drivers/zink/zink_draw.cpp b/src/gallium/drivers/zink/zink_draw.cpp
index 7476d19edbb..a28024f4d67 100644
--- a/src/gallium/drivers/zink/zink_draw.cpp
+++ b/src/gallium/drivers/zink/zink_draw.cpp
@@ -265,11 +265,11 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum
       zink_gfx_program_update(ctx);
    bool pipeline_changed = false;
    VkPipeline pipeline = VK_NULL_HANDLE;
-   if (!ctx->curr_program->base.uses_shobj) {
+   if (!(ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber : ctx->curr_program)->base.uses_shobj) {
       if (screen->info.have_EXT_graphics_pipeline_library)
-         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, true, false>(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
+         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, true, false>(ctx, ctx->curr_program_uber, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
       else
-         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, false, false>(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
+         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, false, false>(ctx, ctx->curr_program, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
       assert(pipeline);
       pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw;
       if (BATCH_CHANGED || pipeline_changed)
@@ -285,7 +285,8 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum
             VK_SHADER_STAGE_FRAGMENT_BIT,
          };
          /* always rebind all stages */
-         VKCTX(CmdBindShadersEXT)(bs->cmdbuf, ZINK_GFX_SHADER_COUNT, stages, ctx->curr_program->objects);
+         VKCTX(CmdBindShadersEXT)(bs->cmdbuf, ZINK_GFX_SHADER_COUNT, stages,
+                                  ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber->objects : ctx->curr_program->objects);
          if (screen->info.have_EXT_mesh_shader) {
             /* must always unbind mesh stages */
             VkShaderStageFlagBits mesh_stages[] = {
@@ -994,9 +995,9 @@ update_mesh_pipeline(struct zink_context *ctx, struct zink_batch_state *bs)
    VkPipeline pipeline = VK_NULL_HANDLE;
    if (!ctx->mesh_program->base.uses_shobj) {
       if (screen->info.have_EXT_graphics_pipeline_library)
-         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, true, true>(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
+         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, true, true>(ctx, ctx->mesh_program, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
       else
-         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, false, true>(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
+         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, false, true>(ctx, ctx->mesh_program, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
       assert(pipeline);
       pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw;
       if (BATCH_CHANGED || pipeline_changed)
diff --git a/src/gallium/drivers/zink/zink_pipeline.c b/src/gallium/drivers/zink/zink_pipeline.c
index a5363a98481..1114bf1186e 100644
--- a/src/gallium/drivers/zink/zink_pipeline.c
+++ b/src/gallium/drivers/zink/zink_pipeline.c
@@ -884,10 +884,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_objec
 }
 
 VkPipeline
-zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_program *prog)
+zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_object *objs, struct zink_gfx_program *prog)
 {
    u_rwlock_wrlock(&prog->base.pipeline_cache_lock);
-   VkPipeline pipeline = create_gfx_pipeline_library(screen, prog->objs, prog->stages_present, prog->base.layout, prog->base.pipeline_cache);
+   VkPipeline pipeline = create_gfx_pipeline_library(screen, objs, prog->stages_present, prog->base.layout, prog->base.pipeline_cache);
    u_rwlock_wrunlock(&prog->base.pipeline_cache_lock);
    return pipeline;
 }
diff --git a/src/gallium/drivers/zink/zink_pipeline.h b/src/gallium/drivers/zink/zink_pipeline.h
index 7b050f15efb..aa66cf0dc21 100644
--- a/src/gallium/drivers/zink/zink_pipeline.h
+++ b/src/gallium/drivers/zink/zink_pipeline.h
@@ -58,7 +58,7 @@ zink_create_gfx_pipeline_input(struct zink_screen *screen,
                                const uint8_t *binding_map,
                                VkPrimitiveTopology primitive_topology);
 VkPipeline
-zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_program *prog);
+zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_object *objs, struct zink_gfx_program *prog);
 VkPipeline
 zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state);
 VkPipeline
diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c
index 00b1ee702a5..e63f4735a73 100644
--- a/src/gallium/drivers/zink/zink_program.c
+++ b/src/gallium/drivers/zink/zink_program.c
@@ -41,6 +41,7 @@
 #include "nir_serialize.h"
 #include "nir.h"
 #include "nir/nir_draw_helpers.h"
+#include "util/u_queue.h"
 
 /* for pipeline cache */
 #define XXH_INLINE_ALL
@@ -48,9 +49,34 @@
 
 static void
 gfx_program_precompile_job(void *data, void *gdata, int thread_index);
+static void
+precompile_variant_job(void *data, void *gdata, int thread_index);
+static void
+precompile_variant_separate_shader_job(void *data, void *gdata, int thread_index);
 struct zink_gfx_program *
 create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch, bool is_mesh);
 
+struct precompile_variant_data {
+   struct zink_gfx_program *prog;
+   struct zink_gfx_pipeline_state state;
+};
+
+struct precompile_separate_variant_data {
+   struct zink_program *prog;
+   struct zink_shader_module *zm;
+   struct zink_shader *zs;
+   struct blob *blob;
+   bool uses_shobj;
+   struct zink_shader_key key;
+   struct zink_st_variant_key st_key;
+   bool has_key;
+};
+
+struct program_variant_key {
+   uint32_t key, st_key;
+   struct zink_gfx_program *prog;
+};
+
 void
 debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr)
 {
@@ -69,9 +95,9 @@ shader_key_matches_tcs_nongenerated(const struct zink_shader_module *zm, const s
    if (zm->num_uniforms != num_uniforms || zm->has_nonseamless != !!key->base.nonseamless_cube_mask ||
        zm->needs_zs_shader_swizzle != key->base.needs_zs_shader_swizzle)
       return false;
-   const uint32_t nonseamless_size = zm->has_nonseamless ? sizeof(uint32_t) : 0;
-   return (!nonseamless_size || !memcmp(zm->key + zm->key_size, &key->base.nonseamless_cube_mask, nonseamless_size)) &&
-          (!num_uniforms || !memcmp(zm->key + zm->key_size + nonseamless_size,
+   const uint32_t nonseamless_size = zm->has_nonseamless ? sizeof(union zink_st_small_key) : 0;
+   return (!nonseamless_size || !memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key), &key->base.nonseamless_cube_mask, nonseamless_size)) &&
+          (!num_uniforms || !memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key) + nonseamless_size,
                                     key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t)));
 }
 
@@ -84,13 +110,13 @@ shader_key_matches(const struct zink_shader_module *zm,
    if (has_inline) {
       if (zm->num_uniforms != num_uniforms ||
           (num_uniforms &&
-           memcmp(zm->key + zm->key_size + nonseamless_size,
+           memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key) + nonseamless_size,
                   key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t))))
          return false;
    }
    if (!has_nonseamless) {
       if (zm->has_nonseamless != !!key->base.nonseamless_cube_mask ||
-          (nonseamless_size && memcmp(zm->key + zm->key_size, &key->base.nonseamless_cube_mask, nonseamless_size)))
+          (nonseamless_size && memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key), &key->base.nonseamless_cube_mask, nonseamless_size)))
          return false;
    }
    if (zm->needs_zs_shader_swizzle != key->base.needs_zs_shader_swizzle)
@@ -142,18 +168,19 @@ create_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *scr
    const bool is_nongenerated_tcs = stage == MESA_SHADER_TESS_CTRL && !zs->non_fs.is_generated;
    const bool shadow_needs_shader_swizzle = key->base.needs_zs_shader_swizzle ||
                                             (stage == MESA_SHADER_FRAGMENT && key->key.fs.base.shadow_needs_shader_swizzle);
-   zm = malloc(sizeof(struct zink_shader_module) + key->size +
+   zm = malloc(sizeof(struct zink_shader_module) + sizeof(union zink_st_small_key) + key->size +
                (!has_nonseamless ? nonseamless_size : 0) + inline_size * sizeof(uint32_t) +
                (shadow_needs_shader_swizzle ? sizeof(struct zink_zs_swizzle_key) : 0));
    if (!zm) {
       return NULL;
    }
+   util_queue_fence_init(&zm->fence);
    unsigned patch_vertices = state->shader_keys.key[MESA_SHADER_TESS_CTRL].key.tcs.patch_vertices;
    if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) {
       assert(ctx); //TODO async
       zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base);
    } else {
-      zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), key, &ctx->di.zs_swizzle[stage], &prog->base);
+      zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), key, &state->st_key, false, &ctx->di.zs_swizzle[stage], &prog->base);
    }
    if (!zm->obj.mod) {
       FREE(zm);
@@ -168,20 +195,22 @@ create_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *scr
       zm->key_size = 0;
       memset(zm->key, 0, key->size);
    }
+   uint16_t st_val = state->st_key.small_key.val;
+   memcpy(zm->key + key->size, &st_val, sizeof(st_val));
    if (!has_nonseamless && nonseamless_size) {
       /* nonseamless mask gets added to base key if it exists */
-      memcpy(zm->key + key->size, &key->base.nonseamless_cube_mask, nonseamless_size);
+      memcpy(zm->key + key->size + sizeof(st_val), &key->base.nonseamless_cube_mask, nonseamless_size);
    }
    zm->needs_zs_shader_swizzle = shadow_needs_shader_swizzle;
    zm->has_nonseamless = has_nonseamless ? 0 : !!nonseamless_size;
    if (inline_size)
-      memcpy(zm->key + key->size + nonseamless_size, key->base.inlined_uniform_values, inline_size * sizeof(uint32_t));
+      memcpy(zm->key + key->size + sizeof(st_val) + nonseamless_size, key->base.inlined_uniform_values, inline_size * sizeof(uint32_t));
    if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated)
       zm->hash = patch_vertices;
    else
       zm->hash = shader_module_hash(zm);
    if (unlikely(shadow_needs_shader_swizzle)) {
-      memcpy(zm->key + key->size + nonseamless_size + inline_size * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
+      memcpy(zm->key + key->size + sizeof(st_val) + nonseamless_size + inline_size * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
       zm->hash ^= _mesa_hash_data(&ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
    }
    zm->default_variant = !shadow_needs_shader_swizzle && !inline_size && !util_dynarray_contains(&prog->shader_cache[stage][0][0], void*);
@@ -219,9 +248,12 @@ get_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *screen
             continue;
          if (!shader_key_matches(iter, key, inline_size, has_inline, has_nonseamless))
             continue;
+         uint16_t st_val = state->st_key.small_key.val;
+         if (memcmp(iter->key + iter->key_size, &st_val, sizeof(st_val)))
+            continue;
          if (unlikely(shadow_needs_shader_swizzle)) {
             /* shadow swizzle data needs a manual compare since it's so fat */
-            if (memcmp(iter->key + iter->key_size + nonseamless_size + iter->num_uniforms * sizeof(uint32_t),
+            if (memcmp(iter->key + iter->key_size + sizeof(st_val) + nonseamless_size + iter->num_uniforms * sizeof(uint32_t),
                        &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)))
                continue;
          }
@@ -241,7 +273,8 @@ ALWAYS_INLINE static struct zink_shader_module *
 create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_screen *screen,
                                        struct zink_shader *zs, struct zink_gfx_program *prog,
                                        mesa_shader_stage stage,
-                                       struct zink_gfx_pipeline_state *state)
+                                       struct zink_gfx_pipeline_state *state,
+                                       bool unpopulated, bool compile_uber)
 {
    struct zink_shader_module *zm;
    uint16_t *key;
@@ -258,23 +291,30 @@ create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_scr
       key = NULL;
    }
    size_t key_size = sizeof(uint16_t);
-   zm = calloc(1, sizeof(struct zink_shader_module) + (key ? key_size : 0) + (unlikely(shadow_needs_shader_swizzle) ? sizeof(struct zink_zs_swizzle_key) : 0));
+   zm = calloc(1, sizeof(struct zink_shader_module) +
+               sizeof(union zink_st_small_key) +
+               (key ? key_size : 0) +
+               (unlikely(shadow_needs_shader_swizzle) ? sizeof(struct zink_zs_swizzle_key) : 0));
    if (!zm) {
       return NULL;
    }
-   if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) {
-      assert(ctx || screen->info.dynamic_state2_feats.extendedDynamicState2PatchControlPoints);
-      unsigned patch_vertices = 3;
-      if (ctx) {
-         struct zink_tcs_key *tcs = (struct zink_tcs_key*)key;
-         patch_vertices = tcs->patch_vertices;
-      }
-      zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base);
-   } else {
-      zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]),
-                                    (struct zink_shader_key*)key, shadow_needs_shader_swizzle ? &ctx->di.zs_swizzle[stage] : NULL, &prog->base);
+   util_queue_fence_init(&zm->fence);
+   if (!unpopulated) {
+        if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) {
+           assert(ctx || screen->info.dynamic_state2_feats.extendedDynamicState2PatchControlPoints);
+           unsigned patch_vertices = 3;
+           if (ctx) {
+                   struct zink_tcs_key *tcs = (struct zink_tcs_key*)key;
+                   patch_vertices = tcs->patch_vertices;
+           }
+           zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base);
+        } else {
+           zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]),
+                                           (struct zink_shader_key*)key, &state->st_key, compile_uber,
+                                           shadow_needs_shader_swizzle ? &ctx->di.zs_swizzle[stage] : NULL, &prog->base);
+        }
    }
-   if (!zm->obj.mod) {
+   if (!zm->obj.mod && !unpopulated) {
       FREE(zm);
       return NULL;
    }
@@ -288,9 +328,17 @@ create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_scr
       *data = (*key) & mask;
       if (unlikely(shadow_needs_shader_swizzle))
          memcpy(&data[1], &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
+      uint16_t st_val = state->st_key.small_key.val;
+      uint8_t *p = (uint8_t*)&data[1];
+      if (unlikely(shadow_needs_shader_swizzle))
+         p += sizeof(struct zink_zs_swizzle_key);
+      memcpy(p, &st_val, sizeof(st_val));
    }
    zm->default_variant = !util_dynarray_contains(&prog->shader_cache[stage][0][0], void*);
-   util_dynarray_append(&prog->shader_cache[stage][0][0], zm);
+   if (!compile_uber)
+      util_dynarray_append(&prog->shader_cache[stage][0][0], zm);
+   else
+      util_dynarray_append(&prog->uber_modules, zm);
    return zm;
 }
 
@@ -298,7 +346,7 @@ ALWAYS_INLINE static struct zink_shader_module *
 get_shader_module_for_stage_optimal_key(struct zink_context *ctx, struct zink_screen *screen,
                                         struct zink_shader *zs, struct zink_gfx_program *prog,
                                         mesa_shader_stage stage,
-                                        struct zink_gfx_pipeline_state *state, uint16_t *key)
+                                        struct zink_gfx_pipeline_state *state, uint16_t *key, uint16_t *st_key)
 {
    /* non-generated tcs won't use the shader key */
    const bool is_nongenerated_tcs = stage == MESA_SHADER_TESS_CTRL && !zs->non_fs.is_generated;
@@ -324,6 +372,12 @@ get_shader_module_for_stage_optimal_key(struct zink_context *ctx, struct zink_sc
             if (memcmp(iter->key + sizeof(uint16_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)))
                continue;
          }
+         uint16_t st_val = *st_key;
+         uint8_t *p = iter->key + sizeof(union zink_st_small_key);
+         if (unlikely(shadow_needs_shader_swizzle))
+            p += sizeof(struct zink_zs_swizzle_key);
+         if (memcmp(p, &st_val, sizeof(st_val)))
+            continue;
       }
       if (i > 0) {
          struct zink_shader_module *zero = pzm[0];
@@ -360,16 +414,18 @@ get_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_screen
                                     mesa_shader_stage stage,
                                     struct zink_gfx_pipeline_state *state)
 {
-   uint16_t *key;
+   uint16_t *key, st_key;
 
    key = get_shader_module_optimal_key(ctx, prog, zs, stage);
+   st_key = state->st_key.small_key.val;
 
-   return get_shader_module_for_stage_optimal_key(ctx, screen, zs, prog, stage, state, key);
+   return get_shader_module_for_stage_optimal_key(ctx, screen, zs, prog, stage, state, key, &st_key);
 }
 
 static void
 zink_destroy_shader_module(struct zink_screen *screen, struct zink_shader_module *zm)
 {
+   util_queue_fence_wait(&zm->fence);
    if (zm->shobj)
       VKSCR(DestroyShaderEXT)(screen->dev, zm->obj.obj, NULL);
    else
@@ -480,7 +536,7 @@ generate_gfx_program_modules(struct zink_context *ctx, struct zink_screen *scree
 }
 
 static void
-generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state)
+generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber)
 {
    assert(!prog->objs[MESA_SHADER_VERTEX].mod && !prog->objs[MESA_SHADER_MESH].mod);
    for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) {
@@ -489,7 +545,7 @@ generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_scree
 
       assert(prog->shaders[i]);
 
-      struct zink_shader_module *zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[i], prog, i, state);
+      struct zink_shader_module *zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[i], prog, i, state, false, compile_uber);
       prog->objs[i] = zm->obj;
       prog->objects[i] = zm->obj.obj;
    }
@@ -498,21 +554,11 @@ generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_scree
       state->modules_changed = true;
    else
       state->mesh_modules_changed = true;
-   prog->last_variant_hash = prog->shaders[MESA_SHADER_MESH] ? state->mesh_optimal_key : state->optimal_key;
-}
 
-static uint32_t
-hash_pipeline_lib_generated_tcs(const void *key)
-{
-   const struct zink_gfx_library_key *gkey = key;
-   return gkey->optimal_key;
-}
-
-
-static bool
-equals_pipeline_lib_generated_tcs(const void *a, const void *b)
-{
-   return !memcmp(a, b, sizeof(uint32_t));
+   if (!compile_uber) {
+      prog->last_variant_hash = prog->shaders[MESA_SHADER_MESH] ? state->mesh_optimal_key : state->optimal_key;
+      prog->st_key = state->st_key.small_key.val;
+   }
 }
 
 static uint32_t
@@ -530,25 +576,6 @@ equals_pipeline_lib_mesh(const void *a, const void *b)
    return ak->optimal_key == bk->optimal_key;
 }
 
-static uint32_t
-hash_pipeline_lib(const void *key)
-{
-   const struct zink_gfx_library_key *gkey = key;
-   /* remove generated tcs bits */
-   return zink_shader_key_optimal_no_tcs(gkey->optimal_key);
-}
-
-static bool
-equals_pipeline_lib(const void *a, const void *b)
-{
-   const struct zink_gfx_library_key *ak = a;
-   const struct zink_gfx_library_key *bk = b;
-   /* remove generated tcs bits */
-   uint32_t val_a = zink_shader_key_optimal_no_tcs(ak->optimal_key);
-   uint32_t val_b = zink_shader_key_optimal_no_tcs(bk->optimal_key);
-   return val_a == val_b;
-}
-
 uint32_t
 hash_gfx_input_dynamic(const void *key)
 {
@@ -673,7 +700,7 @@ zink_gfx_program_update(struct zink_context *ctx)
          update_gfx_program(ctx, prog);
       } else {
          ctx->dirty_gfx_stages |= ctx->shader_stages;
-         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, hash, false);
+         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, hash, false, false);
          zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false);
          _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
          prog->base.removed = false;
@@ -682,7 +709,8 @@ zink_gfx_program_update(struct zink_context *ctx)
       simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
       if (prog && prog != ctx->curr_program)
          zink_batch_reference_program(ctx, &prog->base);
-      ctx->curr_program = prog;
+      ctx->curr_program_uber = ctx->curr_program = prog;
+      ctx->gfx_pipeline_state.uber_required = false;
       ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
       ctx->gfx_dirty = false;
    } else if (ctx->dirty_gfx_stages) {
@@ -695,41 +723,72 @@ zink_gfx_program_update(struct zink_context *ctx)
    ctx->dirty_gfx_stages = 0;
 }
 
-ALWAYS_INLINE static bool
-update_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage)
+ALWAYS_INLINE static void
+gfx_program_cache_populate_queue(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage, struct zink_shader_module *zm)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (screen->info.have_EXT_graphics_pipeline_library)
+      util_queue_fence_wait(&prog->base.cache_fence);
+   struct precompile_separate_variant_data *data = CALLOC_STRUCT(precompile_separate_variant_data);
+   data->prog = &prog->base;
+   data->zs = prog->shaders[pstage];
+   data->blob = &prog->blobs[pstage];
+   data->uses_shobj = prog->base.uses_shobj;
+   data->zm = zm;
+   struct zink_shader_key* keyp = (struct zink_shader_key*)get_shader_module_optimal_key(ctx, prog, data->zs, pstage);
+   if (keyp)
+      data->key = *keyp;
+   data->has_key = !!keyp;
+   data->st_key = ctx->gfx_pipeline_state.st_key;
+   if (zink_debug & ZINK_DEBUG_NOBGC) {
+      precompile_variant_separate_shader_job(data, screen, 0);
+   } else {
+      util_queue_add_job(&screen->cache_get_thread, data, &zm->fence, precompile_variant_separate_shader_job, NULL, 0);
+   }
+}
+
+ALWAYS_INLINE static struct zink_shader_module *
+update_or_queue_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage, bool async)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    if (screen->info.have_EXT_graphics_pipeline_library)
       util_queue_fence_wait(&prog->base.cache_fence);
    struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
+   bool entry_found = !!zm;
+   bool async_done = zm && util_queue_fence_is_signalled(&zm->fence);
    if (!zm) {
-      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
+      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state, async, false);
       perf_debug(ctx, "zink[gfx_compile]: %s shader variant required\n", _mesa_shader_stage_to_string(pstage));
    }
-
-   bool changed = prog->objs[pstage].mod != zm->obj.mod;
-   prog->objs[pstage] = zm->obj;
-   prog->objects[pstage] = zm->obj.obj;
-   return changed;
+   if (!async || async_done) {
+      return zm;
+   } else {
+      if (!entry_found)
+         gfx_program_cache_populate_queue(ctx, prog, pstage, zm);
+   }
+   return NULL;
 }
 
-static void
-update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *prog)
+static bool
+update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, struct zink_gfx_program *variant_prog, bool async)
 {
+   bool async_done = true;
+   struct zink_shader_module *zms[3] = {0};
    const union zink_shader_key_optimal *key = (union zink_shader_key_optimal*)&ctx->gfx_pipeline_state.optimal_key;
    const union zink_shader_key_optimal *last_prog_key = (union zink_shader_key_optimal*)&prog->last_variant_hash;
-   if (key->vs_bits != last_prog_key->vs_bits) {
-      assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->info.stage);
-      ctx->gfx_pipeline_state.modules_changed |= changed;
+   bool st_key_diff = ctx->gfx_pipeline_state.st_key.small_key.val != prog->st_key;
+   if (st_key_diff || key->vs_bits != last_prog_key->vs_bits) {
+      assert(!variant_prog->is_separable);
+      zms[0] = update_or_queue_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->info.stage, async);
+      async_done &= !!zms[0];
    }
    const bool shadow_needs_shader_swizzle = last_prog_key->fs.shadow_needs_shader_swizzle && (ctx->dirty_gfx_stages & BITFIELD_BIT(MESA_SHADER_FRAGMENT));
-   if (key->fs_bits != last_prog_key->fs_bits ||
+   if (st_key_diff || key->fs_bits != last_prog_key->fs_bits ||
        /* always recheck shadow swizzles since they aren't directly part of the key */
        unlikely(shadow_needs_shader_swizzle)) {
-      assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
-      ctx->gfx_pipeline_state.modules_changed |= changed;
+      assert(!variant_prog->is_separable);
+      zms[1] = update_or_queue_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT, async);
+      async_done &= !!zms[1];
       if (unlikely(shadow_needs_shader_swizzle)) {
          struct zink_shader_module **pzm = prog->shader_cache[MESA_SHADER_FRAGMENT][0][0].data;
          ctx->gfx_pipeline_state.shadow = (struct zink_zs_swizzle_key*)pzm[0]->key + sizeof(uint16_t);
@@ -737,11 +796,77 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
    }
    if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated &&
        key->tcs_bits != last_prog_key->tcs_bits) {
-      assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL);
-      ctx->gfx_pipeline_state.modules_changed |= changed;
+      assert(!variant_prog->is_separable);
+      zms[2] = update_or_queue_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL, async);
+      async_done &= !!zms[2];
    }
-   prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
+   mesa_shader_stage stages[] = {ctx->last_vertex_stage->info.stage, MESA_SHADER_FRAGMENT, MESA_SHADER_TESS_CTRL};
+   if (async_done) {
+      for (int i = 0;i < 3; i++) {
+         if (!zms[i])
+            continue;
+         variant_prog->objs[stages[i]] = zms[i]->obj;
+         variant_prog->objects[stages[i]] = zms[i]->obj.obj;
+      }
+      variant_prog->last_variant_hash = prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
+      variant_prog->st_key = prog->st_key = ctx->gfx_pipeline_state.st_key.small_key.val;
+   }
+   return async_done;
+}
+
+static bool
+update_gfx_program_missing_shaders(struct zink_context *ctx, struct zink_gfx_program *prog,
+                                   struct zink_gfx_program *variant_prog, bool async)
+{
+   bool async_done = true;
+   for (int rstage = 0; rstage < MESA_SHADER_COMPUTE; rstage++) {
+      assert(!!variant_prog->shaders[rstage] == !!prog->shaders[rstage]);
+      if (variant_prog->shaders[rstage] && !variant_prog->objs[rstage].mod) {
+         assert(!variant_prog->is_separable);
+         struct zink_shader_module *mod = update_or_queue_gfx_shader_module_optimal(ctx, prog, rstage, async);
+         async_done &= !!mod;
+         if (mod) {
+            bool changed = variant_prog->objs[rstage].mod != mod->obj.mod;
+            variant_prog->objs[rstage] = mod->obj;
+               variant_prog->objects[rstage] = mod->obj.obj;
+               ctx->gfx_pipeline_state.modules_changed |= changed;
+         }
+      }
+   }
+   return async_done;
+}
+
+static void
+copy_gfx_program_missing_shaders(struct zink_context *ctx, struct zink_gfx_program *base_prog,
+                                   struct zink_gfx_program *variant_prog)
+{
+   for (int rstage = 0; rstage < MESA_SHADER_COMPUTE; rstage++) {
+      assert(!!variant_prog->shaders[rstage] == !!base_prog->shaders[rstage]);
+      if (variant_prog->shaders[rstage] && !variant_prog->objs[rstage].mod) {
+         bool changed = variant_prog->objs[rstage].mod != base_prog->objs[rstage].mod;
+         variant_prog->objs[rstage] = base_prog->objs[rstage];
+         variant_prog->objects[rstage] = base_prog->objects[rstage];
+         ctx->gfx_pipeline_state.modules_changed |= changed;
+      }
+   }
+}
+
+ALWAYS_INLINE static bool
+update_gfx_shader_module_mesh(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (screen->info.have_EXT_graphics_pipeline_library)
+      util_queue_fence_wait(&prog->base.cache_fence);
+   struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
+   if (!zm) {
+      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state, false, false);
+      perf_debug(ctx, "zink[gfx_compile]: %s shader variant required\n", _mesa_shader_stage_to_string(pstage));
+   }
+
+   bool changed = prog->objs[pstage].mod != zm->obj.mod;
+   prog->objs[pstage] = zm->obj;
+   prog->objects[pstage] = zm->obj.obj;
+   return changed;
 }
 
 static void
@@ -754,7 +879,7 @@ update_mesh_program_optimal(struct zink_context *ctx, struct zink_gfx_program *p
        /* always recheck shadow swizzles since they aren't directly part of the key */
        unlikely(shadow_needs_shader_swizzle)) {
       assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
+      bool changed = update_gfx_shader_module_mesh(ctx, prog, MESA_SHADER_FRAGMENT);
       ctx->gfx_pipeline_state.modules_changed |= changed;
       if (unlikely(shadow_needs_shader_swizzle)) {
          struct zink_shader_module **pzm = prog->shader_cache[MESA_SHADER_FRAGMENT][0][0].data;
@@ -771,7 +896,7 @@ replace_separable_prog(struct zink_context *ctx, struct hash_entry *entry, struc
    struct zink_gfx_program *real = prog->full_prog ?
                                    prog->full_prog :
                                    /* this will be NULL with ZINK_DEBUG_NOOPT */
-                                   zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false);
+                                   zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false, false);
    entry->data = real;
    entry->key = real->shaders;
    real->base.removed = false;
@@ -780,12 +905,116 @@ replace_separable_prog(struct zink_context *ctx, struct hash_entry *entry, struc
    return real;
 }
 
+static uint32_t
+hash_gfx_program(const void *key)
+{
+   const uint32_t *k = key;
+
+   return XXH32(k, sizeof(uint32_t[2]), 0);
+}
+
+static bool
+equals_program_variant(const void *a, const void *b)
+{
+   const struct program_variant_key *ak = a;
+   const struct program_variant_key *bk = b;
+   uint32_t val_a = ak->key;
+   uint32_t val_b = bk->key;
+   uint32_t val_a_st = ak->st_key;
+   uint32_t val_b_st = bk->st_key;
+   return val_a == val_b && val_a_st == val_b_st;
+}
+
+#define CURR_KEY_PROGRAM(ctx) (ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber: ctx->curr_program)
+
+static void
+async_variant_program_update(struct zink_context *ctx, bool can_use_uber, bool needs_emulation)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   bool needs_uber = false;
+   if (!ctx->curr_program_uber->is_separable && (!ctx->curr_program_uber->base_variant || needs_emulation)) {
+      struct program_variant_key prog_variant_key = {0};
+      prog_variant_key.key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val;//ctx->gfx_pipeline_state.optimal_key;
+      prog_variant_key.st_key = ctx->gfx_pipeline_state.st_key.small_key.val;
+      struct set_entry * variant_entry = _mesa_set_search(&ctx->curr_program_uber->variants, &prog_variant_key);
+      struct zink_gfx_program *variant;
+      if (!variant_entry) {
+         variant = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->curr_program_uber->gfx_hash, false, true);
+         variant->base.uses_shobj = ctx->curr_program_uber->base.uses_shobj;
+         util_queue_fence_init(&variant->base.cache_fence);
+         struct program_variant_key *prog_variant_key_p = MALLOC(sizeof(struct program_variant_key));
+         memcpy(prog_variant_key_p, &prog_variant_key, sizeof(struct program_variant_key));
+         prog_variant_key_p->prog = variant;
+         variant->uber_variant = ctx->curr_program_uber;
+         _mesa_set_add(&ctx->curr_program_uber->variants, prog_variant_key_p);
+         needs_uber = true;
+      } else
+         variant = ((struct program_variant_key *)variant_entry->key)->prog;
+      /* fetches shader modules from cache and starts async compilation on a miss */
+      bool async_done = update_gfx_program_optimal(ctx, ctx->curr_program_uber, variant, can_use_uber);
+      assert(can_use_uber || async_done);
+      if (async_done) {
+         if (ctx->curr_program_uber->base_variant)
+            copy_gfx_program_missing_shaders(ctx, ctx->curr_program_uber->base_variant, variant);
+         else
+            async_done = update_gfx_program_missing_shaders(ctx, ctx->curr_program_uber, variant, can_use_uber);
+      }
+      assert(can_use_uber || async_done);
+      needs_uber &= !async_done;
+
+      if (async_done && !variant->started_compiling) {
+         /* Modules are ready but the program isn't. Start a job for it. */
+         struct precompile_variant_data *data = CALLOC_STRUCT(precompile_variant_data);
+         data->prog = variant;
+         data->state = ctx->gfx_pipeline_state;
+         if (can_use_uber && !(zink_debug & ZINK_DEBUG_NOBGC))
+            util_queue_add_job(&screen->cache_get_thread, data, &variant->base.cache_fence, precompile_variant_job, NULL, 0);
+         else
+            precompile_variant_job(data, screen, 0);
+         variant->started_compiling = true;
+      }
+      if (!can_use_uber)
+         util_queue_fence_wait(&variant->base.cache_fence);
+      bool variant_prog_ready = variant->started_compiling &&
+                                (!can_use_uber || util_queue_fence_is_signalled(&variant->base.cache_fence));
+      assert(can_use_uber || variant_prog_ready);
+      if(variant_prog_ready) {
+         /* variant prog is ready, use it */
+         if (ctx->curr_program != variant) {
+            ctx->gfx_pipeline_state.modules_changed = true;
+            ctx->curr_program = variant;
+         }
+         assert(async_done);
+         if (!needs_emulation)
+            ctx->curr_program_uber->base_variant = variant;
+      }
+      needs_uber |= !async_done || !variant_prog_ready;
+   } else if (ctx->curr_program_uber->base_variant && !needs_emulation) {
+      ctx->curr_program = ctx->curr_program_uber->base_variant;
+      ctx->curr_program_uber->last_variant_hash = ctx->curr_program->last_variant_hash;
+      ctx->curr_program_uber->st_key = ctx->curr_program->st_key;
+      needs_uber = false;
+   } else if (ctx->curr_program_uber->is_separable) {
+      assert(can_use_uber);
+      ctx->curr_program = ctx->curr_program_uber;
+      needs_uber = true;
+   }
+   if (ctx->gfx_pipeline_state.uber_required != needs_uber) {
+      ctx->gfx_pipeline_state.modules_changed = true;
+      ctx->gfx_pipeline_state.uber_required = needs_uber;
+   }
+
+   if (needs_uber || !ctx->curr_program_uber)
+      ctx->curr_program = ctx->curr_program_uber;
+}
+
 void
 zink_gfx_program_update_optimal(struct zink_context *ctx)
 {
    MESA_TRACE_FUNC();
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    assert(!ctx->gfx_stages[MESA_SHADER_TESS_CTRL] || !ctx->gfx_stages[MESA_SHADER_TESS_CTRL]->non_fs.is_generated);
+   struct zink_gfx_program *old_prog = ctx->curr_program_uber;
    if (ctx->gfx_dirty) {
       struct zink_gfx_program *prog = NULL;
       ctx->gfx_pipeline_state.optimal_key = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
@@ -794,72 +1023,93 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
       simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
       struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
 
-      if (ctx->curr_program)
-         ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      if (CURR_KEY_PROGRAM(ctx)) {
+         ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+         ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
+      }
+      bool needs_emulation = needs_st_emulation(ctx) || (ctx->gfx_pipeline_state.optimal_key != ZINK_SHADER_KEY_OPTIMAL_DEFAULT);
+      bool can_use_uber = zink_can_use_uber(ctx);
       if (entry) {
          prog = (struct zink_gfx_program*)entry->data;
-         bool must_replace = prog->base.uses_shobj ? !zink_can_use_shader_objects(ctx) : (prog->is_separable && !zink_can_use_pipeline_libs(ctx));
-         if (prog->is_separable) {
-            /* shader variants can't be handled by separable programs: sync and compile */
-            if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || must_replace)
+         if (prog->is_separable && !(zink_debug & ZINK_DEBUG_NOOPT)) {
+            /* if uber cannot be used we need to compile the variant synchrously,
+             * so we need the full prog: sync and compile */
+            if (!can_use_uber)
                util_queue_fence_wait(&prog->base.cache_fence);
             /* If the optimized linked pipeline is done compiling, swap it into place. */
-            if (util_queue_fence_is_signalled(&prog->base.cache_fence) &&
-                /* but only if needed for ZINK_DEBUG=noopt */
-                (!(zink_debug & ZINK_DEBUG_NOOPT) || !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || must_replace)) {
+            if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
                prog = replace_separable_prog(ctx, entry, prog);
             }
-         } else if (must_replace) {
-            /* this is a non-separable, incompatible prog which needs replacement */
-            struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false);
-            generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state);
-            entry->data = real;
-            entry->key = real->shaders;
-            real->base.removed = false;
-            prog->base.removed = true;
-            prog = real;
-         } else if (!prog->base.precompile_done) {
-            util_queue_fence_wait(&prog->base.cache_fence);
          }
-         update_gfx_program_optimal(ctx, prog);
+         ctx->curr_program_uber = prog;
+         async_variant_program_update(ctx, can_use_uber, needs_emulation);
       } else {
          ctx->dirty_gfx_stages |= ctx->shader_stages;
          prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, false);
+         ctx->gfx_pipeline_state.uber_required = true;
          prog->base.removed = false;
          _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
          if (!prog->is_separable) {
-            zink_screen_get_pipeline_cache(screen, &prog->base, false);
             perf_debug(ctx, "zink[gfx_compile]: new program created (probably legacy GL features in use)\n");
-            generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state);
+            prog->is_uber_program = true;
+            {
+               struct zink_gfx_pipeline_state state = {0};
+               state.shader_keys_optimal.key.vs_base.last_vertex_stage = true;
+               state.shader_keys_optimal.key.tcs.patch_vertices = 3; //random guess, generated tcs precompile is hard
+               state.optimal_key = state.shader_keys_optimal.key.val;
+               generate_gfx_program_modules_optimal(NULL, screen, prog, &state, prog->is_uber_program);
+               zink_screen_get_pipeline_cache(screen, &prog->base, true);
+               if (!prog->base.uses_shobj) {
+                  simple_mtx_lock(&prog->libs->lock);
+                  zink_create_pipeline_lib(screen, prog, &state, prog->is_uber_program);
+                  simple_mtx_unlock(&prog->libs->lock);
+               }
+               zink_screen_update_pipeline_cache(screen, &prog->base, true);
+            }
+            if (needs_emulation && !can_use_uber) {
+               ctx->curr_program_uber = prog;
+               async_variant_program_update(ctx, can_use_uber, needs_emulation);
+            }
          }
       }
       simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
-      if (prog && prog != ctx->curr_program)
-         zink_batch_reference_program(ctx, &prog->base);
-      ctx->curr_program = prog;
-      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      ctx->curr_program_uber = prog;
+      if (ctx->gfx_pipeline_state.uber_required)
+         ctx->curr_program = prog;
+      if (ctx->curr_program_uber && ctx->curr_program_uber != old_prog)
+      {
+         assert(!ctx->curr_program_uber->is_variant_program);
+         zink_batch_reference_program(ctx, &ctx->curr_program_uber->base);
+      }
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
    } else if (ctx->dirty_gfx_stages) {
       /* remove old hash */
       ctx->gfx_pipeline_state.optimal_key = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
-      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
-
-      bool must_replace = ctx->curr_program->base.uses_shobj ? !zink_can_use_shader_objects(ctx) : (ctx->curr_program->is_separable && !zink_can_use_pipeline_libs(ctx));
-      if (must_replace || (ctx->curr_program->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key))) {
-         struct zink_gfx_program *prog = ctx->curr_program;
-
-         util_queue_fence_wait(&prog->base.cache_fence);
-         /* shader variants can't be handled by separable programs: sync and compile */
-         perf_debug(ctx, "zink[gfx_compile]: non-default shader variant required with separate shader object program\n");
-         struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
-         const uint32_t hash = ctx->gfx_hash;
-         simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
-         struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
-         ctx->curr_program = replace_separable_prog(ctx, entry, prog);
-         simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
+      bool needs_emulation = needs_st_emulation(ctx) || (ctx->gfx_pipeline_state.optimal_key != ZINK_SHADER_KEY_OPTIMAL_DEFAULT);
+      bool can_use_uber = zink_can_use_uber(ctx);
+      if (ctx->curr_program->is_separable && !(zink_debug & ZINK_DEBUG_NOOPT)) {
+         struct zink_gfx_program *prog = ctx->curr_program_uber;
+         if (needs_emulation || ctx->curr_program_uber->is_separable) {
+            if (!can_use_uber)
+               util_queue_fence_wait(&prog->base.cache_fence);
+            perf_debug(ctx, "zink[gfx_compile]: non-default shader variant required with separate shader object program\n");
+            if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
+               struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
+               const uint32_t hash = ctx->gfx_hash;
+               simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+               struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
+               ctx->curr_program_uber = replace_separable_prog(ctx, entry, prog);
+               simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+            }
+         }
       }
-      update_gfx_program_optimal(ctx, ctx->curr_program);
+      async_variant_program_update(ctx, can_use_uber, needs_emulation);
       /* apply new hash */
-      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
    }
    ctx->dirty_gfx_stages = 0;
    ctx->gfx_dirty = false;
@@ -898,8 +1148,8 @@ zink_mesh_program_update_optimal(struct zink_context *ctx)
             }
          } else if (must_replace) {
             /* this is a non-separable, incompatible prog which needs replacement */
-            struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->mesh_hash, true);
-            generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state);
+            struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->mesh_hash, true, false);
+            generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state, false);
             entry->data = real;
             entry->key = real->shaders;
             real->base.removed = false;
@@ -917,7 +1167,7 @@ zink_mesh_program_update_optimal(struct zink_context *ctx)
          if (!prog->is_separable) {
             zink_screen_get_pipeline_cache(screen, &prog->base, false);
             perf_debug(ctx, "zink[gfx_compile]: new program created (probably legacy GL features in use)\n");
-            generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state);
+            generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state, false);
          }
       }
       simple_mtx_unlock(lock);
@@ -960,8 +1210,10 @@ optimized_compile_job(void *data, void *gdata, int thread_index)
    VkPrimitiveTopology vkmode = is_mesh ? VK_PRIMITIVE_TOPOLOGY_MAX_ENUM : zink_primitive_topology(pc_entry->state.gfx_prim_mode);
    if (pc_entry->gpl.gkey)
       pipeline = zink_create_gfx_pipeline_combined(screen, pc_entry->prog, pc_entry->gpl.ikey ? pc_entry->gpl.ikey->pipeline : VK_NULL_HANDLE, &pc_entry->gpl.gkey->pipeline, 1, pc_entry->gpl.okey->pipeline, true, false);
-   else
-      pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, pc_entry->prog->objs, &pc_entry->state, pc_entry->state.element_state->binding_map, vkmode, true);
+   else {
+      struct zink_shader_object *objs = pc_entry->prog->objs;
+      pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, objs, &pc_entry->state, pc_entry->state.element_state->binding_map, vkmode, true);
+   }
    if (pipeline) {
       pc_entry->gpl.unoptimized_pipeline = pc_entry->pipeline;
       pc_entry->pipeline = pipeline;
@@ -1009,10 +1261,12 @@ zink_program_finish(struct zink_context *ctx, struct zink_program *pg)
    if (pg->is_compute)
       return;
    struct zink_gfx_program *prog = (struct zink_gfx_program*)pg;
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      hash_table_foreach(&prog->pipelines[i], entry) {
-         struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
-         util_queue_fence_wait(&pc_entry->fence);
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         hash_table_foreach(&prog->pipelines[r][i], entry) {
+            struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
+            util_queue_fence_wait(&pc_entry->fence);
+         }
       }
    }
 }
@@ -1073,7 +1327,7 @@ update_cs_shader_module(struct zink_context *ctx, struct zink_compute_program *c
          return;
       }
       zm->shobj = false;
-      zm->obj = zink_shader_compile(screen, false, zs, zink_shader_blob_deserialize(screen, &comp->shader->blob), key, zs_swizzle_size ? &ctx->di.zs_swizzle[MESA_SHADER_COMPUTE] : NULL, &comp->base);
+      zm->obj = zink_shader_compile(screen, false, zs, zink_shader_blob_deserialize(screen, &comp->shader->blob), key, NULL, false, zs_swizzle_size ? &ctx->di.zs_swizzle[MESA_SHADER_COMPUTE] : NULL, &comp->base);
       if (!zm->obj.spirv) {
          FREE(zm);
          return;
@@ -1198,7 +1452,11 @@ zink_gfx_lib_cache_unref(struct zink_screen *screen, struct zink_gfx_lib_cache *
 {
    if (!p_atomic_dec_zero(&libs->refcount))
       return;
-
+   if (libs->lib) {
+      struct zink_gfx_library_key *gkey = libs->lib;
+      VKSCR(DestroyPipeline)(screen->dev, gkey->pipeline, NULL);
+      FREE(gkey);
+   }
    simple_mtx_destroy(&libs->lock);
    set_foreach_remove(&libs->libs, he) {
       struct zink_gfx_library_key *gkey = (void*)he->key;
@@ -1217,10 +1475,6 @@ create_lib_cache(struct zink_gfx_program *prog, bool generated_tcs)
    if (generated_tcs)
       libs->stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
    simple_mtx_init(&libs->lock, mtx_plain);
-   if (generated_tcs)
-      _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib_generated_tcs, equals_pipeline_lib_generated_tcs);
-   else
-      _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib, equals_pipeline_lib);
    return libs;
 }
 
@@ -1229,6 +1483,8 @@ find_or_create_lib_cache(struct zink_screen *screen, struct zink_gfx_program *pr
 {
    unsigned stages_present = prog->stages_present;
    bool generated_tcs = prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated;
+   if (prog->is_variant_program)
+      return create_lib_cache(prog, generated_tcs);
    if (generated_tcs)
       stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
    unsigned idx = zink_program_cache_stages(stages_present);
@@ -1307,7 +1563,7 @@ gfx_program_create(struct zink_context *ctx,
                         struct zink_shader **stages,
                         unsigned vertices_per_patch,
                         uint32_t gfx_hash,
-                        bool is_mesh)
+                        bool is_mesh, bool variant)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_gfx_program *prog = create_program(ctx, false);
@@ -1317,6 +1573,7 @@ gfx_program_create(struct zink_context *ctx,
    prog->gfx_hash = gfx_hash;
    prog->base.removed = true;
    prog->optimal_keys = screen->optimal_keys;
+   prog->is_variant_program = variant;
 
    for (int i = is_mesh ? MESA_SHADER_FRAGMENT : 0; i < (is_mesh ? MESA_SHADER_MESH_STAGES : MESA_SHADER_STAGES); ++i) {
       util_dynarray_init(&prog->shader_cache[i][0][0], prog->base.ralloc_ctx);
@@ -1331,6 +1588,7 @@ gfx_program_create(struct zink_context *ctx,
          prog->needs_inlining |= prog->shaders[i]->needs_inlining;
       }
    }
+   util_dynarray_init(&prog->uber_modules, prog->base.ralloc_ctx);
    if (stages[MESA_SHADER_TESS_EVAL] && !stages[MESA_SHADER_TESS_CTRL]) {
       util_queue_fence_wait(&stages[MESA_SHADER_TESS_EVAL]->precompile.fence);
       if (!prog->shaders[MESA_SHADER_TESS_EVAL]->non_fs.generated_tcs)
@@ -1340,13 +1598,17 @@ gfx_program_create(struct zink_context *ctx,
    }
    prog->stages_remaining = prog->stages_present;
    for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) {
-      if (prog->shaders[i]) {
+      if (prog->shaders[i] && !variant) {
          simple_mtx_lock(&prog->shaders[i]->lock);
          _mesa_set_add(prog->shaders[i]->programs, prog);
          simple_mtx_unlock(&prog->shaders[i]->lock);
          zink_gfx_program_reference(screen, NULL, prog);
       }
    }
+
+   if (variant)
+      zink_gfx_program_reference(screen, NULL, prog);
+
    p_atomic_dec(&prog->base.reference.count);
 
    if (is_mesh)
@@ -1360,8 +1622,12 @@ gfx_program_create(struct zink_context *ctx,
    prog->has_edgeflags = prog->shaders[MESA_SHADER_VERTEX] &&
                          prog->shaders[MESA_SHADER_VERTEX]->has_edgeflags;
 
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      _mesa_hash_table_init(&prog->pipelines[i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+   _mesa_set_init(&prog->variants, prog->base.ralloc_ctx, hash_gfx_program, equals_program_variant);
+
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         _mesa_hash_table_init(&prog->pipelines[r][i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+      }
    }
    return prog;
 
@@ -1436,9 +1702,9 @@ zink_create_gfx_program(struct zink_context *ctx,
                         struct zink_shader **stages,
                         unsigned vertices_per_patch,
                         uint32_t gfx_hash,
-                        bool is_mesh)
+                        bool is_mesh, bool variant)
 {
-   struct zink_gfx_program *prog = gfx_program_create(ctx, stages, vertices_per_patch, gfx_hash, is_mesh);
+   struct zink_gfx_program *prog = gfx_program_create(ctx, stages, vertices_per_patch, gfx_hash, is_mesh, variant);
    if (prog)
       prog = gfx_program_init(ctx, prog);
    return prog;
@@ -1454,7 +1720,8 @@ create_linked_separable_job(void *data, void *gdata, int thread_index)
    /* this is a dead program */
    if (prog->base.removed)
       return;
-   prog->full_prog = gfx_program_create(prog->base.ctx, prog->shaders, 0, prog->gfx_hash, !!prog->shaders[MESA_SHADER_MESH]);
+   prog->full_prog = gfx_program_create(prog->base.ctx, prog->shaders, 0, prog->gfx_hash, !!prog->shaders[MESA_SHADER_MESH], false);
+   prog->full_prog->is_uber_program = prog->is_uber_program;
    /* block gfx_shader_prune in the main thread */
    util_queue_fence_reset(&prog->full_prog->base.cache_fence);
    /* add an ownership ref */
@@ -1479,15 +1746,16 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
    uint32_t hash = is_mesh ? ctx->mesh_hash : ctx->gfx_hash;
    if (!is_separate ||
        /* TODO: maybe try variants? grimace */
+       /* TODO allow if uber is usable */
        !is_default ||
        !can_gpl)
-      return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh);
+      return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh, false);
    for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) {
       /* ensure async shader creation is done */
       if (stages[i]) {
          util_queue_fence_wait(&stages[i]->precompile.fence);
-         if (!stages[i]->precompile.obj.mod)
-            return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh);
+         if (!stages[i]->precompile.obj.mod && !stages[i]->precompile.obj.mod)
+            return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh, false);
       }
    }
 
@@ -1496,6 +1764,7 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
       goto fail;
 
    prog->is_separable = true;
+   prog->is_uber_program = true;
    prog->gfx_hash = hash;
    prog->base.uses_shobj = screen->info.have_EXT_shader_object &&
                            ((stages[MESA_SHADER_VERTEX] && !stages[MESA_SHADER_VERTEX]->info.view_mask) ||
@@ -1535,8 +1804,12 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
    */
    p_atomic_add(&prog->base.reference.count, refs - 1);
 
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      _mesa_hash_table_init(&prog->pipelines[i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+   _mesa_set_init(&prog->variants, prog->base.ralloc_ctx, hash_gfx_program, equals_program_variant);
+
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         _mesa_hash_table_init(&prog->pipelines[r][i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+      }
    }
 
    for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) {
@@ -1557,18 +1830,25 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
    prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
 
    prog->last_variant_hash = is_mesh ? ctx->gfx_pipeline_state.mesh_optimal_key : ctx->gfx_pipeline_state.optimal_key;
+   prog->st_key = ctx->gfx_pipeline_state.st_key.small_key.val;
 
    if (!prog->base.uses_shobj) {
-      VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
-      struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key);
-      if (!gkey) {
-         mesa_loge("ZINK: failed to allocate gkey!");
-         goto fail;
+      if (!is_mesh) {
+         VkPipeline uber_libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
+         prog->libs->lib = CALLOC_STRUCT(zink_gfx_library_key);
+         prog->libs->lib->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, uber_libs, 2, VK_NULL_HANDLE, false, false);
+      } else {
+         VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
+         struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key);
+         if (!gkey) {
+            mesa_loge("ZINK: failed to allocate gkey!");
+            goto fail;
+         }
+         gkey->optimal_key = prog->last_variant_hash;
+         assert(gkey->optimal_key);
+         gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false, false);
+         _mesa_set_add(&prog->libs->libs, gkey);
       }
-      gkey->optimal_key = prog->last_variant_hash;
-      assert(gkey->optimal_key);
-      gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false, false);
-      _mesa_set_add(&prog->libs->libs, gkey);
    }
 
    if (!(zink_debug & ZINK_DEBUG_NOOPT))
@@ -1722,7 +2002,7 @@ precompile_compute_job(void *data, void *gdata, int thread_index)
    comp->curr = comp->module = CALLOC_STRUCT(zink_shader_module);
    assert(comp->module);
    comp->module->shobj = false;
-   comp->module->obj = zink_shader_compile(screen, false, comp->shader, comp->nir, NULL, NULL, &comp->base);
+   comp->module->obj = zink_shader_compile(screen, false, comp->shader, comp->nir, NULL, NULL, false, NULL, &comp->base);
    /* comp->nir will be freed by zink_shader_compile */
    comp->nir = NULL;
    assert(comp->module->obj.spirv);
@@ -1869,21 +2149,41 @@ zink_destroy_gfx_program(struct zink_screen *screen,
 {
    if (prog->is_separable)
       zink_gfx_program_reference(screen, &prog->full_prog, NULL);
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      hash_table_foreach(&prog->pipelines[i], entry) {
-         struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         hash_table_foreach(&prog->pipelines[r][i], entry) {
+            struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
 
-         util_queue_fence_wait(&pc_entry->fence);
-         VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
-         VKSCR(DestroyPipeline)(screen->dev, pc_entry->gpl.unoptimized_pipeline, NULL);
-         free(pc_entry);
+            util_queue_fence_wait(&pc_entry->fence);
+            VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
+            VKSCR(DestroyPipeline)(screen->dev, pc_entry->gpl.unoptimized_pipeline, NULL);
+            free(pc_entry);
+         }
       }
    }
 
+   /* wait for all async compilation jobs */
+   for (unsigned stage = 0; stage < ZINK_GFX_SHADER_COUNT; stage++) {
+        struct util_dynarray *shader_cache = &prog->shader_cache[stage][0][0];
+        unsigned count = util_dynarray_num_elements(shader_cache, struct zink_shader_module *);
+        struct zink_shader_module **pzm = shader_cache->data;
+        for (unsigned i = 0; i < count; i++) {
+           struct zink_shader_module *iter = pzm[i];
+           util_queue_fence_wait(&iter->fence);
+        }
+   }
+
+   set_foreach(&prog->variants, entry) {
+      struct program_variant_key *prog_variant_key = (void*)entry->key;
+      assert(prog_variant_key->prog->is_variant_program);
+      zink_destroy_gfx_program(screen, prog_variant_key->prog);
+      FREE(prog_variant_key);
+   }
+
    deinit_program(screen, &prog->base);
 
    for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) {
-      if (prog->shaders[i]) {
+      if (prog->shaders[i] && !prog->is_variant_program) {
          _mesa_set_remove_key(prog->shaders[i]->programs, prog);
          prog->shaders[i] = NULL;
       }
@@ -1895,6 +2195,10 @@ zink_destroy_gfx_program(struct zink_screen *screen,
          blob_finish(&prog->blobs[i]);
       }
    }
+   while (util_dynarray_contains(&prog->uber_modules, void*)) {
+      struct zink_shader_module *zm = util_dynarray_pop(&prog->uber_modules, struct zink_shader_module*);
+      zink_destroy_shader_module(screen, zm);
+   }
    if (prog->libs)
       zink_gfx_lib_cache_unref(screen, prog->libs);
 
@@ -2046,8 +2350,11 @@ bind_gfx_stage(struct zink_context *ctx, mesa_shader_stage stage, struct zink_sh
          zink_descriptors_init_bindless(ctx);
    } else {
       if (stage < MESA_SHADER_COMPUTE) {
-         if (ctx->curr_program)
-            ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+         if (ctx->curr_program_uber) {
+            ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+            ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program_uber->st_key;
+         }
+         ctx->curr_program_uber = NULL;
          ctx->curr_program = NULL;
       }
       if (stage == MESA_SHADER_FRAGMENT || stage > MESA_SHADER_COMPUTE) {
@@ -2391,7 +2698,7 @@ zink_delete_cs_shader_state(struct pipe_context *pctx, void *cso)
 
 /* caller must lock prog->libs->lock */
 struct zink_gfx_library_key *
-zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state)
+zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber)
 {
    struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key);
    bool is_mesh = !prog->shaders[MESA_SHADER_VERTEX];
@@ -2401,11 +2708,15 @@ zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *pr
    }
 
    gkey->optimal_key = !is_mesh ? state->optimal_key : state->mesh_optimal_key;
+   gkey->st_key = state->st_key.small_key.val;
    assert(is_mesh || gkey->optimal_key);
    for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++)
       gkey->modules[i] = prog->objs[i].mod;
-   gkey->pipeline = zink_create_gfx_pipeline_library(screen, prog);
-   _mesa_set_add(&prog->libs->libs, gkey);
+   gkey->pipeline = zink_create_gfx_pipeline_library(screen, prog->objs, prog);
+   if (is_mesh)
+      _mesa_set_add(&prog->libs->libs, gkey);
+   else
+      prog->libs->lib = gkey;
    return gkey;
 }
 
@@ -2433,6 +2744,26 @@ print_exe_stages(VkShaderStageFlags stages)
    UNREACHABLE("unhandled combination of stages!");
 }
 
+static void
+precompile_variant_job(void *data, void *gdata, int thread_index)
+{
+   struct zink_screen *screen = gdata;
+   struct precompile_variant_data *precompile_data = data;
+   struct zink_gfx_program *prog = precompile_data->prog;
+   struct zink_gfx_pipeline_state *state = &precompile_data->state;
+
+   //generate_gfx_program_modules_optimal(NULL, screen, prog, state);
+   zink_screen_get_pipeline_cache(screen, &prog->base, true);
+   if (!screen->info.have_EXT_shader_object) {
+      simple_mtx_lock(&prog->libs->lock);
+      zink_create_pipeline_lib(screen, prog, state, false);
+      simple_mtx_unlock(&prog->libs->lock);
+   }
+   zink_screen_update_pipeline_cache(screen, &prog->base, true);
+
+   FREE(data);
+}
+
 static void
 gfx_program_precompile_job(void *data, void *gdata, int thread_index)
 {
@@ -2446,11 +2777,11 @@ gfx_program_precompile_job(void *data, void *gdata, int thread_index)
    state.shader_keys_optimal.key.vs_base.last_vertex_stage = true;
    state.shader_keys_optimal.key.tcs.patch_vertices = 3; //random guess, generated tcs precompile is hard
    state.optimal_key = state.shader_keys_optimal.key.val;
-   generate_gfx_program_modules_optimal(NULL, screen, prog, &state);
+   generate_gfx_program_modules_optimal(NULL, screen, prog, &state, prog->is_uber_program);
    zink_screen_get_pipeline_cache(screen, &prog->base, true);
    if (!prog->base.uses_shobj) {
       simple_mtx_lock(&prog->libs->lock);
-      zink_create_pipeline_lib(screen, prog, &state);
+      zink_create_pipeline_lib(screen, prog, &state, prog->is_uber_program);
       simple_mtx_unlock(&prog->libs->lock);
    }
    prog->base.precompile_done = true;
@@ -2494,17 +2825,18 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
       simple_mtx_unlock(lock);
       return;
    }
-   struct zink_gfx_program *prog = gfx_program_create(ctx, zshaders, 3, hash, is_mesh);
+   struct zink_gfx_program *prog = gfx_program_create(ctx, zshaders, 3, hash, is_mesh, false);
    u_foreach_bit(i, shader_stages)
       assert(prog->shaders[i]);
    _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
    prog->base.removed = false;
+   prog->is_uber_program = true;
    simple_mtx_unlock(lock);
    if (zink_debug & ZINK_DEBUG_SHADERDB) {
       struct zink_screen *screen = zink_screen(pctx->screen);
       gfx_program_init(ctx, prog);
       if (screen->optimal_keys)
-         generate_gfx_program_modules_optimal(ctx, screen,  prog, &ctx->gfx_pipeline_state);
+         generate_gfx_program_modules_optimal(ctx, screen,  prog, &ctx->gfx_pipeline_state, false);
       else
          generate_gfx_program_modules(ctx, screen,  prog, &ctx->gfx_pipeline_state);
       VkPipeline pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, &ctx->gfx_pipeline_state,
@@ -2535,7 +2867,7 @@ zink_delete_shader_state(struct pipe_context *pctx, void *cso)
 static void
 precompile_separate_shader(struct zink_shader *zs, struct zink_screen *screen)
 {
-   zs->precompile.obj = zink_shader_compile_separate(screen, zs);
+   zs->precompile.obj = zink_shader_compile_separate(screen, zs, zs->is_uber);
    if (!screen->info.have_EXT_shader_object) {
       struct zink_shader_object objs[MESA_SHADER_MESH_STAGES] = {0};
       objs[zs->info.stage].mod = zs->precompile.obj.mod;
@@ -2543,6 +2875,20 @@ precompile_separate_shader(struct zink_shader *zs, struct zink_screen *screen)
    }
 }
 
+static void
+precompile_variant_separate_shader_job(void *data, void *gdata, int thread_index)
+{
+   struct zink_screen *screen = gdata;
+   struct precompile_separate_variant_data *precompile_data = data;
+
+   nir_shader *nir = zink_shader_blob_deserialize(screen, precompile_data->blob);
+   precompile_data->zm->obj = zink_shader_compile(screen, precompile_data->uses_shobj, precompile_data->zs, nir,
+                                                  precompile_data->has_key ? &precompile_data->key: NULL,
+                                                  &precompile_data->st_key,
+                                                  false, NULL, precompile_data->prog);
+   FREE(data);
+}
+
 static void
 gfx_shader_init_job(void *data, void *gdata, int thread_index)
 {
@@ -2581,6 +2927,7 @@ zink_create_gfx_shader_state(struct pipe_context *pctx, const struct pipe_shader
       zink_descriptor_util_init_fbfetch(zink_context(pctx));
 
    struct zink_shader *zs = zink_shader_create(zink_screen(pctx->screen), nir);
+   zs->is_uber = true;
    if (zink_debug & ZINK_DEBUG_NOBGC)
       gfx_shader_init_job(zs, screen, 0);
    else
@@ -2593,6 +2940,8 @@ static void
 zink_delete_cached_shader_state(struct pipe_context *pctx, void *cso)
 {
    struct zink_screen *screen = zink_screen(pctx->screen);
+   // HACK this is oversyncing but we have no way of konwing which jobs use this zink_shader
+   util_queue_finish(&screen->cache_get_thread);
    util_shader_reference(pctx, &screen->shaders, &cso, NULL);
 }
 
diff --git a/src/gallium/drivers/zink/zink_program.h b/src/gallium/drivers/zink/zink_program.h
index 3d5ffc170c9..6e2b0a202a6 100644
--- a/src/gallium/drivers/zink/zink_program.h
+++ b/src/gallium/drivers/zink/zink_program.h
@@ -128,7 +128,7 @@ zink_mesh_program_update_optimal(struct zink_context *ctx);
 
 
 struct zink_gfx_library_key *
-zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state);
+zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber);
 uint32_t hash_gfx_output(const void *key);
 uint32_t hash_gfx_output_ds3(const void *key);
 uint32_t hash_gfx_input(const void *key);
@@ -159,7 +159,7 @@ zink_create_gfx_program(struct zink_context *ctx,
                         struct zink_shader **stages,
                         unsigned vertices_per_patch,
                         uint32_t gfx_hash,
-                        bool is_mesh);
+                        bool is_mesh, bool variant);
 
 void
 zink_destroy_gfx_program(struct zink_screen *screen,
@@ -405,6 +405,27 @@ zink_set_zs_needs_shader_swizzle_key(struct zink_context *ctx, mesa_shader_stage
       zink_set_shader_key_base(ctx, pstage)->needs_zs_shader_swizzle = enable;
 }
 
+static inline const union zink_st_small_key *
+zink_get_st_small_key(struct zink_context *ctx)
+{
+   assert(zink_screen(ctx->base.screen)->optimal_keys);
+   return &ctx->gfx_pipeline_state.st_key.small_key;
+}
+
+static inline union zink_st_small_key *
+zink_set_st_small_key(struct zink_context *ctx)
+{
+   ctx->dirty_gfx_stages |= ctx->shader_stages & (MESA_SHADER_VERTEX | MESA_SHADER_GEOMETRY | MESA_SHADER_FRAGMENT);
+   assert(zink_screen(ctx->base.screen)->optimal_keys);
+   return &ctx->gfx_pipeline_state.st_key.small_key;
+}
+
+static inline bool
+needs_st_emulation(struct zink_context *ctx)
+{
+   return ctx->gfx_pipeline_state.st_key.small_key.val != 0;
+}
+
 ALWAYS_INLINE static bool
 zink_can_use_pipeline_libs(const struct zink_context *ctx)
 {
@@ -464,6 +485,14 @@ zink_can_use_shader_objects_mesh(const struct zink_context *ctx)
           !ctx->fb_state.viewmask;
 }
 
+ALWAYS_INLINE static bool
+zink_can_use_uber(struct zink_context *ctx)
+{
+   bool generated_tcs = ctx->gfx_stages[MESA_SHADER_TESS_EVAL] && !ctx->gfx_stages[MESA_SHADER_TESS_CTRL];
+   return zink_shader_key_optimal_no_tcs(ctx->gfx_pipeline_state.optimal_key) == ZINK_SHADER_KEY_OPTIMAL_DEFAULT &&
+      zink_can_use_pipeline_libs(ctx) && (!generated_tcs || ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch == 3);
+}
+
 bool
 zink_set_rasterizer_discard(struct zink_context *ctx, bool disable);
 void
diff --git a/src/gallium/drivers/zink/zink_program_state.hpp b/src/gallium/drivers/zink/zink_program_state.hpp
index a175072e45b..3f38271dab2 100644
--- a/src/gallium/drivers/zink/zink_program_state.hpp
+++ b/src/gallium/drivers/zink/zink_program_state.hpp
@@ -102,6 +102,7 @@ template <zink_dynamic_state DYNAMIC_STATE, bool HAVE_LIB, bool IS_MESH>
 VkPipeline
 zink_get_gfx_pipeline(struct zink_context *ctx,
                       struct zink_gfx_program *prog,
+                      struct zink_gfx_program *variant_prog,
                       struct zink_gfx_pipeline_state *state,
                       enum mesa_prim mode)
 {
@@ -113,7 +114,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
    const unsigned idx = IS_MESH || screen->info.dynamic_state3_props.dynamicPrimitiveTopologyUnrestricted ?
                         0 :
                         get_pipeline_idx<DYNAMIC_STATE >= ZINK_DYNAMIC_STATE>(mode, vkmode);
-   assert(idx <= ARRAY_SIZE(prog->pipelines));
+   assert(idx <= ARRAY_SIZE(prog->pipelines[0]));
    if (IS_MESH) {
       if (!state->mesh_dirty && !state->mesh_modules_changed)
          return state->mesh_pipeline;
@@ -144,27 +145,28 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
       }
    }
    /* extra safety asserts for optimal path to catch refactoring bugs */
-   if (prog->optimal_keys) {
+   if (variant_prog->optimal_keys) {
       ASSERTED const union zink_shader_key_optimal *opt = (union zink_shader_key_optimal*)&prog->last_variant_hash;
       ASSERTED union zink_shader_key_optimal sanitized = {};
       if (IS_MESH) {
          sanitized.val = zink_sanitize_optimal_key_mesh(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
+         assert(opt->val == sanitized.val);
          assert(state->mesh_optimal_key == sanitized.val);
-      } else {
+      } else if (!state->uber_required) {
          sanitized.val = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
+         assert(opt->val == sanitized.val);
          assert(state->optimal_key == sanitized.val);
       }
-      assert(opt->val == sanitized.val);
    }
 
    if (IS_MESH) {
       state->mesh_modules_changed = false;
 
-      if (prog->last_finalized_hash[idx] == state->mesh_final_hash &&
-         !prog->inline_variants && likely(prog->last_pipeline[idx]) &&
+      if (prog->last_finalized_hash[0][idx] == state->mesh_final_hash &&
+         !prog->inline_variants && likely(prog->last_pipeline[0][idx]) &&
          /* this data is too big to compare in the fast-path */
          likely(!prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) {
-         state->mesh_pipeline = prog->last_pipeline[idx]->pipeline;
+         state->mesh_pipeline = prog->last_pipeline[0][idx]->pipeline;
          return state->mesh_pipeline;
       }
    } else {
@@ -202,23 +204,22 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
 
       /* shortcut for reusing previous pipeline across program changes */
       if (DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT || DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT2) {
-         if (prog->last_finalized_hash[idx] == state->final_hash &&
-            !prog->inline_variants && likely(prog->last_pipeline[idx]) &&
-            /* this data is too big to compare in the fast-path */
-            likely(!prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) {
-            state->pipeline = prog->last_pipeline[idx]->pipeline;
+         if (variant_prog->last_finalized_hash[state->uber_required][idx] == state->final_hash &&
+             !variant_prog->inline_variants && likely(variant_prog->last_pipeline[state->uber_required][idx]) &&
+             /* this data is too big to compare in the fast-path */
+             likely(!variant_prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) {
+            state->pipeline = variant_prog->last_pipeline[state->uber_required][idx]->pipeline;
             return state->pipeline;
          }
       }
    }
-
    unsigned final_hash = IS_MESH ? state->mesh_final_hash : state->final_hash;
-   entry = _mesa_hash_table_search_pre_hashed(&prog->pipelines[idx], final_hash, state);
+   entry = _mesa_hash_table_search_pre_hashed(&variant_prog->pipelines[state->uber_required][idx], final_hash, state);
 
    if (!entry) {
       bool can_gpl = IS_MESH ? zink_can_use_pipeline_libs_mesh(ctx) : zink_can_use_pipeline_libs(ctx);
       /* always wait on async precompile/cache fence */
-      util_queue_fence_wait(&prog->base.cache_fence);
+      util_queue_fence_wait(&variant_prog->base.cache_fence);
       struct zink_gfx_pipeline_cache_entry *pc_entry = CALLOC_STRUCT(zink_gfx_pipeline_cache_entry);
       if (!pc_entry)
          return VK_NULL_HANDLE;
@@ -227,28 +228,47 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
        */
       memcpy(&pc_entry->state, state, sizeof(*state));
       pc_entry->state.rendering_info.pColorAttachmentFormats = pc_entry->state.rendering_formats;
-      pc_entry->prog = prog;
+      pc_entry->prog = state->uber_required ? prog : variant_prog;
       /* init the optimized background compile fence */
       util_queue_fence_init(&pc_entry->fence);
-      entry = _mesa_hash_table_insert_pre_hashed(&prog->pipelines[idx], final_hash, pc_entry, pc_entry);
-      if (prog->base.uses_shobj && !prog->is_separable) {
-         memcpy(pc_entry->shobjs, prog->objs, sizeof(prog->objs));
+      entry = _mesa_hash_table_insert_pre_hashed(&variant_prog->pipelines[state->uber_required][idx], final_hash, pc_entry, pc_entry);
+      if (variant_prog->base.uses_shobj && !variant_prog->is_separable) {
+         memcpy(pc_entry->shobjs, variant_prog->objs, sizeof(variant_prog->objs));
          zink_gfx_program_compile_queue(ctx, pc_entry);
       } else if (HAVE_LIB && can_gpl) {
          uint32_t optimal_key = IS_MESH ? ctx->gfx_pipeline_state.mesh_optimal_key : ctx->gfx_pipeline_state.optimal_key;
          /* this is the graphics pipeline library path: find/construct all partial pipelines */
-         simple_mtx_lock(&prog->libs->lock);
-         struct set_entry *he = _mesa_set_search(&prog->libs->libs, &optimal_key);
          struct zink_gfx_library_key *gkey;
-         if (he) {
-            gkey = (struct zink_gfx_library_key *)he->key;
+         if (IS_MESH) {
+            simple_mtx_lock(&prog->libs->lock);
+            struct set_entry *he = _mesa_set_search(&prog->libs->libs, &optimal_key);
+            if (he) {
+               gkey = (struct zink_gfx_library_key *)he->key;
+            } else {
+               assert(!prog->is_separable);
+               gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state, false);
+            }
+            simple_mtx_unlock(&prog->libs->lock);
          } else {
-            assert(!prog->is_separable);
-            gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state);
+            if (state->uber_required) {
+               simple_mtx_lock(&prog->libs->lock);
+               assert(prog->libs->lib);
+               gkey = prog->libs->lib;
+               simple_mtx_unlock(&prog->libs->lock);
+            } else {
+               simple_mtx_lock(&variant_prog->libs->lock);
+               if (variant_prog->libs->lib) {
+                  gkey = variant_prog->libs->lib;
+                  assert(gkey->optimal_key == optimal_key);
+                  assert(gkey->st_key == state->st_key.small_key.val);
+               } else {
+                  assert(!variant_prog->is_separable);
+                  gkey = zink_create_pipeline_lib(screen, variant_prog, &ctx->gfx_pipeline_state, false);
+               }
+               simple_mtx_unlock(&variant_prog->libs->lock);
+            }
          }
-         simple_mtx_unlock(&prog->libs->lock);
-         struct zink_gfx_input_key *ikey = IS_MESH ? NULL :
-                                             DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
+         struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
                                              zink_find_or_create_input_dynamic(ctx, vkmode) :
                                              zink_find_or_create_input(ctx, vkmode);
          struct zink_gfx_output_key *okey = DYNAMIC_STATE >= ZINK_DYNAMIC_STATE3 && screen->have_full_ds3 ?
@@ -259,29 +279,30 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
          pc_entry->gpl.gkey = gkey;
          pc_entry->gpl.okey = okey;
          /* try to hit optimized compile cache first if possible */
-         if (!prog->is_separable)
-            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, true, true);
+         if (!variant_prog->is_separable)
+            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, variant_prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, true, true);
          if (!pc_entry->pipeline) {
             /* create the non-optimized pipeline first using fast-linking to avoid stuttering */
-            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, false, false);
-            if (!prog->is_separable)
+            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, variant_prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, false, false);
+            if (!variant_prog->is_separable)
                /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */
                zink_gfx_program_compile_queue(ctx, pc_entry);
          }
       } else {
+         struct zink_shader_object *objs = state->uber_required ? prog->objs : variant_prog->objs;
          /* optimize by default only when expecting precompiles in order to reduce stuttering */
          if (DYNAMIC_STATE != ZINK_DYNAMIC_VERTEX_INPUT2 && DYNAMIC_STATE != ZINK_DYNAMIC_VERTEX_INPUT && !IS_MESH)
-            pc_entry->pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, state, state->element_state->binding_map, vkmode, !HAVE_LIB);
+            pc_entry->pipeline = zink_create_gfx_pipeline(screen, variant_prog, objs, state, state->element_state->binding_map, vkmode, !HAVE_LIB);
          else
-            pc_entry->pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, state, NULL, vkmode, !HAVE_LIB);
-         if (HAVE_LIB && !prog->is_separable)
+            pc_entry->pipeline = zink_create_gfx_pipeline(screen, variant_prog, objs, state, NULL, vkmode, !HAVE_LIB);
+         if (HAVE_LIB && !variant_prog->is_separable)
             /* trigger async optimized pipeline compile if this was an unoptimized pipeline */
             zink_gfx_program_compile_queue(ctx, pc_entry);
       }
       if (pc_entry->pipeline == VK_NULL_HANDLE)
          return VK_NULL_HANDLE;
 
-      zink_screen_update_pipeline_cache(screen, &prog->base, false);
+      zink_screen_update_pipeline_cache(screen, &variant_prog->base, false);
    }
 
    struct zink_gfx_pipeline_cache_entry *cache_entry = (struct zink_gfx_pipeline_cache_entry *)entry->data;
@@ -291,8 +312,8 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
       state->pipeline = cache_entry->pipeline;
    /* update states for fastpath */
    if (DYNAMIC_STATE >= ZINK_DYNAMIC_VERTEX_INPUT) {
-      prog->last_finalized_hash[idx] = final_hash;
-      prog->last_pipeline[idx] = cache_entry;
+      variant_prog->last_finalized_hash[state->uber_required][idx] = final_hash;
+      variant_prog->last_pipeline[state->uber_required][idx] = cache_entry;
    }
    return IS_MESH ? state->mesh_pipeline : state->pipeline;
 }
@@ -355,6 +376,8 @@ equals_gfx_pipeline_state(const void *a, const void *b)
    if (STAGE_MASK & STAGE_MASK_OPTIMAL) {
       if (sa->optimal_key != sb->optimal_key)
          return false;
+      if (sa->st_key.small_key.val != sb->st_key.small_key.val)
+         return false;
       if (STAGE_MASK & STAGE_MASK_OPTIMAL_SHADOW) {
          if (sa->shadow != sb->shadow)
             return false;
diff --git a/src/gallium/drivers/zink/zink_shader_keys.h b/src/gallium/drivers/zink/zink_shader_keys.h
index ea883007387..a76a6556cc4 100644
--- a/src/gallium/drivers/zink/zink_shader_keys.h
+++ b/src/gallium/drivers/zink/zink_shader_keys.h
@@ -28,6 +28,41 @@
 
 #include "compiler/shader_info.h"
 
+union zink_st_small_key {
+   struct {
+      /** for ARB_color_buffer_float */
+      uint8_t clamp_color:1;
+      /* for user-defined clip-planes */
+      uint8_t lower_ucp:1;
+      /* Whether st_variant::driver_shader is for the draw module,
+       * not for the driver.
+       */
+      uint8_t is_draw_shader:1;
+      uint8_t lower_flatshade:1;
+      uint8_t lower_alpha_test:1;
+      uint16_t pad: 11; // from here not key
+   };
+   uint16_t val;
+};
+
+struct zink_st_variant_key
+{
+   union zink_st_small_key small_key;
+
+   uint8_t ucp_enables: 8;
+
+   unsigned lower_alpha_func:3;
+
+
+   uint32_t pad2: 5; //next array aligned to uint32 for easy access
+
+   /* bitmask of sampler units; PIPE_CAP_GL_CLAMP */
+   uint32_t gl_clamp[3];
+
+   /* needs more than 128 bytes */
+   struct pipe_clip_state ucp_state;
+};
+
 struct zink_vs_key_base {
    bool last_vertex_stage : 1;
    bool clip_halfz : 1;
diff --git a/src/gallium/drivers/zink/zink_state.c b/src/gallium/drivers/zink/zink_state.c
index 24175aceeed..2415a65351d 100644
--- a/src/gallium/drivers/zink/zink_state.c
+++ b/src/gallium/drivers/zink/zink_state.c
@@ -722,6 +722,21 @@ zink_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
 
       if (!screen->optimal_keys)
          zink_update_gs_key_rectangular_line(ctx);
+
+      if (screen->optimal_keys) {
+         struct zink_st_variant_key *key = &ctx->gfx_pipeline_state.st_key;
+
+        if ((zink_get_st_small_key(ctx)->clamp_color) != ctx->rast_state->base.clamp_fragment_color)
+           zink_set_st_small_key(ctx)->clamp_color = ctx->rast_state->base.clamp_fragment_color;
+
+        if ((zink_get_st_small_key(ctx)->lower_flatshade) != ctx->rast_state->base.flatshade)
+           zink_set_st_small_key(ctx)->lower_flatshade = ctx->rast_state->base.flatshade;
+
+        key->ucp_enables = ctx->rast_state->base.clip_plane_enable;
+
+        if ((zink_get_st_small_key(ctx)->lower_ucp) != !!key->ucp_enables)
+           zink_set_st_small_key(ctx)->lower_ucp = !!key->ucp_enables;
+      }
    }
 }
 
diff --git a/src/gallium/drivers/zink/zink_types.h b/src/gallium/drivers/zink/zink_types.h
index c2305ece7f7..bddbb9e87c4 100644
--- a/src/gallium/drivers/zink/zink_types.h
+++ b/src/gallium/drivers/zink/zink_types.h
@@ -795,6 +795,7 @@ struct zink_shader_object {
       VkShaderModule mod;
    };
    struct spirv_shader *spirv;
+   VkPipeline gpl;
 };
 
 struct zink_shader {
@@ -824,6 +825,7 @@ struct zink_shader {
    bool has_uniforms;
    bool has_edgeflags;
    bool needs_inlining;
+   bool is_uber;
    struct spirv_shader *spirv;
 
    struct {
@@ -932,6 +934,7 @@ struct zink_gfx_pipeline_state {
    uint32_t vertex_strides[PIPE_MAX_ATTRIBS];
    struct zink_vertex_elements_hw_state *element_state;
    struct zink_zs_swizzle_key *shadow;
+   bool uber_required; // emulation needed && !async compilation done
    enum mesa_prim shader_rast_prim, rast_prim; /* reduced type or max for unknown */
    union {
       struct {
@@ -942,6 +945,7 @@ struct zink_gfx_pipeline_state {
          union zink_shader_key_optimal key;
       } shader_keys_optimal;
    };
+   struct zink_st_variant_key st_key;
    struct zink_blend_state *blend_state;
    VkFormat rendering_formats[PIPE_MAX_COLOR_BUFS];
    VkPipelineRenderingCreateInfo rendering_info;
@@ -1008,6 +1012,7 @@ enum zink_gfx_push_constant_member {
  */
 struct zink_shader_module {
    struct zink_shader_object obj;
+   struct util_queue_fence fence;
    uint32_t hash;
    bool shobj;
    bool default_variant;
@@ -1049,6 +1054,7 @@ typedef bool (*equals_gfx_pipeline_state_func)(const void *a, const void *b);
 
 struct zink_gfx_library_key {
    uint32_t optimal_key; //equals_pipeline_lib_optimal
+   uint32_t st_key;
    VkShaderModule modules[MESA_SHADER_MESH_STAGES];
    VkPipeline pipeline;
 };
@@ -1115,6 +1121,13 @@ struct zink_gfx_lib_cache {
 
    simple_mtx_t lock;
    struct set libs; //zink_gfx_library_key -> VkPipeline
+   struct zink_gfx_library_key *lib; //zink_gfx_library_key -> VkPipeline
+};
+
+struct zink_gfx_program_variant_key {
+   uint32_t optimal_key; //equals_pipeline_lib_optimal
+   uint32_t st_key;
+   struct zink_gfx_program *prog;
 };
 
 struct zink_gfx_program {
@@ -1135,21 +1148,29 @@ struct zink_gfx_program {
    uint32_t module_hash[MESA_SHADER_MESH_STAGES];
    struct blob blobs[MESA_SHADER_MESH_STAGES];
    struct util_dynarray shader_cache[MESA_SHADER_MESH_STAGES][2][2]; //normal, nonseamless cubes, inline uniforms
+   struct util_dynarray uber_modules;
+   struct set variants;
+   struct zink_gfx_program *base_variant; //quick access to base varitant (only !NULL when done compiling)
+   struct zink_gfx_program *uber_variant;
    unsigned inlined_variant_count[MESA_SHADER_MESH_STAGES];
    uint32_t default_variant_hash;
    uint8_t inline_variants; //which stages are using inlined uniforms
    bool needs_inlining; // whether this program requires some uniforms to be inlined
    bool has_edgeflags;
    bool optimal_keys;
+   bool started_compiling;
+   bool is_uber_program;
+   bool is_variant_program;
 
    /* separable */
    struct zink_gfx_program *full_prog;
 
-   struct hash_table pipelines[11]; // [number of draw modes we support]
+   struct hash_table pipelines[2][11]; // [uber_emulation][number of draw modes we support]
    uint32_t last_variant_hash;
+   uint32_t st_key;
 
-   uint32_t last_finalized_hash[4]; //[primtype idx]
-   struct zink_gfx_pipeline_cache_entry *last_pipeline[4]; //[primtype idx]
+   uint32_t last_finalized_hash[2][4]; //[uber_emulation][primtype idx]
+   struct zink_gfx_pipeline_cache_entry *last_pipeline[2][4]; //[uber_emulation][primtype idx]
 
    struct zink_gfx_lib_cache *libs;
 };
@@ -1787,6 +1808,7 @@ struct zink_context {
    simple_mtx_t program_lock[8];
    uint32_t gfx_hash;
    struct zink_gfx_program *curr_program;
+   struct zink_gfx_program *curr_program_uber;
    struct set gfx_inputs;
    struct set gfx_outputs;