From f2ecf95fa7665819c95fbd3121a7a90d1e2ec602 Mon Sep 17 00:00:00 2001
From: antonino <antonino.maniscalco@collabora.com>
Date: Fri, 28 Jul 2023 11:11:15 +0200
Subject: [PATCH] zink: uber shaders logic

Introduce the logic to implement uber shaders.

The way variants is handled changes significantly: a uber program is
expected to be compiling asynchronously and is used whenever possible.

Specialized variant shaders are compiled asynchronously, though they
might be compiled synchronously if the uber program can't be used.

Each variant is a separate program as that simplifies gpl/obj caching.

A new key is introduced, the st_key, that keeps track of the state of
the features emulated by the uber shader.

This is split in a dynamic part, always sent through push constants, and
a more compact part that is used as a key for caching optimized
variants.
---
 src/gallium/drivers/zink/zink_compiler.c      |  54 +-
 src/gallium/drivers/zink/zink_compiler.h      |   6 +-
 src/gallium/drivers/zink/zink_context.c       |   4 +-
 src/gallium/drivers/zink/zink_draw.cpp        |  13 +-
 src/gallium/drivers/zink/zink_pipeline.c      |   4 +-
 src/gallium/drivers/zink/zink_pipeline.h      |   2 +-
 src/gallium/drivers/zink/zink_program.c       | 727 +++++++++++++-----
 src/gallium/drivers/zink/zink_program.h       |  33 +-
 .../drivers/zink/zink_program_state.hpp       |  99 ++-
 src/gallium/drivers/zink/zink_shader_keys.h   |  35 +
 src/gallium/drivers/zink/zink_state.c         |  15 +
 src/gallium/drivers/zink/zink_types.h         |  28 +-
 12 files changed, 766 insertions(+), 254 deletions(-)

diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index 50414a51037..75a16f41545 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -3879,9 +3879,25 @@ remove_interpolate_at_sample(struct nir_builder *b, nir_intrinsic_instr *interp,
    return true;
 }
 
+static void
+zink_optimized_st_emulation_passes(nir_shader *nir, struct zink_shader *zs,
+                                   const struct zink_st_variant_key *key)
+{
+   if (!nir->info.io_lowered)
+      return;
+}
+
+static void
+zink_emulation_passes(nir_shader *nir, struct zink_shader *zs)
+{
+   if (!nir->info.io_lowered)
+      return;
+}
+
 struct zink_shader_object
-zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs,
-                    nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg)
+zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir,
+                    const struct zink_shader_key *key, const struct zink_st_variant_key *st_key,
+                    bool compile_uber, const void *extra_data, struct zink_program *pg)
 {
    bool need_optimize = true;
    bool inlined_uniforms = false;
@@ -3891,8 +3907,18 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
       NIR_PASS(_, nir, nir_lower_sample_shading);
    }
 
+   if (compile_uber) {
+      zink_emulation_passes(nir, zs);
+      need_optimize = true;
+   }
+   else if (st_key) {
+      zink_optimized_st_emulation_passes(nir, zs, st_key);
+      need_optimize = true;
+   }
+
    NIR_PASS(_, nir, add_derefs);
    NIR_PASS(_, nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
+
    if (key) {
       if (key->inline_uniforms) {
          NIR_PASS(_, nir, nir_inline_uniforms,
@@ -4077,7 +4103,7 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
 }
 
 struct zink_shader_object
-zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
+zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, bool compile_uber)
 {
    nir_shader *nir = zs->nir;
    /* TODO: maybe compile multiple variants for different set counts for compact mode? */
@@ -4107,6 +4133,10 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
       default: break;
       }
    }
+
+   if (compile_uber)
+      zink_emulation_passes(nir, zs);
+
    NIR_PASS(_, nir, add_derefs);
    NIR_PASS(_, nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
    if (screen->driconf.inline_uniforms) {
@@ -4114,6 +4144,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
       NIR_PASS(_, nir, rewrite_bo_access, screen);
       NIR_PASS(_, nir, remove_bo_access, zs);
    }
+
    optimize_nir(nir, zs, true);
    zink_descriptor_shader_init(screen, zs);
    nir_shader *nir_clone = NULL;
@@ -4128,7 +4159,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
          zs->non_fs.generated_tcs = zink_shader_tcs_create(screen, 32);
          zink_shader_tcs_init(screen, zs->non_fs.generated_tcs, nir_clone, &nir_tcs);
          nir_tcs->info.separate_shader = true;
-         zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs);
+         zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs, compile_uber);
          ralloc_free(nir_tcs);
          zs->non_fs.generated_tcs->nir = NULL;
       }
@@ -6448,11 +6479,13 @@ gfx_shader_prune(struct zink_screen *screen, struct zink_shader *shader)
       prog->base.removed = true;
       simple_mtx_unlock(lock);
 
-      for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-         hash_table_foreach(&prog->pipelines[i], table_entry) {
-            struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data;
+      for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+         for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+            hash_table_foreach(&prog->pipelines[r][i], table_entry) {
+               struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data;
 
-            util_queue_fence_wait(&pc_entry->fence);
+               util_queue_fence_wait(&pc_entry->fence);
+            }
          }
       }
    }
@@ -6468,7 +6501,10 @@ gfx_shader_prune(struct zink_screen *screen, struct zink_shader *shader)
       prog->shaders[MESA_SHADER_GEOMETRY]->non_fs.parent == shader) {
       prog->shaders[MESA_SHADER_GEOMETRY] = NULL;
    }
-   zink_gfx_program_reference(screen, &prog, NULL);
+
+   /* variant programs are owned and destroyed by their parent */
+   if (!prog->is_variant_program)
+      zink_gfx_program_reference(screen, &prog, NULL);
    return true;
 }
 
diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h
index bec8fae913e..0c33a493b38 100644
--- a/src/gallium/drivers/zink/zink_compiler.h
+++ b/src/gallium/drivers/zink/zink_compiler.h
@@ -63,9 +63,11 @@ void
 zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_shader *consumer);
 /* pass very large shader key data with extra_data */
 struct zink_shader_object
-zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg);
+zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir,
+                    const struct zink_shader_key *key, const struct zink_st_variant_key *st_key,
+                    bool compile_uber, const void *extra_data, struct zink_program *pg);
 struct zink_shader_object
-zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs);
+zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, bool compile_uber);
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir);
 void
diff --git a/src/gallium/drivers/zink/zink_context.c b/src/gallium/drivers/zink/zink_context.c
index 889699d80f2..413d46de30a 100644
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@@ -3721,8 +3721,8 @@ zink_update_descriptor_refs(struct zink_context *ctx, bool compute)
                res->obj->unordered_read = false;
          }
       }
-      if (ctx->curr_program)
-         zink_batch_reference_program(ctx, &ctx->curr_program->base);
+      if (ctx->curr_program_uber || ctx->curr_program)
+         zink_batch_reference_program(ctx, &ctx->curr_program_uber->base);
    }
    if (ctx->di.bindless_refs_dirty) {
       ctx->di.bindless_refs_dirty = false;
diff --git a/src/gallium/drivers/zink/zink_draw.cpp b/src/gallium/drivers/zink/zink_draw.cpp
index 7476d19edbb..a28024f4d67 100644
--- a/src/gallium/drivers/zink/zink_draw.cpp
+++ b/src/gallium/drivers/zink/zink_draw.cpp
@@ -265,11 +265,11 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum
       zink_gfx_program_update(ctx);
    bool pipeline_changed = false;
    VkPipeline pipeline = VK_NULL_HANDLE;
-   if (!ctx->curr_program->base.uses_shobj) {
+   if (!(ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber : ctx->curr_program)->base.uses_shobj) {
       if (screen->info.have_EXT_graphics_pipeline_library)
-         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, true, false>(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
+         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, true, false>(ctx, ctx->curr_program_uber, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
       else
-         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, false, false>(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
+         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, false, false>(ctx, ctx->curr_program, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
       assert(pipeline);
       pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw;
       if (BATCH_CHANGED || pipeline_changed)
@@ -285,7 +285,8 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum
             VK_SHADER_STAGE_FRAGMENT_BIT,
          };
          /* always rebind all stages */
-         VKCTX(CmdBindShadersEXT)(bs->cmdbuf, ZINK_GFX_SHADER_COUNT, stages, ctx->curr_program->objects);
+         VKCTX(CmdBindShadersEXT)(bs->cmdbuf, ZINK_GFX_SHADER_COUNT, stages,
+                                  ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber->objects : ctx->curr_program->objects);
          if (screen->info.have_EXT_mesh_shader) {
             /* must always unbind mesh stages */
             VkShaderStageFlagBits mesh_stages[] = {
@@ -994,9 +995,9 @@ update_mesh_pipeline(struct zink_context *ctx, struct zink_batch_state *bs)
    VkPipeline pipeline = VK_NULL_HANDLE;
    if (!ctx->mesh_program->base.uses_shobj) {
       if (screen->info.have_EXT_graphics_pipeline_library)
-         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, true, true>(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
+         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, true, true>(ctx, ctx->mesh_program, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
       else
-         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, false, true>(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
+         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, false, true>(ctx, ctx->mesh_program, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
       assert(pipeline);
       pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw;
       if (BATCH_CHANGED || pipeline_changed)
diff --git a/src/gallium/drivers/zink/zink_pipeline.c b/src/gallium/drivers/zink/zink_pipeline.c
index a5363a98481..1114bf1186e 100644
--- a/src/gallium/drivers/zink/zink_pipeline.c
+++ b/src/gallium/drivers/zink/zink_pipeline.c
@@ -884,10 +884,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_objec
 }
 
 VkPipeline
-zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_program *prog)
+zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_object *objs, struct zink_gfx_program *prog)
 {
    u_rwlock_wrlock(&prog->base.pipeline_cache_lock);
-   VkPipeline pipeline = create_gfx_pipeline_library(screen, prog->objs, prog->stages_present, prog->base.layout, prog->base.pipeline_cache);
+   VkPipeline pipeline = create_gfx_pipeline_library(screen, objs, prog->stages_present, prog->base.layout, prog->base.pipeline_cache);
    u_rwlock_wrunlock(&prog->base.pipeline_cache_lock);
    return pipeline;
 }
diff --git a/src/gallium/drivers/zink/zink_pipeline.h b/src/gallium/drivers/zink/zink_pipeline.h
index 7b050f15efb..aa66cf0dc21 100644
--- a/src/gallium/drivers/zink/zink_pipeline.h
+++ b/src/gallium/drivers/zink/zink_pipeline.h
@@ -58,7 +58,7 @@ zink_create_gfx_pipeline_input(struct zink_screen *screen,
                                const uint8_t *binding_map,
                                VkPrimitiveTopology primitive_topology);
 VkPipeline
-zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_program *prog);
+zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_object *objs, struct zink_gfx_program *prog);
 VkPipeline
 zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state);
 VkPipeline
diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c
index 00b1ee702a5..e63f4735a73 100644
--- a/src/gallium/drivers/zink/zink_program.c
+++ b/src/gallium/drivers/zink/zink_program.c
@@ -41,6 +41,7 @@
 #include "nir_serialize.h"
 #include "nir.h"
 #include "nir/nir_draw_helpers.h"
+#include "util/u_queue.h"
 
 /* for pipeline cache */
 #define XXH_INLINE_ALL
@@ -48,9 +49,34 @@
 
 static void
 gfx_program_precompile_job(void *data, void *gdata, int thread_index);
+static void
+precompile_variant_job(void *data, void *gdata, int thread_index);
+static void
+precompile_variant_separate_shader_job(void *data, void *gdata, int thread_index);
 struct zink_gfx_program *
 create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch, bool is_mesh);
 
+struct precompile_variant_data {
+   struct zink_gfx_program *prog;
+   struct zink_gfx_pipeline_state state;
+};
+
+struct precompile_separate_variant_data {
+   struct zink_program *prog;
+   struct zink_shader_module *zm;
+   struct zink_shader *zs;
+   struct blob *blob;
+   bool uses_shobj;
+   struct zink_shader_key key;
+   struct zink_st_variant_key st_key;
+   bool has_key;
+};
+
+struct program_variant_key {
+   uint32_t key, st_key;
+   struct zink_gfx_program *prog;
+};
+
 void
 debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr)
 {
@@ -69,9 +95,9 @@ shader_key_matches_tcs_nongenerated(const struct zink_shader_module *zm, const s
    if (zm->num_uniforms != num_uniforms || zm->has_nonseamless != !!key->base.nonseamless_cube_mask ||
        zm->needs_zs_shader_swizzle != key->base.needs_zs_shader_swizzle)
       return false;
-   const uint32_t nonseamless_size = zm->has_nonseamless ? sizeof(uint32_t) : 0;
-   return (!nonseamless_size || !memcmp(zm->key + zm->key_size, &key->base.nonseamless_cube_mask, nonseamless_size)) &&
-          (!num_uniforms || !memcmp(zm->key + zm->key_size + nonseamless_size,
+   const uint32_t nonseamless_size = zm->has_nonseamless ? sizeof(union zink_st_small_key) : 0;
+   return (!nonseamless_size || !memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key), &key->base.nonseamless_cube_mask, nonseamless_size)) &&
+          (!num_uniforms || !memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key) + nonseamless_size,
                                     key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t)));
 }
 
@@ -84,13 +110,13 @@ shader_key_matches(const struct zink_shader_module *zm,
    if (has_inline) {
       if (zm->num_uniforms != num_uniforms ||
           (num_uniforms &&
-           memcmp(zm->key + zm->key_size + nonseamless_size,
+           memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key) + nonseamless_size,
                   key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t))))
          return false;
    }
    if (!has_nonseamless) {
       if (zm->has_nonseamless != !!key->base.nonseamless_cube_mask ||
-          (nonseamless_size && memcmp(zm->key + zm->key_size, &key->base.nonseamless_cube_mask, nonseamless_size)))
+          (nonseamless_size && memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key), &key->base.nonseamless_cube_mask, nonseamless_size)))
          return false;
    }
    if (zm->needs_zs_shader_swizzle != key->base.needs_zs_shader_swizzle)
@@ -142,18 +168,19 @@ create_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *scr
    const bool is_nongenerated_tcs = stage == MESA_SHADER_TESS_CTRL && !zs->non_fs.is_generated;
    const bool shadow_needs_shader_swizzle = key->base.needs_zs_shader_swizzle ||
                                             (stage == MESA_SHADER_FRAGMENT && key->key.fs.base.shadow_needs_shader_swizzle);
-   zm = malloc(sizeof(struct zink_shader_module) + key->size +
+   zm = malloc(sizeof(struct zink_shader_module) + sizeof(union zink_st_small_key) + key->size +
                (!has_nonseamless ? nonseamless_size : 0) + inline_size * sizeof(uint32_t) +
                (shadow_needs_shader_swizzle ? sizeof(struct zink_zs_swizzle_key) : 0));
    if (!zm) {
       return NULL;
    }
+   util_queue_fence_init(&zm->fence);
    unsigned patch_vertices = state->shader_keys.key[MESA_SHADER_TESS_CTRL].key.tcs.patch_vertices;
    if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) {
       assert(ctx); //TODO async
       zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base);
    } else {
-      zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), key, &ctx->di.zs_swizzle[stage], &prog->base);
+      zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), key, &state->st_key, false, &ctx->di.zs_swizzle[stage], &prog->base);
    }
    if (!zm->obj.mod) {
       FREE(zm);
@@ -168,20 +195,22 @@ create_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *scr
       zm->key_size = 0;
       memset(zm->key, 0, key->size);
    }
+   uint16_t st_val = state->st_key.small_key.val;
+   memcpy(zm->key + key->size, &st_val, sizeof(st_val));
    if (!has_nonseamless && nonseamless_size) {
       /* nonseamless mask gets added to base key if it exists */
-      memcpy(zm->key + key->size, &key->base.nonseamless_cube_mask, nonseamless_size);
+      memcpy(zm->key + key->size + sizeof(st_val), &key->base.nonseamless_cube_mask, nonseamless_size);
    }
    zm->needs_zs_shader_swizzle = shadow_needs_shader_swizzle;
    zm->has_nonseamless = has_nonseamless ? 0 : !!nonseamless_size;
    if (inline_size)
-      memcpy(zm->key + key->size + nonseamless_size, key->base.inlined_uniform_values, inline_size * sizeof(uint32_t));
+      memcpy(zm->key + key->size + sizeof(st_val) + nonseamless_size, key->base.inlined_uniform_values, inline_size * sizeof(uint32_t));
    if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated)
       zm->hash = patch_vertices;
    else
       zm->hash = shader_module_hash(zm);
    if (unlikely(shadow_needs_shader_swizzle)) {
-      memcpy(zm->key + key->size + nonseamless_size + inline_size * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
+      memcpy(zm->key + key->size + sizeof(st_val) + nonseamless_size + inline_size * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
       zm->hash ^= _mesa_hash_data(&ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
    }
    zm->default_variant = !shadow_needs_shader_swizzle && !inline_size && !util_dynarray_contains(&prog->shader_cache[stage][0][0], void*);
@@ -219,9 +248,12 @@ get_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *screen
             continue;
          if (!shader_key_matches(iter, key, inline_size, has_inline, has_nonseamless))
             continue;
+         uint16_t st_val = state->st_key.small_key.val;
+         if (memcmp(iter->key + iter->key_size, &st_val, sizeof(st_val)))
+            continue;
          if (unlikely(shadow_needs_shader_swizzle)) {
             /* shadow swizzle data needs a manual compare since it's so fat */
-            if (memcmp(iter->key + iter->key_size + nonseamless_size + iter->num_uniforms * sizeof(uint32_t),
+            if (memcmp(iter->key + iter->key_size + sizeof(st_val) + nonseamless_size + iter->num_uniforms * sizeof(uint32_t),
                        &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)))
                continue;
          }
@@ -241,7 +273,8 @@ ALWAYS_INLINE static struct zink_shader_module *
 create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_screen *screen,
                                        struct zink_shader *zs, struct zink_gfx_program *prog,
                                        mesa_shader_stage stage,
-                                       struct zink_gfx_pipeline_state *state)
+                                       struct zink_gfx_pipeline_state *state,
+                                       bool unpopulated, bool compile_uber)
 {
    struct zink_shader_module *zm;
    uint16_t *key;
@@ -258,23 +291,30 @@ create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_scr
       key = NULL;
    }
    size_t key_size = sizeof(uint16_t);
-   zm = calloc(1, sizeof(struct zink_shader_module) + (key ? key_size : 0) + (unlikely(shadow_needs_shader_swizzle) ? sizeof(struct zink_zs_swizzle_key) : 0));
+   zm = calloc(1, sizeof(struct zink_shader_module) +
+               sizeof(union zink_st_small_key) +
+               (key ? key_size : 0) +
+               (unlikely(shadow_needs_shader_swizzle) ? sizeof(struct zink_zs_swizzle_key) : 0));
    if (!zm) {
       return NULL;
    }
-   if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) {
-      assert(ctx || screen->info.dynamic_state2_feats.extendedDynamicState2PatchControlPoints);
-      unsigned patch_vertices = 3;
-      if (ctx) {
-         struct zink_tcs_key *tcs = (struct zink_tcs_key*)key;
-         patch_vertices = tcs->patch_vertices;
-      }
-      zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base);
-   } else {
-      zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]),
-                                    (struct zink_shader_key*)key, shadow_needs_shader_swizzle ? &ctx->di.zs_swizzle[stage] : NULL, &prog->base);
+   util_queue_fence_init(&zm->fence);
+   if (!unpopulated) {
+        if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) {
+           assert(ctx || screen->info.dynamic_state2_feats.extendedDynamicState2PatchControlPoints);
+           unsigned patch_vertices = 3;
+           if (ctx) {
+                   struct zink_tcs_key *tcs = (struct zink_tcs_key*)key;
+                   patch_vertices = tcs->patch_vertices;
+           }
+           zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base);
+        } else {
+           zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]),
+                                           (struct zink_shader_key*)key, &state->st_key, compile_uber,
+                                           shadow_needs_shader_swizzle ? &ctx->di.zs_swizzle[stage] : NULL, &prog->base);
+        }
    }
-   if (!zm->obj.mod) {
+   if (!zm->obj.mod && !unpopulated) {
       FREE(zm);
       return NULL;
    }
@@ -288,9 +328,17 @@ create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_scr
       *data = (*key) & mask;
       if (unlikely(shadow_needs_shader_swizzle))
          memcpy(&data[1], &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key));
+      uint16_t st_val = state->st_key.small_key.val;
+      uint8_t *p = (uint8_t*)&data[1];
+      if (unlikely(shadow_needs_shader_swizzle))
+         p += sizeof(struct zink_zs_swizzle_key);
+      memcpy(p, &st_val, sizeof(st_val));
    }
    zm->default_variant = !util_dynarray_contains(&prog->shader_cache[stage][0][0], void*);
-   util_dynarray_append(&prog->shader_cache[stage][0][0], zm);
+   if (!compile_uber)
+      util_dynarray_append(&prog->shader_cache[stage][0][0], zm);
+   else
+      util_dynarray_append(&prog->uber_modules, zm);
    return zm;
 }
 
@@ -298,7 +346,7 @@ ALWAYS_INLINE static struct zink_shader_module *
 get_shader_module_for_stage_optimal_key(struct zink_context *ctx, struct zink_screen *screen,
                                         struct zink_shader *zs, struct zink_gfx_program *prog,
                                         mesa_shader_stage stage,
-                                        struct zink_gfx_pipeline_state *state, uint16_t *key)
+                                        struct zink_gfx_pipeline_state *state, uint16_t *key, uint16_t *st_key)
 {
    /* non-generated tcs won't use the shader key */
    const bool is_nongenerated_tcs = stage == MESA_SHADER_TESS_CTRL && !zs->non_fs.is_generated;
@@ -324,6 +372,12 @@ get_shader_module_for_stage_optimal_key(struct zink_context *ctx, struct zink_sc
             if (memcmp(iter->key + sizeof(uint16_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)))
                continue;
          }
+         uint16_t st_val = *st_key;
+         uint8_t *p = iter->key + sizeof(union zink_st_small_key);
+         if (unlikely(shadow_needs_shader_swizzle))
+            p += sizeof(struct zink_zs_swizzle_key);
+         if (memcmp(p, &st_val, sizeof(st_val)))
+            continue;
       }
       if (i > 0) {
          struct zink_shader_module *zero = pzm[0];
@@ -360,16 +414,18 @@ get_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_screen
                                     mesa_shader_stage stage,
                                     struct zink_gfx_pipeline_state *state)
 {
-   uint16_t *key;
+   uint16_t *key, st_key;
 
    key = get_shader_module_optimal_key(ctx, prog, zs, stage);
+   st_key = state->st_key.small_key.val;
 
-   return get_shader_module_for_stage_optimal_key(ctx, screen, zs, prog, stage, state, key);
+   return get_shader_module_for_stage_optimal_key(ctx, screen, zs, prog, stage, state, key, &st_key);
 }
 
 static void
 zink_destroy_shader_module(struct zink_screen *screen, struct zink_shader_module *zm)
 {
+   util_queue_fence_wait(&zm->fence);
    if (zm->shobj)
       VKSCR(DestroyShaderEXT)(screen->dev, zm->obj.obj, NULL);
    else
@@ -480,7 +536,7 @@ generate_gfx_program_modules(struct zink_context *ctx, struct zink_screen *scree
 }
 
 static void
-generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state)
+generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber)
 {
    assert(!prog->objs[MESA_SHADER_VERTEX].mod && !prog->objs[MESA_SHADER_MESH].mod);
    for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) {
@@ -489,7 +545,7 @@ generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_scree
 
       assert(prog->shaders[i]);
 
-      struct zink_shader_module *zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[i], prog, i, state);
+      struct zink_shader_module *zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[i], prog, i, state, false, compile_uber);
       prog->objs[i] = zm->obj;
       prog->objects[i] = zm->obj.obj;
    }
@@ -498,21 +554,11 @@ generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_scree
       state->modules_changed = true;
    else
       state->mesh_modules_changed = true;
-   prog->last_variant_hash = prog->shaders[MESA_SHADER_MESH] ? state->mesh_optimal_key : state->optimal_key;
-}
 
-static uint32_t
-hash_pipeline_lib_generated_tcs(const void *key)
-{
-   const struct zink_gfx_library_key *gkey = key;
-   return gkey->optimal_key;
-}
-
-
-static bool
-equals_pipeline_lib_generated_tcs(const void *a, const void *b)
-{
-   return !memcmp(a, b, sizeof(uint32_t));
+   if (!compile_uber) {
+      prog->last_variant_hash = prog->shaders[MESA_SHADER_MESH] ? state->mesh_optimal_key : state->optimal_key;
+      prog->st_key = state->st_key.small_key.val;
+   }
 }
 
 static uint32_t
@@ -530,25 +576,6 @@ equals_pipeline_lib_mesh(const void *a, const void *b)
    return ak->optimal_key == bk->optimal_key;
 }
 
-static uint32_t
-hash_pipeline_lib(const void *key)
-{
-   const struct zink_gfx_library_key *gkey = key;
-   /* remove generated tcs bits */
-   return zink_shader_key_optimal_no_tcs(gkey->optimal_key);
-}
-
-static bool
-equals_pipeline_lib(const void *a, const void *b)
-{
-   const struct zink_gfx_library_key *ak = a;
-   const struct zink_gfx_library_key *bk = b;
-   /* remove generated tcs bits */
-   uint32_t val_a = zink_shader_key_optimal_no_tcs(ak->optimal_key);
-   uint32_t val_b = zink_shader_key_optimal_no_tcs(bk->optimal_key);
-   return val_a == val_b;
-}
-
 uint32_t
 hash_gfx_input_dynamic(const void *key)
 {
@@ -673,7 +700,7 @@ zink_gfx_program_update(struct zink_context *ctx)
          update_gfx_program(ctx, prog);
       } else {
          ctx->dirty_gfx_stages |= ctx->shader_stages;
-         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, hash, false);
+         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, hash, false, false);
          zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false);
          _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
          prog->base.removed = false;
@@ -682,7 +709,8 @@ zink_gfx_program_update(struct zink_context *ctx)
       simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
       if (prog && prog != ctx->curr_program)
          zink_batch_reference_program(ctx, &prog->base);
-      ctx->curr_program = prog;
+      ctx->curr_program_uber = ctx->curr_program = prog;
+      ctx->gfx_pipeline_state.uber_required = false;
       ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
       ctx->gfx_dirty = false;
    } else if (ctx->dirty_gfx_stages) {
@@ -695,41 +723,72 @@ zink_gfx_program_update(struct zink_context *ctx)
    ctx->dirty_gfx_stages = 0;
 }
 
-ALWAYS_INLINE static bool
-update_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage)
+ALWAYS_INLINE static void
+gfx_program_cache_populate_queue(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage, struct zink_shader_module *zm)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (screen->info.have_EXT_graphics_pipeline_library)
+      util_queue_fence_wait(&prog->base.cache_fence);
+   struct precompile_separate_variant_data *data = CALLOC_STRUCT(precompile_separate_variant_data);
+   data->prog = &prog->base;
+   data->zs = prog->shaders[pstage];
+   data->blob = &prog->blobs[pstage];
+   data->uses_shobj = prog->base.uses_shobj;
+   data->zm = zm;
+   struct zink_shader_key* keyp = (struct zink_shader_key*)get_shader_module_optimal_key(ctx, prog, data->zs, pstage);
+   if (keyp)
+      data->key = *keyp;
+   data->has_key = !!keyp;
+   data->st_key = ctx->gfx_pipeline_state.st_key;
+   if (zink_debug & ZINK_DEBUG_NOBGC) {
+      precompile_variant_separate_shader_job(data, screen, 0);
+   } else {
+      util_queue_add_job(&screen->cache_get_thread, data, &zm->fence, precompile_variant_separate_shader_job, NULL, 0);
+   }
+}
+
+ALWAYS_INLINE static struct zink_shader_module *
+update_or_queue_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage, bool async)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    if (screen->info.have_EXT_graphics_pipeline_library)
       util_queue_fence_wait(&prog->base.cache_fence);
    struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
+   bool entry_found = !!zm;
+   bool async_done = zm && util_queue_fence_is_signalled(&zm->fence);
    if (!zm) {
-      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
+      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state, async, false);
       perf_debug(ctx, "zink[gfx_compile]: %s shader variant required\n", _mesa_shader_stage_to_string(pstage));
    }
-
-   bool changed = prog->objs[pstage].mod != zm->obj.mod;
-   prog->objs[pstage] = zm->obj;
-   prog->objects[pstage] = zm->obj.obj;
-   return changed;
+   if (!async || async_done) {
+      return zm;
+   } else {
+      if (!entry_found)
+         gfx_program_cache_populate_queue(ctx, prog, pstage, zm);
+   }
+   return NULL;
 }
 
-static void
-update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *prog)
+static bool
+update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, struct zink_gfx_program *variant_prog, bool async)
 {
+   bool async_done = true;
+   struct zink_shader_module *zms[3] = {0};
    const union zink_shader_key_optimal *key = (union zink_shader_key_optimal*)&ctx->gfx_pipeline_state.optimal_key;
    const union zink_shader_key_optimal *last_prog_key = (union zink_shader_key_optimal*)&prog->last_variant_hash;
-   if (key->vs_bits != last_prog_key->vs_bits) {
-      assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->info.stage);
-      ctx->gfx_pipeline_state.modules_changed |= changed;
+   bool st_key_diff = ctx->gfx_pipeline_state.st_key.small_key.val != prog->st_key;
+   if (st_key_diff || key->vs_bits != last_prog_key->vs_bits) {
+      assert(!variant_prog->is_separable);
+      zms[0] = update_or_queue_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->info.stage, async);
+      async_done &= !!zms[0];
    }
    const bool shadow_needs_shader_swizzle = last_prog_key->fs.shadow_needs_shader_swizzle && (ctx->dirty_gfx_stages & BITFIELD_BIT(MESA_SHADER_FRAGMENT));
-   if (key->fs_bits != last_prog_key->fs_bits ||
+   if (st_key_diff || key->fs_bits != last_prog_key->fs_bits ||
        /* always recheck shadow swizzles since they aren't directly part of the key */
        unlikely(shadow_needs_shader_swizzle)) {
-      assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
-      ctx->gfx_pipeline_state.modules_changed |= changed;
+      assert(!variant_prog->is_separable);
+      zms[1] = update_or_queue_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT, async);
+      async_done &= !!zms[1];
       if (unlikely(shadow_needs_shader_swizzle)) {
          struct zink_shader_module **pzm = prog->shader_cache[MESA_SHADER_FRAGMENT][0][0].data;
          ctx->gfx_pipeline_state.shadow = (struct zink_zs_swizzle_key*)pzm[0]->key + sizeof(uint16_t);
@@ -737,11 +796,77 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
    }
    if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated &&
        key->tcs_bits != last_prog_key->tcs_bits) {
-      assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL);
-      ctx->gfx_pipeline_state.modules_changed |= changed;
+      assert(!variant_prog->is_separable);
+      zms[2] = update_or_queue_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL, async);
+      async_done &= !!zms[2];
    }
-   prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
+   mesa_shader_stage stages[] = {ctx->last_vertex_stage->info.stage, MESA_SHADER_FRAGMENT, MESA_SHADER_TESS_CTRL};
+   if (async_done) {
+      for (int i = 0;i < 3; i++) {
+         if (!zms[i])
+            continue;
+         variant_prog->objs[stages[i]] = zms[i]->obj;
+         variant_prog->objects[stages[i]] = zms[i]->obj.obj;
+      }
+      variant_prog->last_variant_hash = prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
+      variant_prog->st_key = prog->st_key = ctx->gfx_pipeline_state.st_key.small_key.val;
+   }
+   return async_done;
+}
+
+static bool
+update_gfx_program_missing_shaders(struct zink_context *ctx, struct zink_gfx_program *prog,
+                                   struct zink_gfx_program *variant_prog, bool async)
+{
+   bool async_done = true;
+   for (int rstage = 0; rstage < MESA_SHADER_COMPUTE; rstage++) {
+      assert(!!variant_prog->shaders[rstage] == !!prog->shaders[rstage]);
+      if (variant_prog->shaders[rstage] && !variant_prog->objs[rstage].mod) {
+         assert(!variant_prog->is_separable);
+         struct zink_shader_module *mod = update_or_queue_gfx_shader_module_optimal(ctx, prog, rstage, async);
+         async_done &= !!mod;
+         if (mod) {
+            bool changed = variant_prog->objs[rstage].mod != mod->obj.mod;
+            variant_prog->objs[rstage] = mod->obj;
+               variant_prog->objects[rstage] = mod->obj.obj;
+               ctx->gfx_pipeline_state.modules_changed |= changed;
+         }
+      }
+   }
+   return async_done;
+}
+
+static void
+copy_gfx_program_missing_shaders(struct zink_context *ctx, struct zink_gfx_program *base_prog,
+                                   struct zink_gfx_program *variant_prog)
+{
+   for (int rstage = 0; rstage < MESA_SHADER_COMPUTE; rstage++) {
+      assert(!!variant_prog->shaders[rstage] == !!base_prog->shaders[rstage]);
+      if (variant_prog->shaders[rstage] && !variant_prog->objs[rstage].mod) {
+         bool changed = variant_prog->objs[rstage].mod != base_prog->objs[rstage].mod;
+         variant_prog->objs[rstage] = base_prog->objs[rstage];
+         variant_prog->objects[rstage] = base_prog->objects[rstage];
+         ctx->gfx_pipeline_state.modules_changed |= changed;
+      }
+   }
+}
+
+ALWAYS_INLINE static bool
+update_gfx_shader_module_mesh(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (screen->info.have_EXT_graphics_pipeline_library)
+      util_queue_fence_wait(&prog->base.cache_fence);
+   struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
+   if (!zm) {
+      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state, false, false);
+      perf_debug(ctx, "zink[gfx_compile]: %s shader variant required\n", _mesa_shader_stage_to_string(pstage));
+   }
+
+   bool changed = prog->objs[pstage].mod != zm->obj.mod;
+   prog->objs[pstage] = zm->obj;
+   prog->objects[pstage] = zm->obj.obj;
+   return changed;
 }
 
 static void
@@ -754,7 +879,7 @@ update_mesh_program_optimal(struct zink_context *ctx, struct zink_gfx_program *p
        /* always recheck shadow swizzles since they aren't directly part of the key */
        unlikely(shadow_needs_shader_swizzle)) {
       assert(!prog->is_separable);
-      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
+      bool changed = update_gfx_shader_module_mesh(ctx, prog, MESA_SHADER_FRAGMENT);
       ctx->gfx_pipeline_state.modules_changed |= changed;
       if (unlikely(shadow_needs_shader_swizzle)) {
          struct zink_shader_module **pzm = prog->shader_cache[MESA_SHADER_FRAGMENT][0][0].data;
@@ -771,7 +896,7 @@ replace_separable_prog(struct zink_context *ctx, struct hash_entry *entry, struc
    struct zink_gfx_program *real = prog->full_prog ?
                                    prog->full_prog :
                                    /* this will be NULL with ZINK_DEBUG_NOOPT */
-                                   zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false);
+                                   zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false, false);
    entry->data = real;
    entry->key = real->shaders;
    real->base.removed = false;
@@ -780,12 +905,116 @@ replace_separable_prog(struct zink_context *ctx, struct hash_entry *entry, struc
    return real;
 }
 
+static uint32_t
+hash_gfx_program(const void *key)
+{
+   const uint32_t *k = key;
+
+   return XXH32(k, sizeof(uint32_t[2]), 0);
+}
+
+static bool
+equals_program_variant(const void *a, const void *b)
+{
+   const struct program_variant_key *ak = a;
+   const struct program_variant_key *bk = b;
+   uint32_t val_a = ak->key;
+   uint32_t val_b = bk->key;
+   uint32_t val_a_st = ak->st_key;
+   uint32_t val_b_st = bk->st_key;
+   return val_a == val_b && val_a_st == val_b_st;
+}
+
+#define CURR_KEY_PROGRAM(ctx) (ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber: ctx->curr_program)
+
+static void
+async_variant_program_update(struct zink_context *ctx, bool can_use_uber, bool needs_emulation)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   bool needs_uber = false;
+   if (!ctx->curr_program_uber->is_separable && (!ctx->curr_program_uber->base_variant || needs_emulation)) {
+      struct program_variant_key prog_variant_key = {0};
+      prog_variant_key.key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val;//ctx->gfx_pipeline_state.optimal_key;
+      prog_variant_key.st_key = ctx->gfx_pipeline_state.st_key.small_key.val;
+      struct set_entry * variant_entry = _mesa_set_search(&ctx->curr_program_uber->variants, &prog_variant_key);
+      struct zink_gfx_program *variant;
+      if (!variant_entry) {
+         variant = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->curr_program_uber->gfx_hash, false, true);
+         variant->base.uses_shobj = ctx->curr_program_uber->base.uses_shobj;
+         util_queue_fence_init(&variant->base.cache_fence);
+         struct program_variant_key *prog_variant_key_p = MALLOC(sizeof(struct program_variant_key));
+         memcpy(prog_variant_key_p, &prog_variant_key, sizeof(struct program_variant_key));
+         prog_variant_key_p->prog = variant;
+         variant->uber_variant = ctx->curr_program_uber;
+         _mesa_set_add(&ctx->curr_program_uber->variants, prog_variant_key_p);
+         needs_uber = true;
+      } else
+         variant = ((struct program_variant_key *)variant_entry->key)->prog;
+      /* fetches shader modules from cache and starts async compilation on a miss */
+      bool async_done = update_gfx_program_optimal(ctx, ctx->curr_program_uber, variant, can_use_uber);
+      assert(can_use_uber || async_done);
+      if (async_done) {
+         if (ctx->curr_program_uber->base_variant)
+            copy_gfx_program_missing_shaders(ctx, ctx->curr_program_uber->base_variant, variant);
+         else
+            async_done = update_gfx_program_missing_shaders(ctx, ctx->curr_program_uber, variant, can_use_uber);
+      }
+      assert(can_use_uber || async_done);
+      needs_uber &= !async_done;
+
+      if (async_done && !variant->started_compiling) {
+         /* Modules are ready but the program isn't. Start a job for it. */
+         struct precompile_variant_data *data = CALLOC_STRUCT(precompile_variant_data);
+         data->prog = variant;
+         data->state = ctx->gfx_pipeline_state;
+         if (can_use_uber && !(zink_debug & ZINK_DEBUG_NOBGC))
+            util_queue_add_job(&screen->cache_get_thread, data, &variant->base.cache_fence, precompile_variant_job, NULL, 0);
+         else
+            precompile_variant_job(data, screen, 0);
+         variant->started_compiling = true;
+      }
+      if (!can_use_uber)
+         util_queue_fence_wait(&variant->base.cache_fence);
+      bool variant_prog_ready = variant->started_compiling &&
+                                (!can_use_uber || util_queue_fence_is_signalled(&variant->base.cache_fence));
+      assert(can_use_uber || variant_prog_ready);
+      if(variant_prog_ready) {
+         /* variant prog is ready, use it */
+         if (ctx->curr_program != variant) {
+            ctx->gfx_pipeline_state.modules_changed = true;
+            ctx->curr_program = variant;
+         }
+         assert(async_done);
+         if (!needs_emulation)
+            ctx->curr_program_uber->base_variant = variant;
+      }
+      needs_uber |= !async_done || !variant_prog_ready;
+   } else if (ctx->curr_program_uber->base_variant && !needs_emulation) {
+      ctx->curr_program = ctx->curr_program_uber->base_variant;
+      ctx->curr_program_uber->last_variant_hash = ctx->curr_program->last_variant_hash;
+      ctx->curr_program_uber->st_key = ctx->curr_program->st_key;
+      needs_uber = false;
+   } else if (ctx->curr_program_uber->is_separable) {
+      assert(can_use_uber);
+      ctx->curr_program = ctx->curr_program_uber;
+      needs_uber = true;
+   }
+   if (ctx->gfx_pipeline_state.uber_required != needs_uber) {
+      ctx->gfx_pipeline_state.modules_changed = true;
+      ctx->gfx_pipeline_state.uber_required = needs_uber;
+   }
+
+   if (needs_uber || !ctx->curr_program_uber)
+      ctx->curr_program = ctx->curr_program_uber;
+}
+
 void
 zink_gfx_program_update_optimal(struct zink_context *ctx)
 {
    MESA_TRACE_FUNC();
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    assert(!ctx->gfx_stages[MESA_SHADER_TESS_CTRL] || !ctx->gfx_stages[MESA_SHADER_TESS_CTRL]->non_fs.is_generated);
+   struct zink_gfx_program *old_prog = ctx->curr_program_uber;
    if (ctx->gfx_dirty) {
       struct zink_gfx_program *prog = NULL;
       ctx->gfx_pipeline_state.optimal_key = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
@@ -794,72 +1023,93 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
       simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
       struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
 
-      if (ctx->curr_program)
-         ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      if (CURR_KEY_PROGRAM(ctx)) {
+         ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+         ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
+      }
+      bool needs_emulation = needs_st_emulation(ctx) || (ctx->gfx_pipeline_state.optimal_key != ZINK_SHADER_KEY_OPTIMAL_DEFAULT);
+      bool can_use_uber = zink_can_use_uber(ctx);
       if (entry) {
          prog = (struct zink_gfx_program*)entry->data;
-         bool must_replace = prog->base.uses_shobj ? !zink_can_use_shader_objects(ctx) : (prog->is_separable && !zink_can_use_pipeline_libs(ctx));
-         if (prog->is_separable) {
-            /* shader variants can't be handled by separable programs: sync and compile */
-            if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || must_replace)
+         if (prog->is_separable && !(zink_debug & ZINK_DEBUG_NOOPT)) {
+            /* if uber cannot be used we need to compile the variant synchrously,
+             * so we need the full prog: sync and compile */
+            if (!can_use_uber)
                util_queue_fence_wait(&prog->base.cache_fence);
             /* If the optimized linked pipeline is done compiling, swap it into place. */
-            if (util_queue_fence_is_signalled(&prog->base.cache_fence) &&
-                /* but only if needed for ZINK_DEBUG=noopt */
-                (!(zink_debug & ZINK_DEBUG_NOOPT) || !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || must_replace)) {
+            if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
                prog = replace_separable_prog(ctx, entry, prog);
             }
-         } else if (must_replace) {
-            /* this is a non-separable, incompatible prog which needs replacement */
-            struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false);
-            generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state);
-            entry->data = real;
-            entry->key = real->shaders;
-            real->base.removed = false;
-            prog->base.removed = true;
-            prog = real;
-         } else if (!prog->base.precompile_done) {
-            util_queue_fence_wait(&prog->base.cache_fence);
          }
-         update_gfx_program_optimal(ctx, prog);
+         ctx->curr_program_uber = prog;
+         async_variant_program_update(ctx, can_use_uber, needs_emulation);
       } else {
          ctx->dirty_gfx_stages |= ctx->shader_stages;
          prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, false);
+         ctx->gfx_pipeline_state.uber_required = true;
          prog->base.removed = false;
          _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
          if (!prog->is_separable) {
-            zink_screen_get_pipeline_cache(screen, &prog->base, false);
             perf_debug(ctx, "zink[gfx_compile]: new program created (probably legacy GL features in use)\n");
-            generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state);
+            prog->is_uber_program = true;
+            {
+               struct zink_gfx_pipeline_state state = {0};
+               state.shader_keys_optimal.key.vs_base.last_vertex_stage = true;
+               state.shader_keys_optimal.key.tcs.patch_vertices = 3; //random guess, generated tcs precompile is hard
+               state.optimal_key = state.shader_keys_optimal.key.val;
+               generate_gfx_program_modules_optimal(NULL, screen, prog, &state, prog->is_uber_program);
+               zink_screen_get_pipeline_cache(screen, &prog->base, true);
+               if (!prog->base.uses_shobj) {
+                  simple_mtx_lock(&prog->libs->lock);
+                  zink_create_pipeline_lib(screen, prog, &state, prog->is_uber_program);
+                  simple_mtx_unlock(&prog->libs->lock);
+               }
+               zink_screen_update_pipeline_cache(screen, &prog->base, true);
+            }
+            if (needs_emulation && !can_use_uber) {
+               ctx->curr_program_uber = prog;
+               async_variant_program_update(ctx, can_use_uber, needs_emulation);
+            }
          }
       }
       simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
-      if (prog && prog != ctx->curr_program)
-         zink_batch_reference_program(ctx, &prog->base);
-      ctx->curr_program = prog;
-      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      ctx->curr_program_uber = prog;
+      if (ctx->gfx_pipeline_state.uber_required)
+         ctx->curr_program = prog;
+      if (ctx->curr_program_uber && ctx->curr_program_uber != old_prog)
+      {
+         assert(!ctx->curr_program_uber->is_variant_program);
+         zink_batch_reference_program(ctx, &ctx->curr_program_uber->base);
+      }
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
    } else if (ctx->dirty_gfx_stages) {
       /* remove old hash */
       ctx->gfx_pipeline_state.optimal_key = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
-      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
-
-      bool must_replace = ctx->curr_program->base.uses_shobj ? !zink_can_use_shader_objects(ctx) : (ctx->curr_program->is_separable && !zink_can_use_pipeline_libs(ctx));
-      if (must_replace || (ctx->curr_program->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key))) {
-         struct zink_gfx_program *prog = ctx->curr_program;
-
-         util_queue_fence_wait(&prog->base.cache_fence);
-         /* shader variants can't be handled by separable programs: sync and compile */
-         perf_debug(ctx, "zink[gfx_compile]: non-default shader variant required with separate shader object program\n");
-         struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
-         const uint32_t hash = ctx->gfx_hash;
-         simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
-         struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
-         ctx->curr_program = replace_separable_prog(ctx, entry, prog);
-         simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
+      bool needs_emulation = needs_st_emulation(ctx) || (ctx->gfx_pipeline_state.optimal_key != ZINK_SHADER_KEY_OPTIMAL_DEFAULT);
+      bool can_use_uber = zink_can_use_uber(ctx);
+      if (ctx->curr_program->is_separable && !(zink_debug & ZINK_DEBUG_NOOPT)) {
+         struct zink_gfx_program *prog = ctx->curr_program_uber;
+         if (needs_emulation || ctx->curr_program_uber->is_separable) {
+            if (!can_use_uber)
+               util_queue_fence_wait(&prog->base.cache_fence);
+            perf_debug(ctx, "zink[gfx_compile]: non-default shader variant required with separate shader object program\n");
+            if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
+               struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
+               const uint32_t hash = ctx->gfx_hash;
+               simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+               struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
+               ctx->curr_program_uber = replace_separable_prog(ctx, entry, prog);
+               simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+            }
+         }
       }
-      update_gfx_program_optimal(ctx, ctx->curr_program);
+      async_variant_program_update(ctx, can_use_uber, needs_emulation);
       /* apply new hash */
-      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+      ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key;
    }
    ctx->dirty_gfx_stages = 0;
    ctx->gfx_dirty = false;
@@ -898,8 +1148,8 @@ zink_mesh_program_update_optimal(struct zink_context *ctx)
             }
          } else if (must_replace) {
             /* this is a non-separable, incompatible prog which needs replacement */
-            struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->mesh_hash, true);
-            generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state);
+            struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->mesh_hash, true, false);
+            generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state, false);
             entry->data = real;
             entry->key = real->shaders;
             real->base.removed = false;
@@ -917,7 +1167,7 @@ zink_mesh_program_update_optimal(struct zink_context *ctx)
          if (!prog->is_separable) {
             zink_screen_get_pipeline_cache(screen, &prog->base, false);
             perf_debug(ctx, "zink[gfx_compile]: new program created (probably legacy GL features in use)\n");
-            generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state);
+            generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state, false);
          }
       }
       simple_mtx_unlock(lock);
@@ -960,8 +1210,10 @@ optimized_compile_job(void *data, void *gdata, int thread_index)
    VkPrimitiveTopology vkmode = is_mesh ? VK_PRIMITIVE_TOPOLOGY_MAX_ENUM : zink_primitive_topology(pc_entry->state.gfx_prim_mode);
    if (pc_entry->gpl.gkey)
       pipeline = zink_create_gfx_pipeline_combined(screen, pc_entry->prog, pc_entry->gpl.ikey ? pc_entry->gpl.ikey->pipeline : VK_NULL_HANDLE, &pc_entry->gpl.gkey->pipeline, 1, pc_entry->gpl.okey->pipeline, true, false);
-   else
-      pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, pc_entry->prog->objs, &pc_entry->state, pc_entry->state.element_state->binding_map, vkmode, true);
+   else {
+      struct zink_shader_object *objs = pc_entry->prog->objs;
+      pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, objs, &pc_entry->state, pc_entry->state.element_state->binding_map, vkmode, true);
+   }
    if (pipeline) {
       pc_entry->gpl.unoptimized_pipeline = pc_entry->pipeline;
       pc_entry->pipeline = pipeline;
@@ -1009,10 +1261,12 @@ zink_program_finish(struct zink_context *ctx, struct zink_program *pg)
    if (pg->is_compute)
       return;
    struct zink_gfx_program *prog = (struct zink_gfx_program*)pg;
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      hash_table_foreach(&prog->pipelines[i], entry) {
-         struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
-         util_queue_fence_wait(&pc_entry->fence);
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         hash_table_foreach(&prog->pipelines[r][i], entry) {
+            struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
+            util_queue_fence_wait(&pc_entry->fence);
+         }
       }
    }
 }
@@ -1073,7 +1327,7 @@ update_cs_shader_module(struct zink_context *ctx, struct zink_compute_program *c
          return;
       }
       zm->shobj = false;
-      zm->obj = zink_shader_compile(screen, false, zs, zink_shader_blob_deserialize(screen, &comp->shader->blob), key, zs_swizzle_size ? &ctx->di.zs_swizzle[MESA_SHADER_COMPUTE] : NULL, &comp->base);
+      zm->obj = zink_shader_compile(screen, false, zs, zink_shader_blob_deserialize(screen, &comp->shader->blob), key, NULL, false, zs_swizzle_size ? &ctx->di.zs_swizzle[MESA_SHADER_COMPUTE] : NULL, &comp->base);
       if (!zm->obj.spirv) {
          FREE(zm);
          return;
@@ -1198,7 +1452,11 @@ zink_gfx_lib_cache_unref(struct zink_screen *screen, struct zink_gfx_lib_cache *
 {
    if (!p_atomic_dec_zero(&libs->refcount))
       return;
-
+   if (libs->lib) {
+      struct zink_gfx_library_key *gkey = libs->lib;
+      VKSCR(DestroyPipeline)(screen->dev, gkey->pipeline, NULL);
+      FREE(gkey);
+   }
    simple_mtx_destroy(&libs->lock);
    set_foreach_remove(&libs->libs, he) {
       struct zink_gfx_library_key *gkey = (void*)he->key;
@@ -1217,10 +1475,6 @@ create_lib_cache(struct zink_gfx_program *prog, bool generated_tcs)
    if (generated_tcs)
       libs->stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
    simple_mtx_init(&libs->lock, mtx_plain);
-   if (generated_tcs)
-      _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib_generated_tcs, equals_pipeline_lib_generated_tcs);
-   else
-      _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib, equals_pipeline_lib);
    return libs;
 }
 
@@ -1229,6 +1483,8 @@ find_or_create_lib_cache(struct zink_screen *screen, struct zink_gfx_program *pr
 {
    unsigned stages_present = prog->stages_present;
    bool generated_tcs = prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated;
+   if (prog->is_variant_program)
+      return create_lib_cache(prog, generated_tcs);
    if (generated_tcs)
       stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
    unsigned idx = zink_program_cache_stages(stages_present);
@@ -1307,7 +1563,7 @@ gfx_program_create(struct zink_context *ctx,
                         struct zink_shader **stages,
                         unsigned vertices_per_patch,
                         uint32_t gfx_hash,
-                        bool is_mesh)
+                        bool is_mesh, bool variant)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_gfx_program *prog = create_program(ctx, false);
@@ -1317,6 +1573,7 @@ gfx_program_create(struct zink_context *ctx,
    prog->gfx_hash = gfx_hash;
    prog->base.removed = true;
    prog->optimal_keys = screen->optimal_keys;
+   prog->is_variant_program = variant;
 
    for (int i = is_mesh ? MESA_SHADER_FRAGMENT : 0; i < (is_mesh ? MESA_SHADER_MESH_STAGES : MESA_SHADER_STAGES); ++i) {
       util_dynarray_init(&prog->shader_cache[i][0][0], prog->base.ralloc_ctx);
@@ -1331,6 +1588,7 @@ gfx_program_create(struct zink_context *ctx,
          prog->needs_inlining |= prog->shaders[i]->needs_inlining;
       }
    }
+   util_dynarray_init(&prog->uber_modules, prog->base.ralloc_ctx);
    if (stages[MESA_SHADER_TESS_EVAL] && !stages[MESA_SHADER_TESS_CTRL]) {
       util_queue_fence_wait(&stages[MESA_SHADER_TESS_EVAL]->precompile.fence);
       if (!prog->shaders[MESA_SHADER_TESS_EVAL]->non_fs.generated_tcs)
@@ -1340,13 +1598,17 @@ gfx_program_create(struct zink_context *ctx,
    }
    prog->stages_remaining = prog->stages_present;
    for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) {
-      if (prog->shaders[i]) {
+      if (prog->shaders[i] && !variant) {
          simple_mtx_lock(&prog->shaders[i]->lock);
          _mesa_set_add(prog->shaders[i]->programs, prog);
          simple_mtx_unlock(&prog->shaders[i]->lock);
          zink_gfx_program_reference(screen, NULL, prog);
       }
    }
+
+   if (variant)
+      zink_gfx_program_reference(screen, NULL, prog);
+
    p_atomic_dec(&prog->base.reference.count);
 
    if (is_mesh)
@@ -1360,8 +1622,12 @@ gfx_program_create(struct zink_context *ctx,
    prog->has_edgeflags = prog->shaders[MESA_SHADER_VERTEX] &&
                          prog->shaders[MESA_SHADER_VERTEX]->has_edgeflags;
 
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      _mesa_hash_table_init(&prog->pipelines[i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+   _mesa_set_init(&prog->variants, prog->base.ralloc_ctx, hash_gfx_program, equals_program_variant);
+
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         _mesa_hash_table_init(&prog->pipelines[r][i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+      }
    }
    return prog;
 
@@ -1436,9 +1702,9 @@ zink_create_gfx_program(struct zink_context *ctx,
                         struct zink_shader **stages,
                         unsigned vertices_per_patch,
                         uint32_t gfx_hash,
-                        bool is_mesh)
+                        bool is_mesh, bool variant)
 {
-   struct zink_gfx_program *prog = gfx_program_create(ctx, stages, vertices_per_patch, gfx_hash, is_mesh);
+   struct zink_gfx_program *prog = gfx_program_create(ctx, stages, vertices_per_patch, gfx_hash, is_mesh, variant);
    if (prog)
       prog = gfx_program_init(ctx, prog);
    return prog;
@@ -1454,7 +1720,8 @@ create_linked_separable_job(void *data, void *gdata, int thread_index)
    /* this is a dead program */
    if (prog->base.removed)
       return;
-   prog->full_prog = gfx_program_create(prog->base.ctx, prog->shaders, 0, prog->gfx_hash, !!prog->shaders[MESA_SHADER_MESH]);
+   prog->full_prog = gfx_program_create(prog->base.ctx, prog->shaders, 0, prog->gfx_hash, !!prog->shaders[MESA_SHADER_MESH], false);
+   prog->full_prog->is_uber_program = prog->is_uber_program;
    /* block gfx_shader_prune in the main thread */
    util_queue_fence_reset(&prog->full_prog->base.cache_fence);
    /* add an ownership ref */
@@ -1479,15 +1746,16 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
    uint32_t hash = is_mesh ? ctx->mesh_hash : ctx->gfx_hash;
    if (!is_separate ||
        /* TODO: maybe try variants? grimace */
+       /* TODO allow if uber is usable */
        !is_default ||
        !can_gpl)
-      return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh);
+      return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh, false);
    for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) {
       /* ensure async shader creation is done */
       if (stages[i]) {
          util_queue_fence_wait(&stages[i]->precompile.fence);
-         if (!stages[i]->precompile.obj.mod)
-            return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh);
+         if (!stages[i]->precompile.obj.mod && !stages[i]->precompile.obj.mod)
+            return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh, false);
       }
    }
 
@@ -1496,6 +1764,7 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
       goto fail;
 
    prog->is_separable = true;
+   prog->is_uber_program = true;
    prog->gfx_hash = hash;
    prog->base.uses_shobj = screen->info.have_EXT_shader_object &&
                            ((stages[MESA_SHADER_VERTEX] && !stages[MESA_SHADER_VERTEX]->info.view_mask) ||
@@ -1535,8 +1804,12 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
    */
    p_atomic_add(&prog->base.reference.count, refs - 1);
 
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      _mesa_hash_table_init(&prog->pipelines[i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+   _mesa_set_init(&prog->variants, prog->base.ralloc_ctx, hash_gfx_program, equals_program_variant);
+
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         _mesa_hash_table_init(&prog->pipelines[r][i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+      }
    }
 
    for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) {
@@ -1557,18 +1830,25 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag
    prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
 
    prog->last_variant_hash = is_mesh ? ctx->gfx_pipeline_state.mesh_optimal_key : ctx->gfx_pipeline_state.optimal_key;
+   prog->st_key = ctx->gfx_pipeline_state.st_key.small_key.val;
 
    if (!prog->base.uses_shobj) {
-      VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
-      struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key);
-      if (!gkey) {
-         mesa_loge("ZINK: failed to allocate gkey!");
-         goto fail;
+      if (!is_mesh) {
+         VkPipeline uber_libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
+         prog->libs->lib = CALLOC_STRUCT(zink_gfx_library_key);
+         prog->libs->lib->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, uber_libs, 2, VK_NULL_HANDLE, false, false);
+      } else {
+         VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
+         struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key);
+         if (!gkey) {
+            mesa_loge("ZINK: failed to allocate gkey!");
+            goto fail;
+         }
+         gkey->optimal_key = prog->last_variant_hash;
+         assert(gkey->optimal_key);
+         gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false, false);
+         _mesa_set_add(&prog->libs->libs, gkey);
       }
-      gkey->optimal_key = prog->last_variant_hash;
-      assert(gkey->optimal_key);
-      gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false, false);
-      _mesa_set_add(&prog->libs->libs, gkey);
    }
 
    if (!(zink_debug & ZINK_DEBUG_NOOPT))
@@ -1722,7 +2002,7 @@ precompile_compute_job(void *data, void *gdata, int thread_index)
    comp->curr = comp->module = CALLOC_STRUCT(zink_shader_module);
    assert(comp->module);
    comp->module->shobj = false;
-   comp->module->obj = zink_shader_compile(screen, false, comp->shader, comp->nir, NULL, NULL, &comp->base);
+   comp->module->obj = zink_shader_compile(screen, false, comp->shader, comp->nir, NULL, NULL, false, NULL, &comp->base);
    /* comp->nir will be freed by zink_shader_compile */
    comp->nir = NULL;
    assert(comp->module->obj.spirv);
@@ -1869,21 +2149,41 @@ zink_destroy_gfx_program(struct zink_screen *screen,
 {
    if (prog->is_separable)
       zink_gfx_program_reference(screen, &prog->full_prog, NULL);
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      hash_table_foreach(&prog->pipelines[i], entry) {
-         struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         hash_table_foreach(&prog->pipelines[r][i], entry) {
+            struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
 
-         util_queue_fence_wait(&pc_entry->fence);
-         VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
-         VKSCR(DestroyPipeline)(screen->dev, pc_entry->gpl.unoptimized_pipeline, NULL);
-         free(pc_entry);
+            util_queue_fence_wait(&pc_entry->fence);
+            VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
+            VKSCR(DestroyPipeline)(screen->dev, pc_entry->gpl.unoptimized_pipeline, NULL);
+            free(pc_entry);
+         }
       }
    }
 
+   /* wait for all async compilation jobs */
+   for (unsigned stage = 0; stage < ZINK_GFX_SHADER_COUNT; stage++) {
+        struct util_dynarray *shader_cache = &prog->shader_cache[stage][0][0];
+        unsigned count = util_dynarray_num_elements(shader_cache, struct zink_shader_module *);
+        struct zink_shader_module **pzm = shader_cache->data;
+        for (unsigned i = 0; i < count; i++) {
+           struct zink_shader_module *iter = pzm[i];
+           util_queue_fence_wait(&iter->fence);
+        }
+   }
+
+   set_foreach(&prog->variants, entry) {
+      struct program_variant_key *prog_variant_key = (void*)entry->key;
+      assert(prog_variant_key->prog->is_variant_program);
+      zink_destroy_gfx_program(screen, prog_variant_key->prog);
+      FREE(prog_variant_key);
+   }
+
    deinit_program(screen, &prog->base);
 
    for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) {
-      if (prog->shaders[i]) {
+      if (prog->shaders[i] && !prog->is_variant_program) {
          _mesa_set_remove_key(prog->shaders[i]->programs, prog);
          prog->shaders[i] = NULL;
       }
@@ -1895,6 +2195,10 @@ zink_destroy_gfx_program(struct zink_screen *screen,
          blob_finish(&prog->blobs[i]);
       }
    }
+   while (util_dynarray_contains(&prog->uber_modules, void*)) {
+      struct zink_shader_module *zm = util_dynarray_pop(&prog->uber_modules, struct zink_shader_module*);
+      zink_destroy_shader_module(screen, zm);
+   }
    if (prog->libs)
       zink_gfx_lib_cache_unref(screen, prog->libs);
 
@@ -2046,8 +2350,11 @@ bind_gfx_stage(struct zink_context *ctx, mesa_shader_stage stage, struct zink_sh
          zink_descriptors_init_bindless(ctx);
    } else {
       if (stage < MESA_SHADER_COMPUTE) {
-         if (ctx->curr_program)
-            ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+         if (ctx->curr_program_uber) {
+            ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash;
+            ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program_uber->st_key;
+         }
+         ctx->curr_program_uber = NULL;
          ctx->curr_program = NULL;
       }
       if (stage == MESA_SHADER_FRAGMENT || stage > MESA_SHADER_COMPUTE) {
@@ -2391,7 +2698,7 @@ zink_delete_cs_shader_state(struct pipe_context *pctx, void *cso)
 
 /* caller must lock prog->libs->lock */
 struct zink_gfx_library_key *
-zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state)
+zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber)
 {
    struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key);
    bool is_mesh = !prog->shaders[MESA_SHADER_VERTEX];
@@ -2401,11 +2708,15 @@ zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *pr
    }
 
    gkey->optimal_key = !is_mesh ? state->optimal_key : state->mesh_optimal_key;
+   gkey->st_key = state->st_key.small_key.val;
    assert(is_mesh || gkey->optimal_key);
    for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++)
       gkey->modules[i] = prog->objs[i].mod;
-   gkey->pipeline = zink_create_gfx_pipeline_library(screen, prog);
-   _mesa_set_add(&prog->libs->libs, gkey);
+   gkey->pipeline = zink_create_gfx_pipeline_library(screen, prog->objs, prog);
+   if (is_mesh)
+      _mesa_set_add(&prog->libs->libs, gkey);
+   else
+      prog->libs->lib = gkey;
    return gkey;
 }
 
@@ -2433,6 +2744,26 @@ print_exe_stages(VkShaderStageFlags stages)
    UNREACHABLE("unhandled combination of stages!");
 }
 
+static void
+precompile_variant_job(void *data, void *gdata, int thread_index)
+{
+   struct zink_screen *screen = gdata;
+   struct precompile_variant_data *precompile_data = data;
+   struct zink_gfx_program *prog = precompile_data->prog;
+   struct zink_gfx_pipeline_state *state = &precompile_data->state;
+
+   //generate_gfx_program_modules_optimal(NULL, screen, prog, state);
+   zink_screen_get_pipeline_cache(screen, &prog->base, true);
+   if (!screen->info.have_EXT_shader_object) {
+      simple_mtx_lock(&prog->libs->lock);
+      zink_create_pipeline_lib(screen, prog, state, false);
+      simple_mtx_unlock(&prog->libs->lock);
+   }
+   zink_screen_update_pipeline_cache(screen, &prog->base, true);
+
+   FREE(data);
+}
+
 static void
 gfx_program_precompile_job(void *data, void *gdata, int thread_index)
 {
@@ -2446,11 +2777,11 @@ gfx_program_precompile_job(void *data, void *gdata, int thread_index)
    state.shader_keys_optimal.key.vs_base.last_vertex_stage = true;
    state.shader_keys_optimal.key.tcs.patch_vertices = 3; //random guess, generated tcs precompile is hard
    state.optimal_key = state.shader_keys_optimal.key.val;
-   generate_gfx_program_modules_optimal(NULL, screen, prog, &state);
+   generate_gfx_program_modules_optimal(NULL, screen, prog, &state, prog->is_uber_program);
    zink_screen_get_pipeline_cache(screen, &prog->base, true);
    if (!prog->base.uses_shobj) {
       simple_mtx_lock(&prog->libs->lock);
-      zink_create_pipeline_lib(screen, prog, &state);
+      zink_create_pipeline_lib(screen, prog, &state, prog->is_uber_program);
       simple_mtx_unlock(&prog->libs->lock);
    }
    prog->base.precompile_done = true;
@@ -2494,17 +2825,18 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
       simple_mtx_unlock(lock);
       return;
    }
-   struct zink_gfx_program *prog = gfx_program_create(ctx, zshaders, 3, hash, is_mesh);
+   struct zink_gfx_program *prog = gfx_program_create(ctx, zshaders, 3, hash, is_mesh, false);
    u_foreach_bit(i, shader_stages)
       assert(prog->shaders[i]);
    _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
    prog->base.removed = false;
+   prog->is_uber_program = true;
    simple_mtx_unlock(lock);
    if (zink_debug & ZINK_DEBUG_SHADERDB) {
       struct zink_screen *screen = zink_screen(pctx->screen);
       gfx_program_init(ctx, prog);
       if (screen->optimal_keys)
-         generate_gfx_program_modules_optimal(ctx, screen,  prog, &ctx->gfx_pipeline_state);
+         generate_gfx_program_modules_optimal(ctx, screen,  prog, &ctx->gfx_pipeline_state, false);
       else
          generate_gfx_program_modules(ctx, screen,  prog, &ctx->gfx_pipeline_state);
       VkPipeline pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, &ctx->gfx_pipeline_state,
@@ -2535,7 +2867,7 @@ zink_delete_shader_state(struct pipe_context *pctx, void *cso)
 static void
 precompile_separate_shader(struct zink_shader *zs, struct zink_screen *screen)
 {
-   zs->precompile.obj = zink_shader_compile_separate(screen, zs);
+   zs->precompile.obj = zink_shader_compile_separate(screen, zs, zs->is_uber);
    if (!screen->info.have_EXT_shader_object) {
       struct zink_shader_object objs[MESA_SHADER_MESH_STAGES] = {0};
       objs[zs->info.stage].mod = zs->precompile.obj.mod;
@@ -2543,6 +2875,20 @@ precompile_separate_shader(struct zink_shader *zs, struct zink_screen *screen)
    }
 }
 
+static void
+precompile_variant_separate_shader_job(void *data, void *gdata, int thread_index)
+{
+   struct zink_screen *screen = gdata;
+   struct precompile_separate_variant_data *precompile_data = data;
+
+   nir_shader *nir = zink_shader_blob_deserialize(screen, precompile_data->blob);
+   precompile_data->zm->obj = zink_shader_compile(screen, precompile_data->uses_shobj, precompile_data->zs, nir,
+                                                  precompile_data->has_key ? &precompile_data->key: NULL,
+                                                  &precompile_data->st_key,
+                                                  false, NULL, precompile_data->prog);
+   FREE(data);
+}
+
 static void
 gfx_shader_init_job(void *data, void *gdata, int thread_index)
 {
@@ -2581,6 +2927,7 @@ zink_create_gfx_shader_state(struct pipe_context *pctx, const struct pipe_shader
       zink_descriptor_util_init_fbfetch(zink_context(pctx));
 
    struct zink_shader *zs = zink_shader_create(zink_screen(pctx->screen), nir);
+   zs->is_uber = true;
    if (zink_debug & ZINK_DEBUG_NOBGC)
       gfx_shader_init_job(zs, screen, 0);
    else
@@ -2593,6 +2940,8 @@ static void
 zink_delete_cached_shader_state(struct pipe_context *pctx, void *cso)
 {
    struct zink_screen *screen = zink_screen(pctx->screen);
+   // HACK this is oversyncing but we have no way of konwing which jobs use this zink_shader
+   util_queue_finish(&screen->cache_get_thread);
    util_shader_reference(pctx, &screen->shaders, &cso, NULL);
 }
 
diff --git a/src/gallium/drivers/zink/zink_program.h b/src/gallium/drivers/zink/zink_program.h
index 3d5ffc170c9..6e2b0a202a6 100644
--- a/src/gallium/drivers/zink/zink_program.h
+++ b/src/gallium/drivers/zink/zink_program.h
@@ -128,7 +128,7 @@ zink_mesh_program_update_optimal(struct zink_context *ctx);
 
 
 struct zink_gfx_library_key *
-zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state);
+zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber);
 uint32_t hash_gfx_output(const void *key);
 uint32_t hash_gfx_output_ds3(const void *key);
 uint32_t hash_gfx_input(const void *key);
@@ -159,7 +159,7 @@ zink_create_gfx_program(struct zink_context *ctx,
                         struct zink_shader **stages,
                         unsigned vertices_per_patch,
                         uint32_t gfx_hash,
-                        bool is_mesh);
+                        bool is_mesh, bool variant);
 
 void
 zink_destroy_gfx_program(struct zink_screen *screen,
@@ -405,6 +405,27 @@ zink_set_zs_needs_shader_swizzle_key(struct zink_context *ctx, mesa_shader_stage
       zink_set_shader_key_base(ctx, pstage)->needs_zs_shader_swizzle = enable;
 }
 
+static inline const union zink_st_small_key *
+zink_get_st_small_key(struct zink_context *ctx)
+{
+   assert(zink_screen(ctx->base.screen)->optimal_keys);
+   return &ctx->gfx_pipeline_state.st_key.small_key;
+}
+
+static inline union zink_st_small_key *
+zink_set_st_small_key(struct zink_context *ctx)
+{
+   ctx->dirty_gfx_stages |= ctx->shader_stages & (MESA_SHADER_VERTEX | MESA_SHADER_GEOMETRY | MESA_SHADER_FRAGMENT);
+   assert(zink_screen(ctx->base.screen)->optimal_keys);
+   return &ctx->gfx_pipeline_state.st_key.small_key;
+}
+
+static inline bool
+needs_st_emulation(struct zink_context *ctx)
+{
+   return ctx->gfx_pipeline_state.st_key.small_key.val != 0;
+}
+
 ALWAYS_INLINE static bool
 zink_can_use_pipeline_libs(const struct zink_context *ctx)
 {
@@ -464,6 +485,14 @@ zink_can_use_shader_objects_mesh(const struct zink_context *ctx)
           !ctx->fb_state.viewmask;
 }
 
+ALWAYS_INLINE static bool
+zink_can_use_uber(struct zink_context *ctx)
+{
+   bool generated_tcs = ctx->gfx_stages[MESA_SHADER_TESS_EVAL] && !ctx->gfx_stages[MESA_SHADER_TESS_CTRL];
+   return zink_shader_key_optimal_no_tcs(ctx->gfx_pipeline_state.optimal_key) == ZINK_SHADER_KEY_OPTIMAL_DEFAULT &&
+      zink_can_use_pipeline_libs(ctx) && (!generated_tcs || ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch == 3);
+}
+
 bool
 zink_set_rasterizer_discard(struct zink_context *ctx, bool disable);
 void
diff --git a/src/gallium/drivers/zink/zink_program_state.hpp b/src/gallium/drivers/zink/zink_program_state.hpp
index a175072e45b..3f38271dab2 100644
--- a/src/gallium/drivers/zink/zink_program_state.hpp
+++ b/src/gallium/drivers/zink/zink_program_state.hpp
@@ -102,6 +102,7 @@ template <zink_dynamic_state DYNAMIC_STATE, bool HAVE_LIB, bool IS_MESH>
 VkPipeline
 zink_get_gfx_pipeline(struct zink_context *ctx,
                       struct zink_gfx_program *prog,
+                      struct zink_gfx_program *variant_prog,
                       struct zink_gfx_pipeline_state *state,
                       enum mesa_prim mode)
 {
@@ -113,7 +114,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
    const unsigned idx = IS_MESH || screen->info.dynamic_state3_props.dynamicPrimitiveTopologyUnrestricted ?
                         0 :
                         get_pipeline_idx<DYNAMIC_STATE >= ZINK_DYNAMIC_STATE>(mode, vkmode);
-   assert(idx <= ARRAY_SIZE(prog->pipelines));
+   assert(idx <= ARRAY_SIZE(prog->pipelines[0]));
    if (IS_MESH) {
       if (!state->mesh_dirty && !state->mesh_modules_changed)
          return state->mesh_pipeline;
@@ -144,27 +145,28 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
       }
    }
    /* extra safety asserts for optimal path to catch refactoring bugs */
-   if (prog->optimal_keys) {
+   if (variant_prog->optimal_keys) {
       ASSERTED const union zink_shader_key_optimal *opt = (union zink_shader_key_optimal*)&prog->last_variant_hash;
       ASSERTED union zink_shader_key_optimal sanitized = {};
       if (IS_MESH) {
          sanitized.val = zink_sanitize_optimal_key_mesh(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
+         assert(opt->val == sanitized.val);
          assert(state->mesh_optimal_key == sanitized.val);
-      } else {
+      } else if (!state->uber_required) {
          sanitized.val = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val);
+         assert(opt->val == sanitized.val);
          assert(state->optimal_key == sanitized.val);
       }
-      assert(opt->val == sanitized.val);
    }
 
    if (IS_MESH) {
       state->mesh_modules_changed = false;
 
-      if (prog->last_finalized_hash[idx] == state->mesh_final_hash &&
-         !prog->inline_variants && likely(prog->last_pipeline[idx]) &&
+      if (prog->last_finalized_hash[0][idx] == state->mesh_final_hash &&
+         !prog->inline_variants && likely(prog->last_pipeline[0][idx]) &&
          /* this data is too big to compare in the fast-path */
          likely(!prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) {
-         state->mesh_pipeline = prog->last_pipeline[idx]->pipeline;
+         state->mesh_pipeline = prog->last_pipeline[0][idx]->pipeline;
          return state->mesh_pipeline;
       }
    } else {
@@ -202,23 +204,22 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
 
       /* shortcut for reusing previous pipeline across program changes */
       if (DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT || DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT2) {
-         if (prog->last_finalized_hash[idx] == state->final_hash &&
-            !prog->inline_variants && likely(prog->last_pipeline[idx]) &&
-            /* this data is too big to compare in the fast-path */
-            likely(!prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) {
-            state->pipeline = prog->last_pipeline[idx]->pipeline;
+         if (variant_prog->last_finalized_hash[state->uber_required][idx] == state->final_hash &&
+             !variant_prog->inline_variants && likely(variant_prog->last_pipeline[state->uber_required][idx]) &&
+             /* this data is too big to compare in the fast-path */
+             likely(!variant_prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) {
+            state->pipeline = variant_prog->last_pipeline[state->uber_required][idx]->pipeline;
             return state->pipeline;
          }
       }
    }
-
    unsigned final_hash = IS_MESH ? state->mesh_final_hash : state->final_hash;
-   entry = _mesa_hash_table_search_pre_hashed(&prog->pipelines[idx], final_hash, state);
+   entry = _mesa_hash_table_search_pre_hashed(&variant_prog->pipelines[state->uber_required][idx], final_hash, state);
 
    if (!entry) {
       bool can_gpl = IS_MESH ? zink_can_use_pipeline_libs_mesh(ctx) : zink_can_use_pipeline_libs(ctx);
       /* always wait on async precompile/cache fence */
-      util_queue_fence_wait(&prog->base.cache_fence);
+      util_queue_fence_wait(&variant_prog->base.cache_fence);
       struct zink_gfx_pipeline_cache_entry *pc_entry = CALLOC_STRUCT(zink_gfx_pipeline_cache_entry);
       if (!pc_entry)
          return VK_NULL_HANDLE;
@@ -227,28 +228,47 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
        */
       memcpy(&pc_entry->state, state, sizeof(*state));
       pc_entry->state.rendering_info.pColorAttachmentFormats = pc_entry->state.rendering_formats;
-      pc_entry->prog = prog;
+      pc_entry->prog = state->uber_required ? prog : variant_prog;
       /* init the optimized background compile fence */
       util_queue_fence_init(&pc_entry->fence);
-      entry = _mesa_hash_table_insert_pre_hashed(&prog->pipelines[idx], final_hash, pc_entry, pc_entry);
-      if (prog->base.uses_shobj && !prog->is_separable) {
-         memcpy(pc_entry->shobjs, prog->objs, sizeof(prog->objs));
+      entry = _mesa_hash_table_insert_pre_hashed(&variant_prog->pipelines[state->uber_required][idx], final_hash, pc_entry, pc_entry);
+      if (variant_prog->base.uses_shobj && !variant_prog->is_separable) {
+         memcpy(pc_entry->shobjs, variant_prog->objs, sizeof(variant_prog->objs));
          zink_gfx_program_compile_queue(ctx, pc_entry);
       } else if (HAVE_LIB && can_gpl) {
          uint32_t optimal_key = IS_MESH ? ctx->gfx_pipeline_state.mesh_optimal_key : ctx->gfx_pipeline_state.optimal_key;
          /* this is the graphics pipeline library path: find/construct all partial pipelines */
-         simple_mtx_lock(&prog->libs->lock);
-         struct set_entry *he = _mesa_set_search(&prog->libs->libs, &optimal_key);
          struct zink_gfx_library_key *gkey;
-         if (he) {
-            gkey = (struct zink_gfx_library_key *)he->key;
+         if (IS_MESH) {
+            simple_mtx_lock(&prog->libs->lock);
+            struct set_entry *he = _mesa_set_search(&prog->libs->libs, &optimal_key);
+            if (he) {
+               gkey = (struct zink_gfx_library_key *)he->key;
+            } else {
+               assert(!prog->is_separable);
+               gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state, false);
+            }
+            simple_mtx_unlock(&prog->libs->lock);
          } else {
-            assert(!prog->is_separable);
-            gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state);
+            if (state->uber_required) {
+               simple_mtx_lock(&prog->libs->lock);
+               assert(prog->libs->lib);
+               gkey = prog->libs->lib;
+               simple_mtx_unlock(&prog->libs->lock);
+            } else {
+               simple_mtx_lock(&variant_prog->libs->lock);
+               if (variant_prog->libs->lib) {
+                  gkey = variant_prog->libs->lib;
+                  assert(gkey->optimal_key == optimal_key);
+                  assert(gkey->st_key == state->st_key.small_key.val);
+               } else {
+                  assert(!variant_prog->is_separable);
+                  gkey = zink_create_pipeline_lib(screen, variant_prog, &ctx->gfx_pipeline_state, false);
+               }
+               simple_mtx_unlock(&variant_prog->libs->lock);
+            }
          }
-         simple_mtx_unlock(&prog->libs->lock);
-         struct zink_gfx_input_key *ikey = IS_MESH ? NULL :
-                                             DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
+         struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
                                              zink_find_or_create_input_dynamic(ctx, vkmode) :
                                              zink_find_or_create_input(ctx, vkmode);
          struct zink_gfx_output_key *okey = DYNAMIC_STATE >= ZINK_DYNAMIC_STATE3 && screen->have_full_ds3 ?
@@ -259,29 +279,30 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
          pc_entry->gpl.gkey = gkey;
          pc_entry->gpl.okey = okey;
          /* try to hit optimized compile cache first if possible */
-         if (!prog->is_separable)
-            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, true, true);
+         if (!variant_prog->is_separable)
+            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, variant_prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, true, true);
          if (!pc_entry->pipeline) {
             /* create the non-optimized pipeline first using fast-linking to avoid stuttering */
-            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, false, false);
-            if (!prog->is_separable)
+            pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, variant_prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, false, false);
+            if (!variant_prog->is_separable)
                /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */
                zink_gfx_program_compile_queue(ctx, pc_entry);
          }
       } else {
+         struct zink_shader_object *objs = state->uber_required ? prog->objs : variant_prog->objs;
          /* optimize by default only when expecting precompiles in order to reduce stuttering */
          if (DYNAMIC_STATE != ZINK_DYNAMIC_VERTEX_INPUT2 && DYNAMIC_STATE != ZINK_DYNAMIC_VERTEX_INPUT && !IS_MESH)
-            pc_entry->pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, state, state->element_state->binding_map, vkmode, !HAVE_LIB);
+            pc_entry->pipeline = zink_create_gfx_pipeline(screen, variant_prog, objs, state, state->element_state->binding_map, vkmode, !HAVE_LIB);
          else
-            pc_entry->pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, state, NULL, vkmode, !HAVE_LIB);
-         if (HAVE_LIB && !prog->is_separable)
+            pc_entry->pipeline = zink_create_gfx_pipeline(screen, variant_prog, objs, state, NULL, vkmode, !HAVE_LIB);
+         if (HAVE_LIB && !variant_prog->is_separable)
             /* trigger async optimized pipeline compile if this was an unoptimized pipeline */
             zink_gfx_program_compile_queue(ctx, pc_entry);
       }
       if (pc_entry->pipeline == VK_NULL_HANDLE)
          return VK_NULL_HANDLE;
 
-      zink_screen_update_pipeline_cache(screen, &prog->base, false);
+      zink_screen_update_pipeline_cache(screen, &variant_prog->base, false);
    }
 
    struct zink_gfx_pipeline_cache_entry *cache_entry = (struct zink_gfx_pipeline_cache_entry *)entry->data;
@@ -291,8 +312,8 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
       state->pipeline = cache_entry->pipeline;
    /* update states for fastpath */
    if (DYNAMIC_STATE >= ZINK_DYNAMIC_VERTEX_INPUT) {
-      prog->last_finalized_hash[idx] = final_hash;
-      prog->last_pipeline[idx] = cache_entry;
+      variant_prog->last_finalized_hash[state->uber_required][idx] = final_hash;
+      variant_prog->last_pipeline[state->uber_required][idx] = cache_entry;
    }
    return IS_MESH ? state->mesh_pipeline : state->pipeline;
 }
@@ -355,6 +376,8 @@ equals_gfx_pipeline_state(const void *a, const void *b)
    if (STAGE_MASK & STAGE_MASK_OPTIMAL) {
       if (sa->optimal_key != sb->optimal_key)
          return false;
+      if (sa->st_key.small_key.val != sb->st_key.small_key.val)
+         return false;
       if (STAGE_MASK & STAGE_MASK_OPTIMAL_SHADOW) {
          if (sa->shadow != sb->shadow)
             return false;
diff --git a/src/gallium/drivers/zink/zink_shader_keys.h b/src/gallium/drivers/zink/zink_shader_keys.h
index ea883007387..a76a6556cc4 100644
--- a/src/gallium/drivers/zink/zink_shader_keys.h
+++ b/src/gallium/drivers/zink/zink_shader_keys.h
@@ -28,6 +28,41 @@
 
 #include "compiler/shader_info.h"
 
+union zink_st_small_key {
+   struct {
+      /** for ARB_color_buffer_float */
+      uint8_t clamp_color:1;
+      /* for user-defined clip-planes */
+      uint8_t lower_ucp:1;
+      /* Whether st_variant::driver_shader is for the draw module,
+       * not for the driver.
+       */
+      uint8_t is_draw_shader:1;
+      uint8_t lower_flatshade:1;
+      uint8_t lower_alpha_test:1;
+      uint16_t pad: 11; // from here not key
+   };
+   uint16_t val;
+};
+
+struct zink_st_variant_key
+{
+   union zink_st_small_key small_key;
+
+   uint8_t ucp_enables: 8;
+
+   unsigned lower_alpha_func:3;
+
+
+   uint32_t pad2: 5; //next array aligned to uint32 for easy access
+
+   /* bitmask of sampler units; PIPE_CAP_GL_CLAMP */
+   uint32_t gl_clamp[3];
+
+   /* needs more than 128 bytes */
+   struct pipe_clip_state ucp_state;
+};
+
 struct zink_vs_key_base {
    bool last_vertex_stage : 1;
    bool clip_halfz : 1;
diff --git a/src/gallium/drivers/zink/zink_state.c b/src/gallium/drivers/zink/zink_state.c
index 24175aceeed..2415a65351d 100644
--- a/src/gallium/drivers/zink/zink_state.c
+++ b/src/gallium/drivers/zink/zink_state.c
@@ -722,6 +722,21 @@ zink_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
 
       if (!screen->optimal_keys)
          zink_update_gs_key_rectangular_line(ctx);
+
+      if (screen->optimal_keys) {
+         struct zink_st_variant_key *key = &ctx->gfx_pipeline_state.st_key;
+
+        if ((zink_get_st_small_key(ctx)->clamp_color) != ctx->rast_state->base.clamp_fragment_color)
+           zink_set_st_small_key(ctx)->clamp_color = ctx->rast_state->base.clamp_fragment_color;
+
+        if ((zink_get_st_small_key(ctx)->lower_flatshade) != ctx->rast_state->base.flatshade)
+           zink_set_st_small_key(ctx)->lower_flatshade = ctx->rast_state->base.flatshade;
+
+        key->ucp_enables = ctx->rast_state->base.clip_plane_enable;
+
+        if ((zink_get_st_small_key(ctx)->lower_ucp) != !!key->ucp_enables)
+           zink_set_st_small_key(ctx)->lower_ucp = !!key->ucp_enables;
+      }
    }
 }
 
diff --git a/src/gallium/drivers/zink/zink_types.h b/src/gallium/drivers/zink/zink_types.h
index c2305ece7f7..bddbb9e87c4 100644
--- a/src/gallium/drivers/zink/zink_types.h
+++ b/src/gallium/drivers/zink/zink_types.h
@@ -795,6 +795,7 @@ struct zink_shader_object {
       VkShaderModule mod;
    };
    struct spirv_shader *spirv;
+   VkPipeline gpl;
 };
 
 struct zink_shader {
@@ -824,6 +825,7 @@ struct zink_shader {
    bool has_uniforms;
    bool has_edgeflags;
    bool needs_inlining;
+   bool is_uber;
    struct spirv_shader *spirv;
 
    struct {
@@ -932,6 +934,7 @@ struct zink_gfx_pipeline_state {
    uint32_t vertex_strides[PIPE_MAX_ATTRIBS];
    struct zink_vertex_elements_hw_state *element_state;
    struct zink_zs_swizzle_key *shadow;
+   bool uber_required; // emulation needed && !async compilation done
    enum mesa_prim shader_rast_prim, rast_prim; /* reduced type or max for unknown */
    union {
       struct {
@@ -942,6 +945,7 @@ struct zink_gfx_pipeline_state {
          union zink_shader_key_optimal key;
       } shader_keys_optimal;
    };
+   struct zink_st_variant_key st_key;
    struct zink_blend_state *blend_state;
    VkFormat rendering_formats[PIPE_MAX_COLOR_BUFS];
    VkPipelineRenderingCreateInfo rendering_info;
@@ -1008,6 +1012,7 @@ enum zink_gfx_push_constant_member {
  */
 struct zink_shader_module {
    struct zink_shader_object obj;
+   struct util_queue_fence fence;
    uint32_t hash;
    bool shobj;
    bool default_variant;
@@ -1049,6 +1054,7 @@ typedef bool (*equals_gfx_pipeline_state_func)(const void *a, const void *b);
 
 struct zink_gfx_library_key {
    uint32_t optimal_key; //equals_pipeline_lib_optimal
+   uint32_t st_key;
    VkShaderModule modules[MESA_SHADER_MESH_STAGES];
    VkPipeline pipeline;
 };
@@ -1115,6 +1121,13 @@ struct zink_gfx_lib_cache {
 
    simple_mtx_t lock;
    struct set libs; //zink_gfx_library_key -> VkPipeline
+   struct zink_gfx_library_key *lib; //zink_gfx_library_key -> VkPipeline
+};
+
+struct zink_gfx_program_variant_key {
+   uint32_t optimal_key; //equals_pipeline_lib_optimal
+   uint32_t st_key;
+   struct zink_gfx_program *prog;
 };
 
 struct zink_gfx_program {
@@ -1135,21 +1148,29 @@ struct zink_gfx_program {
    uint32_t module_hash[MESA_SHADER_MESH_STAGES];
    struct blob blobs[MESA_SHADER_MESH_STAGES];
    struct util_dynarray shader_cache[MESA_SHADER_MESH_STAGES][2][2]; //normal, nonseamless cubes, inline uniforms
+   struct util_dynarray uber_modules;
+   struct set variants;
+   struct zink_gfx_program *base_variant; //quick access to base varitant (only !NULL when done compiling)
+   struct zink_gfx_program *uber_variant;
    unsigned inlined_variant_count[MESA_SHADER_MESH_STAGES];
    uint32_t default_variant_hash;
    uint8_t inline_variants; //which stages are using inlined uniforms
    bool needs_inlining; // whether this program requires some uniforms to be inlined
    bool has_edgeflags;
    bool optimal_keys;
+   bool started_compiling;
+   bool is_uber_program;
+   bool is_variant_program;
 
    /* separable */
    struct zink_gfx_program *full_prog;
 
-   struct hash_table pipelines[11]; // [number of draw modes we support]
+   struct hash_table pipelines[2][11]; // [uber_emulation][number of draw modes we support]
    uint32_t last_variant_hash;
+   uint32_t st_key;
 
-   uint32_t last_finalized_hash[4]; //[primtype idx]
-   struct zink_gfx_pipeline_cache_entry *last_pipeline[4]; //[primtype idx]
+   uint32_t last_finalized_hash[2][4]; //[uber_emulation][primtype idx]
+   struct zink_gfx_pipeline_cache_entry *last_pipeline[2][4]; //[uber_emulation][primtype idx]
 
    struct zink_gfx_lib_cache *libs;
 };
@@ -1787,6 +1808,7 @@ struct zink_context {
    simple_mtx_t program_lock[8];
    uint32_t gfx_hash;
    struct zink_gfx_program *curr_program;
+   struct zink_gfx_program *curr_program_uber;
    struct set gfx_inputs;
    struct set gfx_outputs;