zink: use GPL to handle (simple) separate shader objects

apps/games using separate shader objects end up passing the separable shaders to the link_shader hook individually, which is still not ideal for zink's usage since the more optimal path is to have all the shaders and create a RAST+FS GPL stage that can run all the inter-stage io handlers it IS technically possible to handle this for simple VS+FS pipelines using GPL, however, but it's kinda gross. such shaders now use descriptor buffer to create their own pipelines/layouts/descriptors async, and then a "separable" variant of the gfx program can be created by fast-linking these together the "separable" gfx program can't handle shader variants, but it can do basic pipeline caching for PSO state changes, which makes it flexible enough to sorta kinda maybe handle the most basic cases of separate shader objects descriptor buffer is used because having to create and manage a separate architecture for sets/pools/templates is too nightmarish even for me this is, at best, a partial solution, but it's the best the vulkan api can currently do Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21197>
2025-12-23 17:40:11 +01:00 · 2023-02-07 13:32:21 -05:00 · 2023-02-07 13:32:21 -05:00 · e3b746e3a3
commit e3b746e3a3
parent 60b26a6b1f
10 changed files with 447 additions and 25 deletions
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@ -24,6 +24,7 @@
 #include "nir_opcodes.h"
 #include "zink_context.h"
 #include "zink_compiler.h"
 #include "zink_descriptors.h"
 #include "zink_program.h"
 #include "zink_screen.h"
 #include "nir_to_spirv/nir_to_spirv.h"
@ -3205,6 +3206,39 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs,
   return mod;
 }
 VkShaderModule
 zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir)
 {
   nir_shader *nir = nir_shader_clone(NULL, zs->nir);
   int set = nir->info.stage == MESA_SHADER_FRAGMENT;
   unsigned offsets[4];
   zink_descriptor_shader_get_binding_offsets(zs, offsets);
   nir_foreach_variable_with_modes(var, nir, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform | nir_var_image) {
      if (var->data.bindless)
         continue;
      var->data.descriptor_set = set;
      switch (var->data.mode) {
      case nir_var_mem_ubo:
            var->data.binding = !!var->data.driver_location;
            break;
      case nir_var_uniform:
         if (glsl_type_is_sampler(glsl_without_array(var->type)))
            var->data.binding += offsets[1];
         break;
      case nir_var_mem_ssbo:
         var->data.binding += offsets[2];
         break;
      case nir_var_image:
         var->data.binding += offsets[3];
         break;
      default: break;
      }
   }
   optimize_nir(nir, zs);
   *ret_nir = nir;
   return compile_module(screen, zs, nir);
 }
 static bool
 lower_baseinstance_instr(nir_builder *b, nir_instr *instr, void *data)
 {
@ -4196,6 +4230,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
   ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model;
   util_queue_fence_init(&ret->precompile.fence);
   ret->hash = _mesa_hash_pointer(ret);
   ret->programs = _mesa_pointer_set_create(NULL);
@ -4490,8 +4525,16 @@ zink_shader_free(struct zink_screen *screen, struct zink_shader *shader)
      shader->non_fs.generated_gs = NULL;
   }
   _mesa_set_destroy(shader->programs, NULL);
   util_queue_fence_wait(&shader->precompile.fence);
   util_queue_fence_destroy(&shader->precompile.fence);
   zink_descriptor_shader_deinit(screen, shader);
   if (shader->precompile.mod)
      VKSCR(DestroyShaderModule)(screen->dev, shader->precompile.mod, NULL);
   if (shader->precompile.gpl)
      VKSCR(DestroyPipeline)(screen->dev, shader->precompile.gpl, NULL);
   ralloc_free(shader->nir);
   ralloc_free(shader->spirv);
   free(shader->precompile.bindings);
   ralloc_free(shader);
 }
@ -4530,6 +4573,7 @@ struct zink_shader *
 zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch)
 {
   struct zink_shader *ret = rzalloc(NULL, struct zink_shader);
   util_queue_fence_init(&ret->precompile.fence);
   ret->hash = _mesa_hash_pointer(ret);
   ret->programs = _mesa_pointer_set_create(NULL);
   simple_mtx_init(&ret->lock, mtx_plain);
--- a/src/gallium/drivers/zink/zink_compiler.h
+++ b/src/gallium/drivers/zink/zink_compiler.h
@ -63,6 +63,8 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh
 VkShaderModule
 zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data);
 VkShaderModule
 zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir);
 VkShaderModule
 zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv);
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
--- a/src/gallium/drivers/zink/zink_descriptors.c
+++ b/src/gallium/drivers/zink/zink_descriptors.c
@ -670,6 +670,96 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg)
   return true;
 }
 void
 zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets)
 {
   offsets[ZINK_DESCRIPTOR_TYPE_UBO] = 0;
   offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] = shader->bindings[ZINK_DESCRIPTOR_TYPE_UBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_UBO] - 1].binding + 1;
   offsets[ZINK_DESCRIPTOR_TYPE_SSBO] = offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] - 1].binding + 1;
   offsets[ZINK_DESCRIPTOR_TYPE_IMAGE] = offsets[ZINK_DESCRIPTOR_TYPE_SSBO] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SSBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SSBO] - 1].binding + 1;
 }
 void
 zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader)
 {
   VkDescriptorSetLayoutBinding bindings[ZINK_DESCRIPTOR_BASE_TYPES * ZINK_MAX_DESCRIPTORS_PER_TYPE];
   unsigned num_bindings = 0;
   VkShaderStageFlagBits stage_flags = mesa_to_vk_shader_stage(shader->nir->info.stage);
   unsigned desc_set_size = shader->has_uniforms;
   for (unsigned i = 0; i < ZINK_DESCRIPTOR_BASE_TYPES; i++)
      desc_set_size += shader->num_bindings[i];
   if (desc_set_size)
      shader->precompile.db_template = rzalloc_array(shader, struct zink_descriptor_template, desc_set_size);
   if (shader->has_uniforms) {
      VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings];
      binding->binding = 0;
      binding->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
      binding->descriptorCount = 1;
      binding->stageFlags = stage_flags;
      binding->pImmutableSamplers = NULL;
      struct zink_descriptor_template *entry = &shader->precompile.db_template[num_bindings];
      entry->count = 1;
      entry->offset = offsetof(struct zink_context, di.db.ubos[shader->nir->info.stage][0]);
      entry->stride = sizeof(VkDescriptorAddressInfoEXT);
      entry->db_size = screen->info.db_props.robustUniformBufferDescriptorSize;
      num_bindings++;
   }
   /* sync with zink_shader_compile_separate() */
   unsigned offsets[4];
   zink_descriptor_shader_get_binding_offsets(shader, offsets);
   for (int j = 0; j < ZINK_DESCRIPTOR_BASE_TYPES; j++) {
      for (int k = 0; k < shader->num_bindings[j]; k++) {
         VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings];
         if (j == ZINK_DESCRIPTOR_TYPE_UBO)
            binding->binding = 1;
         else
            binding->binding = shader->bindings[j][k].binding + offsets[j];
         binding->descriptorType = shader->bindings[j][k].type;
         binding->descriptorCount = shader->bindings[j][k].size;
         binding->stageFlags = stage_flags;
         binding->pImmutableSamplers = NULL;
         unsigned temp = 0;
         init_db_template_entry(screen, shader, j, k, &shader->precompile.db_template[num_bindings], &temp);
         num_bindings++;
      }
   }
   if (num_bindings) {
      shader->precompile.dsl = descriptor_layout_create(screen, 0, bindings, num_bindings);
      shader->precompile.bindings = mem_dup(bindings, num_bindings * sizeof(VkDescriptorSetLayoutBinding));
      shader->precompile.num_bindings = num_bindings;
      VkDeviceSize val;
      VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, shader->precompile.dsl, &val);
      shader->precompile.db_size = val;
      shader->precompile.db_offset = rzalloc_array(shader, uint32_t, num_bindings);
      for (unsigned i = 0; i < num_bindings; i++) {
         VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, shader->precompile.dsl, bindings[i].binding, &val);
         shader->precompile.db_offset[i] = val;
      }
   }
   VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_ALL_TYPES] = {0};
   unsigned num_dsl = num_bindings ? 2 : 0;
   if (shader->bindless)
      num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES;
   if (num_bindings || shader->bindless) {
      dsl[shader->nir->info.stage == MESA_SHADER_FRAGMENT] = shader->precompile.dsl;
      if (shader->bindless)
         dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout;
   }
   shader->precompile.layout = zink_pipeline_layout_create(screen, dsl, num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
 }
 void
 zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader)
 {
   if (shader->precompile.dsl)
      VKSCR(DestroyDescriptorSetLayout)(screen->dev, shader->precompile.dsl, NULL);
   if (shader->precompile.layout)
      VKSCR(DestroyPipelineLayout)(screen->dev, shader->precompile.layout, NULL);
 }
 /* called during program destroy */
 void
 zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg)
@ -946,6 +1036,71 @@ populate_sets(struct zink_context *ctx, struct zink_batch_state *bs,
   return true;
 }
 static void
 update_separable(struct zink_context *ctx, struct zink_program *pg)
 {
   struct zink_screen *screen = zink_screen(ctx->base.screen);
   struct zink_batch_state *bs = ctx->batch.state;
   unsigned use_buffer = 0;
   /* find the least-written buffer to use for this */
   for (unsigned i = 0; i < ARRAY_SIZE(bs->dd.db_offset); i++) {
      if (bs->dd.db_offset[i] < bs->dd.db_offset[use_buffer])
         use_buffer = i;
   }
   VkDescriptorGetInfoEXT info;
   info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT;
   info.pNext = NULL;
   struct zink_gfx_program *prog = (struct zink_gfx_program *)pg;
   struct zink_shader *shaders[] = {
      prog->shaders[MESA_SHADER_VERTEX]->precompile.num_bindings ? prog->shaders[MESA_SHADER_VERTEX] : prog->shaders[MESA_SHADER_FRAGMENT],
      prog->shaders[MESA_SHADER_FRAGMENT],
   };
   for (unsigned j = 0; j < pg->num_dsl; j++) {
      if (!(pg->dd.binding_usage & BITFIELD_BIT(j)))
         continue;
      uint64_t offset = bs->dd.db_offset[use_buffer];
      assert(bs->dd.db[use_buffer]->obj->size > bs->dd.db_offset[use_buffer] + pg->dd.db_size[j]);
      for (unsigned i = 0; i < shaders[j]->precompile.num_bindings; i++) {
         info.type = shaders[j]->precompile.bindings[i].descriptorType;
         uint64_t desc_offset = offset + pg->dd.db_offset[j][i];
         if (screen->info.db_props.combinedImageSamplerDescriptorSingleArray ||
               shaders[j]->precompile.bindings[i].descriptorType != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
               shaders[j]->precompile.bindings[i].descriptorCount == 1) {
            for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) {
               /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */
               info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[j][i].offset + k * pg->dd.db_template[j][i].stride);
               VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][i].db_size, bs->dd.db_map[use_buffer] + desc_offset + k * pg->dd.db_template[j][i].db_size);
            }
         } else {
            assert(shaders[j]->precompile.bindings[i].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
            char buf[1024];
            uint8_t *db = bs->dd.db_map[use_buffer] + desc_offset;
            uint8_t *samplers = db + shaders[j]->precompile.bindings[i].descriptorCount * screen->info.db_props.sampledImageDescriptorSize;
            for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) {
               /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */
               info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].offset +
                                             k * pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].stride);
               VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW].db_size, buf);
               /* drivers that don't support combinedImageSamplerDescriptorSingleArray must have sampler arrays written in memory as
                  *
                  *   | array_of_samplers[] | array_of_sampled_images[] |
                  * 
                  * which means each descriptor's data must be split
                  */
               memcpy(db, buf, screen->info.db_props.samplerDescriptorSize);
               memcpy(samplers, &buf[screen->info.db_props.samplerDescriptorSize], screen->info.db_props.sampledImageDescriptorSize);
               db += screen->info.db_props.sampledImageDescriptorSize;
               samplers += screen->info.db_props.samplerDescriptorSize;
            }
         }
      }
      bs->dd.cur_db_offset[use_buffer] = bs->dd.db_offset[use_buffer];
      bs->dd.db_offset[use_buffer] += pg->dd.db_size[j];
      VKCTX(CmdSetDescriptorBufferOffsetsEXT)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pg->layout, j, 1, &use_buffer, &offset);
   }
 }
 /* updates the mask of changed_sets and binds the mask of bind_sets */
 static void
 zink_descriptors_update_masked_buffer(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets)
@ -1092,6 +1247,17 @@ zink_descriptors_update(struct zink_context *ctx, bool is_compute)
      ctx->dd.push_state_changed[is_compute] = !!pg->dd.push_usage || ctx->dd.has_fbfetch != bs->dd.has_fbfetch;
   }
   if (!is_compute) {
      struct zink_gfx_program *prog = (struct zink_gfx_program*)pg;
      if (prog->is_separable) {
         /* force all descriptors update on next pass: separables use different layouts */
         ctx->dd.state_changed[is_compute] = BITFIELD_MASK(ZINK_DESCRIPTOR_TYPE_UNIFORMS);
         ctx->dd.push_state_changed[is_compute] = true;
         update_separable(ctx, pg);
         return;
      }
   }
   if (pg != bs->dd.pg[is_compute]) {
      /* if we don't already know that we have to update all sets,
       * check to see if any dsls changed
--- a/src/gallium/drivers/zink/zink_descriptors.h
+++ b/src/gallium/drivers/zink/zink_descriptors.h
@ -154,8 +154,12 @@ zink_descriptors_deinit_bindless(struct zink_context *ctx);
 void
 zink_descriptors_update_bindless(struct zink_context *ctx);
-
+void
-
+zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets);
 void
 zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader);
 void
 zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader);
 bool
 zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg);
--- a/src/gallium/drivers/zink/zink_pipeline.c
+++ b/src/gallium/drivers/zink/zink_pipeline.c
@ -751,7 +751,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, VkShaderModule *modules,
   pci.pStages = shader_stages;
   pci.stageCount = num_stages;
-   /* only add LTO for full pipeline libs */
+   /* Only keep LTO information for full pipeline libs.  For separable shaders, they will only
   * ever be used with fast linking, and to optimize them a new pipeline lib will be created with full
   * link time information for the full set of shader stages (rather than linking in these single-stage libs).
   */
   if (num_stages > 1)
      pci.flags |= VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
@ -770,6 +773,12 @@ zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_pro
   return create_gfx_pipeline_library(screen, prog->modules, prog->base.layout, prog->base.pipeline_cache);
 }
 VkPipeline
 zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout)
 {
   return create_gfx_pipeline_library(screen, modules, layout, VK_NULL_HANDLE);
 }
 VkPipeline
 zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized)
 {
--- a/src/gallium/drivers/zink/zink_pipeline.h
+++ b/src/gallium/drivers/zink/zink_pipeline.h
@ -62,6 +62,8 @@ VkPipeline
 zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state);
 VkPipeline
 zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized);
 VkPipeline
 zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout);
 #ifdef __cplusplus
 }
 #endif
--- a/src/gallium/drivers/zink/zink_program.c
+++ b/src/gallium/drivers/zink/zink_program.c
@ -44,6 +44,11 @@
 #define XXH_INLINE_ALL
 #include "util/xxhash.h"
 static void
 precompile_job(void *data, void *gdata, int thread_index);
 struct zink_gfx_program *
 create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch);
 void
 debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr)
 {
@ -645,6 +650,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
 {
   const union zink_shader_key_optimal *optimal_key = (union zink_shader_key_optimal*)&prog->last_variant_hash;
   if (ctx->gfx_pipeline_state.shader_keys_optimal.key.vs_bits != optimal_key->vs_bits) {
      assert(!prog->is_separable);
      bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->nir->info.stage);
      ctx->gfx_pipeline_state.modules_changed |= changed;
   }
@ -652,6 +658,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
   if (ctx->gfx_pipeline_state.shader_keys_optimal.key.fs_bits != optimal_key->fs_bits ||
       /* always recheck shadow swizzles since they aren't directly part of the key */
       unlikely(shadow_needs_shader_swizzle)) {
      assert(!prog->is_separable);
      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
      ctx->gfx_pipeline_state.modules_changed |= changed;
      if (unlikely(shadow_needs_shader_swizzle)) {
@ -661,6 +668,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
   }
   if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated &&
       ctx->gfx_pipeline_state.shader_keys_optimal.key.tcs_bits != optimal_key->tcs_bits) {
      assert(!prog->is_separable);
      bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL);
      ctx->gfx_pipeline_state.modules_changed |= changed;
   }
@ -682,13 +690,28 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
         ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
      if (entry) {
         prog = (struct zink_gfx_program*)entry->data;
         if (prog->is_separable) {
            /* shader variants can't be handled by separable programs: sync and compile */
            if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key))
               util_queue_fence_wait(&prog->base.cache_fence);
            /* If the optimized linked pipeline is done compiling, swap it into place. */
            if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
               struct zink_gfx_program *real = prog->full_prog;
               entry->data = real;
               prog->full_prog = NULL;
               prog->base.removed = true;
               zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL);
               prog = real;
            }
         }
         update_gfx_program_optimal(ctx, prog);
      } else {
         ctx->dirty_gfx_stages |= ctx->shader_stages;
-         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch);
+         prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch);
         zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false);
         _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
-         generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state);
+         if (!prog->is_separable)
            generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state);
      }
      simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
      if (prog && prog != ctx->curr_program)
@ -699,6 +722,24 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
      /* remove old hash */
      ctx->gfx_pipeline_state.optimal_key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val;
      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
      if (ctx->curr_program->is_separable) {
         struct zink_gfx_program *prog = ctx->curr_program;
         if (prog->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) {
            util_queue_fence_wait(&prog->base.cache_fence);
            /* shader variants can't be handled by separable programs: sync and compile */
            struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
            const uint32_t hash = ctx->gfx_hash;
            simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
            struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
            struct zink_gfx_program *real = prog->full_prog;
            entry->data = real;
            prog->full_prog = NULL;
            prog->base.removed = true;
            zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL);
            ctx->curr_program = real;
            simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
         }
      }
      update_gfx_program_optimal(ctx, ctx->curr_program);
      /* apply new hash */
      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
@ -969,6 +1010,112 @@ fail:
   return NULL;
 }
 /* Creates a replacement, optimized zink_gfx_program for this set of separate shaders, which will
 * be swapped in in place of the fast-linked separable program once it's done compiling.
 */
 static void
 create_linked_separable_job(void *data, void *gdata, int thread_index)
 {
   struct zink_gfx_program *prog = data;
   prog->full_prog = zink_create_gfx_program(prog->ctx, prog->shaders, 0);
   precompile_job(prog->full_prog, gdata, thread_index);
 }
 struct zink_gfx_program *
 create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch)
 {
   struct zink_screen *screen = zink_screen(ctx->base.screen);
   unsigned shader_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_FRAGMENT);
   /* filter cases that need real pipelines */
   if (ctx->shader_stages != shader_stages ||
       !stages[MESA_SHADER_VERTEX]->precompile.mod || !stages[MESA_SHADER_FRAGMENT]->precompile.mod ||
       /* TODO: maybe try variants? grimace */
       !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) ||
       !zink_can_use_pipeline_libs(ctx))
      return zink_create_gfx_program(ctx, stages, vertices_per_patch);
   /* ensure async gpl creation is done */
   util_queue_fence_wait(&stages[MESA_SHADER_VERTEX]->precompile.fence);
   util_queue_fence_wait(&stages[MESA_SHADER_FRAGMENT]->precompile.fence);
   struct zink_gfx_program *prog = create_program(ctx, false);
   if (!prog)
      goto fail;
   prog->ctx = ctx;
   prog->is_separable = true;
   prog->shaders[MESA_SHADER_VERTEX] = stages[MESA_SHADER_VERTEX];
   prog->stages_remaining = prog->stages_present = shader_stages;
   prog->shaders[MESA_SHADER_FRAGMENT] = stages[MESA_SHADER_FRAGMENT];
   prog->last_vertex_stage = stages[MESA_SHADER_VERTEX];
   _mesa_set_init(&prog->libs, prog, hash_pipeline_lib, equals_pipeline_lib);
   unsigned refs = 0;
   for (int i = 0; i < ZINK_GFX_SHADER_COUNT; ++i) {
      if (prog->shaders[i]) {
         simple_mtx_lock(&prog->shaders[i]->lock);
         _mesa_set_add(prog->shaders[i]->programs, prog);
         simple_mtx_unlock(&prog->shaders[i]->lock);
         refs++;
      }
   }
   /* We can do this add after the _mesa_set_adds above because we know the prog->shaders[] are 
   * referenced by the draw state and zink_shader_free() can't be called on them while we're in here.
   */
   p_atomic_add(&prog->base.reference.count, refs);
   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
         _mesa_hash_table_init(&prog->pipelines[r][i], prog, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
         /* only need first 3/4 for point/line/tri/patch */
         if (screen->info.have_EXT_extended_dynamic_state &&
             i == (prog->last_vertex_stage->nir->info.stage == MESA_SHADER_TESS_EVAL ? 4 : 3))
            break;
      }
   }
   if (prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl) {
      prog->base.dd.binding_usage |= BITFIELD_BIT(0);
      prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_template;
      prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_size;
      prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_offset;
      prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl;
      prog->base.num_dsl++;
   }
   if (prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl) {
      prog->base.dd.binding_usage |= BITFIELD_BIT(1);
      prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_template;
      prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_size;
      prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_offset;
      prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl;
      /* guarantee a null dsl if vs doesn't have descriptors */
      prog->base.num_dsl = 2;
   }
   prog->base.dd.bindless = prog->shaders[MESA_SHADER_VERTEX]->bindless | prog->shaders[MESA_SHADER_FRAGMENT]->bindless;
   if (prog->base.dd.bindless) {
      prog->base.num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES;
      prog->base.dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout;
   }
   prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
   VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
   prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
   struct zink_gfx_library_key *gkey = rzalloc(prog, struct zink_gfx_library_key);
   gkey->optimal_key = prog->last_variant_hash;
   assert(gkey->optimal_key);
   gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false);
   _mesa_set_add(&prog->libs, gkey);
   util_queue_add_job(&screen->cache_get_thread, prog, &prog->base.cache_fence, create_linked_separable_job, NULL, 0);
   return prog;
 fail:
   if (prog)
      zink_destroy_gfx_program(screen, prog);
   return NULL;
 }
 static uint32_t
 hash_compute_pipeline_state_local_size(const void *key)
 {
@ -1203,6 +1350,8 @@ zink_destroy_gfx_program(struct zink_screen *screen,
      max_idx++;
   }
   if (prog->is_separable)
      zink_gfx_program_reference(screen, &prog->full_prog, NULL);
   for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) {
      for (int i = 0; i < max_idx; ++i) {
         hash_table_foreach(&prog->pipelines[r][i], entry) {
@ -1223,11 +1372,13 @@ zink_destroy_gfx_program(struct zink_screen *screen,
         _mesa_set_remove_key(prog->shaders[i]->programs, prog);
         prog->shaders[i] = NULL;
      }
-      destroy_shader_cache(screen, &prog->shader_cache[i][0][0]);
+      if (!prog->is_separable) {
-      destroy_shader_cache(screen, &prog->shader_cache[i][0][1]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][0][0]);
-      destroy_shader_cache(screen, &prog->shader_cache[i][1][0]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][0][1]);
-      destroy_shader_cache(screen, &prog->shader_cache[i][1][1]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][1][0]);
-      ralloc_free(prog->nir[i]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][1][1]);
         ralloc_free(prog->nir[i]);
      }
   }
   set_foreach_remove(&prog->libs, he) {
@ -1761,6 +1912,20 @@ precompile_job(void *data, void *gdata, int thread_index)
   zink_screen_update_pipeline_cache(screen, &prog->base, true);
 }
 static void
 precompile_separate_shader_job(void *data, void *gdata, int thread_index)
 {
   struct zink_screen *screen = gdata;
   struct zink_shader *zs = data;
   nir_shader *nir;
   zs->precompile.mod = zink_shader_compile_separate(screen, zs, &nir);
   zink_descriptor_shader_init(screen, zs);
   VkShaderModule mods[ZINK_GFX_SHADER_COUNT] = {0};
   mods[nir->info.stage] = zs->precompile.mod;
   zs->precompile.gpl = zink_create_gfx_pipeline_separate(screen, mods, zs->precompile.layout);
 }
 static void
 zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
 {
@ -1769,8 +1934,17 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
   if (shaders[MESA_SHADER_COMPUTE])
      return;
   /* can't precompile fixedfunc */
-   if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT])
+   if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) {
      if (shaders[MESA_SHADER_VERTEX] || shaders[MESA_SHADER_FRAGMENT]) {
         struct zink_shader *zs = shaders[MESA_SHADER_VERTEX] ? shaders[MESA_SHADER_VERTEX] : shaders[MESA_SHADER_FRAGMENT];
         if (zs->nir->info.separate_shader && !zs->precompile.mod && util_queue_fence_is_signalled(&zs->precompile.fence) &&
             zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB &&
             /* sample shading can't precompile */
             (!shaders[MESA_SHADER_FRAGMENT] || !zs->nir->info.fs.uses_sample_shading))
            util_queue_add_job(&zink_screen(pctx->screen)->cache_get_thread, zs, &zs->precompile.fence, precompile_separate_shader_job, NULL, 0);
      }
      return;
   }
   unsigned hash = 0;
   unsigned shader_stages = 0;
   for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) {
--- a/src/gallium/drivers/zink/zink_program_state.hpp
+++ b/src/gallium/drivers/zink/zink_program_state.hpp
@ -190,10 +190,12 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
         /* this is the graphics pipeline library path: find/construct all partial pipelines */
         struct set_entry *he = _mesa_set_search(&prog->libs, &ctx->gfx_pipeline_state.optimal_key);
         struct zink_gfx_library_key *gkey;
-         if (he)
+         if (he) {
            gkey = (struct zink_gfx_library_key *)he->key;
-         else
+         } else {
            assert(!prog->is_separable);
            gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state);
         }
         struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
                                             zink_find_or_create_input_dynamic(ctx, vkmode) :
                                             zink_find_or_create_input(ctx, vkmode);
@ -215,7 +217,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
      zink_screen_update_pipeline_cache(screen, &prog->base, false);
      pc_entry->pipeline = pipeline;
-      if (HAVE_LIB)
+      if (HAVE_LIB && !prog->is_separable)
         /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */
         zink_gfx_program_compile_queue(ctx, pc_entry);
   }
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@ -188,6 +188,8 @@ zink_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *s
   }
   struct zink_shader *zs = shader;
   if (!util_queue_fence_is_signalled(&zs->precompile.fence))
      return false;
   bool finished = true;
   set_foreach(zs->programs, entry) {
      struct zink_gfx_program *prog = (void*)entry->key;
--- a/src/gallium/drivers/zink/zink_types.h
+++ b/src/gallium/drivers/zink/zink_types.h
@ -732,6 +732,19 @@ struct zink_shader {
   bool has_uniforms;
   struct spirv_shader *spirv;
   struct {
      struct util_queue_fence fence;
      VkShaderModule mod;
      VkDescriptorSetLayout dsl;
      VkPipelineLayout layout;
      VkPipeline gpl;
      VkDescriptorSetLayoutBinding *bindings;
      unsigned num_bindings;
      struct zink_descriptor_template *db_template;
      unsigned db_size;
      unsigned *db_offset;
   } precompile;
   simple_mtx_t lock;
   struct set *programs;
@ -973,26 +986,30 @@ struct zink_gfx_pipeline_cache_entry {
 struct zink_gfx_program {
   struct zink_program base;
   bool is_separable; //not a full program
   struct zink_context *ctx; //the owner context
   uint32_t stages_present; //mask of stages present in this program
   uint32_t stages_remaining; //mask of zink_shader remaining in this program
   struct nir_shader *nir[ZINK_GFX_SHADER_COUNT];
   VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here
   uint32_t module_hash[ZINK_GFX_SHADER_COUNT];
   struct zink_shader *last_vertex_stage;
   struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms
   unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT];
   struct zink_shader *shaders[ZINK_GFX_SHADER_COUNT];
-   struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support]
+   struct zink_shader *last_vertex_stage;
   /* full */
   VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here
   uint32_t module_hash[ZINK_GFX_SHADER_COUNT];
   struct nir_shader *nir[ZINK_GFX_SHADER_COUNT];
   struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms
   unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT];
   uint32_t default_variant_hash;
   uint32_t last_variant_hash;
   uint8_t inline_variants; //which stages are using inlined uniforms
   /* separable */
   struct zink_gfx_program *full_prog;
   struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support]
   uint32_t last_variant_hash;
   uint32_t last_finalized_hash[2][4]; //[dynamic, renderpass][primtype idx]
   VkPipeline last_pipeline[2][4]; //[dynamic, renderpass][primtype idx]