zink: implement async gfx precompile

the pipe_context::link_shader hook is called when shaders are linked into a program by the application by leveraging this, it becomes possible to utilize the existing graphics pipeline library to implement precompilation by creating a partial pipeline containing only the shader stages and then adding in the vertex input and fragment output stages dynamically using the fast-link feature if all goes well, and if the vulkan driver's fast-linking is truly fast, the full pipeline should be dynamically combined in time to avoid stuttering, and an optimized variant will be queued for async compile to be used the next time the pipeline triggers a draw Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18961>
2025-12-23 15:30:14 +01:00 · 2022-09-22 16:42:19 -04:00 · 2022-09-22 16:42:19 -04:00 · 41ffb15de5
commit 41ffb15de5
parent aed4e716d0
6 changed files with 106 additions and 6 deletions
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@ -3387,6 +3387,7 @@ zink_shader_free(struct zink_screen *screen, struct zink_shader *shader)
         _mesa_hash_table_remove(ht, he);
         prog->base.removed = true;
         simple_mtx_unlock(&prog->ctx->program_lock[idx]);
         util_queue_fence_wait(&prog->base.cache_fence);
      }
      if (stage != MESA_SHADER_TESS_CTRL || !shader->is_generated) {
         prog->shaders[stage] = NULL;
--- a/src/gallium/drivers/zink/zink_program.c
+++ b/src/gallium/drivers/zink/zink_program.c
@ -607,6 +607,8 @@ ALWAYS_INLINE static void
 update_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, gl_shader_stage pstage)
 {
   struct zink_screen *screen = zink_screen(ctx->base.screen);
   if (screen->info.have_EXT_graphics_pipeline_library)
      util_queue_fence_wait(&prog->base.cache_fence);
   struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
   if (!zm)
      zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state);
@ -674,6 +676,28 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
   ctx->last_vertex_stage_dirty = false;
 }
 static void
 optimized_compile_job(void *data, void *gdata, int thread_index)
 {
   struct zink_gfx_pipeline_cache_entry *pc_entry = data;
   struct zink_screen *screen = gdata;
   VkPipeline pipeline;
   if (pc_entry->gkey)
      pipeline = zink_create_gfx_pipeline_combined(screen, pc_entry->prog, pc_entry->ikey->pipeline, pc_entry->gkey->pipeline, pc_entry->okey->pipeline, false);
   else
      pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, &pc_entry->state, pc_entry->state.element_state->binding_map, zink_primitive_topology(pc_entry->state.gfx_prim_mode), true);
   if (pipeline) {
      pc_entry->unoptimized_pipeline = pc_entry->pipeline;
      pc_entry->pipeline = pipeline;
   }
 }
 void
 zink_gfx_program_compile_queue(struct zink_context *ctx, struct zink_gfx_pipeline_cache_entry *pc_entry)
 {
   util_queue_add_job(&zink_screen(ctx->base.screen)->cache_get_thread, pc_entry, &pc_entry->fence, optimized_compile_job, NULL, 0);
 }
 static void
 update_cs_shader_module(struct zink_context *ctx, struct zink_compute_program *comp)
 {
@ -1149,7 +1173,9 @@ zink_destroy_gfx_program(struct zink_screen *screen,
         hash_table_foreach(&prog->pipelines[r][i], entry) {
            struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data;
            util_queue_fence_wait(&pc_entry->fence);
            VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
            VKSCR(DestroyPipeline)(screen->dev, pc_entry->unoptimized_pipeline, NULL);
            free(pc_entry);
         }
      }
@ -1540,6 +1566,59 @@ zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *pr
   return gkey;
 }
 static void
 precompile_job(void *data, void *gdata, int thread_index)
 {
   struct zink_screen *screen = gdata;
   struct zink_gfx_program *prog = data;
   struct zink_gfx_pipeline_state state = {0};
   state.shader_keys_optimal.key.vs_base.last_vertex_stage = true;
   generate_gfx_program_modules_optimal(NULL, screen, prog, &state);
   zink_screen_get_pipeline_cache(screen, &prog->base, true);
   zink_create_pipeline_lib(screen, prog, &state);
   zink_screen_update_pipeline_cache(screen, &prog->base, true);
 }
 static void
 zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
 {
   struct zink_context *ctx = zink_context(pctx);
   struct zink_shader **zshaders = (struct zink_shader **)shaders;
   if (shaders[MESA_SHADER_COMPUTE])
      return;
   /* can't precompile fixedfunc */
   if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT])
      return;
   unsigned hash = 0;
   unsigned shader_stages = 0;
   for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) {
      if (zshaders[i]) {
         hash ^= zshaders[i]->hash;
         shader_stages |= BITFIELD_BIT(i);
      }
   }
   unsigned tess_stages = BITFIELD_BIT(MESA_SHADER_TESS_CTRL) | BITFIELD_BIT(MESA_SHADER_TESS_EVAL);
   unsigned tess = shader_stages & tess_stages;
   /* can't do fixedfunc tes either */
   if (tess && !shaders[MESA_SHADER_TESS_EVAL])
      return;
   struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(shader_stages)];
   simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(shader_stages)]);
   /* link can be called repeatedly with the same shaders: ignore */
   if (_mesa_hash_table_search_pre_hashed(ht, hash, shaders)) {
      simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(shader_stages)]);
      return;
   }
   struct zink_gfx_program *prog = zink_create_gfx_program(ctx, zshaders, 3);
   u_foreach_bit(i, shader_stages)
      assert(prog->shaders[i]);
   _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
   simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(shader_stages)]);
   // precompile_job(prog, ctx, 0);
   util_queue_add_job(&zink_screen(pctx->screen)->cache_get_thread, prog, &prog->base.cache_fence, precompile_job, NULL, 0);
 }
 void
 zink_program_init(struct zink_context *ctx)
 {
@ -1585,6 +1664,9 @@ zink_program_init(struct zink_context *ctx)
                 offsetof(struct zink_gfx_input_key, element_state) - offsetof(struct zink_gfx_input_key, input));
   STATIC_ASSERT(sizeof(union zink_shader_key_optimal) == sizeof(uint32_t));
   if (zink_screen(ctx->base.screen)->info.have_EXT_graphics_pipeline_library)
      ctx->base.link_shader = zink_link_gfx_shader;
 }
 bool
--- a/src/gallium/drivers/zink/zink_program.h
+++ b/src/gallium/drivers/zink/zink_program.h
@ -126,6 +126,8 @@ uint32_t hash_gfx_output_ds3(const void *key);
 uint32_t hash_gfx_input(const void *key);
 uint32_t hash_gfx_input_dynamic(const void *key);
 void
 zink_gfx_program_compile_queue(struct zink_context *ctx, struct zink_gfx_pipeline_cache_entry *pc_entry);
 static inline unsigned
 get_primtype_idx(enum pipe_prim_type mode)
--- a/src/gallium/drivers/zink/zink_program_state.hpp
+++ b/src/gallium/drivers/zink/zink_program_state.hpp
@ -221,6 +221,8 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
      if (!pc_entry)
         return VK_NULL_HANDLE;
      memcpy(&pc_entry->state, state, sizeof(*state));
      pc_entry->prog = prog;
      util_queue_fence_init(&pc_entry->fence);
      entry = _mesa_hash_table_insert_pre_hashed(&prog->pipelines[rp_idx][idx], state->final_hash, pc_entry, pc_entry);
      if (HAVE_LIB &&
          /* TODO: if there's ever a dynamic render extension with input attachments */
@ -247,13 +249,16 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
         pc_entry->okey = okey;
         pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey->pipeline, gkey->pipeline, okey->pipeline, true);
      } else {
-         pipeline = zink_create_gfx_pipeline(screen, prog, state, state->element_state->binding_map, vkmode, true);
+         /* optimize by default only when expecting precompiles in order to reduce stuttering */
         pipeline = zink_create_gfx_pipeline(screen, prog, state, state->element_state->binding_map, vkmode, !HAVE_LIB);
      }
      if (pipeline == VK_NULL_HANDLE)
         return VK_NULL_HANDLE;
      zink_screen_update_pipeline_cache(screen, &prog->base, false);
      pc_entry->pipeline = pipeline;
      if (HAVE_LIB)
         zink_gfx_program_compile_queue(ctx, pc_entry);
   }
   struct zink_gfx_pipeline_cache_entry *cache_entry = (struct zink_gfx_pipeline_cache_entry *)entry->data;
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@ -176,11 +176,18 @@ zink_set_max_shader_compiler_threads(struct pipe_screen *pscreen, unsigned max_t
 static bool
 zink_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *shader, enum pipe_shader_type shader_type)
 {
-   /* not supported yet */
+   if (shader_type == MESA_SHADER_COMPUTE) {
-   if (shader_type != MESA_SHADER_COMPUTE)
+      struct zink_program *pg = shader;
-      return true;
+      return !pg->can_precompile || util_queue_fence_is_signalled(&pg->cache_fence);
-   struct zink_program *pg = shader;
+   }
-   return !pg->can_precompile || util_queue_fence_is_signalled(&pg->cache_fence);
+
   struct zink_shader *zs = shader;
   bool finished = true;
   set_foreach(zs->programs, entry) {
      struct zink_gfx_program *prog = (void*)entry->key;
      finished &= util_queue_fence_is_signalled(&prog->base.cache_fence);
   }
   return finished;
 }
 static VkDeviceSize
--- a/src/gallium/drivers/zink/zink_types.h
+++ b/src/gallium/drivers/zink/zink_types.h
@ -830,9 +830,12 @@ struct zink_gfx_pipeline_cache_entry {
   struct zink_gfx_pipeline_state state;
   VkPipeline pipeline;
   /* GPL only */
   struct util_queue_fence fence;
   struct zink_gfx_input_key *ikey;
   struct zink_gfx_library_key *gkey;
   struct zink_gfx_output_key *okey;
   struct zink_gfx_program *prog;
   VkPipeline unoptimized_pipeline;
 };
 struct zink_gfx_program {