From f2ecf95fa7665819c95fbd3121a7a90d1e2ec602 Mon Sep 17 00:00:00 2001 From: antonino Date: Fri, 28 Jul 2023 11:11:15 +0200 Subject: [PATCH] zink: uber shaders logic Introduce the logic to implement uber shaders. The way variants is handled changes significantly: a uber program is expected to be compiling asynchronously and is used whenever possible. Specialized variant shaders are compiled asynchronously, though they might be compiled synchronously if the uber program can't be used. Each variant is a separate program as that simplifies gpl/obj caching. A new key is introduced, the st_key, that keeps track of the state of the features emulated by the uber shader. This is split in a dynamic part, always sent through push constants, and a more compact part that is used as a key for caching optimized variants. --- src/gallium/drivers/zink/zink_compiler.c | 54 +- src/gallium/drivers/zink/zink_compiler.h | 6 +- src/gallium/drivers/zink/zink_context.c | 4 +- src/gallium/drivers/zink/zink_draw.cpp | 13 +- src/gallium/drivers/zink/zink_pipeline.c | 4 +- src/gallium/drivers/zink/zink_pipeline.h | 2 +- src/gallium/drivers/zink/zink_program.c | 727 +++++++++++++----- src/gallium/drivers/zink/zink_program.h | 33 +- .../drivers/zink/zink_program_state.hpp | 99 ++- src/gallium/drivers/zink/zink_shader_keys.h | 35 + src/gallium/drivers/zink/zink_state.c | 15 + src/gallium/drivers/zink/zink_types.h | 28 +- 12 files changed, 766 insertions(+), 254 deletions(-) diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index 50414a51037..75a16f41545 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -3879,9 +3879,25 @@ remove_interpolate_at_sample(struct nir_builder *b, nir_intrinsic_instr *interp, return true; } +static void +zink_optimized_st_emulation_passes(nir_shader *nir, struct zink_shader *zs, + const struct zink_st_variant_key *key) +{ + if (!nir->info.io_lowered) + return; +} + +static void +zink_emulation_passes(nir_shader *nir, struct zink_shader *zs) +{ + if (!nir->info.io_lowered) + return; +} + struct zink_shader_object -zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, - nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg) +zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir, + const struct zink_shader_key *key, const struct zink_st_variant_key *st_key, + bool compile_uber, const void *extra_data, struct zink_program *pg) { bool need_optimize = true; bool inlined_uniforms = false; @@ -3891,8 +3907,18 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad NIR_PASS(_, nir, nir_lower_sample_shading); } + if (compile_uber) { + zink_emulation_passes(nir, zs); + need_optimize = true; + } + else if (st_key) { + zink_optimized_st_emulation_passes(nir, zs, st_key); + need_optimize = true; + } + NIR_PASS(_, nir, add_derefs); NIR_PASS(_, nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8); + if (key) { if (key->inline_uniforms) { NIR_PASS(_, nir, nir_inline_uniforms, @@ -4077,7 +4103,7 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad } struct zink_shader_object -zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs) +zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, bool compile_uber) { nir_shader *nir = zs->nir; /* TODO: maybe compile multiple variants for different set counts for compact mode? */ @@ -4107,6 +4133,10 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs) default: break; } } + + if (compile_uber) + zink_emulation_passes(nir, zs); + NIR_PASS(_, nir, add_derefs); NIR_PASS(_, nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8); if (screen->driconf.inline_uniforms) { @@ -4114,6 +4144,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs) NIR_PASS(_, nir, rewrite_bo_access, screen); NIR_PASS(_, nir, remove_bo_access, zs); } + optimize_nir(nir, zs, true); zink_descriptor_shader_init(screen, zs); nir_shader *nir_clone = NULL; @@ -4128,7 +4159,7 @@ zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs) zs->non_fs.generated_tcs = zink_shader_tcs_create(screen, 32); zink_shader_tcs_init(screen, zs->non_fs.generated_tcs, nir_clone, &nir_tcs); nir_tcs->info.separate_shader = true; - zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs); + zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs, compile_uber); ralloc_free(nir_tcs); zs->non_fs.generated_tcs->nir = NULL; } @@ -6448,11 +6479,13 @@ gfx_shader_prune(struct zink_screen *screen, struct zink_shader *shader) prog->base.removed = true; simple_mtx_unlock(lock); - for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) { - hash_table_foreach(&prog->pipelines[i], table_entry) { - struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data; + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + hash_table_foreach(&prog->pipelines[r][i], table_entry) { + struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data; - util_queue_fence_wait(&pc_entry->fence); + util_queue_fence_wait(&pc_entry->fence); + } } } } @@ -6468,7 +6501,10 @@ gfx_shader_prune(struct zink_screen *screen, struct zink_shader *shader) prog->shaders[MESA_SHADER_GEOMETRY]->non_fs.parent == shader) { prog->shaders[MESA_SHADER_GEOMETRY] = NULL; } - zink_gfx_program_reference(screen, &prog, NULL); + + /* variant programs are owned and destroyed by their parent */ + if (!prog->is_variant_program) + zink_gfx_program_reference(screen, &prog, NULL); return true; } diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h index bec8fae913e..0c33a493b38 100644 --- a/src/gallium/drivers/zink/zink_compiler.h +++ b/src/gallium/drivers/zink/zink_compiler.h @@ -63,9 +63,11 @@ void zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_shader *consumer); /* pass very large shader key data with extra_data */ struct zink_shader_object -zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg); +zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs, nir_shader *nir, + const struct zink_shader_key *key, const struct zink_st_variant_key *st_key, + bool compile_uber, const void *extra_data, struct zink_program *pg); struct zink_shader_object -zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs); +zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, bool compile_uber); struct zink_shader * zink_shader_create(struct zink_screen *screen, struct nir_shader *nir); void diff --git a/src/gallium/drivers/zink/zink_context.c b/src/gallium/drivers/zink/zink_context.c index 889699d80f2..413d46de30a 100644 --- a/src/gallium/drivers/zink/zink_context.c +++ b/src/gallium/drivers/zink/zink_context.c @@ -3721,8 +3721,8 @@ zink_update_descriptor_refs(struct zink_context *ctx, bool compute) res->obj->unordered_read = false; } } - if (ctx->curr_program) - zink_batch_reference_program(ctx, &ctx->curr_program->base); + if (ctx->curr_program_uber || ctx->curr_program) + zink_batch_reference_program(ctx, &ctx->curr_program_uber->base); } if (ctx->di.bindless_refs_dirty) { ctx->di.bindless_refs_dirty = false; diff --git a/src/gallium/drivers/zink/zink_draw.cpp b/src/gallium/drivers/zink/zink_draw.cpp index 7476d19edbb..a28024f4d67 100644 --- a/src/gallium/drivers/zink/zink_draw.cpp +++ b/src/gallium/drivers/zink/zink_draw.cpp @@ -265,11 +265,11 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum zink_gfx_program_update(ctx); bool pipeline_changed = false; VkPipeline pipeline = VK_NULL_HANDLE; - if (!ctx->curr_program->base.uses_shobj) { + if (!(ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber : ctx->curr_program)->base.uses_shobj) { if (screen->info.have_EXT_graphics_pipeline_library) - pipeline = zink_get_gfx_pipeline(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode); + pipeline = zink_get_gfx_pipeline(ctx, ctx->curr_program_uber, ctx->curr_program, &ctx->gfx_pipeline_state, mode); else - pipeline = zink_get_gfx_pipeline(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode); + pipeline = zink_get_gfx_pipeline(ctx, ctx->curr_program, ctx->curr_program, &ctx->gfx_pipeline_state, mode); assert(pipeline); pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw; if (BATCH_CHANGED || pipeline_changed) @@ -285,7 +285,8 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum VK_SHADER_STAGE_FRAGMENT_BIT, }; /* always rebind all stages */ - VKCTX(CmdBindShadersEXT)(bs->cmdbuf, ZINK_GFX_SHADER_COUNT, stages, ctx->curr_program->objects); + VKCTX(CmdBindShadersEXT)(bs->cmdbuf, ZINK_GFX_SHADER_COUNT, stages, + ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber->objects : ctx->curr_program->objects); if (screen->info.have_EXT_mesh_shader) { /* must always unbind mesh stages */ VkShaderStageFlagBits mesh_stages[] = { @@ -994,9 +995,9 @@ update_mesh_pipeline(struct zink_context *ctx, struct zink_batch_state *bs) VkPipeline pipeline = VK_NULL_HANDLE; if (!ctx->mesh_program->base.uses_shobj) { if (screen->info.have_EXT_graphics_pipeline_library) - pipeline = zink_get_gfx_pipeline(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT); + pipeline = zink_get_gfx_pipeline(ctx, ctx->mesh_program, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT); else - pipeline = zink_get_gfx_pipeline(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT); + pipeline = zink_get_gfx_pipeline(ctx, ctx->mesh_program, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT); assert(pipeline); pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw; if (BATCH_CHANGED || pipeline_changed) diff --git a/src/gallium/drivers/zink/zink_pipeline.c b/src/gallium/drivers/zink/zink_pipeline.c index a5363a98481..1114bf1186e 100644 --- a/src/gallium/drivers/zink/zink_pipeline.c +++ b/src/gallium/drivers/zink/zink_pipeline.c @@ -884,10 +884,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_objec } VkPipeline -zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_program *prog) +zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_object *objs, struct zink_gfx_program *prog) { u_rwlock_wrlock(&prog->base.pipeline_cache_lock); - VkPipeline pipeline = create_gfx_pipeline_library(screen, prog->objs, prog->stages_present, prog->base.layout, prog->base.pipeline_cache); + VkPipeline pipeline = create_gfx_pipeline_library(screen, objs, prog->stages_present, prog->base.layout, prog->base.pipeline_cache); u_rwlock_wrunlock(&prog->base.pipeline_cache_lock); return pipeline; } diff --git a/src/gallium/drivers/zink/zink_pipeline.h b/src/gallium/drivers/zink/zink_pipeline.h index 7b050f15efb..aa66cf0dc21 100644 --- a/src/gallium/drivers/zink/zink_pipeline.h +++ b/src/gallium/drivers/zink/zink_pipeline.h @@ -58,7 +58,7 @@ zink_create_gfx_pipeline_input(struct zink_screen *screen, const uint8_t *binding_map, VkPrimitiveTopology primitive_topology); VkPipeline -zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_program *prog); +zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_shader_object *objs, struct zink_gfx_program *prog); VkPipeline zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state); VkPipeline diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c index 00b1ee702a5..e63f4735a73 100644 --- a/src/gallium/drivers/zink/zink_program.c +++ b/src/gallium/drivers/zink/zink_program.c @@ -41,6 +41,7 @@ #include "nir_serialize.h" #include "nir.h" #include "nir/nir_draw_helpers.h" +#include "util/u_queue.h" /* for pipeline cache */ #define XXH_INLINE_ALL @@ -48,9 +49,34 @@ static void gfx_program_precompile_job(void *data, void *gdata, int thread_index); +static void +precompile_variant_job(void *data, void *gdata, int thread_index); +static void +precompile_variant_separate_shader_job(void *data, void *gdata, int thread_index); struct zink_gfx_program * create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch, bool is_mesh); +struct precompile_variant_data { + struct zink_gfx_program *prog; + struct zink_gfx_pipeline_state state; +}; + +struct precompile_separate_variant_data { + struct zink_program *prog; + struct zink_shader_module *zm; + struct zink_shader *zs; + struct blob *blob; + bool uses_shobj; + struct zink_shader_key key; + struct zink_st_variant_key st_key; + bool has_key; +}; + +struct program_variant_key { + uint32_t key, st_key; + struct zink_gfx_program *prog; +}; + void debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr) { @@ -69,9 +95,9 @@ shader_key_matches_tcs_nongenerated(const struct zink_shader_module *zm, const s if (zm->num_uniforms != num_uniforms || zm->has_nonseamless != !!key->base.nonseamless_cube_mask || zm->needs_zs_shader_swizzle != key->base.needs_zs_shader_swizzle) return false; - const uint32_t nonseamless_size = zm->has_nonseamless ? sizeof(uint32_t) : 0; - return (!nonseamless_size || !memcmp(zm->key + zm->key_size, &key->base.nonseamless_cube_mask, nonseamless_size)) && - (!num_uniforms || !memcmp(zm->key + zm->key_size + nonseamless_size, + const uint32_t nonseamless_size = zm->has_nonseamless ? sizeof(union zink_st_small_key) : 0; + return (!nonseamless_size || !memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key), &key->base.nonseamless_cube_mask, nonseamless_size)) && + (!num_uniforms || !memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key) + nonseamless_size, key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t))); } @@ -84,13 +110,13 @@ shader_key_matches(const struct zink_shader_module *zm, if (has_inline) { if (zm->num_uniforms != num_uniforms || (num_uniforms && - memcmp(zm->key + zm->key_size + nonseamless_size, + memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key) + nonseamless_size, key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t)))) return false; } if (!has_nonseamless) { if (zm->has_nonseamless != !!key->base.nonseamless_cube_mask || - (nonseamless_size && memcmp(zm->key + zm->key_size, &key->base.nonseamless_cube_mask, nonseamless_size))) + (nonseamless_size && memcmp(zm->key + zm->key_size + sizeof(union zink_st_small_key), &key->base.nonseamless_cube_mask, nonseamless_size))) return false; } if (zm->needs_zs_shader_swizzle != key->base.needs_zs_shader_swizzle) @@ -142,18 +168,19 @@ create_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *scr const bool is_nongenerated_tcs = stage == MESA_SHADER_TESS_CTRL && !zs->non_fs.is_generated; const bool shadow_needs_shader_swizzle = key->base.needs_zs_shader_swizzle || (stage == MESA_SHADER_FRAGMENT && key->key.fs.base.shadow_needs_shader_swizzle); - zm = malloc(sizeof(struct zink_shader_module) + key->size + + zm = malloc(sizeof(struct zink_shader_module) + sizeof(union zink_st_small_key) + key->size + (!has_nonseamless ? nonseamless_size : 0) + inline_size * sizeof(uint32_t) + (shadow_needs_shader_swizzle ? sizeof(struct zink_zs_swizzle_key) : 0)); if (!zm) { return NULL; } + util_queue_fence_init(&zm->fence); unsigned patch_vertices = state->shader_keys.key[MESA_SHADER_TESS_CTRL].key.tcs.patch_vertices; if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) { assert(ctx); //TODO async zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base); } else { - zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), key, &ctx->di.zs_swizzle[stage], &prog->base); + zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), key, &state->st_key, false, &ctx->di.zs_swizzle[stage], &prog->base); } if (!zm->obj.mod) { FREE(zm); @@ -168,20 +195,22 @@ create_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *scr zm->key_size = 0; memset(zm->key, 0, key->size); } + uint16_t st_val = state->st_key.small_key.val; + memcpy(zm->key + key->size, &st_val, sizeof(st_val)); if (!has_nonseamless && nonseamless_size) { /* nonseamless mask gets added to base key if it exists */ - memcpy(zm->key + key->size, &key->base.nonseamless_cube_mask, nonseamless_size); + memcpy(zm->key + key->size + sizeof(st_val), &key->base.nonseamless_cube_mask, nonseamless_size); } zm->needs_zs_shader_swizzle = shadow_needs_shader_swizzle; zm->has_nonseamless = has_nonseamless ? 0 : !!nonseamless_size; if (inline_size) - memcpy(zm->key + key->size + nonseamless_size, key->base.inlined_uniform_values, inline_size * sizeof(uint32_t)); + memcpy(zm->key + key->size + sizeof(st_val) + nonseamless_size, key->base.inlined_uniform_values, inline_size * sizeof(uint32_t)); if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated) zm->hash = patch_vertices; else zm->hash = shader_module_hash(zm); if (unlikely(shadow_needs_shader_swizzle)) { - memcpy(zm->key + key->size + nonseamless_size + inline_size * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)); + memcpy(zm->key + key->size + sizeof(st_val) + nonseamless_size + inline_size * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)); zm->hash ^= _mesa_hash_data(&ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)); } zm->default_variant = !shadow_needs_shader_swizzle && !inline_size && !util_dynarray_contains(&prog->shader_cache[stage][0][0], void*); @@ -219,9 +248,12 @@ get_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *screen continue; if (!shader_key_matches(iter, key, inline_size, has_inline, has_nonseamless)) continue; + uint16_t st_val = state->st_key.small_key.val; + if (memcmp(iter->key + iter->key_size, &st_val, sizeof(st_val))) + continue; if (unlikely(shadow_needs_shader_swizzle)) { /* shadow swizzle data needs a manual compare since it's so fat */ - if (memcmp(iter->key + iter->key_size + nonseamless_size + iter->num_uniforms * sizeof(uint32_t), + if (memcmp(iter->key + iter->key_size + sizeof(st_val) + nonseamless_size + iter->num_uniforms * sizeof(uint32_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key))) continue; } @@ -241,7 +273,8 @@ ALWAYS_INLINE static struct zink_shader_module * create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_shader *zs, struct zink_gfx_program *prog, mesa_shader_stage stage, - struct zink_gfx_pipeline_state *state) + struct zink_gfx_pipeline_state *state, + bool unpopulated, bool compile_uber) { struct zink_shader_module *zm; uint16_t *key; @@ -258,23 +291,30 @@ create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_scr key = NULL; } size_t key_size = sizeof(uint16_t); - zm = calloc(1, sizeof(struct zink_shader_module) + (key ? key_size : 0) + (unlikely(shadow_needs_shader_swizzle) ? sizeof(struct zink_zs_swizzle_key) : 0)); + zm = calloc(1, sizeof(struct zink_shader_module) + + sizeof(union zink_st_small_key) + + (key ? key_size : 0) + + (unlikely(shadow_needs_shader_swizzle) ? sizeof(struct zink_zs_swizzle_key) : 0)); if (!zm) { return NULL; } - if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) { - assert(ctx || screen->info.dynamic_state2_feats.extendedDynamicState2PatchControlPoints); - unsigned patch_vertices = 3; - if (ctx) { - struct zink_tcs_key *tcs = (struct zink_tcs_key*)key; - patch_vertices = tcs->patch_vertices; - } - zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base); - } else { - zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), - (struct zink_shader_key*)key, shadow_needs_shader_swizzle ? &ctx->di.zs_swizzle[stage] : NULL, &prog->base); + util_queue_fence_init(&zm->fence); + if (!unpopulated) { + if (stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated && zs->spirv) { + assert(ctx || screen->info.dynamic_state2_feats.extendedDynamicState2PatchControlPoints); + unsigned patch_vertices = 3; + if (ctx) { + struct zink_tcs_key *tcs = (struct zink_tcs_key*)key; + patch_vertices = tcs->patch_vertices; + } + zm->obj = zink_shader_tcs_compile(screen, zs, patch_vertices, prog->base.uses_shobj, &prog->base); + } else { + zm->obj = zink_shader_compile(screen, prog->base.uses_shobj, zs, zink_shader_blob_deserialize(screen, &prog->blobs[stage]), + (struct zink_shader_key*)key, &state->st_key, compile_uber, + shadow_needs_shader_swizzle ? &ctx->di.zs_swizzle[stage] : NULL, &prog->base); + } } - if (!zm->obj.mod) { + if (!zm->obj.mod && !unpopulated) { FREE(zm); return NULL; } @@ -288,9 +328,17 @@ create_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_scr *data = (*key) & mask; if (unlikely(shadow_needs_shader_swizzle)) memcpy(&data[1], &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key)); + uint16_t st_val = state->st_key.small_key.val; + uint8_t *p = (uint8_t*)&data[1]; + if (unlikely(shadow_needs_shader_swizzle)) + p += sizeof(struct zink_zs_swizzle_key); + memcpy(p, &st_val, sizeof(st_val)); } zm->default_variant = !util_dynarray_contains(&prog->shader_cache[stage][0][0], void*); - util_dynarray_append(&prog->shader_cache[stage][0][0], zm); + if (!compile_uber) + util_dynarray_append(&prog->shader_cache[stage][0][0], zm); + else + util_dynarray_append(&prog->uber_modules, zm); return zm; } @@ -298,7 +346,7 @@ ALWAYS_INLINE static struct zink_shader_module * get_shader_module_for_stage_optimal_key(struct zink_context *ctx, struct zink_screen *screen, struct zink_shader *zs, struct zink_gfx_program *prog, mesa_shader_stage stage, - struct zink_gfx_pipeline_state *state, uint16_t *key) + struct zink_gfx_pipeline_state *state, uint16_t *key, uint16_t *st_key) { /* non-generated tcs won't use the shader key */ const bool is_nongenerated_tcs = stage == MESA_SHADER_TESS_CTRL && !zs->non_fs.is_generated; @@ -324,6 +372,12 @@ get_shader_module_for_stage_optimal_key(struct zink_context *ctx, struct zink_sc if (memcmp(iter->key + sizeof(uint16_t), &ctx->di.zs_swizzle[stage], sizeof(struct zink_zs_swizzle_key))) continue; } + uint16_t st_val = *st_key; + uint8_t *p = iter->key + sizeof(union zink_st_small_key); + if (unlikely(shadow_needs_shader_swizzle)) + p += sizeof(struct zink_zs_swizzle_key); + if (memcmp(p, &st_val, sizeof(st_val))) + continue; } if (i > 0) { struct zink_shader_module *zero = pzm[0]; @@ -360,16 +414,18 @@ get_shader_module_for_stage_optimal(struct zink_context *ctx, struct zink_screen mesa_shader_stage stage, struct zink_gfx_pipeline_state *state) { - uint16_t *key; + uint16_t *key, st_key; key = get_shader_module_optimal_key(ctx, prog, zs, stage); + st_key = state->st_key.small_key.val; - return get_shader_module_for_stage_optimal_key(ctx, screen, zs, prog, stage, state, key); + return get_shader_module_for_stage_optimal_key(ctx, screen, zs, prog, stage, state, key, &st_key); } static void zink_destroy_shader_module(struct zink_screen *screen, struct zink_shader_module *zm) { + util_queue_fence_wait(&zm->fence); if (zm->shobj) VKSCR(DestroyShaderEXT)(screen->dev, zm->obj.obj, NULL); else @@ -480,7 +536,7 @@ generate_gfx_program_modules(struct zink_context *ctx, struct zink_screen *scree } static void -generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state) +generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber) { assert(!prog->objs[MESA_SHADER_VERTEX].mod && !prog->objs[MESA_SHADER_MESH].mod); for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) { @@ -489,7 +545,7 @@ generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_scree assert(prog->shaders[i]); - struct zink_shader_module *zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[i], prog, i, state); + struct zink_shader_module *zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[i], prog, i, state, false, compile_uber); prog->objs[i] = zm->obj; prog->objects[i] = zm->obj.obj; } @@ -498,21 +554,11 @@ generate_gfx_program_modules_optimal(struct zink_context *ctx, struct zink_scree state->modules_changed = true; else state->mesh_modules_changed = true; - prog->last_variant_hash = prog->shaders[MESA_SHADER_MESH] ? state->mesh_optimal_key : state->optimal_key; -} -static uint32_t -hash_pipeline_lib_generated_tcs(const void *key) -{ - const struct zink_gfx_library_key *gkey = key; - return gkey->optimal_key; -} - - -static bool -equals_pipeline_lib_generated_tcs(const void *a, const void *b) -{ - return !memcmp(a, b, sizeof(uint32_t)); + if (!compile_uber) { + prog->last_variant_hash = prog->shaders[MESA_SHADER_MESH] ? state->mesh_optimal_key : state->optimal_key; + prog->st_key = state->st_key.small_key.val; + } } static uint32_t @@ -530,25 +576,6 @@ equals_pipeline_lib_mesh(const void *a, const void *b) return ak->optimal_key == bk->optimal_key; } -static uint32_t -hash_pipeline_lib(const void *key) -{ - const struct zink_gfx_library_key *gkey = key; - /* remove generated tcs bits */ - return zink_shader_key_optimal_no_tcs(gkey->optimal_key); -} - -static bool -equals_pipeline_lib(const void *a, const void *b) -{ - const struct zink_gfx_library_key *ak = a; - const struct zink_gfx_library_key *bk = b; - /* remove generated tcs bits */ - uint32_t val_a = zink_shader_key_optimal_no_tcs(ak->optimal_key); - uint32_t val_b = zink_shader_key_optimal_no_tcs(bk->optimal_key); - return val_a == val_b; -} - uint32_t hash_gfx_input_dynamic(const void *key) { @@ -673,7 +700,7 @@ zink_gfx_program_update(struct zink_context *ctx) update_gfx_program(ctx, prog); } else { ctx->dirty_gfx_stages |= ctx->shader_stages; - prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, hash, false); + prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, hash, false, false); zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false); _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog); prog->base.removed = false; @@ -682,7 +709,8 @@ zink_gfx_program_update(struct zink_context *ctx) simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); if (prog && prog != ctx->curr_program) zink_batch_reference_program(ctx, &prog->base); - ctx->curr_program = prog; + ctx->curr_program_uber = ctx->curr_program = prog; + ctx->gfx_pipeline_state.uber_required = false; ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; ctx->gfx_dirty = false; } else if (ctx->dirty_gfx_stages) { @@ -695,41 +723,72 @@ zink_gfx_program_update(struct zink_context *ctx) ctx->dirty_gfx_stages = 0; } -ALWAYS_INLINE static bool -update_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage) +ALWAYS_INLINE static void +gfx_program_cache_populate_queue(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage, struct zink_shader_module *zm) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + if (screen->info.have_EXT_graphics_pipeline_library) + util_queue_fence_wait(&prog->base.cache_fence); + struct precompile_separate_variant_data *data = CALLOC_STRUCT(precompile_separate_variant_data); + data->prog = &prog->base; + data->zs = prog->shaders[pstage]; + data->blob = &prog->blobs[pstage]; + data->uses_shobj = prog->base.uses_shobj; + data->zm = zm; + struct zink_shader_key* keyp = (struct zink_shader_key*)get_shader_module_optimal_key(ctx, prog, data->zs, pstage); + if (keyp) + data->key = *keyp; + data->has_key = !!keyp; + data->st_key = ctx->gfx_pipeline_state.st_key; + if (zink_debug & ZINK_DEBUG_NOBGC) { + precompile_variant_separate_shader_job(data, screen, 0); + } else { + util_queue_add_job(&screen->cache_get_thread, data, &zm->fence, precompile_variant_separate_shader_job, NULL, 0); + } +} + +ALWAYS_INLINE static struct zink_shader_module * +update_or_queue_gfx_shader_module_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage, bool async) { struct zink_screen *screen = zink_screen(ctx->base.screen); if (screen->info.have_EXT_graphics_pipeline_library) util_queue_fence_wait(&prog->base.cache_fence); struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state); + bool entry_found = !!zm; + bool async_done = zm && util_queue_fence_is_signalled(&zm->fence); if (!zm) { - zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state); + zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state, async, false); perf_debug(ctx, "zink[gfx_compile]: %s shader variant required\n", _mesa_shader_stage_to_string(pstage)); } - - bool changed = prog->objs[pstage].mod != zm->obj.mod; - prog->objs[pstage] = zm->obj; - prog->objects[pstage] = zm->obj.obj; - return changed; + if (!async || async_done) { + return zm; + } else { + if (!entry_found) + gfx_program_cache_populate_queue(ctx, prog, pstage, zm); + } + return NULL; } -static void -update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *prog) +static bool +update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *prog, struct zink_gfx_program *variant_prog, bool async) { + bool async_done = true; + struct zink_shader_module *zms[3] = {0}; const union zink_shader_key_optimal *key = (union zink_shader_key_optimal*)&ctx->gfx_pipeline_state.optimal_key; const union zink_shader_key_optimal *last_prog_key = (union zink_shader_key_optimal*)&prog->last_variant_hash; - if (key->vs_bits != last_prog_key->vs_bits) { - assert(!prog->is_separable); - bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->info.stage); - ctx->gfx_pipeline_state.modules_changed |= changed; + bool st_key_diff = ctx->gfx_pipeline_state.st_key.small_key.val != prog->st_key; + if (st_key_diff || key->vs_bits != last_prog_key->vs_bits) { + assert(!variant_prog->is_separable); + zms[0] = update_or_queue_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->info.stage, async); + async_done &= !!zms[0]; } const bool shadow_needs_shader_swizzle = last_prog_key->fs.shadow_needs_shader_swizzle && (ctx->dirty_gfx_stages & BITFIELD_BIT(MESA_SHADER_FRAGMENT)); - if (key->fs_bits != last_prog_key->fs_bits || + if (st_key_diff || key->fs_bits != last_prog_key->fs_bits || /* always recheck shadow swizzles since they aren't directly part of the key */ unlikely(shadow_needs_shader_swizzle)) { - assert(!prog->is_separable); - bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT); - ctx->gfx_pipeline_state.modules_changed |= changed; + assert(!variant_prog->is_separable); + zms[1] = update_or_queue_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT, async); + async_done &= !!zms[1]; if (unlikely(shadow_needs_shader_swizzle)) { struct zink_shader_module **pzm = prog->shader_cache[MESA_SHADER_FRAGMENT][0][0].data; ctx->gfx_pipeline_state.shadow = (struct zink_zs_swizzle_key*)pzm[0]->key + sizeof(uint16_t); @@ -737,11 +796,77 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr } if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated && key->tcs_bits != last_prog_key->tcs_bits) { - assert(!prog->is_separable); - bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL); - ctx->gfx_pipeline_state.modules_changed |= changed; + assert(!variant_prog->is_separable); + zms[2] = update_or_queue_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL, async); + async_done &= !!zms[2]; } - prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key; + mesa_shader_stage stages[] = {ctx->last_vertex_stage->info.stage, MESA_SHADER_FRAGMENT, MESA_SHADER_TESS_CTRL}; + if (async_done) { + for (int i = 0;i < 3; i++) { + if (!zms[i]) + continue; + variant_prog->objs[stages[i]] = zms[i]->obj; + variant_prog->objects[stages[i]] = zms[i]->obj.obj; + } + variant_prog->last_variant_hash = prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key; + variant_prog->st_key = prog->st_key = ctx->gfx_pipeline_state.st_key.small_key.val; + } + return async_done; +} + +static bool +update_gfx_program_missing_shaders(struct zink_context *ctx, struct zink_gfx_program *prog, + struct zink_gfx_program *variant_prog, bool async) +{ + bool async_done = true; + for (int rstage = 0; rstage < MESA_SHADER_COMPUTE; rstage++) { + assert(!!variant_prog->shaders[rstage] == !!prog->shaders[rstage]); + if (variant_prog->shaders[rstage] && !variant_prog->objs[rstage].mod) { + assert(!variant_prog->is_separable); + struct zink_shader_module *mod = update_or_queue_gfx_shader_module_optimal(ctx, prog, rstage, async); + async_done &= !!mod; + if (mod) { + bool changed = variant_prog->objs[rstage].mod != mod->obj.mod; + variant_prog->objs[rstage] = mod->obj; + variant_prog->objects[rstage] = mod->obj.obj; + ctx->gfx_pipeline_state.modules_changed |= changed; + } + } + } + return async_done; +} + +static void +copy_gfx_program_missing_shaders(struct zink_context *ctx, struct zink_gfx_program *base_prog, + struct zink_gfx_program *variant_prog) +{ + for (int rstage = 0; rstage < MESA_SHADER_COMPUTE; rstage++) { + assert(!!variant_prog->shaders[rstage] == !!base_prog->shaders[rstage]); + if (variant_prog->shaders[rstage] && !variant_prog->objs[rstage].mod) { + bool changed = variant_prog->objs[rstage].mod != base_prog->objs[rstage].mod; + variant_prog->objs[rstage] = base_prog->objs[rstage]; + variant_prog->objects[rstage] = base_prog->objects[rstage]; + ctx->gfx_pipeline_state.modules_changed |= changed; + } + } +} + +ALWAYS_INLINE static bool +update_gfx_shader_module_mesh(struct zink_context *ctx, struct zink_gfx_program *prog, mesa_shader_stage pstage) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + if (screen->info.have_EXT_graphics_pipeline_library) + util_queue_fence_wait(&prog->base.cache_fence); + struct zink_shader_module *zm = get_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state); + if (!zm) { + zm = create_shader_module_for_stage_optimal(ctx, screen, prog->shaders[pstage], prog, pstage, &ctx->gfx_pipeline_state, false, false); + perf_debug(ctx, "zink[gfx_compile]: %s shader variant required\n", _mesa_shader_stage_to_string(pstage)); + } + + bool changed = prog->objs[pstage].mod != zm->obj.mod; + prog->objs[pstage] = zm->obj; + prog->objects[pstage] = zm->obj.obj; + return changed; } static void @@ -754,7 +879,7 @@ update_mesh_program_optimal(struct zink_context *ctx, struct zink_gfx_program *p /* always recheck shadow swizzles since they aren't directly part of the key */ unlikely(shadow_needs_shader_swizzle)) { assert(!prog->is_separable); - bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT); + bool changed = update_gfx_shader_module_mesh(ctx, prog, MESA_SHADER_FRAGMENT); ctx->gfx_pipeline_state.modules_changed |= changed; if (unlikely(shadow_needs_shader_swizzle)) { struct zink_shader_module **pzm = prog->shader_cache[MESA_SHADER_FRAGMENT][0][0].data; @@ -771,7 +896,7 @@ replace_separable_prog(struct zink_context *ctx, struct hash_entry *entry, struc struct zink_gfx_program *real = prog->full_prog ? prog->full_prog : /* this will be NULL with ZINK_DEBUG_NOOPT */ - zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false); + zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false, false); entry->data = real; entry->key = real->shaders; real->base.removed = false; @@ -780,12 +905,116 @@ replace_separable_prog(struct zink_context *ctx, struct hash_entry *entry, struc return real; } +static uint32_t +hash_gfx_program(const void *key) +{ + const uint32_t *k = key; + + return XXH32(k, sizeof(uint32_t[2]), 0); +} + +static bool +equals_program_variant(const void *a, const void *b) +{ + const struct program_variant_key *ak = a; + const struct program_variant_key *bk = b; + uint32_t val_a = ak->key; + uint32_t val_b = bk->key; + uint32_t val_a_st = ak->st_key; + uint32_t val_b_st = bk->st_key; + return val_a == val_b && val_a_st == val_b_st; +} + +#define CURR_KEY_PROGRAM(ctx) (ctx->gfx_pipeline_state.uber_required ? ctx->curr_program_uber: ctx->curr_program) + +static void +async_variant_program_update(struct zink_context *ctx, bool can_use_uber, bool needs_emulation) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + bool needs_uber = false; + if (!ctx->curr_program_uber->is_separable && (!ctx->curr_program_uber->base_variant || needs_emulation)) { + struct program_variant_key prog_variant_key = {0}; + prog_variant_key.key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val;//ctx->gfx_pipeline_state.optimal_key; + prog_variant_key.st_key = ctx->gfx_pipeline_state.st_key.small_key.val; + struct set_entry * variant_entry = _mesa_set_search(&ctx->curr_program_uber->variants, &prog_variant_key); + struct zink_gfx_program *variant; + if (!variant_entry) { + variant = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->curr_program_uber->gfx_hash, false, true); + variant->base.uses_shobj = ctx->curr_program_uber->base.uses_shobj; + util_queue_fence_init(&variant->base.cache_fence); + struct program_variant_key *prog_variant_key_p = MALLOC(sizeof(struct program_variant_key)); + memcpy(prog_variant_key_p, &prog_variant_key, sizeof(struct program_variant_key)); + prog_variant_key_p->prog = variant; + variant->uber_variant = ctx->curr_program_uber; + _mesa_set_add(&ctx->curr_program_uber->variants, prog_variant_key_p); + needs_uber = true; + } else + variant = ((struct program_variant_key *)variant_entry->key)->prog; + /* fetches shader modules from cache and starts async compilation on a miss */ + bool async_done = update_gfx_program_optimal(ctx, ctx->curr_program_uber, variant, can_use_uber); + assert(can_use_uber || async_done); + if (async_done) { + if (ctx->curr_program_uber->base_variant) + copy_gfx_program_missing_shaders(ctx, ctx->curr_program_uber->base_variant, variant); + else + async_done = update_gfx_program_missing_shaders(ctx, ctx->curr_program_uber, variant, can_use_uber); + } + assert(can_use_uber || async_done); + needs_uber &= !async_done; + + if (async_done && !variant->started_compiling) { + /* Modules are ready but the program isn't. Start a job for it. */ + struct precompile_variant_data *data = CALLOC_STRUCT(precompile_variant_data); + data->prog = variant; + data->state = ctx->gfx_pipeline_state; + if (can_use_uber && !(zink_debug & ZINK_DEBUG_NOBGC)) + util_queue_add_job(&screen->cache_get_thread, data, &variant->base.cache_fence, precompile_variant_job, NULL, 0); + else + precompile_variant_job(data, screen, 0); + variant->started_compiling = true; + } + if (!can_use_uber) + util_queue_fence_wait(&variant->base.cache_fence); + bool variant_prog_ready = variant->started_compiling && + (!can_use_uber || util_queue_fence_is_signalled(&variant->base.cache_fence)); + assert(can_use_uber || variant_prog_ready); + if(variant_prog_ready) { + /* variant prog is ready, use it */ + if (ctx->curr_program != variant) { + ctx->gfx_pipeline_state.modules_changed = true; + ctx->curr_program = variant; + } + assert(async_done); + if (!needs_emulation) + ctx->curr_program_uber->base_variant = variant; + } + needs_uber |= !async_done || !variant_prog_ready; + } else if (ctx->curr_program_uber->base_variant && !needs_emulation) { + ctx->curr_program = ctx->curr_program_uber->base_variant; + ctx->curr_program_uber->last_variant_hash = ctx->curr_program->last_variant_hash; + ctx->curr_program_uber->st_key = ctx->curr_program->st_key; + needs_uber = false; + } else if (ctx->curr_program_uber->is_separable) { + assert(can_use_uber); + ctx->curr_program = ctx->curr_program_uber; + needs_uber = true; + } + if (ctx->gfx_pipeline_state.uber_required != needs_uber) { + ctx->gfx_pipeline_state.modules_changed = true; + ctx->gfx_pipeline_state.uber_required = needs_uber; + } + + if (needs_uber || !ctx->curr_program_uber) + ctx->curr_program = ctx->curr_program_uber; +} + void zink_gfx_program_update_optimal(struct zink_context *ctx) { MESA_TRACE_FUNC(); struct zink_screen *screen = zink_screen(ctx->base.screen); assert(!ctx->gfx_stages[MESA_SHADER_TESS_CTRL] || !ctx->gfx_stages[MESA_SHADER_TESS_CTRL]->non_fs.is_generated); + struct zink_gfx_program *old_prog = ctx->curr_program_uber; if (ctx->gfx_dirty) { struct zink_gfx_program *prog = NULL; ctx->gfx_pipeline_state.optimal_key = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val); @@ -794,72 +1023,93 @@ zink_gfx_program_update_optimal(struct zink_context *ctx) simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages); - if (ctx->curr_program) - ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; + if (CURR_KEY_PROGRAM(ctx)) { + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash; + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key; + } + bool needs_emulation = needs_st_emulation(ctx) || (ctx->gfx_pipeline_state.optimal_key != ZINK_SHADER_KEY_OPTIMAL_DEFAULT); + bool can_use_uber = zink_can_use_uber(ctx); if (entry) { prog = (struct zink_gfx_program*)entry->data; - bool must_replace = prog->base.uses_shobj ? !zink_can_use_shader_objects(ctx) : (prog->is_separable && !zink_can_use_pipeline_libs(ctx)); - if (prog->is_separable) { - /* shader variants can't be handled by separable programs: sync and compile */ - if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || must_replace) + if (prog->is_separable && !(zink_debug & ZINK_DEBUG_NOOPT)) { + /* if uber cannot be used we need to compile the variant synchrously, + * so we need the full prog: sync and compile */ + if (!can_use_uber) util_queue_fence_wait(&prog->base.cache_fence); /* If the optimized linked pipeline is done compiling, swap it into place. */ - if (util_queue_fence_is_signalled(&prog->base.cache_fence) && - /* but only if needed for ZINK_DEBUG=noopt */ - (!(zink_debug & ZINK_DEBUG_NOOPT) || !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || must_replace)) { + if (util_queue_fence_is_signalled(&prog->base.cache_fence)) { prog = replace_separable_prog(ctx, entry, prog); } - } else if (must_replace) { - /* this is a non-separable, incompatible prog which needs replacement */ - struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, ctx->gfx_hash, false); - generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state); - entry->data = real; - entry->key = real->shaders; - real->base.removed = false; - prog->base.removed = true; - prog = real; - } else if (!prog->base.precompile_done) { - util_queue_fence_wait(&prog->base.cache_fence); } - update_gfx_program_optimal(ctx, prog); + ctx->curr_program_uber = prog; + async_variant_program_update(ctx, can_use_uber, needs_emulation); } else { ctx->dirty_gfx_stages |= ctx->shader_stages; prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch, false); + ctx->gfx_pipeline_state.uber_required = true; prog->base.removed = false; _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog); if (!prog->is_separable) { - zink_screen_get_pipeline_cache(screen, &prog->base, false); perf_debug(ctx, "zink[gfx_compile]: new program created (probably legacy GL features in use)\n"); - generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state); + prog->is_uber_program = true; + { + struct zink_gfx_pipeline_state state = {0}; + state.shader_keys_optimal.key.vs_base.last_vertex_stage = true; + state.shader_keys_optimal.key.tcs.patch_vertices = 3; //random guess, generated tcs precompile is hard + state.optimal_key = state.shader_keys_optimal.key.val; + generate_gfx_program_modules_optimal(NULL, screen, prog, &state, prog->is_uber_program); + zink_screen_get_pipeline_cache(screen, &prog->base, true); + if (!prog->base.uses_shobj) { + simple_mtx_lock(&prog->libs->lock); + zink_create_pipeline_lib(screen, prog, &state, prog->is_uber_program); + simple_mtx_unlock(&prog->libs->lock); + } + zink_screen_update_pipeline_cache(screen, &prog->base, true); + } + if (needs_emulation && !can_use_uber) { + ctx->curr_program_uber = prog; + async_variant_program_update(ctx, can_use_uber, needs_emulation); + } } } simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); - if (prog && prog != ctx->curr_program) - zink_batch_reference_program(ctx, &prog->base); - ctx->curr_program = prog; - ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; + ctx->curr_program_uber = prog; + if (ctx->gfx_pipeline_state.uber_required) + ctx->curr_program = prog; + if (ctx->curr_program_uber && ctx->curr_program_uber != old_prog) + { + assert(!ctx->curr_program_uber->is_variant_program); + zink_batch_reference_program(ctx, &ctx->curr_program_uber->base); + } + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash; + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key; } else if (ctx->dirty_gfx_stages) { /* remove old hash */ ctx->gfx_pipeline_state.optimal_key = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val); - ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; - - bool must_replace = ctx->curr_program->base.uses_shobj ? !zink_can_use_shader_objects(ctx) : (ctx->curr_program->is_separable && !zink_can_use_pipeline_libs(ctx)); - if (must_replace || (ctx->curr_program->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key))) { - struct zink_gfx_program *prog = ctx->curr_program; - - util_queue_fence_wait(&prog->base.cache_fence); - /* shader variants can't be handled by separable programs: sync and compile */ - perf_debug(ctx, "zink[gfx_compile]: non-default shader variant required with separate shader object program\n"); - struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)]; - const uint32_t hash = ctx->gfx_hash; - simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); - struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages); - ctx->curr_program = replace_separable_prog(ctx, entry, prog); - simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash; + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key; + bool needs_emulation = needs_st_emulation(ctx) || (ctx->gfx_pipeline_state.optimal_key != ZINK_SHADER_KEY_OPTIMAL_DEFAULT); + bool can_use_uber = zink_can_use_uber(ctx); + if (ctx->curr_program->is_separable && !(zink_debug & ZINK_DEBUG_NOOPT)) { + struct zink_gfx_program *prog = ctx->curr_program_uber; + if (needs_emulation || ctx->curr_program_uber->is_separable) { + if (!can_use_uber) + util_queue_fence_wait(&prog->base.cache_fence); + perf_debug(ctx, "zink[gfx_compile]: non-default shader variant required with separate shader object program\n"); + if (util_queue_fence_is_signalled(&prog->base.cache_fence)) { + struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)]; + const uint32_t hash = ctx->gfx_hash; + simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages); + ctx->curr_program_uber = replace_separable_prog(ctx, entry, prog); + simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + } + } } - update_gfx_program_optimal(ctx, ctx->curr_program); + async_variant_program_update(ctx, can_use_uber, needs_emulation); /* apply new hash */ - ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash; + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->st_key; } ctx->dirty_gfx_stages = 0; ctx->gfx_dirty = false; @@ -898,8 +1148,8 @@ zink_mesh_program_update_optimal(struct zink_context *ctx) } } else if (must_replace) { /* this is a non-separable, incompatible prog which needs replacement */ - struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->mesh_hash, true); - generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state); + struct zink_gfx_program *real = zink_create_gfx_program(ctx, ctx->gfx_stages, 0, ctx->mesh_hash, true, false); + generate_gfx_program_modules_optimal(ctx, screen, real, &ctx->gfx_pipeline_state, false); entry->data = real; entry->key = real->shaders; real->base.removed = false; @@ -917,7 +1167,7 @@ zink_mesh_program_update_optimal(struct zink_context *ctx) if (!prog->is_separable) { zink_screen_get_pipeline_cache(screen, &prog->base, false); perf_debug(ctx, "zink[gfx_compile]: new program created (probably legacy GL features in use)\n"); - generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state); + generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state, false); } } simple_mtx_unlock(lock); @@ -960,8 +1210,10 @@ optimized_compile_job(void *data, void *gdata, int thread_index) VkPrimitiveTopology vkmode = is_mesh ? VK_PRIMITIVE_TOPOLOGY_MAX_ENUM : zink_primitive_topology(pc_entry->state.gfx_prim_mode); if (pc_entry->gpl.gkey) pipeline = zink_create_gfx_pipeline_combined(screen, pc_entry->prog, pc_entry->gpl.ikey ? pc_entry->gpl.ikey->pipeline : VK_NULL_HANDLE, &pc_entry->gpl.gkey->pipeline, 1, pc_entry->gpl.okey->pipeline, true, false); - else - pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, pc_entry->prog->objs, &pc_entry->state, pc_entry->state.element_state->binding_map, vkmode, true); + else { + struct zink_shader_object *objs = pc_entry->prog->objs; + pipeline = zink_create_gfx_pipeline(screen, pc_entry->prog, objs, &pc_entry->state, pc_entry->state.element_state->binding_map, vkmode, true); + } if (pipeline) { pc_entry->gpl.unoptimized_pipeline = pc_entry->pipeline; pc_entry->pipeline = pipeline; @@ -1009,10 +1261,12 @@ zink_program_finish(struct zink_context *ctx, struct zink_program *pg) if (pg->is_compute) return; struct zink_gfx_program *prog = (struct zink_gfx_program*)pg; - for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) { - hash_table_foreach(&prog->pipelines[i], entry) { - struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data; - util_queue_fence_wait(&pc_entry->fence); + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + hash_table_foreach(&prog->pipelines[r][i], entry) { + struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data; + util_queue_fence_wait(&pc_entry->fence); + } } } } @@ -1073,7 +1327,7 @@ update_cs_shader_module(struct zink_context *ctx, struct zink_compute_program *c return; } zm->shobj = false; - zm->obj = zink_shader_compile(screen, false, zs, zink_shader_blob_deserialize(screen, &comp->shader->blob), key, zs_swizzle_size ? &ctx->di.zs_swizzle[MESA_SHADER_COMPUTE] : NULL, &comp->base); + zm->obj = zink_shader_compile(screen, false, zs, zink_shader_blob_deserialize(screen, &comp->shader->blob), key, NULL, false, zs_swizzle_size ? &ctx->di.zs_swizzle[MESA_SHADER_COMPUTE] : NULL, &comp->base); if (!zm->obj.spirv) { FREE(zm); return; @@ -1198,7 +1452,11 @@ zink_gfx_lib_cache_unref(struct zink_screen *screen, struct zink_gfx_lib_cache * { if (!p_atomic_dec_zero(&libs->refcount)) return; - + if (libs->lib) { + struct zink_gfx_library_key *gkey = libs->lib; + VKSCR(DestroyPipeline)(screen->dev, gkey->pipeline, NULL); + FREE(gkey); + } simple_mtx_destroy(&libs->lock); set_foreach_remove(&libs->libs, he) { struct zink_gfx_library_key *gkey = (void*)he->key; @@ -1217,10 +1475,6 @@ create_lib_cache(struct zink_gfx_program *prog, bool generated_tcs) if (generated_tcs) libs->stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL); simple_mtx_init(&libs->lock, mtx_plain); - if (generated_tcs) - _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib_generated_tcs, equals_pipeline_lib_generated_tcs); - else - _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib, equals_pipeline_lib); return libs; } @@ -1229,6 +1483,8 @@ find_or_create_lib_cache(struct zink_screen *screen, struct zink_gfx_program *pr { unsigned stages_present = prog->stages_present; bool generated_tcs = prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated; + if (prog->is_variant_program) + return create_lib_cache(prog, generated_tcs); if (generated_tcs) stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL); unsigned idx = zink_program_cache_stages(stages_present); @@ -1307,7 +1563,7 @@ gfx_program_create(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch, uint32_t gfx_hash, - bool is_mesh) + bool is_mesh, bool variant) { struct zink_screen *screen = zink_screen(ctx->base.screen); struct zink_gfx_program *prog = create_program(ctx, false); @@ -1317,6 +1573,7 @@ gfx_program_create(struct zink_context *ctx, prog->gfx_hash = gfx_hash; prog->base.removed = true; prog->optimal_keys = screen->optimal_keys; + prog->is_variant_program = variant; for (int i = is_mesh ? MESA_SHADER_FRAGMENT : 0; i < (is_mesh ? MESA_SHADER_MESH_STAGES : MESA_SHADER_STAGES); ++i) { util_dynarray_init(&prog->shader_cache[i][0][0], prog->base.ralloc_ctx); @@ -1331,6 +1588,7 @@ gfx_program_create(struct zink_context *ctx, prog->needs_inlining |= prog->shaders[i]->needs_inlining; } } + util_dynarray_init(&prog->uber_modules, prog->base.ralloc_ctx); if (stages[MESA_SHADER_TESS_EVAL] && !stages[MESA_SHADER_TESS_CTRL]) { util_queue_fence_wait(&stages[MESA_SHADER_TESS_EVAL]->precompile.fence); if (!prog->shaders[MESA_SHADER_TESS_EVAL]->non_fs.generated_tcs) @@ -1340,13 +1598,17 @@ gfx_program_create(struct zink_context *ctx, } prog->stages_remaining = prog->stages_present; for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) { - if (prog->shaders[i]) { + if (prog->shaders[i] && !variant) { simple_mtx_lock(&prog->shaders[i]->lock); _mesa_set_add(prog->shaders[i]->programs, prog); simple_mtx_unlock(&prog->shaders[i]->lock); zink_gfx_program_reference(screen, NULL, prog); } } + + if (variant) + zink_gfx_program_reference(screen, NULL, prog); + p_atomic_dec(&prog->base.reference.count); if (is_mesh) @@ -1360,8 +1622,12 @@ gfx_program_create(struct zink_context *ctx, prog->has_edgeflags = prog->shaders[MESA_SHADER_VERTEX] && prog->shaders[MESA_SHADER_VERTEX]->has_edgeflags; - for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) { - _mesa_hash_table_init(&prog->pipelines[i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog)); + _mesa_set_init(&prog->variants, prog->base.ralloc_ctx, hash_gfx_program, equals_program_variant); + + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + _mesa_hash_table_init(&prog->pipelines[r][i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog)); + } } return prog; @@ -1436,9 +1702,9 @@ zink_create_gfx_program(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch, uint32_t gfx_hash, - bool is_mesh) + bool is_mesh, bool variant) { - struct zink_gfx_program *prog = gfx_program_create(ctx, stages, vertices_per_patch, gfx_hash, is_mesh); + struct zink_gfx_program *prog = gfx_program_create(ctx, stages, vertices_per_patch, gfx_hash, is_mesh, variant); if (prog) prog = gfx_program_init(ctx, prog); return prog; @@ -1454,7 +1720,8 @@ create_linked_separable_job(void *data, void *gdata, int thread_index) /* this is a dead program */ if (prog->base.removed) return; - prog->full_prog = gfx_program_create(prog->base.ctx, prog->shaders, 0, prog->gfx_hash, !!prog->shaders[MESA_SHADER_MESH]); + prog->full_prog = gfx_program_create(prog->base.ctx, prog->shaders, 0, prog->gfx_hash, !!prog->shaders[MESA_SHADER_MESH], false); + prog->full_prog->is_uber_program = prog->is_uber_program; /* block gfx_shader_prune in the main thread */ util_queue_fence_reset(&prog->full_prog->base.cache_fence); /* add an ownership ref */ @@ -1479,15 +1746,16 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag uint32_t hash = is_mesh ? ctx->mesh_hash : ctx->gfx_hash; if (!is_separate || /* TODO: maybe try variants? grimace */ + /* TODO allow if uber is usable */ !is_default || !can_gpl) - return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh); + return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh, false); for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) { /* ensure async shader creation is done */ if (stages[i]) { util_queue_fence_wait(&stages[i]->precompile.fence); - if (!stages[i]->precompile.obj.mod) - return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh); + if (!stages[i]->precompile.obj.mod && !stages[i]->precompile.obj.mod) + return zink_create_gfx_program(ctx, stages, vertices_per_patch, hash, is_mesh, false); } } @@ -1496,6 +1764,7 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag goto fail; prog->is_separable = true; + prog->is_uber_program = true; prog->gfx_hash = hash; prog->base.uses_shobj = screen->info.have_EXT_shader_object && ((stages[MESA_SHADER_VERTEX] && !stages[MESA_SHADER_VERTEX]->info.view_mask) || @@ -1535,8 +1804,12 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag */ p_atomic_add(&prog->base.reference.count, refs - 1); - for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) { - _mesa_hash_table_init(&prog->pipelines[i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog)); + _mesa_set_init(&prog->variants, prog->base.ralloc_ctx, hash_gfx_program, equals_program_variant); + + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + _mesa_hash_table_init(&prog->pipelines[r][i], prog->base.ralloc_ctx, NULL, zink_get_gfx_pipeline_eq_func(screen, prog)); + } } for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) { @@ -1557,18 +1830,25 @@ create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stag prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT); prog->last_variant_hash = is_mesh ? ctx->gfx_pipeline_state.mesh_optimal_key : ctx->gfx_pipeline_state.optimal_key; + prog->st_key = ctx->gfx_pipeline_state.st_key.small_key.val; if (!prog->base.uses_shobj) { - VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl}; - struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key); - if (!gkey) { - mesa_loge("ZINK: failed to allocate gkey!"); - goto fail; + if (!is_mesh) { + VkPipeline uber_libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl}; + prog->libs->lib = CALLOC_STRUCT(zink_gfx_library_key); + prog->libs->lib->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, uber_libs, 2, VK_NULL_HANDLE, false, false); + } else { + VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl}; + struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key); + if (!gkey) { + mesa_loge("ZINK: failed to allocate gkey!"); + goto fail; + } + gkey->optimal_key = prog->last_variant_hash; + assert(gkey->optimal_key); + gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false, false); + _mesa_set_add(&prog->libs->libs, gkey); } - gkey->optimal_key = prog->last_variant_hash; - assert(gkey->optimal_key); - gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false, false); - _mesa_set_add(&prog->libs->libs, gkey); } if (!(zink_debug & ZINK_DEBUG_NOOPT)) @@ -1722,7 +2002,7 @@ precompile_compute_job(void *data, void *gdata, int thread_index) comp->curr = comp->module = CALLOC_STRUCT(zink_shader_module); assert(comp->module); comp->module->shobj = false; - comp->module->obj = zink_shader_compile(screen, false, comp->shader, comp->nir, NULL, NULL, &comp->base); + comp->module->obj = zink_shader_compile(screen, false, comp->shader, comp->nir, NULL, NULL, false, NULL, &comp->base); /* comp->nir will be freed by zink_shader_compile */ comp->nir = NULL; assert(comp->module->obj.spirv); @@ -1869,21 +2149,41 @@ zink_destroy_gfx_program(struct zink_screen *screen, { if (prog->is_separable) zink_gfx_program_reference(screen, &prog->full_prog, NULL); - for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) { - hash_table_foreach(&prog->pipelines[i], entry) { - struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data; + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + hash_table_foreach(&prog->pipelines[r][i], entry) { + struct zink_gfx_pipeline_cache_entry *pc_entry = entry->data; - util_queue_fence_wait(&pc_entry->fence); - VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL); - VKSCR(DestroyPipeline)(screen->dev, pc_entry->gpl.unoptimized_pipeline, NULL); - free(pc_entry); + util_queue_fence_wait(&pc_entry->fence); + VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL); + VKSCR(DestroyPipeline)(screen->dev, pc_entry->gpl.unoptimized_pipeline, NULL); + free(pc_entry); + } } } + /* wait for all async compilation jobs */ + for (unsigned stage = 0; stage < ZINK_GFX_SHADER_COUNT; stage++) { + struct util_dynarray *shader_cache = &prog->shader_cache[stage][0][0]; + unsigned count = util_dynarray_num_elements(shader_cache, struct zink_shader_module *); + struct zink_shader_module **pzm = shader_cache->data; + for (unsigned i = 0; i < count; i++) { + struct zink_shader_module *iter = pzm[i]; + util_queue_fence_wait(&iter->fence); + } + } + + set_foreach(&prog->variants, entry) { + struct program_variant_key *prog_variant_key = (void*)entry->key; + assert(prog_variant_key->prog->is_variant_program); + zink_destroy_gfx_program(screen, prog_variant_key->prog); + FREE(prog_variant_key); + } + deinit_program(screen, &prog->base); for (int i = 0; i < MESA_SHADER_MESH_STAGES; ++i) { - if (prog->shaders[i]) { + if (prog->shaders[i] && !prog->is_variant_program) { _mesa_set_remove_key(prog->shaders[i]->programs, prog); prog->shaders[i] = NULL; } @@ -1895,6 +2195,10 @@ zink_destroy_gfx_program(struct zink_screen *screen, blob_finish(&prog->blobs[i]); } } + while (util_dynarray_contains(&prog->uber_modules, void*)) { + struct zink_shader_module *zm = util_dynarray_pop(&prog->uber_modules, struct zink_shader_module*); + zink_destroy_shader_module(screen, zm); + } if (prog->libs) zink_gfx_lib_cache_unref(screen, prog->libs); @@ -2046,8 +2350,11 @@ bind_gfx_stage(struct zink_context *ctx, mesa_shader_stage stage, struct zink_sh zink_descriptors_init_bindless(ctx); } else { if (stage < MESA_SHADER_COMPUTE) { - if (ctx->curr_program) - ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; + if (ctx->curr_program_uber) { + ctx->gfx_pipeline_state.final_hash ^= CURR_KEY_PROGRAM(ctx)->last_variant_hash; + ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program_uber->st_key; + } + ctx->curr_program_uber = NULL; ctx->curr_program = NULL; } if (stage == MESA_SHADER_FRAGMENT || stage > MESA_SHADER_COMPUTE) { @@ -2391,7 +2698,7 @@ zink_delete_cs_shader_state(struct pipe_context *pctx, void *cso) /* caller must lock prog->libs->lock */ struct zink_gfx_library_key * -zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state) +zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber) { struct zink_gfx_library_key *gkey = CALLOC_STRUCT(zink_gfx_library_key); bool is_mesh = !prog->shaders[MESA_SHADER_VERTEX]; @@ -2401,11 +2708,15 @@ zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *pr } gkey->optimal_key = !is_mesh ? state->optimal_key : state->mesh_optimal_key; + gkey->st_key = state->st_key.small_key.val; assert(is_mesh || gkey->optimal_key); for (unsigned i = 0; i < MESA_SHADER_MESH_STAGES; i++) gkey->modules[i] = prog->objs[i].mod; - gkey->pipeline = zink_create_gfx_pipeline_library(screen, prog); - _mesa_set_add(&prog->libs->libs, gkey); + gkey->pipeline = zink_create_gfx_pipeline_library(screen, prog->objs, prog); + if (is_mesh) + _mesa_set_add(&prog->libs->libs, gkey); + else + prog->libs->lib = gkey; return gkey; } @@ -2433,6 +2744,26 @@ print_exe_stages(VkShaderStageFlags stages) UNREACHABLE("unhandled combination of stages!"); } +static void +precompile_variant_job(void *data, void *gdata, int thread_index) +{ + struct zink_screen *screen = gdata; + struct precompile_variant_data *precompile_data = data; + struct zink_gfx_program *prog = precompile_data->prog; + struct zink_gfx_pipeline_state *state = &precompile_data->state; + + //generate_gfx_program_modules_optimal(NULL, screen, prog, state); + zink_screen_get_pipeline_cache(screen, &prog->base, true); + if (!screen->info.have_EXT_shader_object) { + simple_mtx_lock(&prog->libs->lock); + zink_create_pipeline_lib(screen, prog, state, false); + simple_mtx_unlock(&prog->libs->lock); + } + zink_screen_update_pipeline_cache(screen, &prog->base, true); + + FREE(data); +} + static void gfx_program_precompile_job(void *data, void *gdata, int thread_index) { @@ -2446,11 +2777,11 @@ gfx_program_precompile_job(void *data, void *gdata, int thread_index) state.shader_keys_optimal.key.vs_base.last_vertex_stage = true; state.shader_keys_optimal.key.tcs.patch_vertices = 3; //random guess, generated tcs precompile is hard state.optimal_key = state.shader_keys_optimal.key.val; - generate_gfx_program_modules_optimal(NULL, screen, prog, &state); + generate_gfx_program_modules_optimal(NULL, screen, prog, &state, prog->is_uber_program); zink_screen_get_pipeline_cache(screen, &prog->base, true); if (!prog->base.uses_shobj) { simple_mtx_lock(&prog->libs->lock); - zink_create_pipeline_lib(screen, prog, &state); + zink_create_pipeline_lib(screen, prog, &state, prog->is_uber_program); simple_mtx_unlock(&prog->libs->lock); } prog->base.precompile_done = true; @@ -2494,17 +2825,18 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders) simple_mtx_unlock(lock); return; } - struct zink_gfx_program *prog = gfx_program_create(ctx, zshaders, 3, hash, is_mesh); + struct zink_gfx_program *prog = gfx_program_create(ctx, zshaders, 3, hash, is_mesh, false); u_foreach_bit(i, shader_stages) assert(prog->shaders[i]); _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog); prog->base.removed = false; + prog->is_uber_program = true; simple_mtx_unlock(lock); if (zink_debug & ZINK_DEBUG_SHADERDB) { struct zink_screen *screen = zink_screen(pctx->screen); gfx_program_init(ctx, prog); if (screen->optimal_keys) - generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state); + generate_gfx_program_modules_optimal(ctx, screen, prog, &ctx->gfx_pipeline_state, false); else generate_gfx_program_modules(ctx, screen, prog, &ctx->gfx_pipeline_state); VkPipeline pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, &ctx->gfx_pipeline_state, @@ -2535,7 +2867,7 @@ zink_delete_shader_state(struct pipe_context *pctx, void *cso) static void precompile_separate_shader(struct zink_shader *zs, struct zink_screen *screen) { - zs->precompile.obj = zink_shader_compile_separate(screen, zs); + zs->precompile.obj = zink_shader_compile_separate(screen, zs, zs->is_uber); if (!screen->info.have_EXT_shader_object) { struct zink_shader_object objs[MESA_SHADER_MESH_STAGES] = {0}; objs[zs->info.stage].mod = zs->precompile.obj.mod; @@ -2543,6 +2875,20 @@ precompile_separate_shader(struct zink_shader *zs, struct zink_screen *screen) } } +static void +precompile_variant_separate_shader_job(void *data, void *gdata, int thread_index) +{ + struct zink_screen *screen = gdata; + struct precompile_separate_variant_data *precompile_data = data; + + nir_shader *nir = zink_shader_blob_deserialize(screen, precompile_data->blob); + precompile_data->zm->obj = zink_shader_compile(screen, precompile_data->uses_shobj, precompile_data->zs, nir, + precompile_data->has_key ? &precompile_data->key: NULL, + &precompile_data->st_key, + false, NULL, precompile_data->prog); + FREE(data); +} + static void gfx_shader_init_job(void *data, void *gdata, int thread_index) { @@ -2581,6 +2927,7 @@ zink_create_gfx_shader_state(struct pipe_context *pctx, const struct pipe_shader zink_descriptor_util_init_fbfetch(zink_context(pctx)); struct zink_shader *zs = zink_shader_create(zink_screen(pctx->screen), nir); + zs->is_uber = true; if (zink_debug & ZINK_DEBUG_NOBGC) gfx_shader_init_job(zs, screen, 0); else @@ -2593,6 +2940,8 @@ static void zink_delete_cached_shader_state(struct pipe_context *pctx, void *cso) { struct zink_screen *screen = zink_screen(pctx->screen); + // HACK this is oversyncing but we have no way of konwing which jobs use this zink_shader + util_queue_finish(&screen->cache_get_thread); util_shader_reference(pctx, &screen->shaders, &cso, NULL); } diff --git a/src/gallium/drivers/zink/zink_program.h b/src/gallium/drivers/zink/zink_program.h index 3d5ffc170c9..6e2b0a202a6 100644 --- a/src/gallium/drivers/zink/zink_program.h +++ b/src/gallium/drivers/zink/zink_program.h @@ -128,7 +128,7 @@ zink_mesh_program_update_optimal(struct zink_context *ctx); struct zink_gfx_library_key * -zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state); +zink_create_pipeline_lib(struct zink_screen *screen, struct zink_gfx_program *prog, struct zink_gfx_pipeline_state *state, bool compile_uber); uint32_t hash_gfx_output(const void *key); uint32_t hash_gfx_output_ds3(const void *key); uint32_t hash_gfx_input(const void *key); @@ -159,7 +159,7 @@ zink_create_gfx_program(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch, uint32_t gfx_hash, - bool is_mesh); + bool is_mesh, bool variant); void zink_destroy_gfx_program(struct zink_screen *screen, @@ -405,6 +405,27 @@ zink_set_zs_needs_shader_swizzle_key(struct zink_context *ctx, mesa_shader_stage zink_set_shader_key_base(ctx, pstage)->needs_zs_shader_swizzle = enable; } +static inline const union zink_st_small_key * +zink_get_st_small_key(struct zink_context *ctx) +{ + assert(zink_screen(ctx->base.screen)->optimal_keys); + return &ctx->gfx_pipeline_state.st_key.small_key; +} + +static inline union zink_st_small_key * +zink_set_st_small_key(struct zink_context *ctx) +{ + ctx->dirty_gfx_stages |= ctx->shader_stages & (MESA_SHADER_VERTEX | MESA_SHADER_GEOMETRY | MESA_SHADER_FRAGMENT); + assert(zink_screen(ctx->base.screen)->optimal_keys); + return &ctx->gfx_pipeline_state.st_key.small_key; +} + +static inline bool +needs_st_emulation(struct zink_context *ctx) +{ + return ctx->gfx_pipeline_state.st_key.small_key.val != 0; +} + ALWAYS_INLINE static bool zink_can_use_pipeline_libs(const struct zink_context *ctx) { @@ -464,6 +485,14 @@ zink_can_use_shader_objects_mesh(const struct zink_context *ctx) !ctx->fb_state.viewmask; } +ALWAYS_INLINE static bool +zink_can_use_uber(struct zink_context *ctx) +{ + bool generated_tcs = ctx->gfx_stages[MESA_SHADER_TESS_EVAL] && !ctx->gfx_stages[MESA_SHADER_TESS_CTRL]; + return zink_shader_key_optimal_no_tcs(ctx->gfx_pipeline_state.optimal_key) == ZINK_SHADER_KEY_OPTIMAL_DEFAULT && + zink_can_use_pipeline_libs(ctx) && (!generated_tcs || ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch == 3); +} + bool zink_set_rasterizer_discard(struct zink_context *ctx, bool disable); void diff --git a/src/gallium/drivers/zink/zink_program_state.hpp b/src/gallium/drivers/zink/zink_program_state.hpp index a175072e45b..3f38271dab2 100644 --- a/src/gallium/drivers/zink/zink_program_state.hpp +++ b/src/gallium/drivers/zink/zink_program_state.hpp @@ -102,6 +102,7 @@ template VkPipeline zink_get_gfx_pipeline(struct zink_context *ctx, struct zink_gfx_program *prog, + struct zink_gfx_program *variant_prog, struct zink_gfx_pipeline_state *state, enum mesa_prim mode) { @@ -113,7 +114,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx, const unsigned idx = IS_MESH || screen->info.dynamic_state3_props.dynamicPrimitiveTopologyUnrestricted ? 0 : get_pipeline_idx= ZINK_DYNAMIC_STATE>(mode, vkmode); - assert(idx <= ARRAY_SIZE(prog->pipelines)); + assert(idx <= ARRAY_SIZE(prog->pipelines[0])); if (IS_MESH) { if (!state->mesh_dirty && !state->mesh_modules_changed) return state->mesh_pipeline; @@ -144,27 +145,28 @@ zink_get_gfx_pipeline(struct zink_context *ctx, } } /* extra safety asserts for optimal path to catch refactoring bugs */ - if (prog->optimal_keys) { + if (variant_prog->optimal_keys) { ASSERTED const union zink_shader_key_optimal *opt = (union zink_shader_key_optimal*)&prog->last_variant_hash; ASSERTED union zink_shader_key_optimal sanitized = {}; if (IS_MESH) { sanitized.val = zink_sanitize_optimal_key_mesh(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val); + assert(opt->val == sanitized.val); assert(state->mesh_optimal_key == sanitized.val); - } else { + } else if (!state->uber_required) { sanitized.val = zink_sanitize_optimal_key(ctx->gfx_stages, ctx->gfx_pipeline_state.shader_keys_optimal.key.val); + assert(opt->val == sanitized.val); assert(state->optimal_key == sanitized.val); } - assert(opt->val == sanitized.val); } if (IS_MESH) { state->mesh_modules_changed = false; - if (prog->last_finalized_hash[idx] == state->mesh_final_hash && - !prog->inline_variants && likely(prog->last_pipeline[idx]) && + if (prog->last_finalized_hash[0][idx] == state->mesh_final_hash && + !prog->inline_variants && likely(prog->last_pipeline[0][idx]) && /* this data is too big to compare in the fast-path */ likely(!prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) { - state->mesh_pipeline = prog->last_pipeline[idx]->pipeline; + state->mesh_pipeline = prog->last_pipeline[0][idx]->pipeline; return state->mesh_pipeline; } } else { @@ -202,23 +204,22 @@ zink_get_gfx_pipeline(struct zink_context *ctx, /* shortcut for reusing previous pipeline across program changes */ if (DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT || DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT2) { - if (prog->last_finalized_hash[idx] == state->final_hash && - !prog->inline_variants && likely(prog->last_pipeline[idx]) && - /* this data is too big to compare in the fast-path */ - likely(!prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) { - state->pipeline = prog->last_pipeline[idx]->pipeline; + if (variant_prog->last_finalized_hash[state->uber_required][idx] == state->final_hash && + !variant_prog->inline_variants && likely(variant_prog->last_pipeline[state->uber_required][idx]) && + /* this data is too big to compare in the fast-path */ + likely(!variant_prog->shaders[MESA_SHADER_FRAGMENT]->fs.legacy_shadow_mask)) { + state->pipeline = variant_prog->last_pipeline[state->uber_required][idx]->pipeline; return state->pipeline; } } } - unsigned final_hash = IS_MESH ? state->mesh_final_hash : state->final_hash; - entry = _mesa_hash_table_search_pre_hashed(&prog->pipelines[idx], final_hash, state); + entry = _mesa_hash_table_search_pre_hashed(&variant_prog->pipelines[state->uber_required][idx], final_hash, state); if (!entry) { bool can_gpl = IS_MESH ? zink_can_use_pipeline_libs_mesh(ctx) : zink_can_use_pipeline_libs(ctx); /* always wait on async precompile/cache fence */ - util_queue_fence_wait(&prog->base.cache_fence); + util_queue_fence_wait(&variant_prog->base.cache_fence); struct zink_gfx_pipeline_cache_entry *pc_entry = CALLOC_STRUCT(zink_gfx_pipeline_cache_entry); if (!pc_entry) return VK_NULL_HANDLE; @@ -227,28 +228,47 @@ zink_get_gfx_pipeline(struct zink_context *ctx, */ memcpy(&pc_entry->state, state, sizeof(*state)); pc_entry->state.rendering_info.pColorAttachmentFormats = pc_entry->state.rendering_formats; - pc_entry->prog = prog; + pc_entry->prog = state->uber_required ? prog : variant_prog; /* init the optimized background compile fence */ util_queue_fence_init(&pc_entry->fence); - entry = _mesa_hash_table_insert_pre_hashed(&prog->pipelines[idx], final_hash, pc_entry, pc_entry); - if (prog->base.uses_shobj && !prog->is_separable) { - memcpy(pc_entry->shobjs, prog->objs, sizeof(prog->objs)); + entry = _mesa_hash_table_insert_pre_hashed(&variant_prog->pipelines[state->uber_required][idx], final_hash, pc_entry, pc_entry); + if (variant_prog->base.uses_shobj && !variant_prog->is_separable) { + memcpy(pc_entry->shobjs, variant_prog->objs, sizeof(variant_prog->objs)); zink_gfx_program_compile_queue(ctx, pc_entry); } else if (HAVE_LIB && can_gpl) { uint32_t optimal_key = IS_MESH ? ctx->gfx_pipeline_state.mesh_optimal_key : ctx->gfx_pipeline_state.optimal_key; /* this is the graphics pipeline library path: find/construct all partial pipelines */ - simple_mtx_lock(&prog->libs->lock); - struct set_entry *he = _mesa_set_search(&prog->libs->libs, &optimal_key); struct zink_gfx_library_key *gkey; - if (he) { - gkey = (struct zink_gfx_library_key *)he->key; + if (IS_MESH) { + simple_mtx_lock(&prog->libs->lock); + struct set_entry *he = _mesa_set_search(&prog->libs->libs, &optimal_key); + if (he) { + gkey = (struct zink_gfx_library_key *)he->key; + } else { + assert(!prog->is_separable); + gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state, false); + } + simple_mtx_unlock(&prog->libs->lock); } else { - assert(!prog->is_separable); - gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state); + if (state->uber_required) { + simple_mtx_lock(&prog->libs->lock); + assert(prog->libs->lib); + gkey = prog->libs->lib; + simple_mtx_unlock(&prog->libs->lock); + } else { + simple_mtx_lock(&variant_prog->libs->lock); + if (variant_prog->libs->lib) { + gkey = variant_prog->libs->lib; + assert(gkey->optimal_key == optimal_key); + assert(gkey->st_key == state->st_key.small_key.val); + } else { + assert(!variant_prog->is_separable); + gkey = zink_create_pipeline_lib(screen, variant_prog, &ctx->gfx_pipeline_state, false); + } + simple_mtx_unlock(&variant_prog->libs->lock); + } } - simple_mtx_unlock(&prog->libs->lock); - struct zink_gfx_input_key *ikey = IS_MESH ? NULL : - DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ? + struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ? zink_find_or_create_input_dynamic(ctx, vkmode) : zink_find_or_create_input(ctx, vkmode); struct zink_gfx_output_key *okey = DYNAMIC_STATE >= ZINK_DYNAMIC_STATE3 && screen->have_full_ds3 ? @@ -259,29 +279,30 @@ zink_get_gfx_pipeline(struct zink_context *ctx, pc_entry->gpl.gkey = gkey; pc_entry->gpl.okey = okey; /* try to hit optimized compile cache first if possible */ - if (!prog->is_separable) - pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, true, true); + if (!variant_prog->is_separable) + pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, variant_prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, true, true); if (!pc_entry->pipeline) { /* create the non-optimized pipeline first using fast-linking to avoid stuttering */ - pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, false, false); - if (!prog->is_separable) + pc_entry->pipeline = zink_create_gfx_pipeline_combined(screen, variant_prog, ikey ? ikey->pipeline : VK_NULL_HANDLE, &gkey->pipeline, 1, okey->pipeline, false, false); + if (!variant_prog->is_separable) /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */ zink_gfx_program_compile_queue(ctx, pc_entry); } } else { + struct zink_shader_object *objs = state->uber_required ? prog->objs : variant_prog->objs; /* optimize by default only when expecting precompiles in order to reduce stuttering */ if (DYNAMIC_STATE != ZINK_DYNAMIC_VERTEX_INPUT2 && DYNAMIC_STATE != ZINK_DYNAMIC_VERTEX_INPUT && !IS_MESH) - pc_entry->pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, state, state->element_state->binding_map, vkmode, !HAVE_LIB); + pc_entry->pipeline = zink_create_gfx_pipeline(screen, variant_prog, objs, state, state->element_state->binding_map, vkmode, !HAVE_LIB); else - pc_entry->pipeline = zink_create_gfx_pipeline(screen, prog, prog->objs, state, NULL, vkmode, !HAVE_LIB); - if (HAVE_LIB && !prog->is_separable) + pc_entry->pipeline = zink_create_gfx_pipeline(screen, variant_prog, objs, state, NULL, vkmode, !HAVE_LIB); + if (HAVE_LIB && !variant_prog->is_separable) /* trigger async optimized pipeline compile if this was an unoptimized pipeline */ zink_gfx_program_compile_queue(ctx, pc_entry); } if (pc_entry->pipeline == VK_NULL_HANDLE) return VK_NULL_HANDLE; - zink_screen_update_pipeline_cache(screen, &prog->base, false); + zink_screen_update_pipeline_cache(screen, &variant_prog->base, false); } struct zink_gfx_pipeline_cache_entry *cache_entry = (struct zink_gfx_pipeline_cache_entry *)entry->data; @@ -291,8 +312,8 @@ zink_get_gfx_pipeline(struct zink_context *ctx, state->pipeline = cache_entry->pipeline; /* update states for fastpath */ if (DYNAMIC_STATE >= ZINK_DYNAMIC_VERTEX_INPUT) { - prog->last_finalized_hash[idx] = final_hash; - prog->last_pipeline[idx] = cache_entry; + variant_prog->last_finalized_hash[state->uber_required][idx] = final_hash; + variant_prog->last_pipeline[state->uber_required][idx] = cache_entry; } return IS_MESH ? state->mesh_pipeline : state->pipeline; } @@ -355,6 +376,8 @@ equals_gfx_pipeline_state(const void *a, const void *b) if (STAGE_MASK & STAGE_MASK_OPTIMAL) { if (sa->optimal_key != sb->optimal_key) return false; + if (sa->st_key.small_key.val != sb->st_key.small_key.val) + return false; if (STAGE_MASK & STAGE_MASK_OPTIMAL_SHADOW) { if (sa->shadow != sb->shadow) return false; diff --git a/src/gallium/drivers/zink/zink_shader_keys.h b/src/gallium/drivers/zink/zink_shader_keys.h index ea883007387..a76a6556cc4 100644 --- a/src/gallium/drivers/zink/zink_shader_keys.h +++ b/src/gallium/drivers/zink/zink_shader_keys.h @@ -28,6 +28,41 @@ #include "compiler/shader_info.h" +union zink_st_small_key { + struct { + /** for ARB_color_buffer_float */ + uint8_t clamp_color:1; + /* for user-defined clip-planes */ + uint8_t lower_ucp:1; + /* Whether st_variant::driver_shader is for the draw module, + * not for the driver. + */ + uint8_t is_draw_shader:1; + uint8_t lower_flatshade:1; + uint8_t lower_alpha_test:1; + uint16_t pad: 11; // from here not key + }; + uint16_t val; +}; + +struct zink_st_variant_key +{ + union zink_st_small_key small_key; + + uint8_t ucp_enables: 8; + + unsigned lower_alpha_func:3; + + + uint32_t pad2: 5; //next array aligned to uint32 for easy access + + /* bitmask of sampler units; PIPE_CAP_GL_CLAMP */ + uint32_t gl_clamp[3]; + + /* needs more than 128 bytes */ + struct pipe_clip_state ucp_state; +}; + struct zink_vs_key_base { bool last_vertex_stage : 1; bool clip_halfz : 1; diff --git a/src/gallium/drivers/zink/zink_state.c b/src/gallium/drivers/zink/zink_state.c index 24175aceeed..2415a65351d 100644 --- a/src/gallium/drivers/zink/zink_state.c +++ b/src/gallium/drivers/zink/zink_state.c @@ -722,6 +722,21 @@ zink_bind_rasterizer_state(struct pipe_context *pctx, void *cso) if (!screen->optimal_keys) zink_update_gs_key_rectangular_line(ctx); + + if (screen->optimal_keys) { + struct zink_st_variant_key *key = &ctx->gfx_pipeline_state.st_key; + + if ((zink_get_st_small_key(ctx)->clamp_color) != ctx->rast_state->base.clamp_fragment_color) + zink_set_st_small_key(ctx)->clamp_color = ctx->rast_state->base.clamp_fragment_color; + + if ((zink_get_st_small_key(ctx)->lower_flatshade) != ctx->rast_state->base.flatshade) + zink_set_st_small_key(ctx)->lower_flatshade = ctx->rast_state->base.flatshade; + + key->ucp_enables = ctx->rast_state->base.clip_plane_enable; + + if ((zink_get_st_small_key(ctx)->lower_ucp) != !!key->ucp_enables) + zink_set_st_small_key(ctx)->lower_ucp = !!key->ucp_enables; + } } } diff --git a/src/gallium/drivers/zink/zink_types.h b/src/gallium/drivers/zink/zink_types.h index c2305ece7f7..bddbb9e87c4 100644 --- a/src/gallium/drivers/zink/zink_types.h +++ b/src/gallium/drivers/zink/zink_types.h @@ -795,6 +795,7 @@ struct zink_shader_object { VkShaderModule mod; }; struct spirv_shader *spirv; + VkPipeline gpl; }; struct zink_shader { @@ -824,6 +825,7 @@ struct zink_shader { bool has_uniforms; bool has_edgeflags; bool needs_inlining; + bool is_uber; struct spirv_shader *spirv; struct { @@ -932,6 +934,7 @@ struct zink_gfx_pipeline_state { uint32_t vertex_strides[PIPE_MAX_ATTRIBS]; struct zink_vertex_elements_hw_state *element_state; struct zink_zs_swizzle_key *shadow; + bool uber_required; // emulation needed && !async compilation done enum mesa_prim shader_rast_prim, rast_prim; /* reduced type or max for unknown */ union { struct { @@ -942,6 +945,7 @@ struct zink_gfx_pipeline_state { union zink_shader_key_optimal key; } shader_keys_optimal; }; + struct zink_st_variant_key st_key; struct zink_blend_state *blend_state; VkFormat rendering_formats[PIPE_MAX_COLOR_BUFS]; VkPipelineRenderingCreateInfo rendering_info; @@ -1008,6 +1012,7 @@ enum zink_gfx_push_constant_member { */ struct zink_shader_module { struct zink_shader_object obj; + struct util_queue_fence fence; uint32_t hash; bool shobj; bool default_variant; @@ -1049,6 +1054,7 @@ typedef bool (*equals_gfx_pipeline_state_func)(const void *a, const void *b); struct zink_gfx_library_key { uint32_t optimal_key; //equals_pipeline_lib_optimal + uint32_t st_key; VkShaderModule modules[MESA_SHADER_MESH_STAGES]; VkPipeline pipeline; }; @@ -1115,6 +1121,13 @@ struct zink_gfx_lib_cache { simple_mtx_t lock; struct set libs; //zink_gfx_library_key -> VkPipeline + struct zink_gfx_library_key *lib; //zink_gfx_library_key -> VkPipeline +}; + +struct zink_gfx_program_variant_key { + uint32_t optimal_key; //equals_pipeline_lib_optimal + uint32_t st_key; + struct zink_gfx_program *prog; }; struct zink_gfx_program { @@ -1135,21 +1148,29 @@ struct zink_gfx_program { uint32_t module_hash[MESA_SHADER_MESH_STAGES]; struct blob blobs[MESA_SHADER_MESH_STAGES]; struct util_dynarray shader_cache[MESA_SHADER_MESH_STAGES][2][2]; //normal, nonseamless cubes, inline uniforms + struct util_dynarray uber_modules; + struct set variants; + struct zink_gfx_program *base_variant; //quick access to base varitant (only !NULL when done compiling) + struct zink_gfx_program *uber_variant; unsigned inlined_variant_count[MESA_SHADER_MESH_STAGES]; uint32_t default_variant_hash; uint8_t inline_variants; //which stages are using inlined uniforms bool needs_inlining; // whether this program requires some uniforms to be inlined bool has_edgeflags; bool optimal_keys; + bool started_compiling; + bool is_uber_program; + bool is_variant_program; /* separable */ struct zink_gfx_program *full_prog; - struct hash_table pipelines[11]; // [number of draw modes we support] + struct hash_table pipelines[2][11]; // [uber_emulation][number of draw modes we support] uint32_t last_variant_hash; + uint32_t st_key; - uint32_t last_finalized_hash[4]; //[primtype idx] - struct zink_gfx_pipeline_cache_entry *last_pipeline[4]; //[primtype idx] + uint32_t last_finalized_hash[2][4]; //[uber_emulation][primtype idx] + struct zink_gfx_pipeline_cache_entry *last_pipeline[2][4]; //[uber_emulation][primtype idx] struct zink_gfx_lib_cache *libs; }; @@ -1787,6 +1808,7 @@ struct zink_context { simple_mtx_t program_lock[8]; uint32_t gfx_hash; struct zink_gfx_program *curr_program; + struct zink_gfx_program *curr_program_uber; struct set gfx_inputs; struct set gfx_outputs;