diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index c8d22f1f076..2352fe17aa4 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -24,6 +24,7 @@ #include "nir_opcodes.h" #include "zink_context.h" #include "zink_compiler.h" +#include "zink_descriptors.h" #include "zink_program.h" #include "zink_screen.h" #include "nir_to_spirv/nir_to_spirv.h" @@ -3205,6 +3206,39 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, return mod; } +VkShaderModule +zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir) +{ + nir_shader *nir = nir_shader_clone(NULL, zs->nir); + int set = nir->info.stage == MESA_SHADER_FRAGMENT; + unsigned offsets[4]; + zink_descriptor_shader_get_binding_offsets(zs, offsets); + nir_foreach_variable_with_modes(var, nir, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform | nir_var_image) { + if (var->data.bindless) + continue; + var->data.descriptor_set = set; + switch (var->data.mode) { + case nir_var_mem_ubo: + var->data.binding = !!var->data.driver_location; + break; + case nir_var_uniform: + if (glsl_type_is_sampler(glsl_without_array(var->type))) + var->data.binding += offsets[1]; + break; + case nir_var_mem_ssbo: + var->data.binding += offsets[2]; + break; + case nir_var_image: + var->data.binding += offsets[3]; + break; + default: break; + } + } + optimize_nir(nir, zs); + *ret_nir = nir; + return compile_module(screen, zs, nir); +} + static bool lower_baseinstance_instr(nir_builder *b, nir_instr *instr, void *data) { @@ -4196,6 +4230,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model; + util_queue_fence_init(&ret->precompile.fence); ret->hash = _mesa_hash_pointer(ret); ret->programs = _mesa_pointer_set_create(NULL); @@ -4490,8 +4525,16 @@ zink_shader_free(struct zink_screen *screen, struct zink_shader *shader) shader->non_fs.generated_gs = NULL; } _mesa_set_destroy(shader->programs, NULL); + util_queue_fence_wait(&shader->precompile.fence); + util_queue_fence_destroy(&shader->precompile.fence); + zink_descriptor_shader_deinit(screen, shader); + if (shader->precompile.mod) + VKSCR(DestroyShaderModule)(screen->dev, shader->precompile.mod, NULL); + if (shader->precompile.gpl) + VKSCR(DestroyPipeline)(screen->dev, shader->precompile.gpl, NULL); ralloc_free(shader->nir); ralloc_free(shader->spirv); + free(shader->precompile.bindings); ralloc_free(shader); } @@ -4530,6 +4573,7 @@ struct zink_shader * zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch) { struct zink_shader *ret = rzalloc(NULL, struct zink_shader); + util_queue_fence_init(&ret->precompile.fence); ret->hash = _mesa_hash_pointer(ret); ret->programs = _mesa_pointer_set_create(NULL); simple_mtx_init(&ret->lock, mtx_plain); diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h index a1c894d3853..30a3111e68f 100644 --- a/src/gallium/drivers/zink/zink_compiler.h +++ b/src/gallium/drivers/zink/zink_compiler.h @@ -63,6 +63,8 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh VkShaderModule zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data); VkShaderModule +zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir); +VkShaderModule zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv); struct zink_shader * zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, diff --git a/src/gallium/drivers/zink/zink_descriptors.c b/src/gallium/drivers/zink/zink_descriptors.c index cc3519d65e1..68f0ef07fe0 100644 --- a/src/gallium/drivers/zink/zink_descriptors.c +++ b/src/gallium/drivers/zink/zink_descriptors.c @@ -670,6 +670,96 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg) return true; } +void +zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets) +{ + offsets[ZINK_DESCRIPTOR_TYPE_UBO] = 0; + offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] = shader->bindings[ZINK_DESCRIPTOR_TYPE_UBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_UBO] - 1].binding + 1; + offsets[ZINK_DESCRIPTOR_TYPE_SSBO] = offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] - 1].binding + 1; + offsets[ZINK_DESCRIPTOR_TYPE_IMAGE] = offsets[ZINK_DESCRIPTOR_TYPE_SSBO] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SSBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SSBO] - 1].binding + 1; +} + +void +zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader) +{ + VkDescriptorSetLayoutBinding bindings[ZINK_DESCRIPTOR_BASE_TYPES * ZINK_MAX_DESCRIPTORS_PER_TYPE]; + unsigned num_bindings = 0; + VkShaderStageFlagBits stage_flags = mesa_to_vk_shader_stage(shader->nir->info.stage); + + unsigned desc_set_size = shader->has_uniforms; + for (unsigned i = 0; i < ZINK_DESCRIPTOR_BASE_TYPES; i++) + desc_set_size += shader->num_bindings[i]; + if (desc_set_size) + shader->precompile.db_template = rzalloc_array(shader, struct zink_descriptor_template, desc_set_size); + + if (shader->has_uniforms) { + VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings]; + binding->binding = 0; + binding->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + binding->descriptorCount = 1; + binding->stageFlags = stage_flags; + binding->pImmutableSamplers = NULL; + struct zink_descriptor_template *entry = &shader->precompile.db_template[num_bindings]; + entry->count = 1; + entry->offset = offsetof(struct zink_context, di.db.ubos[shader->nir->info.stage][0]); + entry->stride = sizeof(VkDescriptorAddressInfoEXT); + entry->db_size = screen->info.db_props.robustUniformBufferDescriptorSize; + num_bindings++; + } + /* sync with zink_shader_compile_separate() */ + unsigned offsets[4]; + zink_descriptor_shader_get_binding_offsets(shader, offsets); + for (int j = 0; j < ZINK_DESCRIPTOR_BASE_TYPES; j++) { + for (int k = 0; k < shader->num_bindings[j]; k++) { + VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings]; + if (j == ZINK_DESCRIPTOR_TYPE_UBO) + binding->binding = 1; + else + binding->binding = shader->bindings[j][k].binding + offsets[j]; + binding->descriptorType = shader->bindings[j][k].type; + binding->descriptorCount = shader->bindings[j][k].size; + binding->stageFlags = stage_flags; + binding->pImmutableSamplers = NULL; + + unsigned temp = 0; + init_db_template_entry(screen, shader, j, k, &shader->precompile.db_template[num_bindings], &temp); + num_bindings++; + } + } + if (num_bindings) { + shader->precompile.dsl = descriptor_layout_create(screen, 0, bindings, num_bindings); + shader->precompile.bindings = mem_dup(bindings, num_bindings * sizeof(VkDescriptorSetLayoutBinding)); + shader->precompile.num_bindings = num_bindings; + VkDeviceSize val; + VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, shader->precompile.dsl, &val); + shader->precompile.db_size = val; + shader->precompile.db_offset = rzalloc_array(shader, uint32_t, num_bindings); + for (unsigned i = 0; i < num_bindings; i++) { + VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, shader->precompile.dsl, bindings[i].binding, &val); + shader->precompile.db_offset[i] = val; + } + } + VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_ALL_TYPES] = {0}; + unsigned num_dsl = num_bindings ? 2 : 0; + if (shader->bindless) + num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES; + if (num_bindings || shader->bindless) { + dsl[shader->nir->info.stage == MESA_SHADER_FRAGMENT] = shader->precompile.dsl; + if (shader->bindless) + dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout; + } + shader->precompile.layout = zink_pipeline_layout_create(screen, dsl, num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT); +} + +void +zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader) +{ + if (shader->precompile.dsl) + VKSCR(DestroyDescriptorSetLayout)(screen->dev, shader->precompile.dsl, NULL); + if (shader->precompile.layout) + VKSCR(DestroyPipelineLayout)(screen->dev, shader->precompile.layout, NULL); +} + /* called during program destroy */ void zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg) @@ -946,6 +1036,71 @@ populate_sets(struct zink_context *ctx, struct zink_batch_state *bs, return true; } +static void +update_separable(struct zink_context *ctx, struct zink_program *pg) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + struct zink_batch_state *bs = ctx->batch.state; + + unsigned use_buffer = 0; + /* find the least-written buffer to use for this */ + for (unsigned i = 0; i < ARRAY_SIZE(bs->dd.db_offset); i++) { + if (bs->dd.db_offset[i] < bs->dd.db_offset[use_buffer]) + use_buffer = i; + } + VkDescriptorGetInfoEXT info; + info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT; + info.pNext = NULL; + struct zink_gfx_program *prog = (struct zink_gfx_program *)pg; + struct zink_shader *shaders[] = { + prog->shaders[MESA_SHADER_VERTEX]->precompile.num_bindings ? prog->shaders[MESA_SHADER_VERTEX] : prog->shaders[MESA_SHADER_FRAGMENT], + prog->shaders[MESA_SHADER_FRAGMENT], + }; + for (unsigned j = 0; j < pg->num_dsl; j++) { + if (!(pg->dd.binding_usage & BITFIELD_BIT(j))) + continue; + uint64_t offset = bs->dd.db_offset[use_buffer]; + assert(bs->dd.db[use_buffer]->obj->size > bs->dd.db_offset[use_buffer] + pg->dd.db_size[j]); + for (unsigned i = 0; i < shaders[j]->precompile.num_bindings; i++) { + info.type = shaders[j]->precompile.bindings[i].descriptorType; + uint64_t desc_offset = offset + pg->dd.db_offset[j][i]; + if (screen->info.db_props.combinedImageSamplerDescriptorSingleArray || + shaders[j]->precompile.bindings[i].descriptorType != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + shaders[j]->precompile.bindings[i].descriptorCount == 1) { + for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) { + /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */ + info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[j][i].offset + k * pg->dd.db_template[j][i].stride); + VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][i].db_size, bs->dd.db_map[use_buffer] + desc_offset + k * pg->dd.db_template[j][i].db_size); + } + } else { + assert(shaders[j]->precompile.bindings[i].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); + char buf[1024]; + uint8_t *db = bs->dd.db_map[use_buffer] + desc_offset; + uint8_t *samplers = db + shaders[j]->precompile.bindings[i].descriptorCount * screen->info.db_props.sampledImageDescriptorSize; + for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) { + /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */ + info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].offset + + k * pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].stride); + VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW].db_size, buf); + /* drivers that don't support combinedImageSamplerDescriptorSingleArray must have sampler arrays written in memory as + * + * | array_of_samplers[] | array_of_sampled_images[] | + * + * which means each descriptor's data must be split + */ + memcpy(db, buf, screen->info.db_props.samplerDescriptorSize); + memcpy(samplers, &buf[screen->info.db_props.samplerDescriptorSize], screen->info.db_props.sampledImageDescriptorSize); + db += screen->info.db_props.sampledImageDescriptorSize; + samplers += screen->info.db_props.samplerDescriptorSize; + } + } + } + bs->dd.cur_db_offset[use_buffer] = bs->dd.db_offset[use_buffer]; + bs->dd.db_offset[use_buffer] += pg->dd.db_size[j]; + VKCTX(CmdSetDescriptorBufferOffsetsEXT)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pg->layout, j, 1, &use_buffer, &offset); + } +} + /* updates the mask of changed_sets and binds the mask of bind_sets */ static void zink_descriptors_update_masked_buffer(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets) @@ -1092,6 +1247,17 @@ zink_descriptors_update(struct zink_context *ctx, bool is_compute) ctx->dd.push_state_changed[is_compute] = !!pg->dd.push_usage || ctx->dd.has_fbfetch != bs->dd.has_fbfetch; } + if (!is_compute) { + struct zink_gfx_program *prog = (struct zink_gfx_program*)pg; + if (prog->is_separable) { + /* force all descriptors update on next pass: separables use different layouts */ + ctx->dd.state_changed[is_compute] = BITFIELD_MASK(ZINK_DESCRIPTOR_TYPE_UNIFORMS); + ctx->dd.push_state_changed[is_compute] = true; + update_separable(ctx, pg); + return; + } + } + if (pg != bs->dd.pg[is_compute]) { /* if we don't already know that we have to update all sets, * check to see if any dsls changed diff --git a/src/gallium/drivers/zink/zink_descriptors.h b/src/gallium/drivers/zink/zink_descriptors.h index c3705b693a7..e61d24c75d8 100644 --- a/src/gallium/drivers/zink/zink_descriptors.h +++ b/src/gallium/drivers/zink/zink_descriptors.h @@ -154,8 +154,12 @@ zink_descriptors_deinit_bindless(struct zink_context *ctx); void zink_descriptors_update_bindless(struct zink_context *ctx); - - +void +zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets); +void +zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader); +void +zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader); bool zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg); diff --git a/src/gallium/drivers/zink/zink_pipeline.c b/src/gallium/drivers/zink/zink_pipeline.c index bea5c155fc4..6d7f716e28b 100644 --- a/src/gallium/drivers/zink/zink_pipeline.c +++ b/src/gallium/drivers/zink/zink_pipeline.c @@ -751,7 +751,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, VkShaderModule *modules, pci.pStages = shader_stages; pci.stageCount = num_stages; - /* only add LTO for full pipeline libs */ + /* Only keep LTO information for full pipeline libs. For separable shaders, they will only + * ever be used with fast linking, and to optimize them a new pipeline lib will be created with full + * link time information for the full set of shader stages (rather than linking in these single-stage libs). + */ if (num_stages > 1) pci.flags |= VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT; @@ -770,6 +773,12 @@ zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_pro return create_gfx_pipeline_library(screen, prog->modules, prog->base.layout, prog->base.pipeline_cache); } +VkPipeline +zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout) +{ + return create_gfx_pipeline_library(screen, modules, layout, VK_NULL_HANDLE); +} + VkPipeline zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized) { diff --git a/src/gallium/drivers/zink/zink_pipeline.h b/src/gallium/drivers/zink/zink_pipeline.h index 11f86ac9fae..c6d5001c074 100644 --- a/src/gallium/drivers/zink/zink_pipeline.h +++ b/src/gallium/drivers/zink/zink_pipeline.h @@ -62,6 +62,8 @@ VkPipeline zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state); VkPipeline zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized); +VkPipeline +zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout); #ifdef __cplusplus } #endif diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c index 02ec650145b..9a255255ec5 100644 --- a/src/gallium/drivers/zink/zink_program.c +++ b/src/gallium/drivers/zink/zink_program.c @@ -44,6 +44,11 @@ #define XXH_INLINE_ALL #include "util/xxhash.h" +static void +precompile_job(void *data, void *gdata, int thread_index); +struct zink_gfx_program * +create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch); + void debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr) { @@ -645,6 +650,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr { const union zink_shader_key_optimal *optimal_key = (union zink_shader_key_optimal*)&prog->last_variant_hash; if (ctx->gfx_pipeline_state.shader_keys_optimal.key.vs_bits != optimal_key->vs_bits) { + assert(!prog->is_separable); bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->nir->info.stage); ctx->gfx_pipeline_state.modules_changed |= changed; } @@ -652,6 +658,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr if (ctx->gfx_pipeline_state.shader_keys_optimal.key.fs_bits != optimal_key->fs_bits || /* always recheck shadow swizzles since they aren't directly part of the key */ unlikely(shadow_needs_shader_swizzle)) { + assert(!prog->is_separable); bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT); ctx->gfx_pipeline_state.modules_changed |= changed; if (unlikely(shadow_needs_shader_swizzle)) { @@ -661,6 +668,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr } if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated && ctx->gfx_pipeline_state.shader_keys_optimal.key.tcs_bits != optimal_key->tcs_bits) { + assert(!prog->is_separable); bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL); ctx->gfx_pipeline_state.modules_changed |= changed; } @@ -682,13 +690,28 @@ zink_gfx_program_update_optimal(struct zink_context *ctx) ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; if (entry) { prog = (struct zink_gfx_program*)entry->data; + if (prog->is_separable) { + /* shader variants can't be handled by separable programs: sync and compile */ + if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) + util_queue_fence_wait(&prog->base.cache_fence); + /* If the optimized linked pipeline is done compiling, swap it into place. */ + if (util_queue_fence_is_signalled(&prog->base.cache_fence)) { + struct zink_gfx_program *real = prog->full_prog; + entry->data = real; + prog->full_prog = NULL; + prog->base.removed = true; + zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL); + prog = real; + } + } update_gfx_program_optimal(ctx, prog); } else { ctx->dirty_gfx_stages |= ctx->shader_stages; - prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch); + prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch); zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false); _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog); - generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state); + if (!prog->is_separable) + generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state); } simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); if (prog && prog != ctx->curr_program) @@ -699,6 +722,24 @@ zink_gfx_program_update_optimal(struct zink_context *ctx) /* remove old hash */ ctx->gfx_pipeline_state.optimal_key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val; ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; + if (ctx->curr_program->is_separable) { + struct zink_gfx_program *prog = ctx->curr_program; + if (prog->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) { + util_queue_fence_wait(&prog->base.cache_fence); + /* shader variants can't be handled by separable programs: sync and compile */ + struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)]; + const uint32_t hash = ctx->gfx_hash; + simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages); + struct zink_gfx_program *real = prog->full_prog; + entry->data = real; + prog->full_prog = NULL; + prog->base.removed = true; + zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL); + ctx->curr_program = real; + simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + } + } update_gfx_program_optimal(ctx, ctx->curr_program); /* apply new hash */ ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; @@ -969,6 +1010,112 @@ fail: return NULL; } +/* Creates a replacement, optimized zink_gfx_program for this set of separate shaders, which will + * be swapped in in place of the fast-linked separable program once it's done compiling. + */ +static void +create_linked_separable_job(void *data, void *gdata, int thread_index) +{ + struct zink_gfx_program *prog = data; + prog->full_prog = zink_create_gfx_program(prog->ctx, prog->shaders, 0); + precompile_job(prog->full_prog, gdata, thread_index); +} + +struct zink_gfx_program * +create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + unsigned shader_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_FRAGMENT); + /* filter cases that need real pipelines */ + if (ctx->shader_stages != shader_stages || + !stages[MESA_SHADER_VERTEX]->precompile.mod || !stages[MESA_SHADER_FRAGMENT]->precompile.mod || + /* TODO: maybe try variants? grimace */ + !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || + !zink_can_use_pipeline_libs(ctx)) + return zink_create_gfx_program(ctx, stages, vertices_per_patch); + /* ensure async gpl creation is done */ + util_queue_fence_wait(&stages[MESA_SHADER_VERTEX]->precompile.fence); + util_queue_fence_wait(&stages[MESA_SHADER_FRAGMENT]->precompile.fence); + + struct zink_gfx_program *prog = create_program(ctx, false); + if (!prog) + goto fail; + + prog->ctx = ctx; + prog->is_separable = true; + + prog->shaders[MESA_SHADER_VERTEX] = stages[MESA_SHADER_VERTEX]; + prog->stages_remaining = prog->stages_present = shader_stages; + prog->shaders[MESA_SHADER_FRAGMENT] = stages[MESA_SHADER_FRAGMENT]; + prog->last_vertex_stage = stages[MESA_SHADER_VERTEX]; + _mesa_set_init(&prog->libs, prog, hash_pipeline_lib, equals_pipeline_lib); + + unsigned refs = 0; + for (int i = 0; i < ZINK_GFX_SHADER_COUNT; ++i) { + if (prog->shaders[i]) { + simple_mtx_lock(&prog->shaders[i]->lock); + _mesa_set_add(prog->shaders[i]->programs, prog); + simple_mtx_unlock(&prog->shaders[i]->lock); + refs++; + } + } + /* We can do this add after the _mesa_set_adds above because we know the prog->shaders[] are + * referenced by the draw state and zink_shader_free() can't be called on them while we're in here. + */ + p_atomic_add(&prog->base.reference.count, refs); + + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + _mesa_hash_table_init(&prog->pipelines[r][i], prog, NULL, zink_get_gfx_pipeline_eq_func(screen, prog)); + /* only need first 3/4 for point/line/tri/patch */ + if (screen->info.have_EXT_extended_dynamic_state && + i == (prog->last_vertex_stage->nir->info.stage == MESA_SHADER_TESS_EVAL ? 4 : 3)) + break; + } + } + + if (prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl) { + prog->base.dd.binding_usage |= BITFIELD_BIT(0); + prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_template; + prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_size; + prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_offset; + prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl; + prog->base.num_dsl++; + } + if (prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl) { + prog->base.dd.binding_usage |= BITFIELD_BIT(1); + prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_template; + prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_size; + prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_offset; + prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl; + /* guarantee a null dsl if vs doesn't have descriptors */ + prog->base.num_dsl = 2; + } + prog->base.dd.bindless = prog->shaders[MESA_SHADER_VERTEX]->bindless | prog->shaders[MESA_SHADER_FRAGMENT]->bindless; + if (prog->base.dd.bindless) { + prog->base.num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES; + prog->base.dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout; + } + prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT); + + VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl}; + prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key; + + struct zink_gfx_library_key *gkey = rzalloc(prog, struct zink_gfx_library_key); + gkey->optimal_key = prog->last_variant_hash; + assert(gkey->optimal_key); + gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false); + _mesa_set_add(&prog->libs, gkey); + + util_queue_add_job(&screen->cache_get_thread, prog, &prog->base.cache_fence, create_linked_separable_job, NULL, 0); + + return prog; +fail: + if (prog) + zink_destroy_gfx_program(screen, prog); + return NULL; +} + static uint32_t hash_compute_pipeline_state_local_size(const void *key) { @@ -1203,6 +1350,8 @@ zink_destroy_gfx_program(struct zink_screen *screen, max_idx++; } + if (prog->is_separable) + zink_gfx_program_reference(screen, &prog->full_prog, NULL); for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) { for (int i = 0; i < max_idx; ++i) { hash_table_foreach(&prog->pipelines[r][i], entry) { @@ -1223,11 +1372,13 @@ zink_destroy_gfx_program(struct zink_screen *screen, _mesa_set_remove_key(prog->shaders[i]->programs, prog); prog->shaders[i] = NULL; } - destroy_shader_cache(screen, &prog->shader_cache[i][0][0]); - destroy_shader_cache(screen, &prog->shader_cache[i][0][1]); - destroy_shader_cache(screen, &prog->shader_cache[i][1][0]); - destroy_shader_cache(screen, &prog->shader_cache[i][1][1]); - ralloc_free(prog->nir[i]); + if (!prog->is_separable) { + destroy_shader_cache(screen, &prog->shader_cache[i][0][0]); + destroy_shader_cache(screen, &prog->shader_cache[i][0][1]); + destroy_shader_cache(screen, &prog->shader_cache[i][1][0]); + destroy_shader_cache(screen, &prog->shader_cache[i][1][1]); + ralloc_free(prog->nir[i]); + } } set_foreach_remove(&prog->libs, he) { @@ -1761,6 +1912,20 @@ precompile_job(void *data, void *gdata, int thread_index) zink_screen_update_pipeline_cache(screen, &prog->base, true); } +static void +precompile_separate_shader_job(void *data, void *gdata, int thread_index) +{ + struct zink_screen *screen = gdata; + struct zink_shader *zs = data; + + nir_shader *nir; + zs->precompile.mod = zink_shader_compile_separate(screen, zs, &nir); + zink_descriptor_shader_init(screen, zs); + VkShaderModule mods[ZINK_GFX_SHADER_COUNT] = {0}; + mods[nir->info.stage] = zs->precompile.mod; + zs->precompile.gpl = zink_create_gfx_pipeline_separate(screen, mods, zs->precompile.layout); +} + static void zink_link_gfx_shader(struct pipe_context *pctx, void **shaders) { @@ -1769,8 +1934,17 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders) if (shaders[MESA_SHADER_COMPUTE]) return; /* can't precompile fixedfunc */ - if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) + if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) { + if (shaders[MESA_SHADER_VERTEX] || shaders[MESA_SHADER_FRAGMENT]) { + struct zink_shader *zs = shaders[MESA_SHADER_VERTEX] ? shaders[MESA_SHADER_VERTEX] : shaders[MESA_SHADER_FRAGMENT]; + if (zs->nir->info.separate_shader && !zs->precompile.mod && util_queue_fence_is_signalled(&zs->precompile.fence) && + zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB && + /* sample shading can't precompile */ + (!shaders[MESA_SHADER_FRAGMENT] || !zs->nir->info.fs.uses_sample_shading)) + util_queue_add_job(&zink_screen(pctx->screen)->cache_get_thread, zs, &zs->precompile.fence, precompile_separate_shader_job, NULL, 0); + } return; + } unsigned hash = 0; unsigned shader_stages = 0; for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) { diff --git a/src/gallium/drivers/zink/zink_program_state.hpp b/src/gallium/drivers/zink/zink_program_state.hpp index dba466455cc..45550cee83a 100644 --- a/src/gallium/drivers/zink/zink_program_state.hpp +++ b/src/gallium/drivers/zink/zink_program_state.hpp @@ -190,10 +190,12 @@ zink_get_gfx_pipeline(struct zink_context *ctx, /* this is the graphics pipeline library path: find/construct all partial pipelines */ struct set_entry *he = _mesa_set_search(&prog->libs, &ctx->gfx_pipeline_state.optimal_key); struct zink_gfx_library_key *gkey; - if (he) + if (he) { gkey = (struct zink_gfx_library_key *)he->key; - else + } else { + assert(!prog->is_separable); gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state); + } struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ? zink_find_or_create_input_dynamic(ctx, vkmode) : zink_find_or_create_input(ctx, vkmode); @@ -215,7 +217,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx, zink_screen_update_pipeline_cache(screen, &prog->base, false); pc_entry->pipeline = pipeline; - if (HAVE_LIB) + if (HAVE_LIB && !prog->is_separable) /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */ zink_gfx_program_compile_queue(ctx, pc_entry); } diff --git a/src/gallium/drivers/zink/zink_screen.c b/src/gallium/drivers/zink/zink_screen.c index 144fe4617fd..c5fd4377bed 100644 --- a/src/gallium/drivers/zink/zink_screen.c +++ b/src/gallium/drivers/zink/zink_screen.c @@ -188,6 +188,8 @@ zink_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *s } struct zink_shader *zs = shader; + if (!util_queue_fence_is_signalled(&zs->precompile.fence)) + return false; bool finished = true; set_foreach(zs->programs, entry) { struct zink_gfx_program *prog = (void*)entry->key; diff --git a/src/gallium/drivers/zink/zink_types.h b/src/gallium/drivers/zink/zink_types.h index b81a1d0e2d0..806429d28c2 100644 --- a/src/gallium/drivers/zink/zink_types.h +++ b/src/gallium/drivers/zink/zink_types.h @@ -732,6 +732,19 @@ struct zink_shader { bool has_uniforms; struct spirv_shader *spirv; + struct { + struct util_queue_fence fence; + VkShaderModule mod; + VkDescriptorSetLayout dsl; + VkPipelineLayout layout; + VkPipeline gpl; + VkDescriptorSetLayoutBinding *bindings; + unsigned num_bindings; + struct zink_descriptor_template *db_template; + unsigned db_size; + unsigned *db_offset; + } precompile; + simple_mtx_t lock; struct set *programs; @@ -973,26 +986,30 @@ struct zink_gfx_pipeline_cache_entry { struct zink_gfx_program { struct zink_program base; + bool is_separable; //not a full program struct zink_context *ctx; //the owner context uint32_t stages_present; //mask of stages present in this program uint32_t stages_remaining; //mask of zink_shader remaining in this program - struct nir_shader *nir[ZINK_GFX_SHADER_COUNT]; - - VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here - uint32_t module_hash[ZINK_GFX_SHADER_COUNT]; - - struct zink_shader *last_vertex_stage; - - struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms - unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT]; struct zink_shader *shaders[ZINK_GFX_SHADER_COUNT]; - struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support] + struct zink_shader *last_vertex_stage; + + /* full */ + VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here + uint32_t module_hash[ZINK_GFX_SHADER_COUNT]; + struct nir_shader *nir[ZINK_GFX_SHADER_COUNT]; + struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms + unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT]; uint32_t default_variant_hash; - uint32_t last_variant_hash; uint8_t inline_variants; //which stages are using inlined uniforms + /* separable */ + struct zink_gfx_program *full_prog; + + struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support] + uint32_t last_variant_hash; + uint32_t last_finalized_hash[2][4]; //[dynamic, renderpass][primtype idx] VkPipeline last_pipeline[2][4]; //[dynamic, renderpass][primtype idx]