zink: use GPL to handle (simple) separate shader objects

apps/games using separate shader objects end up passing the separable
shaders to the link_shader hook individually, which is still not ideal for
zink's usage since the more optimal path is to have all the shaders and create
a RAST+FS GPL stage that can run all the inter-stage io handlers

it IS technically possible to handle this for simple VS+FS pipelines using
GPL, however, but it's kinda gross. such shaders now use descriptor buffer
to create their own pipelines/layouts/descriptors async, and then a "separable"
variant of the gfx program can be created by fast-linking these together

the "separable" gfx program can't handle shader variants, but it can do basic
pipeline caching for PSO state changes, which makes it flexible enough to sorta
kinda maybe handle the most basic cases of separate shader objects

descriptor buffer is used because having to create and manage a separate architecture
for sets/pools/templates is too nightmarish even for me

this is, at best, a partial solution, but it's the best the vulkan api can
currently do

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21197>
This commit is contained in:
Mike Blumenkrantz 2023-02-07 13:32:21 -05:00 committed by Marge Bot
parent 60b26a6b1f
commit e3b746e3a3
10 changed files with 447 additions and 25 deletions

View file

@ -24,6 +24,7 @@
#include "nir_opcodes.h" #include "nir_opcodes.h"
#include "zink_context.h" #include "zink_context.h"
#include "zink_compiler.h" #include "zink_compiler.h"
#include "zink_descriptors.h"
#include "zink_program.h" #include "zink_program.h"
#include "zink_screen.h" #include "zink_screen.h"
#include "nir_to_spirv/nir_to_spirv.h" #include "nir_to_spirv/nir_to_spirv.h"
@ -3205,6 +3206,39 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs,
return mod; return mod;
} }
VkShaderModule
zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir)
{
nir_shader *nir = nir_shader_clone(NULL, zs->nir);
int set = nir->info.stage == MESA_SHADER_FRAGMENT;
unsigned offsets[4];
zink_descriptor_shader_get_binding_offsets(zs, offsets);
nir_foreach_variable_with_modes(var, nir, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform | nir_var_image) {
if (var->data.bindless)
continue;
var->data.descriptor_set = set;
switch (var->data.mode) {
case nir_var_mem_ubo:
var->data.binding = !!var->data.driver_location;
break;
case nir_var_uniform:
if (glsl_type_is_sampler(glsl_without_array(var->type)))
var->data.binding += offsets[1];
break;
case nir_var_mem_ssbo:
var->data.binding += offsets[2];
break;
case nir_var_image:
var->data.binding += offsets[3];
break;
default: break;
}
}
optimize_nir(nir, zs);
*ret_nir = nir;
return compile_module(screen, zs, nir);
}
static bool static bool
lower_baseinstance_instr(nir_builder *b, nir_instr *instr, void *data) lower_baseinstance_instr(nir_builder *b, nir_instr *instr, void *data)
{ {
@ -4196,6 +4230,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model; ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model;
util_queue_fence_init(&ret->precompile.fence);
ret->hash = _mesa_hash_pointer(ret); ret->hash = _mesa_hash_pointer(ret);
ret->programs = _mesa_pointer_set_create(NULL); ret->programs = _mesa_pointer_set_create(NULL);
@ -4490,8 +4525,16 @@ zink_shader_free(struct zink_screen *screen, struct zink_shader *shader)
shader->non_fs.generated_gs = NULL; shader->non_fs.generated_gs = NULL;
} }
_mesa_set_destroy(shader->programs, NULL); _mesa_set_destroy(shader->programs, NULL);
util_queue_fence_wait(&shader->precompile.fence);
util_queue_fence_destroy(&shader->precompile.fence);
zink_descriptor_shader_deinit(screen, shader);
if (shader->precompile.mod)
VKSCR(DestroyShaderModule)(screen->dev, shader->precompile.mod, NULL);
if (shader->precompile.gpl)
VKSCR(DestroyPipeline)(screen->dev, shader->precompile.gpl, NULL);
ralloc_free(shader->nir); ralloc_free(shader->nir);
ralloc_free(shader->spirv); ralloc_free(shader->spirv);
free(shader->precompile.bindings);
ralloc_free(shader); ralloc_free(shader);
} }
@ -4530,6 +4573,7 @@ struct zink_shader *
zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch) zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch)
{ {
struct zink_shader *ret = rzalloc(NULL, struct zink_shader); struct zink_shader *ret = rzalloc(NULL, struct zink_shader);
util_queue_fence_init(&ret->precompile.fence);
ret->hash = _mesa_hash_pointer(ret); ret->hash = _mesa_hash_pointer(ret);
ret->programs = _mesa_pointer_set_create(NULL); ret->programs = _mesa_pointer_set_create(NULL);
simple_mtx_init(&ret->lock, mtx_plain); simple_mtx_init(&ret->lock, mtx_plain);

View file

@ -63,6 +63,8 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh
VkShaderModule VkShaderModule
zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data); zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data);
VkShaderModule VkShaderModule
zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir);
VkShaderModule
zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv); zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv);
struct zink_shader * struct zink_shader *
zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,

View file

@ -670,6 +670,96 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg)
return true; return true;
} }
void
zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets)
{
offsets[ZINK_DESCRIPTOR_TYPE_UBO] = 0;
offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] = shader->bindings[ZINK_DESCRIPTOR_TYPE_UBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_UBO] - 1].binding + 1;
offsets[ZINK_DESCRIPTOR_TYPE_SSBO] = offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] - 1].binding + 1;
offsets[ZINK_DESCRIPTOR_TYPE_IMAGE] = offsets[ZINK_DESCRIPTOR_TYPE_SSBO] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SSBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SSBO] - 1].binding + 1;
}
void
zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader)
{
VkDescriptorSetLayoutBinding bindings[ZINK_DESCRIPTOR_BASE_TYPES * ZINK_MAX_DESCRIPTORS_PER_TYPE];
unsigned num_bindings = 0;
VkShaderStageFlagBits stage_flags = mesa_to_vk_shader_stage(shader->nir->info.stage);
unsigned desc_set_size = shader->has_uniforms;
for (unsigned i = 0; i < ZINK_DESCRIPTOR_BASE_TYPES; i++)
desc_set_size += shader->num_bindings[i];
if (desc_set_size)
shader->precompile.db_template = rzalloc_array(shader, struct zink_descriptor_template, desc_set_size);
if (shader->has_uniforms) {
VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings];
binding->binding = 0;
binding->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
binding->descriptorCount = 1;
binding->stageFlags = stage_flags;
binding->pImmutableSamplers = NULL;
struct zink_descriptor_template *entry = &shader->precompile.db_template[num_bindings];
entry->count = 1;
entry->offset = offsetof(struct zink_context, di.db.ubos[shader->nir->info.stage][0]);
entry->stride = sizeof(VkDescriptorAddressInfoEXT);
entry->db_size = screen->info.db_props.robustUniformBufferDescriptorSize;
num_bindings++;
}
/* sync with zink_shader_compile_separate() */
unsigned offsets[4];
zink_descriptor_shader_get_binding_offsets(shader, offsets);
for (int j = 0; j < ZINK_DESCRIPTOR_BASE_TYPES; j++) {
for (int k = 0; k < shader->num_bindings[j]; k++) {
VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings];
if (j == ZINK_DESCRIPTOR_TYPE_UBO)
binding->binding = 1;
else
binding->binding = shader->bindings[j][k].binding + offsets[j];
binding->descriptorType = shader->bindings[j][k].type;
binding->descriptorCount = shader->bindings[j][k].size;
binding->stageFlags = stage_flags;
binding->pImmutableSamplers = NULL;
unsigned temp = 0;
init_db_template_entry(screen, shader, j, k, &shader->precompile.db_template[num_bindings], &temp);
num_bindings++;
}
}
if (num_bindings) {
shader->precompile.dsl = descriptor_layout_create(screen, 0, bindings, num_bindings);
shader->precompile.bindings = mem_dup(bindings, num_bindings * sizeof(VkDescriptorSetLayoutBinding));
shader->precompile.num_bindings = num_bindings;
VkDeviceSize val;
VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, shader->precompile.dsl, &val);
shader->precompile.db_size = val;
shader->precompile.db_offset = rzalloc_array(shader, uint32_t, num_bindings);
for (unsigned i = 0; i < num_bindings; i++) {
VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, shader->precompile.dsl, bindings[i].binding, &val);
shader->precompile.db_offset[i] = val;
}
}
VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_ALL_TYPES] = {0};
unsigned num_dsl = num_bindings ? 2 : 0;
if (shader->bindless)
num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES;
if (num_bindings || shader->bindless) {
dsl[shader->nir->info.stage == MESA_SHADER_FRAGMENT] = shader->precompile.dsl;
if (shader->bindless)
dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout;
}
shader->precompile.layout = zink_pipeline_layout_create(screen, dsl, num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
}
void
zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader)
{
if (shader->precompile.dsl)
VKSCR(DestroyDescriptorSetLayout)(screen->dev, shader->precompile.dsl, NULL);
if (shader->precompile.layout)
VKSCR(DestroyPipelineLayout)(screen->dev, shader->precompile.layout, NULL);
}
/* called during program destroy */ /* called during program destroy */
void void
zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg) zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg)
@ -946,6 +1036,71 @@ populate_sets(struct zink_context *ctx, struct zink_batch_state *bs,
return true; return true;
} }
static void
update_separable(struct zink_context *ctx, struct zink_program *pg)
{
struct zink_screen *screen = zink_screen(ctx->base.screen);
struct zink_batch_state *bs = ctx->batch.state;
unsigned use_buffer = 0;
/* find the least-written buffer to use for this */
for (unsigned i = 0; i < ARRAY_SIZE(bs->dd.db_offset); i++) {
if (bs->dd.db_offset[i] < bs->dd.db_offset[use_buffer])
use_buffer = i;
}
VkDescriptorGetInfoEXT info;
info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT;
info.pNext = NULL;
struct zink_gfx_program *prog = (struct zink_gfx_program *)pg;
struct zink_shader *shaders[] = {
prog->shaders[MESA_SHADER_VERTEX]->precompile.num_bindings ? prog->shaders[MESA_SHADER_VERTEX] : prog->shaders[MESA_SHADER_FRAGMENT],
prog->shaders[MESA_SHADER_FRAGMENT],
};
for (unsigned j = 0; j < pg->num_dsl; j++) {
if (!(pg->dd.binding_usage & BITFIELD_BIT(j)))
continue;
uint64_t offset = bs->dd.db_offset[use_buffer];
assert(bs->dd.db[use_buffer]->obj->size > bs->dd.db_offset[use_buffer] + pg->dd.db_size[j]);
for (unsigned i = 0; i < shaders[j]->precompile.num_bindings; i++) {
info.type = shaders[j]->precompile.bindings[i].descriptorType;
uint64_t desc_offset = offset + pg->dd.db_offset[j][i];
if (screen->info.db_props.combinedImageSamplerDescriptorSingleArray ||
shaders[j]->precompile.bindings[i].descriptorType != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
shaders[j]->precompile.bindings[i].descriptorCount == 1) {
for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) {
/* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */
info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[j][i].offset + k * pg->dd.db_template[j][i].stride);
VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][i].db_size, bs->dd.db_map[use_buffer] + desc_offset + k * pg->dd.db_template[j][i].db_size);
}
} else {
assert(shaders[j]->precompile.bindings[i].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
char buf[1024];
uint8_t *db = bs->dd.db_map[use_buffer] + desc_offset;
uint8_t *samplers = db + shaders[j]->precompile.bindings[i].descriptorCount * screen->info.db_props.sampledImageDescriptorSize;
for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) {
/* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */
info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].offset +
k * pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].stride);
VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW].db_size, buf);
/* drivers that don't support combinedImageSamplerDescriptorSingleArray must have sampler arrays written in memory as
*
* | array_of_samplers[] | array_of_sampled_images[] |
*
* which means each descriptor's data must be split
*/
memcpy(db, buf, screen->info.db_props.samplerDescriptorSize);
memcpy(samplers, &buf[screen->info.db_props.samplerDescriptorSize], screen->info.db_props.sampledImageDescriptorSize);
db += screen->info.db_props.sampledImageDescriptorSize;
samplers += screen->info.db_props.samplerDescriptorSize;
}
}
}
bs->dd.cur_db_offset[use_buffer] = bs->dd.db_offset[use_buffer];
bs->dd.db_offset[use_buffer] += pg->dd.db_size[j];
VKCTX(CmdSetDescriptorBufferOffsetsEXT)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pg->layout, j, 1, &use_buffer, &offset);
}
}
/* updates the mask of changed_sets and binds the mask of bind_sets */ /* updates the mask of changed_sets and binds the mask of bind_sets */
static void static void
zink_descriptors_update_masked_buffer(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets) zink_descriptors_update_masked_buffer(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets)
@ -1092,6 +1247,17 @@ zink_descriptors_update(struct zink_context *ctx, bool is_compute)
ctx->dd.push_state_changed[is_compute] = !!pg->dd.push_usage || ctx->dd.has_fbfetch != bs->dd.has_fbfetch; ctx->dd.push_state_changed[is_compute] = !!pg->dd.push_usage || ctx->dd.has_fbfetch != bs->dd.has_fbfetch;
} }
if (!is_compute) {
struct zink_gfx_program *prog = (struct zink_gfx_program*)pg;
if (prog->is_separable) {
/* force all descriptors update on next pass: separables use different layouts */
ctx->dd.state_changed[is_compute] = BITFIELD_MASK(ZINK_DESCRIPTOR_TYPE_UNIFORMS);
ctx->dd.push_state_changed[is_compute] = true;
update_separable(ctx, pg);
return;
}
}
if (pg != bs->dd.pg[is_compute]) { if (pg != bs->dd.pg[is_compute]) {
/* if we don't already know that we have to update all sets, /* if we don't already know that we have to update all sets,
* check to see if any dsls changed * check to see if any dsls changed

View file

@ -154,8 +154,12 @@ zink_descriptors_deinit_bindless(struct zink_context *ctx);
void void
zink_descriptors_update_bindless(struct zink_context *ctx); zink_descriptors_update_bindless(struct zink_context *ctx);
void
zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets);
void
zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader);
void
zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader);
bool bool
zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg); zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg);

View file

@ -751,7 +751,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, VkShaderModule *modules,
pci.pStages = shader_stages; pci.pStages = shader_stages;
pci.stageCount = num_stages; pci.stageCount = num_stages;
/* only add LTO for full pipeline libs */ /* Only keep LTO information for full pipeline libs. For separable shaders, they will only
* ever be used with fast linking, and to optimize them a new pipeline lib will be created with full
* link time information for the full set of shader stages (rather than linking in these single-stage libs).
*/
if (num_stages > 1) if (num_stages > 1)
pci.flags |= VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT; pci.flags |= VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
@ -770,6 +773,12 @@ zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_pro
return create_gfx_pipeline_library(screen, prog->modules, prog->base.layout, prog->base.pipeline_cache); return create_gfx_pipeline_library(screen, prog->modules, prog->base.layout, prog->base.pipeline_cache);
} }
VkPipeline
zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout)
{
return create_gfx_pipeline_library(screen, modules, layout, VK_NULL_HANDLE);
}
VkPipeline VkPipeline
zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized) zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized)
{ {

View file

@ -62,6 +62,8 @@ VkPipeline
zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state); zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state);
VkPipeline VkPipeline
zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized); zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized);
VkPipeline
zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -44,6 +44,11 @@
#define XXH_INLINE_ALL #define XXH_INLINE_ALL
#include "util/xxhash.h" #include "util/xxhash.h"
static void
precompile_job(void *data, void *gdata, int thread_index);
struct zink_gfx_program *
create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch);
void void
debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr) debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr)
{ {
@ -645,6 +650,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
{ {
const union zink_shader_key_optimal *optimal_key = (union zink_shader_key_optimal*)&prog->last_variant_hash; const union zink_shader_key_optimal *optimal_key = (union zink_shader_key_optimal*)&prog->last_variant_hash;
if (ctx->gfx_pipeline_state.shader_keys_optimal.key.vs_bits != optimal_key->vs_bits) { if (ctx->gfx_pipeline_state.shader_keys_optimal.key.vs_bits != optimal_key->vs_bits) {
assert(!prog->is_separable);
bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->nir->info.stage); bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->nir->info.stage);
ctx->gfx_pipeline_state.modules_changed |= changed; ctx->gfx_pipeline_state.modules_changed |= changed;
} }
@ -652,6 +658,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
if (ctx->gfx_pipeline_state.shader_keys_optimal.key.fs_bits != optimal_key->fs_bits || if (ctx->gfx_pipeline_state.shader_keys_optimal.key.fs_bits != optimal_key->fs_bits ||
/* always recheck shadow swizzles since they aren't directly part of the key */ /* always recheck shadow swizzles since they aren't directly part of the key */
unlikely(shadow_needs_shader_swizzle)) { unlikely(shadow_needs_shader_swizzle)) {
assert(!prog->is_separable);
bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT); bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
ctx->gfx_pipeline_state.modules_changed |= changed; ctx->gfx_pipeline_state.modules_changed |= changed;
if (unlikely(shadow_needs_shader_swizzle)) { if (unlikely(shadow_needs_shader_swizzle)) {
@ -661,6 +668,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
} }
if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated && if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated &&
ctx->gfx_pipeline_state.shader_keys_optimal.key.tcs_bits != optimal_key->tcs_bits) { ctx->gfx_pipeline_state.shader_keys_optimal.key.tcs_bits != optimal_key->tcs_bits) {
assert(!prog->is_separable);
bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL); bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL);
ctx->gfx_pipeline_state.modules_changed |= changed; ctx->gfx_pipeline_state.modules_changed |= changed;
} }
@ -682,13 +690,28 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
if (entry) { if (entry) {
prog = (struct zink_gfx_program*)entry->data; prog = (struct zink_gfx_program*)entry->data;
if (prog->is_separable) {
/* shader variants can't be handled by separable programs: sync and compile */
if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key))
util_queue_fence_wait(&prog->base.cache_fence);
/* If the optimized linked pipeline is done compiling, swap it into place. */
if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
struct zink_gfx_program *real = prog->full_prog;
entry->data = real;
prog->full_prog = NULL;
prog->base.removed = true;
zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL);
prog = real;
}
}
update_gfx_program_optimal(ctx, prog); update_gfx_program_optimal(ctx, prog);
} else { } else {
ctx->dirty_gfx_stages |= ctx->shader_stages; ctx->dirty_gfx_stages |= ctx->shader_stages;
prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch); prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch);
zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false); zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false);
_mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog); _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state); if (!prog->is_separable)
generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state);
} }
simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
if (prog && prog != ctx->curr_program) if (prog && prog != ctx->curr_program)
@ -699,6 +722,24 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
/* remove old hash */ /* remove old hash */
ctx->gfx_pipeline_state.optimal_key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val; ctx->gfx_pipeline_state.optimal_key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val;
ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
if (ctx->curr_program->is_separable) {
struct zink_gfx_program *prog = ctx->curr_program;
if (prog->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) {
util_queue_fence_wait(&prog->base.cache_fence);
/* shader variants can't be handled by separable programs: sync and compile */
struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
const uint32_t hash = ctx->gfx_hash;
simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
struct zink_gfx_program *real = prog->full_prog;
entry->data = real;
prog->full_prog = NULL;
prog->base.removed = true;
zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL);
ctx->curr_program = real;
simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
}
}
update_gfx_program_optimal(ctx, ctx->curr_program); update_gfx_program_optimal(ctx, ctx->curr_program);
/* apply new hash */ /* apply new hash */
ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
@ -969,6 +1010,112 @@ fail:
return NULL; return NULL;
} }
/* Creates a replacement, optimized zink_gfx_program for this set of separate shaders, which will
* be swapped in in place of the fast-linked separable program once it's done compiling.
*/
static void
create_linked_separable_job(void *data, void *gdata, int thread_index)
{
struct zink_gfx_program *prog = data;
prog->full_prog = zink_create_gfx_program(prog->ctx, prog->shaders, 0);
precompile_job(prog->full_prog, gdata, thread_index);
}
struct zink_gfx_program *
create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch)
{
struct zink_screen *screen = zink_screen(ctx->base.screen);
unsigned shader_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_FRAGMENT);
/* filter cases that need real pipelines */
if (ctx->shader_stages != shader_stages ||
!stages[MESA_SHADER_VERTEX]->precompile.mod || !stages[MESA_SHADER_FRAGMENT]->precompile.mod ||
/* TODO: maybe try variants? grimace */
!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) ||
!zink_can_use_pipeline_libs(ctx))
return zink_create_gfx_program(ctx, stages, vertices_per_patch);
/* ensure async gpl creation is done */
util_queue_fence_wait(&stages[MESA_SHADER_VERTEX]->precompile.fence);
util_queue_fence_wait(&stages[MESA_SHADER_FRAGMENT]->precompile.fence);
struct zink_gfx_program *prog = create_program(ctx, false);
if (!prog)
goto fail;
prog->ctx = ctx;
prog->is_separable = true;
prog->shaders[MESA_SHADER_VERTEX] = stages[MESA_SHADER_VERTEX];
prog->stages_remaining = prog->stages_present = shader_stages;
prog->shaders[MESA_SHADER_FRAGMENT] = stages[MESA_SHADER_FRAGMENT];
prog->last_vertex_stage = stages[MESA_SHADER_VERTEX];
_mesa_set_init(&prog->libs, prog, hash_pipeline_lib, equals_pipeline_lib);
unsigned refs = 0;
for (int i = 0; i < ZINK_GFX_SHADER_COUNT; ++i) {
if (prog->shaders[i]) {
simple_mtx_lock(&prog->shaders[i]->lock);
_mesa_set_add(prog->shaders[i]->programs, prog);
simple_mtx_unlock(&prog->shaders[i]->lock);
refs++;
}
}
/* We can do this add after the _mesa_set_adds above because we know the prog->shaders[] are
* referenced by the draw state and zink_shader_free() can't be called on them while we're in here.
*/
p_atomic_add(&prog->base.reference.count, refs);
for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
_mesa_hash_table_init(&prog->pipelines[r][i], prog, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
/* only need first 3/4 for point/line/tri/patch */
if (screen->info.have_EXT_extended_dynamic_state &&
i == (prog->last_vertex_stage->nir->info.stage == MESA_SHADER_TESS_EVAL ? 4 : 3))
break;
}
}
if (prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl) {
prog->base.dd.binding_usage |= BITFIELD_BIT(0);
prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_template;
prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_size;
prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_offset;
prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl;
prog->base.num_dsl++;
}
if (prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl) {
prog->base.dd.binding_usage |= BITFIELD_BIT(1);
prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_template;
prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_size;
prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_offset;
prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl;
/* guarantee a null dsl if vs doesn't have descriptors */
prog->base.num_dsl = 2;
}
prog->base.dd.bindless = prog->shaders[MESA_SHADER_VERTEX]->bindless | prog->shaders[MESA_SHADER_FRAGMENT]->bindless;
if (prog->base.dd.bindless) {
prog->base.num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES;
prog->base.dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout;
}
prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
struct zink_gfx_library_key *gkey = rzalloc(prog, struct zink_gfx_library_key);
gkey->optimal_key = prog->last_variant_hash;
assert(gkey->optimal_key);
gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false);
_mesa_set_add(&prog->libs, gkey);
util_queue_add_job(&screen->cache_get_thread, prog, &prog->base.cache_fence, create_linked_separable_job, NULL, 0);
return prog;
fail:
if (prog)
zink_destroy_gfx_program(screen, prog);
return NULL;
}
static uint32_t static uint32_t
hash_compute_pipeline_state_local_size(const void *key) hash_compute_pipeline_state_local_size(const void *key)
{ {
@ -1203,6 +1350,8 @@ zink_destroy_gfx_program(struct zink_screen *screen,
max_idx++; max_idx++;
} }
if (prog->is_separable)
zink_gfx_program_reference(screen, &prog->full_prog, NULL);
for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) { for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) {
for (int i = 0; i < max_idx; ++i) { for (int i = 0; i < max_idx; ++i) {
hash_table_foreach(&prog->pipelines[r][i], entry) { hash_table_foreach(&prog->pipelines[r][i], entry) {
@ -1223,11 +1372,13 @@ zink_destroy_gfx_program(struct zink_screen *screen,
_mesa_set_remove_key(prog->shaders[i]->programs, prog); _mesa_set_remove_key(prog->shaders[i]->programs, prog);
prog->shaders[i] = NULL; prog->shaders[i] = NULL;
} }
destroy_shader_cache(screen, &prog->shader_cache[i][0][0]); if (!prog->is_separable) {
destroy_shader_cache(screen, &prog->shader_cache[i][0][1]); destroy_shader_cache(screen, &prog->shader_cache[i][0][0]);
destroy_shader_cache(screen, &prog->shader_cache[i][1][0]); destroy_shader_cache(screen, &prog->shader_cache[i][0][1]);
destroy_shader_cache(screen, &prog->shader_cache[i][1][1]); destroy_shader_cache(screen, &prog->shader_cache[i][1][0]);
ralloc_free(prog->nir[i]); destroy_shader_cache(screen, &prog->shader_cache[i][1][1]);
ralloc_free(prog->nir[i]);
}
} }
set_foreach_remove(&prog->libs, he) { set_foreach_remove(&prog->libs, he) {
@ -1761,6 +1912,20 @@ precompile_job(void *data, void *gdata, int thread_index)
zink_screen_update_pipeline_cache(screen, &prog->base, true); zink_screen_update_pipeline_cache(screen, &prog->base, true);
} }
static void
precompile_separate_shader_job(void *data, void *gdata, int thread_index)
{
struct zink_screen *screen = gdata;
struct zink_shader *zs = data;
nir_shader *nir;
zs->precompile.mod = zink_shader_compile_separate(screen, zs, &nir);
zink_descriptor_shader_init(screen, zs);
VkShaderModule mods[ZINK_GFX_SHADER_COUNT] = {0};
mods[nir->info.stage] = zs->precompile.mod;
zs->precompile.gpl = zink_create_gfx_pipeline_separate(screen, mods, zs->precompile.layout);
}
static void static void
zink_link_gfx_shader(struct pipe_context *pctx, void **shaders) zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
{ {
@ -1769,8 +1934,17 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
if (shaders[MESA_SHADER_COMPUTE]) if (shaders[MESA_SHADER_COMPUTE])
return; return;
/* can't precompile fixedfunc */ /* can't precompile fixedfunc */
if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) {
if (shaders[MESA_SHADER_VERTEX] || shaders[MESA_SHADER_FRAGMENT]) {
struct zink_shader *zs = shaders[MESA_SHADER_VERTEX] ? shaders[MESA_SHADER_VERTEX] : shaders[MESA_SHADER_FRAGMENT];
if (zs->nir->info.separate_shader && !zs->precompile.mod && util_queue_fence_is_signalled(&zs->precompile.fence) &&
zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB &&
/* sample shading can't precompile */
(!shaders[MESA_SHADER_FRAGMENT] || !zs->nir->info.fs.uses_sample_shading))
util_queue_add_job(&zink_screen(pctx->screen)->cache_get_thread, zs, &zs->precompile.fence, precompile_separate_shader_job, NULL, 0);
}
return; return;
}
unsigned hash = 0; unsigned hash = 0;
unsigned shader_stages = 0; unsigned shader_stages = 0;
for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) { for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) {

View file

@ -190,10 +190,12 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
/* this is the graphics pipeline library path: find/construct all partial pipelines */ /* this is the graphics pipeline library path: find/construct all partial pipelines */
struct set_entry *he = _mesa_set_search(&prog->libs, &ctx->gfx_pipeline_state.optimal_key); struct set_entry *he = _mesa_set_search(&prog->libs, &ctx->gfx_pipeline_state.optimal_key);
struct zink_gfx_library_key *gkey; struct zink_gfx_library_key *gkey;
if (he) if (he) {
gkey = (struct zink_gfx_library_key *)he->key; gkey = (struct zink_gfx_library_key *)he->key;
else } else {
assert(!prog->is_separable);
gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state); gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state);
}
struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ? struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
zink_find_or_create_input_dynamic(ctx, vkmode) : zink_find_or_create_input_dynamic(ctx, vkmode) :
zink_find_or_create_input(ctx, vkmode); zink_find_or_create_input(ctx, vkmode);
@ -215,7 +217,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
zink_screen_update_pipeline_cache(screen, &prog->base, false); zink_screen_update_pipeline_cache(screen, &prog->base, false);
pc_entry->pipeline = pipeline; pc_entry->pipeline = pipeline;
if (HAVE_LIB) if (HAVE_LIB && !prog->is_separable)
/* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */ /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */
zink_gfx_program_compile_queue(ctx, pc_entry); zink_gfx_program_compile_queue(ctx, pc_entry);
} }

View file

@ -188,6 +188,8 @@ zink_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *s
} }
struct zink_shader *zs = shader; struct zink_shader *zs = shader;
if (!util_queue_fence_is_signalled(&zs->precompile.fence))
return false;
bool finished = true; bool finished = true;
set_foreach(zs->programs, entry) { set_foreach(zs->programs, entry) {
struct zink_gfx_program *prog = (void*)entry->key; struct zink_gfx_program *prog = (void*)entry->key;

View file

@ -732,6 +732,19 @@ struct zink_shader {
bool has_uniforms; bool has_uniforms;
struct spirv_shader *spirv; struct spirv_shader *spirv;
struct {
struct util_queue_fence fence;
VkShaderModule mod;
VkDescriptorSetLayout dsl;
VkPipelineLayout layout;
VkPipeline gpl;
VkDescriptorSetLayoutBinding *bindings;
unsigned num_bindings;
struct zink_descriptor_template *db_template;
unsigned db_size;
unsigned *db_offset;
} precompile;
simple_mtx_t lock; simple_mtx_t lock;
struct set *programs; struct set *programs;
@ -973,26 +986,30 @@ struct zink_gfx_pipeline_cache_entry {
struct zink_gfx_program { struct zink_gfx_program {
struct zink_program base; struct zink_program base;
bool is_separable; //not a full program
struct zink_context *ctx; //the owner context struct zink_context *ctx; //the owner context
uint32_t stages_present; //mask of stages present in this program uint32_t stages_present; //mask of stages present in this program
uint32_t stages_remaining; //mask of zink_shader remaining in this program uint32_t stages_remaining; //mask of zink_shader remaining in this program
struct nir_shader *nir[ZINK_GFX_SHADER_COUNT];
VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here
uint32_t module_hash[ZINK_GFX_SHADER_COUNT];
struct zink_shader *last_vertex_stage;
struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms
unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT];
struct zink_shader *shaders[ZINK_GFX_SHADER_COUNT]; struct zink_shader *shaders[ZINK_GFX_SHADER_COUNT];
struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support] struct zink_shader *last_vertex_stage;
/* full */
VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here
uint32_t module_hash[ZINK_GFX_SHADER_COUNT];
struct nir_shader *nir[ZINK_GFX_SHADER_COUNT];
struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms
unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT];
uint32_t default_variant_hash; uint32_t default_variant_hash;
uint32_t last_variant_hash;
uint8_t inline_variants; //which stages are using inlined uniforms uint8_t inline_variants; //which stages are using inlined uniforms
/* separable */
struct zink_gfx_program *full_prog;
struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support]
uint32_t last_variant_hash;
uint32_t last_finalized_hash[2][4]; //[dynamic, renderpass][primtype idx] uint32_t last_finalized_hash[2][4]; //[dynamic, renderpass][primtype idx]
VkPipeline last_pipeline[2][4]; //[dynamic, renderpass][primtype idx] VkPipeline last_pipeline[2][4]; //[dynamic, renderpass][primtype idx]