From cc02c5deb42f1f89b227ac3e4816a32a573b6bcf Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Mon, 9 Dec 2024 10:15:43 +0100 Subject: [PATCH] panvk: Implement precomp dispatch Implement dispatching of precompilled OpenCL C shaders in panvk. Signed-off-by: Mary Guillemard Acked-by: Alyssa Rosenzweig Reviewed-by: Boris Brezillon Part-of: --- src/panfrost/libpan/libpan_dgc.h | 32 +++ .../vulkan/csf/panvk_vX_cmd_precomp.c | 192 ++++++++++++++++++ src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c | 81 ++++++++ src/panfrost/vulkan/meson.build | 3 + src/panfrost/vulkan/panvk_cmd_precomp.h | 40 ++++ src/panfrost/vulkan/panvk_device.h | 4 + src/panfrost/vulkan/panvk_precomp_cache.h | 43 ++++ src/panfrost/vulkan/panvk_vX_device.c | 28 ++- src/panfrost/vulkan/panvk_vX_precomp_cache.c | 101 +++++++++ 9 files changed, 523 insertions(+), 1 deletion(-) create mode 100644 src/panfrost/libpan/libpan_dgc.h create mode 100644 src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c create mode 100644 src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c create mode 100644 src/panfrost/vulkan/panvk_cmd_precomp.h create mode 100644 src/panfrost/vulkan/panvk_precomp_cache.h create mode 100644 src/panfrost/vulkan/panvk_vX_precomp_cache.c diff --git a/src/panfrost/libpan/libpan_dgc.h b/src/panfrost/libpan/libpan_dgc.h new file mode 100644 index 00000000000..520271da2e8 --- /dev/null +++ b/src/panfrost/libpan/libpan_dgc.h @@ -0,0 +1,32 @@ +/* + * Copyright 2024 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#ifndef LIBPAN_DGC_H +#define LIBPAN_DGC_H +#include "libpan.h" + +enum panlib_barrier { + PANLIB_BARRIER_NONE = 0, + PANLIB_BARRIER_JM_BARRIER = (1 << 0), + PANLIB_BARRIER_JM_SUPPRESS_PREFETCH = (1 << 1), +}; + +struct panlib_precomp_grid { + uint32_t count[3]; +}; + +static struct panlib_precomp_grid +panlib_3d(uint32_t x, uint32_t y, uint32_t z) +{ + return (struct panlib_precomp_grid){.count = {x, y, z}}; +} + +static struct panlib_precomp_grid +panlib_1d(uint32_t x) +{ + return panlib_3d(x, 1, 1); +} + +#endif \ No newline at end of file diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c new file mode 100644 index 00000000000..a776854a961 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -0,0 +1,192 @@ +/* + * Copyright © 2024 Collabora Ltd. + * + * SPDX-License-Identifier: MIT + */ + +#include "bifrost_compile.h" +#include "pan_desc.h" +#include "pan_encoder.h" +#include "panvk_cmd_alloc.h" +#include "panvk_cmd_buffer.h" +#include "panvk_cmd_precomp.h" +#include "panvk_macros.h" +#include "panvk_mempool.h" +#include "panvk_precomp_cache.h" +#include "panvk_queue.h" + +void +panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, + struct panlib_precomp_grid grid, + enum panlib_barrier barrier, + enum libpan_shaders_program idx, void *data, + size_t data_size) +{ + assert(barrier == PANLIB_BARRIER_NONE && "Unsupported barrier flags"); + + struct panvk_cmd_buffer *cmdbuf = ctx->cmdbuf; + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(dev->vk.physical); + const struct panvk_shader *shader = + panvk_per_arch(precomp_cache_get)(dev->precomp_cache, idx); + assert(shader); + + struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 16); + + assert(push_uniforms.gpu); + + struct bifrost_precompiled_kernel_sysvals sysvals; + sysvals.num_workgroups.x = grid.count[0]; + sysvals.num_workgroups.y = grid.count[1]; + sysvals.num_workgroups.z = grid.count[2]; + + bifrost_precompiled_kernel_prepare_push_uniforms(push_uniforms.cpu, data, + data_size, &sysvals); + + struct pan_tls_info tlsinfo = {.tls = {.size = shader->info.tls_size}, + .wls = {.size = shader->info.wls_size}}; + + if (tlsinfo.tls.size) { + unsigned thread_tls_alloc = + panfrost_query_thread_tls_alloc(&phys_dev->kmod.props); + unsigned core_id_range; + panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); + + unsigned size = panfrost_get_total_stack_size( + tlsinfo.tls.size, thread_tls_alloc, core_id_range); + tlsinfo.tls.ptr = panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu; + assert(tlsinfo.tls.ptr); + } + + if (tlsinfo.wls.size) { + unsigned core_id_range; + panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); + + struct pan_compute_dim wg_count = {.x = grid.count[0], + .y = grid.count[1], + .z = grid.count[2]}; + tlsinfo.wls.instances = pan_wls_instances(&wg_count); + + unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) * + tlsinfo.wls.instances * core_id_range; + + tlsinfo.wls.ptr = + panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu; + + assert(tlsinfo.wls.ptr); + } + + struct pan_compute_dim dim = {.x = grid.count[0], + .y = grid.count[1], + .z = grid.count[2]}; + + uint64_t tsd = + panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader, &dim, false); + assert(tsd); + + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE); + const struct cs_tracing_ctx *tracing_ctx = + &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing; + + /* Copy the global TLS pointer to the per-job TSD. */ + if (shader->info.tls_size) { + cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); + cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); + cs_wait_slot(b, SB_ID(LS), false); + cs_move64_to(b, cs_scratch_reg64(b, 0), tsd); + cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); + cs_wait_slot(b, SB_ID(LS), false); + } + + cs_update_compute_ctx(b) { + /* No resource table */ + cs_move64_to(b, cs_sr_reg64(b, 0), 0); + + uint64_t fau_count = + DIV_ROUND_UP(BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 8); + uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56); + cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr); + + cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd)); + + cs_move64_to(b, cs_sr_reg64(b, 24), tsd); + + /* Global attribute offset */ + cs_move32_to(b, cs_sr_reg32(b, 32), 0); + + struct mali_compute_size_workgroup_packed wg_size; + pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) { + cfg.workgroup_size_x = shader->local_size.x; + cfg.workgroup_size_y = shader->local_size.y; + cfg.workgroup_size_z = shader->local_size.z; + cfg.allow_merging_workgroups = false; + } + cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]); + + /* Job offset */ + cs_move32_to(b, cs_sr_reg32(b, 34), 0); + cs_move32_to(b, cs_sr_reg32(b, 35), 0); + cs_move32_to(b, cs_sr_reg32(b, 36), 0); + + /* Job size */ + cs_move32_to(b, cs_sr_reg32(b, 37), grid.count[0]); + cs_move32_to(b, cs_sr_reg32(b, 38), grid.count[1]); + cs_move32_to(b, cs_sr_reg32(b, 39), grid.count[2]); + } + + panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE); + + cs_req_res(b, CS_COMPUTE_RES); + unsigned task_axis = MALI_TASK_AXIS_X; + unsigned task_increment = 0; + panvk_per_arch(calculate_task_axis_and_increment)( + shader, phys_dev, &task_axis, &task_increment); + cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4), + task_increment, task_axis, false, + cs_shader_res_sel(0, 0, 0, 0)); + cs_req_res(b, 0); + + struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index iter_sb = cs_scratch_reg32(b, 2); + struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); + struct cs_index add_val = cs_scratch_reg64(b, 4); + + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), + BITFIELD_MASK(3), + offsetof(struct panvk_cs_subqueue_context, syncobjs)); + cs_wait_slot(b, SB_ID(LS), false); + + cs_add64(b, sync_addr, sync_addr, + PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64)); + cs_move64_to(b, add_val, 1); + + cs_match(b, iter_sb, cmp_scratch) { +#define CASE(x) \ + cs_case(b, x) { \ + cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, \ + cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \ + cs_move32_to(b, iter_sb, next_iter_sb(x)); \ + } + + CASE(0) + CASE(1) + CASE(2) + CASE(3) + CASE(4) +#undef CASE + } + + cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_wait_slot(b, SB_ID(LS), false); + + ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point; + + /* XXX: clobbers the registers instead to avoid recreating them when calling + * a dispatch after? */ + compute_state_set_dirty(cmdbuf, CS); + compute_state_set_dirty(cmdbuf, DESC_STATE); + compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS); +} diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c new file mode 100644 index 00000000000..012be7133ff --- /dev/null +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c @@ -0,0 +1,81 @@ +/* + * Copyright © 2024 Collabora Ltd. + * + * SPDX-License-Identifier: MIT + */ + +#include "bifrost_compile.h" +#include "pan_desc.h" +#include "pan_encoder.h" +#include "panvk_cmd_alloc.h" +#include "panvk_cmd_buffer.h" +#include "panvk_cmd_precomp.h" +#include "panvk_macros.h" +#include "panvk_mempool.h" +#include "panvk_precomp_cache.h" + +void +panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, + struct panlib_precomp_grid grid, + enum panlib_barrier barrier, + enum libpan_shaders_program idx, void *data, + size_t data_size) +{ + struct panvk_cmd_buffer *cmdbuf = ctx->cmdbuf; + struct panvk_batch *batch = cmdbuf->cur_batch; + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + const struct panvk_shader *shader = + panvk_per_arch(precomp_cache_get)(dev->precomp_cache, idx); + + assert(shader); + assert(batch && "Need current batch to be present!"); + + struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem( + cmdbuf, desc, BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 16); + + assert(push_uniforms.gpu); + + struct bifrost_precompiled_kernel_sysvals sysvals; + sysvals.num_workgroups.x = grid.count[0]; + sysvals.num_workgroups.y = grid.count[1]; + sysvals.num_workgroups.z = grid.count[2]; + + bifrost_precompiled_kernel_prepare_push_uniforms(push_uniforms.cpu, data, + data_size, &sysvals); + + struct panfrost_ptr job = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB); + assert(job.gpu); + + panfrost_pack_work_groups_compute( + pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), grid.count[0], + grid.count[1], grid.count[2], shader->local_size.x, shader->local_size.y, + shader->local_size.z, false, false); + + pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = util_logbase2_ceil(shader->local_size.x + 1) + + util_logbase2_ceil(shader->local_size.y + 1) + + util_logbase2_ceil(shader->local_size.z + 1); + } + + struct pan_compute_dim dim = {.x = grid.count[0], + .y = grid.count[1], + .z = grid.count[2]}; + uint64_t tld = + panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader, &dim, false); + assert(tld); + + pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.state = panvk_priv_mem_dev_addr(shader->rsd), + cfg.push_uniforms = push_uniforms.gpu; + cfg.thread_storage = tld; + } + + util_dynarray_append(&batch->jobs, void *, job.cpu); + + bool job_barrier = (barrier & PANLIB_BARRIER_JM_BARRIER) != 0; + bool suppress_prefetch = + (barrier & PANLIB_BARRIER_JM_SUPPRESS_PREFETCH) != 0; + + pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, job_barrier, + suppress_prefetch, 0, 0, &job, false); +} diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index 209b6501a0c..d74a1e77b41 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -70,6 +70,7 @@ jm_files = [ 'jm/panvk_vX_cmd_draw.c', 'jm/panvk_vX_cmd_event.c', 'jm/panvk_vX_cmd_query.c', + 'jm/panvk_vX_cmd_precomp.c', 'jm/panvk_vX_event.c', 'jm/panvk_vX_queue.c', ] @@ -82,6 +83,7 @@ csf_files = [ 'csf/panvk_vX_cmd_draw.c', 'csf/panvk_vX_cmd_event.c', 'csf/panvk_vX_cmd_query.c', + 'csf/panvk_vX_cmd_precomp.c', 'csf/panvk_vX_device.c', 'csf/panvk_vX_event.c', 'csf/panvk_vX_exception_handler.c', @@ -103,6 +105,7 @@ common_per_arch_files = [ 'panvk_vX_descriptor_set.c', 'panvk_vX_descriptor_set_layout.c', 'panvk_vX_device.c', + 'panvk_vX_precomp_cache.c', 'panvk_vX_query_pool.c', 'panvk_vX_image_view.c', 'panvk_vX_nir_lower_descriptors.c', diff --git a/src/panfrost/vulkan/panvk_cmd_precomp.h b/src/panfrost/vulkan/panvk_cmd_precomp.h new file mode 100644 index 00000000000..f0512d9a65d --- /dev/null +++ b/src/panfrost/vulkan/panvk_cmd_precomp.h @@ -0,0 +1,40 @@ +/* + * Copyright © 2024 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_CMD_PRECOMP_H +#define PANVK_CMD_PRECOMP_H + +#ifndef PAN_ARCH +#error "PAN_ARCH must be defined" +#endif + +#include "genxml/gen_macros.h" +#include "util/simple_mtx.h" +#include "libpan_dgc.h" +#include "libpan_shaders.h" +#include "panvk_macros.h" + +struct panvk_cmd_buffer; + +struct panvk_precomp_ctx { + struct panvk_cmd_buffer *cmdbuf; +}; + +static inline struct panvk_precomp_ctx +panvk_per_arch(precomp_cs)(struct panvk_cmd_buffer *cmdbuf) +{ + return (struct panvk_precomp_ctx){.cmdbuf = cmdbuf}; +} + +enum libpan_shaders_program; +void panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, + struct panlib_precomp_grid grid, + enum panlib_barrier barrier, + enum libpan_shaders_program idx, + void *data, size_t data_size); + +#define MESA_DISPATCH_PRECOMP panvk_per_arch(dispatch_precomp) + +#endif diff --git a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h index ddca52b1638..30d56730922 100644 --- a/src/panfrost/vulkan/panvk_device.h +++ b/src/panfrost/vulkan/panvk_device.h @@ -27,6 +27,8 @@ #define PANVK_MAX_QUEUE_FAMILIES 1 +struct panvk_precomp_cache; + struct panvk_device { struct vk_device vk; @@ -63,6 +65,8 @@ struct panvk_device { struct panvk_queue *queues[PANVK_MAX_QUEUE_FAMILIES]; int queue_count[PANVK_MAX_QUEUE_FAMILIES]; + struct panvk_precomp_cache *precomp_cache; + struct { struct u_trace_context utctx; #ifdef HAVE_PERFETTO diff --git a/src/panfrost/vulkan/panvk_precomp_cache.h b/src/panfrost/vulkan/panvk_precomp_cache.h new file mode 100644 index 00000000000..e16885457fd --- /dev/null +++ b/src/panfrost/vulkan/panvk_precomp_cache.h @@ -0,0 +1,43 @@ +/* + * Copyright © 2024 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#ifndef PANVK_PRECOMP_CACHE_H +#define PANVK_PRECOMP_CACHE_H + +#include "panvk_device.h" +#ifndef PAN_ARCH +#error "PAN_ARCH must be defined" +#endif + +#include "genxml/gen_macros.h" +#include "util/simple_mtx.h" +#include "libpan_dgc.h" +#include "libpan_shaders.h" +#include "pan_shader.h" +#include "panvk_macros.h" +#include "panvk_mempool.h" +#include "panvk_shader.h" + +struct panvk_device; + +struct panvk_precomp_cache { + simple_mtx_t lock; + struct panvk_device *dev; + + /* Precompiled binary table */ + const uint32_t **programs; + + struct panvk_shader *precomp[LIBPAN_SHADERS_NUM_PROGRAMS]; +}; + +struct panvk_precomp_cache * + panvk_per_arch(precomp_cache_init)(struct panvk_device *dev); +void panvk_per_arch(precomp_cache_cleanup)(struct panvk_precomp_cache *cache); + +struct panvk_shader * + panvk_per_arch(precomp_cache_get)(struct panvk_precomp_cache *cache, + unsigned program); + +#endif diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index d747b25074a..cfa5f24b192 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -21,6 +21,7 @@ #include "panvk_instance.h" #include "panvk_macros.h" #include "panvk_physical_device.h" +#include "panvk_precomp_cache.h" #include "panvk_priv_bo.h" #include "panvk_queue.h" #include "panvk_utrace.h" @@ -154,6 +155,23 @@ panvk_meta_cleanup(struct panvk_device *device) vk_meta_device_finish(&device->vk, &device->meta); } +static VkResult +panvk_precomp_init(struct panvk_device *device) +{ + device->precomp_cache = panvk_per_arch(precomp_cache_init)(device); + + if (device->precomp_cache == NULL) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + return VK_SUCCESS; +} + +static void +panvk_precomp_cleanup(struct panvk_device *device) +{ + panvk_per_arch(precomp_cache_cleanup)(device->precomp_cache); +} + /* Always reserve the lower 32MB. */ #define PANVK_VA_RESERVE_BOTTOM 0x2000000ull @@ -326,10 +344,15 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd); - result = panvk_meta_init(device); + + result = panvk_precomp_init(device); if (result != VK_SUCCESS) goto err_free_priv_bos; + result = panvk_meta_init(device); + if (result != VK_SUCCESS) + goto err_free_precomp; + for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i]; @@ -378,6 +401,8 @@ err_finish_queues: panvk_meta_cleanup(device); +err_free_precomp: + panvk_precomp_cleanup(device); err_free_priv_bos: panvk_priv_bo_unref(device->tiler_oom.handlers_bo); panvk_priv_bo_unref(device->sample_positions); @@ -414,6 +439,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, vk_free(&device->vk.alloc, device->queues[i]); } + panvk_precomp_cleanup(device); panvk_meta_cleanup(device); panvk_priv_bo_unref(device->tiler_oom.handlers_bo); panvk_priv_bo_unref(device->tiler_heap); diff --git a/src/panfrost/vulkan/panvk_vX_precomp_cache.c b/src/panfrost/vulkan/panvk_vX_precomp_cache.c new file mode 100644 index 00000000000..0e7a2d547d0 --- /dev/null +++ b/src/panfrost/vulkan/panvk_vX_precomp_cache.c @@ -0,0 +1,101 @@ +/* + * Copyright © 2024 Collabora Ltd. + * Copyright 2022 Alyssa Rosenzweig + * SPDX-License-Identifier: MIT + */ + +#include "genxml/gen_macros.h" +#include "util/macros.h" +#include "bifrost_compile.h" +#include "libpan_shaders.h" +#include "panvk_device.h" +#include "panvk_precomp_cache.h" +#include "panvk_shader.h" +#include "vk_alloc.h" +#include "vk_shader.h" + +struct panvk_precomp_cache * +panvk_per_arch(precomp_cache_init)(struct panvk_device *dev) +{ + struct panvk_precomp_cache *res = + vk_zalloc(&dev->vk.alloc, sizeof(struct panvk_precomp_cache), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (res == NULL) + return NULL; + + simple_mtx_init(&res->lock, mtx_plain); + res->dev = dev; + res->programs = GENX(libpan_shaders_default); + + return res; +} + +void +panvk_per_arch(precomp_cache_cleanup)(struct panvk_precomp_cache *cache) +{ + for (unsigned i = 0; i < ARRAY_SIZE(cache->precomp); i++) { + if (cache->precomp[i]) + vk_shader_destroy(&cache->dev->vk, &cache->precomp[i]->vk, + &cache->dev->vk.alloc); + } + + simple_mtx_destroy(&cache->lock); + vk_free(&cache->dev->vk.alloc, cache); +} + +static struct panvk_shader * +panvk_get_precompiled_locked(struct panvk_precomp_cache *cache, + unsigned program) +{ + simple_mtx_assert_locked(&cache->lock); + + /* It is possible that, while waiting for the lock, another thread uploaded + * the shader. Check for that so we don't double-upload. + */ + if (cache->precomp[program]) + return cache->precomp[program]; + + const uint32_t *bin = cache->programs[program]; + const struct bifrost_precompiled_kernel_info *info = (void *)bin; + const void *binary = (const uint8_t *)bin + sizeof(*info); + struct pan_compute_dim local_dim = { + .x = info->local_size_x, + .y = info->local_size_y, + .z = info->local_size_z, + }; + + struct panvk_shader *shader; + VkResult result = panvk_per_arch(create_shader_from_binary)( + cache->dev, &info->info, local_dim, binary, info->binary_size, &shader); + + if (result != VK_SUCCESS) + return NULL; + + /* We must only write to the cache once we are done compiling, since other + * threads may be reading the cache concurrently. Do this last. + */ + p_atomic_set(&cache->precomp[program], shader); + + return shader; +} + +struct panvk_shader * +panvk_per_arch(precomp_cache_get)(struct panvk_precomp_cache *cache, + unsigned program) +{ + /* Shaders are immutable once written, so if we atomically read a non-NULL + * shader, then we have a valid cached shader and are done. + */ + struct panvk_shader *ret = p_atomic_read(cache->precomp + program); + + if (ret != NULL) + return ret; + + /* Otherwise, take the lock and upload. */ + simple_mtx_lock(&cache->lock); + ret = panvk_get_precompiled_locked(cache, program); + simple_mtx_unlock(&cache->lock); + + return ret; +}