mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-03 04:20:26 +01:00
panvk: Implement precomp dispatch
Implement dispatching of precompilled OpenCL C shaders in panvk. Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32720>
This commit is contained in:
parent
ab03752c4f
commit
cc02c5deb4
9 changed files with 523 additions and 1 deletions
32
src/panfrost/libpan/libpan_dgc.h
Normal file
32
src/panfrost/libpan/libpan_dgc.h
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2024 Collabora Ltd.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LIBPAN_DGC_H
|
||||||
|
#define LIBPAN_DGC_H
|
||||||
|
#include "libpan.h"
|
||||||
|
|
||||||
|
enum panlib_barrier {
|
||||||
|
PANLIB_BARRIER_NONE = 0,
|
||||||
|
PANLIB_BARRIER_JM_BARRIER = (1 << 0),
|
||||||
|
PANLIB_BARRIER_JM_SUPPRESS_PREFETCH = (1 << 1),
|
||||||
|
};
|
||||||
|
|
||||||
|
struct panlib_precomp_grid {
|
||||||
|
uint32_t count[3];
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct panlib_precomp_grid
|
||||||
|
panlib_3d(uint32_t x, uint32_t y, uint32_t z)
|
||||||
|
{
|
||||||
|
return (struct panlib_precomp_grid){.count = {x, y, z}};
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct panlib_precomp_grid
|
||||||
|
panlib_1d(uint32_t x)
|
||||||
|
{
|
||||||
|
return panlib_3d(x, 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
192
src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
Normal file
192
src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
Normal file
|
|
@ -0,0 +1,192 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2024 Collabora Ltd.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "bifrost_compile.h"
|
||||||
|
#include "pan_desc.h"
|
||||||
|
#include "pan_encoder.h"
|
||||||
|
#include "panvk_cmd_alloc.h"
|
||||||
|
#include "panvk_cmd_buffer.h"
|
||||||
|
#include "panvk_cmd_precomp.h"
|
||||||
|
#include "panvk_macros.h"
|
||||||
|
#include "panvk_mempool.h"
|
||||||
|
#include "panvk_precomp_cache.h"
|
||||||
|
#include "panvk_queue.h"
|
||||||
|
|
||||||
|
void
|
||||||
|
panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
|
||||||
|
struct panlib_precomp_grid grid,
|
||||||
|
enum panlib_barrier barrier,
|
||||||
|
enum libpan_shaders_program idx, void *data,
|
||||||
|
size_t data_size)
|
||||||
|
{
|
||||||
|
assert(barrier == PANLIB_BARRIER_NONE && "Unsupported barrier flags");
|
||||||
|
|
||||||
|
struct panvk_cmd_buffer *cmdbuf = ctx->cmdbuf;
|
||||||
|
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||||
|
struct panvk_physical_device *phys_dev =
|
||||||
|
to_panvk_physical_device(dev->vk.physical);
|
||||||
|
const struct panvk_shader *shader =
|
||||||
|
panvk_per_arch(precomp_cache_get)(dev->precomp_cache, idx);
|
||||||
|
assert(shader);
|
||||||
|
|
||||||
|
struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem(
|
||||||
|
cmdbuf, desc, BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 16);
|
||||||
|
|
||||||
|
assert(push_uniforms.gpu);
|
||||||
|
|
||||||
|
struct bifrost_precompiled_kernel_sysvals sysvals;
|
||||||
|
sysvals.num_workgroups.x = grid.count[0];
|
||||||
|
sysvals.num_workgroups.y = grid.count[1];
|
||||||
|
sysvals.num_workgroups.z = grid.count[2];
|
||||||
|
|
||||||
|
bifrost_precompiled_kernel_prepare_push_uniforms(push_uniforms.cpu, data,
|
||||||
|
data_size, &sysvals);
|
||||||
|
|
||||||
|
struct pan_tls_info tlsinfo = {.tls = {.size = shader->info.tls_size},
|
||||||
|
.wls = {.size = shader->info.wls_size}};
|
||||||
|
|
||||||
|
if (tlsinfo.tls.size) {
|
||||||
|
unsigned thread_tls_alloc =
|
||||||
|
panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
|
||||||
|
unsigned core_id_range;
|
||||||
|
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
|
||||||
|
|
||||||
|
unsigned size = panfrost_get_total_stack_size(
|
||||||
|
tlsinfo.tls.size, thread_tls_alloc, core_id_range);
|
||||||
|
tlsinfo.tls.ptr = panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
|
||||||
|
assert(tlsinfo.tls.ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tlsinfo.wls.size) {
|
||||||
|
unsigned core_id_range;
|
||||||
|
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
|
||||||
|
|
||||||
|
struct pan_compute_dim wg_count = {.x = grid.count[0],
|
||||||
|
.y = grid.count[1],
|
||||||
|
.z = grid.count[2]};
|
||||||
|
tlsinfo.wls.instances = pan_wls_instances(&wg_count);
|
||||||
|
|
||||||
|
unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
|
||||||
|
tlsinfo.wls.instances * core_id_range;
|
||||||
|
|
||||||
|
tlsinfo.wls.ptr =
|
||||||
|
panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
|
||||||
|
|
||||||
|
assert(tlsinfo.wls.ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pan_compute_dim dim = {.x = grid.count[0],
|
||||||
|
.y = grid.count[1],
|
||||||
|
.z = grid.count[2]};
|
||||||
|
|
||||||
|
uint64_t tsd =
|
||||||
|
panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader, &dim, false);
|
||||||
|
assert(tsd);
|
||||||
|
|
||||||
|
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
|
||||||
|
const struct cs_tracing_ctx *tracing_ctx =
|
||||||
|
&cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
|
||||||
|
|
||||||
|
/* Copy the global TLS pointer to the per-job TSD. */
|
||||||
|
if (shader->info.tls_size) {
|
||||||
|
cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
|
||||||
|
cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
|
||||||
|
cs_wait_slot(b, SB_ID(LS), false);
|
||||||
|
cs_move64_to(b, cs_scratch_reg64(b, 0), tsd);
|
||||||
|
cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
|
||||||
|
cs_wait_slot(b, SB_ID(LS), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_update_compute_ctx(b) {
|
||||||
|
/* No resource table */
|
||||||
|
cs_move64_to(b, cs_sr_reg64(b, 0), 0);
|
||||||
|
|
||||||
|
uint64_t fau_count =
|
||||||
|
DIV_ROUND_UP(BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 8);
|
||||||
|
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
|
||||||
|
cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
|
||||||
|
|
||||||
|
cs_move64_to(b, cs_sr_reg64(b, 16), panvk_priv_mem_dev_addr(shader->spd));
|
||||||
|
|
||||||
|
cs_move64_to(b, cs_sr_reg64(b, 24), tsd);
|
||||||
|
|
||||||
|
/* Global attribute offset */
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 32), 0);
|
||||||
|
|
||||||
|
struct mali_compute_size_workgroup_packed wg_size;
|
||||||
|
pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
|
||||||
|
cfg.workgroup_size_x = shader->local_size.x;
|
||||||
|
cfg.workgroup_size_y = shader->local_size.y;
|
||||||
|
cfg.workgroup_size_z = shader->local_size.z;
|
||||||
|
cfg.allow_merging_workgroups = false;
|
||||||
|
}
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
|
||||||
|
|
||||||
|
/* Job offset */
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 34), 0);
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 35), 0);
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 36), 0);
|
||||||
|
|
||||||
|
/* Job size */
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 37), grid.count[0]);
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 38), grid.count[1]);
|
||||||
|
cs_move32_to(b, cs_sr_reg32(b, 39), grid.count[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
|
||||||
|
|
||||||
|
cs_req_res(b, CS_COMPUTE_RES);
|
||||||
|
unsigned task_axis = MALI_TASK_AXIS_X;
|
||||||
|
unsigned task_increment = 0;
|
||||||
|
panvk_per_arch(calculate_task_axis_and_increment)(
|
||||||
|
shader, phys_dev, &task_axis, &task_increment);
|
||||||
|
cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
|
||||||
|
task_increment, task_axis, false,
|
||||||
|
cs_shader_res_sel(0, 0, 0, 0));
|
||||||
|
cs_req_res(b, 0);
|
||||||
|
|
||||||
|
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
|
||||||
|
struct cs_index iter_sb = cs_scratch_reg32(b, 2);
|
||||||
|
struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
|
||||||
|
struct cs_index add_val = cs_scratch_reg64(b, 4);
|
||||||
|
|
||||||
|
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
|
||||||
|
BITFIELD_MASK(3),
|
||||||
|
offsetof(struct panvk_cs_subqueue_context, syncobjs));
|
||||||
|
cs_wait_slot(b, SB_ID(LS), false);
|
||||||
|
|
||||||
|
cs_add64(b, sync_addr, sync_addr,
|
||||||
|
PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
|
||||||
|
cs_move64_to(b, add_val, 1);
|
||||||
|
|
||||||
|
cs_match(b, iter_sb, cmp_scratch) {
|
||||||
|
#define CASE(x) \
|
||||||
|
cs_case(b, x) { \
|
||||||
|
cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, \
|
||||||
|
cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \
|
||||||
|
cs_move32_to(b, iter_sb, next_iter_sb(x)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
CASE(0)
|
||||||
|
CASE(1)
|
||||||
|
CASE(2)
|
||||||
|
CASE(3)
|
||||||
|
CASE(4)
|
||||||
|
#undef CASE
|
||||||
|
}
|
||||||
|
|
||||||
|
cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
|
||||||
|
offsetof(struct panvk_cs_subqueue_context, iter_sb));
|
||||||
|
cs_wait_slot(b, SB_ID(LS), false);
|
||||||
|
|
||||||
|
++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
|
||||||
|
|
||||||
|
/* XXX: clobbers the registers instead to avoid recreating them when calling
|
||||||
|
* a dispatch after? */
|
||||||
|
compute_state_set_dirty(cmdbuf, CS);
|
||||||
|
compute_state_set_dirty(cmdbuf, DESC_STATE);
|
||||||
|
compute_state_set_dirty(cmdbuf, PUSH_UNIFORMS);
|
||||||
|
}
|
||||||
81
src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c
Normal file
81
src/panfrost/vulkan/jm/panvk_vX_cmd_precomp.c
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2024 Collabora Ltd.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "bifrost_compile.h"
|
||||||
|
#include "pan_desc.h"
|
||||||
|
#include "pan_encoder.h"
|
||||||
|
#include "panvk_cmd_alloc.h"
|
||||||
|
#include "panvk_cmd_buffer.h"
|
||||||
|
#include "panvk_cmd_precomp.h"
|
||||||
|
#include "panvk_macros.h"
|
||||||
|
#include "panvk_mempool.h"
|
||||||
|
#include "panvk_precomp_cache.h"
|
||||||
|
|
||||||
|
void
|
||||||
|
panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
|
||||||
|
struct panlib_precomp_grid grid,
|
||||||
|
enum panlib_barrier barrier,
|
||||||
|
enum libpan_shaders_program idx, void *data,
|
||||||
|
size_t data_size)
|
||||||
|
{
|
||||||
|
struct panvk_cmd_buffer *cmdbuf = ctx->cmdbuf;
|
||||||
|
struct panvk_batch *batch = cmdbuf->cur_batch;
|
||||||
|
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||||
|
const struct panvk_shader *shader =
|
||||||
|
panvk_per_arch(precomp_cache_get)(dev->precomp_cache, idx);
|
||||||
|
|
||||||
|
assert(shader);
|
||||||
|
assert(batch && "Need current batch to be present!");
|
||||||
|
|
||||||
|
struct panfrost_ptr push_uniforms = panvk_cmd_alloc_dev_mem(
|
||||||
|
cmdbuf, desc, BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 16);
|
||||||
|
|
||||||
|
assert(push_uniforms.gpu);
|
||||||
|
|
||||||
|
struct bifrost_precompiled_kernel_sysvals sysvals;
|
||||||
|
sysvals.num_workgroups.x = grid.count[0];
|
||||||
|
sysvals.num_workgroups.y = grid.count[1];
|
||||||
|
sysvals.num_workgroups.z = grid.count[2];
|
||||||
|
|
||||||
|
bifrost_precompiled_kernel_prepare_push_uniforms(push_uniforms.cpu, data,
|
||||||
|
data_size, &sysvals);
|
||||||
|
|
||||||
|
struct panfrost_ptr job = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB);
|
||||||
|
assert(job.gpu);
|
||||||
|
|
||||||
|
panfrost_pack_work_groups_compute(
|
||||||
|
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), grid.count[0],
|
||||||
|
grid.count[1], grid.count[2], shader->local_size.x, shader->local_size.y,
|
||||||
|
shader->local_size.z, false, false);
|
||||||
|
|
||||||
|
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
|
||||||
|
cfg.job_task_split = util_logbase2_ceil(shader->local_size.x + 1) +
|
||||||
|
util_logbase2_ceil(shader->local_size.y + 1) +
|
||||||
|
util_logbase2_ceil(shader->local_size.z + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pan_compute_dim dim = {.x = grid.count[0],
|
||||||
|
.y = grid.count[1],
|
||||||
|
.z = grid.count[2]};
|
||||||
|
uint64_t tld =
|
||||||
|
panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader, &dim, false);
|
||||||
|
assert(tld);
|
||||||
|
|
||||||
|
pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
|
||||||
|
cfg.state = panvk_priv_mem_dev_addr(shader->rsd),
|
||||||
|
cfg.push_uniforms = push_uniforms.gpu;
|
||||||
|
cfg.thread_storage = tld;
|
||||||
|
}
|
||||||
|
|
||||||
|
util_dynarray_append(&batch->jobs, void *, job.cpu);
|
||||||
|
|
||||||
|
bool job_barrier = (barrier & PANLIB_BARRIER_JM_BARRIER) != 0;
|
||||||
|
bool suppress_prefetch =
|
||||||
|
(barrier & PANLIB_BARRIER_JM_SUPPRESS_PREFETCH) != 0;
|
||||||
|
|
||||||
|
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, job_barrier,
|
||||||
|
suppress_prefetch, 0, 0, &job, false);
|
||||||
|
}
|
||||||
|
|
@ -70,6 +70,7 @@ jm_files = [
|
||||||
'jm/panvk_vX_cmd_draw.c',
|
'jm/panvk_vX_cmd_draw.c',
|
||||||
'jm/panvk_vX_cmd_event.c',
|
'jm/panvk_vX_cmd_event.c',
|
||||||
'jm/panvk_vX_cmd_query.c',
|
'jm/panvk_vX_cmd_query.c',
|
||||||
|
'jm/panvk_vX_cmd_precomp.c',
|
||||||
'jm/panvk_vX_event.c',
|
'jm/panvk_vX_event.c',
|
||||||
'jm/panvk_vX_queue.c',
|
'jm/panvk_vX_queue.c',
|
||||||
]
|
]
|
||||||
|
|
@ -82,6 +83,7 @@ csf_files = [
|
||||||
'csf/panvk_vX_cmd_draw.c',
|
'csf/panvk_vX_cmd_draw.c',
|
||||||
'csf/panvk_vX_cmd_event.c',
|
'csf/panvk_vX_cmd_event.c',
|
||||||
'csf/panvk_vX_cmd_query.c',
|
'csf/panvk_vX_cmd_query.c',
|
||||||
|
'csf/panvk_vX_cmd_precomp.c',
|
||||||
'csf/panvk_vX_device.c',
|
'csf/panvk_vX_device.c',
|
||||||
'csf/panvk_vX_event.c',
|
'csf/panvk_vX_event.c',
|
||||||
'csf/panvk_vX_exception_handler.c',
|
'csf/panvk_vX_exception_handler.c',
|
||||||
|
|
@ -103,6 +105,7 @@ common_per_arch_files = [
|
||||||
'panvk_vX_descriptor_set.c',
|
'panvk_vX_descriptor_set.c',
|
||||||
'panvk_vX_descriptor_set_layout.c',
|
'panvk_vX_descriptor_set_layout.c',
|
||||||
'panvk_vX_device.c',
|
'panvk_vX_device.c',
|
||||||
|
'panvk_vX_precomp_cache.c',
|
||||||
'panvk_vX_query_pool.c',
|
'panvk_vX_query_pool.c',
|
||||||
'panvk_vX_image_view.c',
|
'panvk_vX_image_view.c',
|
||||||
'panvk_vX_nir_lower_descriptors.c',
|
'panvk_vX_nir_lower_descriptors.c',
|
||||||
|
|
|
||||||
40
src/panfrost/vulkan/panvk_cmd_precomp.h
Normal file
40
src/panfrost/vulkan/panvk_cmd_precomp.h
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2024 Collabora Ltd.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PANVK_CMD_PRECOMP_H
|
||||||
|
#define PANVK_CMD_PRECOMP_H
|
||||||
|
|
||||||
|
#ifndef PAN_ARCH
|
||||||
|
#error "PAN_ARCH must be defined"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "genxml/gen_macros.h"
|
||||||
|
#include "util/simple_mtx.h"
|
||||||
|
#include "libpan_dgc.h"
|
||||||
|
#include "libpan_shaders.h"
|
||||||
|
#include "panvk_macros.h"
|
||||||
|
|
||||||
|
struct panvk_cmd_buffer;
|
||||||
|
|
||||||
|
struct panvk_precomp_ctx {
|
||||||
|
struct panvk_cmd_buffer *cmdbuf;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline struct panvk_precomp_ctx
|
||||||
|
panvk_per_arch(precomp_cs)(struct panvk_cmd_buffer *cmdbuf)
|
||||||
|
{
|
||||||
|
return (struct panvk_precomp_ctx){.cmdbuf = cmdbuf};
|
||||||
|
}
|
||||||
|
|
||||||
|
enum libpan_shaders_program;
|
||||||
|
void panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
|
||||||
|
struct panlib_precomp_grid grid,
|
||||||
|
enum panlib_barrier barrier,
|
||||||
|
enum libpan_shaders_program idx,
|
||||||
|
void *data, size_t data_size);
|
||||||
|
|
||||||
|
#define MESA_DISPATCH_PRECOMP panvk_per_arch(dispatch_precomp)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -27,6 +27,8 @@
|
||||||
|
|
||||||
#define PANVK_MAX_QUEUE_FAMILIES 1
|
#define PANVK_MAX_QUEUE_FAMILIES 1
|
||||||
|
|
||||||
|
struct panvk_precomp_cache;
|
||||||
|
|
||||||
struct panvk_device {
|
struct panvk_device {
|
||||||
struct vk_device vk;
|
struct vk_device vk;
|
||||||
|
|
||||||
|
|
@ -63,6 +65,8 @@ struct panvk_device {
|
||||||
struct panvk_queue *queues[PANVK_MAX_QUEUE_FAMILIES];
|
struct panvk_queue *queues[PANVK_MAX_QUEUE_FAMILIES];
|
||||||
int queue_count[PANVK_MAX_QUEUE_FAMILIES];
|
int queue_count[PANVK_MAX_QUEUE_FAMILIES];
|
||||||
|
|
||||||
|
struct panvk_precomp_cache *precomp_cache;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
struct u_trace_context utctx;
|
struct u_trace_context utctx;
|
||||||
#ifdef HAVE_PERFETTO
|
#ifdef HAVE_PERFETTO
|
||||||
|
|
|
||||||
43
src/panfrost/vulkan/panvk_precomp_cache.h
Normal file
43
src/panfrost/vulkan/panvk_precomp_cache.h
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2024 Collabora Ltd.
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PANVK_PRECOMP_CACHE_H
|
||||||
|
#define PANVK_PRECOMP_CACHE_H
|
||||||
|
|
||||||
|
#include "panvk_device.h"
|
||||||
|
#ifndef PAN_ARCH
|
||||||
|
#error "PAN_ARCH must be defined"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "genxml/gen_macros.h"
|
||||||
|
#include "util/simple_mtx.h"
|
||||||
|
#include "libpan_dgc.h"
|
||||||
|
#include "libpan_shaders.h"
|
||||||
|
#include "pan_shader.h"
|
||||||
|
#include "panvk_macros.h"
|
||||||
|
#include "panvk_mempool.h"
|
||||||
|
#include "panvk_shader.h"
|
||||||
|
|
||||||
|
struct panvk_device;
|
||||||
|
|
||||||
|
struct panvk_precomp_cache {
|
||||||
|
simple_mtx_t lock;
|
||||||
|
struct panvk_device *dev;
|
||||||
|
|
||||||
|
/* Precompiled binary table */
|
||||||
|
const uint32_t **programs;
|
||||||
|
|
||||||
|
struct panvk_shader *precomp[LIBPAN_SHADERS_NUM_PROGRAMS];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct panvk_precomp_cache *
|
||||||
|
panvk_per_arch(precomp_cache_init)(struct panvk_device *dev);
|
||||||
|
void panvk_per_arch(precomp_cache_cleanup)(struct panvk_precomp_cache *cache);
|
||||||
|
|
||||||
|
struct panvk_shader *
|
||||||
|
panvk_per_arch(precomp_cache_get)(struct panvk_precomp_cache *cache,
|
||||||
|
unsigned program);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -21,6 +21,7 @@
|
||||||
#include "panvk_instance.h"
|
#include "panvk_instance.h"
|
||||||
#include "panvk_macros.h"
|
#include "panvk_macros.h"
|
||||||
#include "panvk_physical_device.h"
|
#include "panvk_physical_device.h"
|
||||||
|
#include "panvk_precomp_cache.h"
|
||||||
#include "panvk_priv_bo.h"
|
#include "panvk_priv_bo.h"
|
||||||
#include "panvk_queue.h"
|
#include "panvk_queue.h"
|
||||||
#include "panvk_utrace.h"
|
#include "panvk_utrace.h"
|
||||||
|
|
@ -154,6 +155,23 @@ panvk_meta_cleanup(struct panvk_device *device)
|
||||||
vk_meta_device_finish(&device->vk, &device->meta);
|
vk_meta_device_finish(&device->vk, &device->meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static VkResult
|
||||||
|
panvk_precomp_init(struct panvk_device *device)
|
||||||
|
{
|
||||||
|
device->precomp_cache = panvk_per_arch(precomp_cache_init)(device);
|
||||||
|
|
||||||
|
if (device->precomp_cache == NULL)
|
||||||
|
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||||
|
|
||||||
|
return VK_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
panvk_precomp_cleanup(struct panvk_device *device)
|
||||||
|
{
|
||||||
|
panvk_per_arch(precomp_cache_cleanup)(device->precomp_cache);
|
||||||
|
}
|
||||||
|
|
||||||
/* Always reserve the lower 32MB. */
|
/* Always reserve the lower 32MB. */
|
||||||
#define PANVK_VA_RESERVE_BOTTOM 0x2000000ull
|
#define PANVK_VA_RESERVE_BOTTOM 0x2000000ull
|
||||||
|
|
||||||
|
|
@ -326,10 +344,15 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device,
|
||||||
|
|
||||||
vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd);
|
vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd);
|
||||||
|
|
||||||
result = panvk_meta_init(device);
|
|
||||||
|
result = panvk_precomp_init(device);
|
||||||
if (result != VK_SUCCESS)
|
if (result != VK_SUCCESS)
|
||||||
goto err_free_priv_bos;
|
goto err_free_priv_bos;
|
||||||
|
|
||||||
|
result = panvk_meta_init(device);
|
||||||
|
if (result != VK_SUCCESS)
|
||||||
|
goto err_free_precomp;
|
||||||
|
|
||||||
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
|
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
|
||||||
const VkDeviceQueueCreateInfo *queue_create =
|
const VkDeviceQueueCreateInfo *queue_create =
|
||||||
&pCreateInfo->pQueueCreateInfos[i];
|
&pCreateInfo->pQueueCreateInfos[i];
|
||||||
|
|
@ -378,6 +401,8 @@ err_finish_queues:
|
||||||
|
|
||||||
panvk_meta_cleanup(device);
|
panvk_meta_cleanup(device);
|
||||||
|
|
||||||
|
err_free_precomp:
|
||||||
|
panvk_precomp_cleanup(device);
|
||||||
err_free_priv_bos:
|
err_free_priv_bos:
|
||||||
panvk_priv_bo_unref(device->tiler_oom.handlers_bo);
|
panvk_priv_bo_unref(device->tiler_oom.handlers_bo);
|
||||||
panvk_priv_bo_unref(device->sample_positions);
|
panvk_priv_bo_unref(device->sample_positions);
|
||||||
|
|
@ -414,6 +439,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device,
|
||||||
vk_free(&device->vk.alloc, device->queues[i]);
|
vk_free(&device->vk.alloc, device->queues[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
panvk_precomp_cleanup(device);
|
||||||
panvk_meta_cleanup(device);
|
panvk_meta_cleanup(device);
|
||||||
panvk_priv_bo_unref(device->tiler_oom.handlers_bo);
|
panvk_priv_bo_unref(device->tiler_oom.handlers_bo);
|
||||||
panvk_priv_bo_unref(device->tiler_heap);
|
panvk_priv_bo_unref(device->tiler_heap);
|
||||||
|
|
|
||||||
101
src/panfrost/vulkan/panvk_vX_precomp_cache.c
Normal file
101
src/panfrost/vulkan/panvk_vX_precomp_cache.c
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2024 Collabora Ltd.
|
||||||
|
* Copyright 2022 Alyssa Rosenzweig
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "genxml/gen_macros.h"
|
||||||
|
#include "util/macros.h"
|
||||||
|
#include "bifrost_compile.h"
|
||||||
|
#include "libpan_shaders.h"
|
||||||
|
#include "panvk_device.h"
|
||||||
|
#include "panvk_precomp_cache.h"
|
||||||
|
#include "panvk_shader.h"
|
||||||
|
#include "vk_alloc.h"
|
||||||
|
#include "vk_shader.h"
|
||||||
|
|
||||||
|
struct panvk_precomp_cache *
|
||||||
|
panvk_per_arch(precomp_cache_init)(struct panvk_device *dev)
|
||||||
|
{
|
||||||
|
struct panvk_precomp_cache *res =
|
||||||
|
vk_zalloc(&dev->vk.alloc, sizeof(struct panvk_precomp_cache), 8,
|
||||||
|
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
|
||||||
|
|
||||||
|
if (res == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
simple_mtx_init(&res->lock, mtx_plain);
|
||||||
|
res->dev = dev;
|
||||||
|
res->programs = GENX(libpan_shaders_default);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
panvk_per_arch(precomp_cache_cleanup)(struct panvk_precomp_cache *cache)
|
||||||
|
{
|
||||||
|
for (unsigned i = 0; i < ARRAY_SIZE(cache->precomp); i++) {
|
||||||
|
if (cache->precomp[i])
|
||||||
|
vk_shader_destroy(&cache->dev->vk, &cache->precomp[i]->vk,
|
||||||
|
&cache->dev->vk.alloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
simple_mtx_destroy(&cache->lock);
|
||||||
|
vk_free(&cache->dev->vk.alloc, cache);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct panvk_shader *
|
||||||
|
panvk_get_precompiled_locked(struct panvk_precomp_cache *cache,
|
||||||
|
unsigned program)
|
||||||
|
{
|
||||||
|
simple_mtx_assert_locked(&cache->lock);
|
||||||
|
|
||||||
|
/* It is possible that, while waiting for the lock, another thread uploaded
|
||||||
|
* the shader. Check for that so we don't double-upload.
|
||||||
|
*/
|
||||||
|
if (cache->precomp[program])
|
||||||
|
return cache->precomp[program];
|
||||||
|
|
||||||
|
const uint32_t *bin = cache->programs[program];
|
||||||
|
const struct bifrost_precompiled_kernel_info *info = (void *)bin;
|
||||||
|
const void *binary = (const uint8_t *)bin + sizeof(*info);
|
||||||
|
struct pan_compute_dim local_dim = {
|
||||||
|
.x = info->local_size_x,
|
||||||
|
.y = info->local_size_y,
|
||||||
|
.z = info->local_size_z,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct panvk_shader *shader;
|
||||||
|
VkResult result = panvk_per_arch(create_shader_from_binary)(
|
||||||
|
cache->dev, &info->info, local_dim, binary, info->binary_size, &shader);
|
||||||
|
|
||||||
|
if (result != VK_SUCCESS)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* We must only write to the cache once we are done compiling, since other
|
||||||
|
* threads may be reading the cache concurrently. Do this last.
|
||||||
|
*/
|
||||||
|
p_atomic_set(&cache->precomp[program], shader);
|
||||||
|
|
||||||
|
return shader;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct panvk_shader *
|
||||||
|
panvk_per_arch(precomp_cache_get)(struct panvk_precomp_cache *cache,
|
||||||
|
unsigned program)
|
||||||
|
{
|
||||||
|
/* Shaders are immutable once written, so if we atomically read a non-NULL
|
||||||
|
* shader, then we have a valid cached shader and are done.
|
||||||
|
*/
|
||||||
|
struct panvk_shader *ret = p_atomic_read(cache->precomp + program);
|
||||||
|
|
||||||
|
if (ret != NULL)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
/* Otherwise, take the lock and upload. */
|
||||||
|
simple_mtx_lock(&cache->lock);
|
||||||
|
ret = panvk_get_precompiled_locked(cache, program);
|
||||||
|
simple_mtx_unlock(&cache->lock);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue