From 8adede1d44e8039d6c82d452e6959d8e2fcba1da Mon Sep 17 00:00:00 2001 From: Mary Guillemard Date: Thu, 2 Jan 2025 16:34:02 +0100 Subject: [PATCH] panfrost: Implement precomp dispatch on Gallium This handle JM and CSF but exclude Midgard. Signed-off-by: Mary Guillemard Acked-by: Alyssa Rosenzweig Reviewed-by: Boris Brezillon Part-of: --- src/gallium/drivers/panfrost/meson.build | 1 + src/gallium/drivers/panfrost/pan_cmdstream.c | 3 + src/gallium/drivers/panfrost/pan_device.h | 4 + src/gallium/drivers/panfrost/pan_precomp.c | 391 +++++++++++++++++++ src/gallium/drivers/panfrost/pan_precomp.h | 58 +++ 5 files changed, 457 insertions(+) create mode 100644 src/gallium/drivers/panfrost/pan_precomp.c create mode 100644 src/gallium/drivers/panfrost/pan_precomp.h diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index bb9b9ef1883..61db5780f58 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -48,6 +48,7 @@ foreach ver : panfrost_versions files_panfrost_vx = [ 'pan_cmdstream.c', 'pan_fb_preload.c', + 'pan_precomp.c', pan_packers, ] if ver in ['4', '5', '6', '7', '9'] diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index 0b1110d5759..22fc36453fd 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -52,6 +52,7 @@ #include "pan_jm.h" #include "pan_job.h" #include "pan_pool.h" +#include "pan_precomp.h" #include "pan_resource.h" #include "pan_samples.h" #include "pan_shader.h" @@ -4219,6 +4220,8 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) (&dev->fb_preload_cache, panfrost_device_gpu_id(dev), &dev->blend_shaders, &screen->mempools.bin.base, &screen->mempools.desc.base); + dev->precomp_cache = GENX(panfrost_precomp_cache_init)(screen); + #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT pan_indirect_dispatch_meta_init( &dev->indirect_dispatch, panfrost_device_gpu_id(dev), diff --git a/src/gallium/drivers/panfrost/pan_device.h b/src/gallium/drivers/panfrost/pan_device.h index ad629f15f06..b7a16ee93b5 100644 --- a/src/gallium/drivers/panfrost/pan_device.h +++ b/src/gallium/drivers/panfrost/pan_device.h @@ -79,6 +79,8 @@ extern "C" { /* Fencepost problem, hence the off-by-one */ #define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1) +struct panfrost_precomp_cache; + struct panfrost_device { /* For ralloc */ void *memctx; @@ -175,6 +177,8 @@ struct panfrost_device { * unconditionally on Bifrost, and useful for sharing with Midgard */ struct panfrost_bo *sample_positions; + + struct panfrost_precomp_cache *precomp_cache; }; static inline int diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c new file mode 100644 index 00000000000..2e4b154bd6b --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -0,0 +1,391 @@ +/* + * Copyright © 2024 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ +#include "pan_precomp.h" +#include "util/u_memory.h" +#include "bifrost_compile.h" +#include "pan_context.h" +#include "pan_desc.h" +#include "pan_pool.h" +#include "pan_screen.h" +#include "pan_shader.h" + +#if PAN_ARCH >= 10 +#include "genxml/cs_builder.h" +#include "pan_csf.h" +#endif + +struct panfrost_precomp_cache * +GENX(panfrost_precomp_cache_init)(struct panfrost_screen *screen) +{ + struct panfrost_precomp_cache *cache = CALLOC_STRUCT(panfrost_precomp_cache); + + if (cache == NULL) + return NULL; + + simple_mtx_init(&cache->lock, mtx_plain); + cache->programs = GENX(libpan_shaders_default); + cache->bin_pool = &screen->mempools.bin.base; + cache->desc_pool = &screen->mempools.desc.base; + + return cache; +} + +#if PAN_ARCH >= 9 +static enum mali_flush_to_zero_mode +panfrost_ftz_mode(struct pan_shader_info *info) +{ + if (info->ftz_fp32) { + if (info->ftz_fp16) + return MALI_FLUSH_TO_ZERO_MODE_ALWAYS; + else + return MALI_FLUSH_TO_ZERO_MODE_DX11; + } else { + /* We don't have a "flush FP16, preserve FP32" mode, but APIs + * should not be able to generate that. + */ + assert(!info->ftz_fp16 && !info->ftz_fp32); + return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS; + } +} +#endif + +static struct panfrost_precomp_shader * +panfrost_precomp_shader_create( + struct panfrost_precomp_cache *cache, + const struct bifrost_precompiled_kernel_info *info, const void *binary) +{ + struct panfrost_precomp_shader *res = CALLOC_STRUCT(panfrost_precomp_shader); + + if (res == NULL) + return NULL; + + res->info = info->info; + + struct pan_compute_dim local_dim = { + .x = info->local_size_x, + .y = info->local_size_y, + .z = info->local_size_z, + }; + res->local_size = local_dim; + + struct panfrost_ptr bin = + pan_pool_alloc_aligned(cache->bin_pool, info->binary_size, 64); + + if (!bin.gpu) + goto err; + + memcpy(bin.cpu, binary, info->binary_size); + res->code_ptr = bin.gpu; + +#if PAN_ARCH <= 7 + struct panfrost_ptr rsd = + pan_pool_alloc_desc(cache->desc_pool, RENDERER_STATE); + + if (!rsd.gpu) + goto err; + + pan_cast_and_pack(rsd.cpu, RENDERER_STATE, cfg) { + pan_shader_prepare_rsd(&res->info, bin.gpu, &cfg); + } + + res->state_ptr = rsd.gpu; +#else + struct panfrost_ptr spd = + pan_pool_alloc_desc(cache->desc_pool, SHADER_PROGRAM); + + if (!spd.gpu) + goto err; + + pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&res->info); + cfg.register_allocation = + pan_register_allocation(res->info.work_reg_count); + cfg.binary = res->code_ptr; + cfg.preload.r48_r63 = (res->info.preload >> 48); + cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info); + } + + res->state_ptr = spd.gpu; +#endif + + return res; + +err: + FREE(res); + return NULL; +} + +static void +panfrost_precomp_shader_destroy(struct panfrost_precomp_cache *cache, + struct panfrost_precomp_shader *shader) +{ + /* XXX: Do we have anything to do here? */ +} + +void +GENX(panfrost_precomp_cache_cleanup)(struct panfrost_precomp_cache *cache) +{ + for (unsigned i = 0; i < ARRAY_SIZE(cache->precomp); i++) { + if (cache->precomp[i]) + panfrost_precomp_shader_destroy(cache, cache->precomp[i]); + } + + simple_mtx_destroy(&cache->lock); + FREE(cache); +} + +static struct panfrost_precomp_shader * +panfrost_precomp_cache_get_locked(struct panfrost_precomp_cache *cache, + unsigned program) +{ + simple_mtx_assert_locked(&cache->lock); + + /* It is possible that, while waiting for the lock, another thread uploaded + * the shader. Check for that so we don't double-upload. + */ + if (cache->precomp[program]) + return cache->precomp[program]; + + const uint32_t *bin = cache->programs[program]; + const struct bifrost_precompiled_kernel_info *info = (void *)bin; + const void *binary = (const uint8_t *)bin + sizeof(*info); + + struct panfrost_precomp_shader *shader = + panfrost_precomp_shader_create(cache, info, binary); + + if (shader == NULL) + return NULL; + + /* We must only write to the cache once we are done compiling, since other + * threads may be reading the cache concurrently. Do this last. + */ + p_atomic_set(&cache->precomp[program], shader); + + return shader; +} + +static struct panfrost_precomp_shader * +panfrost_precomp_cache_get(struct panfrost_precomp_cache *cache, + unsigned program) +{ + /* Shaders are immutable once written, so if we atomically read a non-NULL + * shader, then we have a valid cached shader and are done. + */ + struct panfrost_precomp_shader *ret = + p_atomic_read(cache->precomp + program); + + if (ret != NULL) + return ret; + + /* Otherwise, take the lock and upload. */ + simple_mtx_lock(&cache->lock); + ret = panfrost_precomp_cache_get_locked(cache, program); + simple_mtx_unlock(&cache->lock); + + return ret; +} + +static uint64_t +emit_tls(struct panfrost_batch *batch, + const struct panfrost_precomp_shader *shader, + const struct pan_compute_dim *dim) +{ + struct panfrost_context *ctx = batch->ctx; + struct panfrost_device *dev = pan_device(ctx->base.screen); + struct panfrost_ptr t = + pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); + + struct pan_tls_info info = { + .tls.size = shader->info.tls_size, + .wls.size = shader->info.wls_size, + .wls.instances = pan_wls_instances(dim), + }; + + if (info.tls.size) { + struct panfrost_bo *bo = panfrost_batch_get_scratchpad( + batch, info.tls.size, dev->thread_tls_alloc, dev->core_id_range); + info.tls.ptr = bo->ptr.gpu; + } + + if (info.wls.size) { + unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances * + dev->core_id_range; + + struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1); + + info.wls.ptr = bo->ptr.gpu; + } + + GENX(pan_emit_tls)(&info, t.cpu); + + return t.gpu; +} + +void +GENX(panfrost_launch_precomp)(struct panfrost_batch *batch, + struct panlib_precomp_grid grid, + enum panlib_barrier barrier, + enum libpan_shaders_program idx, void *data, + size_t data_size) +{ + assert(PAN_ARCH >= 6 && "Midgard isn't supported on launch_precomp"); + + struct panfrost_context *ctx = batch->ctx; + struct pipe_context *gallium = (struct pipe_context *)ctx; + struct panfrost_device *dev = pan_device(gallium->screen); + + struct panfrost_precomp_shader *shader = + panfrost_precomp_cache_get(dev->precomp_cache, idx); + assert(shader); + + struct panfrost_ptr push_uniforms = pan_pool_alloc_aligned( + &batch->pool.base, BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, + 16); + assert(push_uniforms.gpu); + + struct pan_compute_dim dim = {.x = grid.count[0], + .y = grid.count[1], + .z = grid.count[2]}; + uint64_t tsd = emit_tls(batch, shader, &dim); + assert(tsd); + + struct bifrost_precompiled_kernel_sysvals sysvals; + sysvals.num_workgroups.x = grid.count[0]; + sysvals.num_workgroups.y = grid.count[1]; + sysvals.num_workgroups.z = grid.count[2]; + + bifrost_precompiled_kernel_prepare_push_uniforms(push_uniforms.cpu, data, + data_size, &sysvals); + +#if PAN_ARCH <= 9 + struct panfrost_ptr job = + pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); + assert(job.gpu); + +#if PAN_ARCH <= 7 + panfrost_pack_work_groups_compute( + pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), grid.count[0], + grid.count[1], grid.count[2], shader->local_size.x, shader->local_size.y, + shader->local_size.z, false, false); + + pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { + cfg.job_task_split = util_logbase2_ceil(shader->local_size.x + 1) + + util_logbase2_ceil(shader->local_size.y + 1) + + util_logbase2_ceil(shader->local_size.z + 1); + } + + pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { + cfg.state = shader->state_ptr; + cfg.push_uniforms = push_uniforms.gpu; + cfg.thread_storage = tsd; + } +#else + pan_section_pack(job.cpu, COMPUTE_JOB, PAYLOAD, cfg) { + cfg.workgroup_size_x = shader->local_size.x; + cfg.workgroup_size_y = shader->local_size.y; + cfg.workgroup_size_z = shader->local_size.z; + + cfg.workgroup_count_x = grid.count[0]; + cfg.workgroup_count_y = grid.count[1]; + cfg.workgroup_count_z = grid.count[2]; + + cfg.compute.shader = shader->state_ptr; + + uint64_t fau_count = + DIV_ROUND_UP(BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 8); + + cfg.compute.fau = push_uniforms.gpu; + cfg.compute.fau_count = fau_count; + + cfg.compute.thread_storage = tsd; + + cfg.compute.resources = 0; + cfg.allow_merging_workgroups = false; + + cfg.task_increment = 1; + cfg.task_axis = MALI_TASK_AXIS_Z; + } +#endif + + bool job_barrier = (barrier & PANLIB_BARRIER_JM_BARRIER) != 0; + bool suppress_prefetch = + (barrier & PANLIB_BARRIER_JM_SUPPRESS_PREFETCH) != 0; + + pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, job_barrier, + suppress_prefetch, 0, 0, &job, false); +#else + struct cs_builder *b = batch->csf.cs.builder; + + /* No resource table */ + cs_move64_to(b, cs_reg64(b, 0), 0); + + uint64_t fau_count = + DIV_ROUND_UP(BIFROST_PRECOMPILED_KERNEL_SYSVALS_SIZE + data_size, 8); + uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56); + cs_move64_to(b, cs_reg64(b, 8), fau_ptr); + + cs_move64_to(b, cs_reg64(b, 16), shader->state_ptr); + cs_move64_to(b, cs_reg64(b, 24), tsd); + + /* Global attribute offset */ + cs_move32_to(b, cs_reg32(b, 32), 0); + + /* Compute workgroup size */ + struct mali_compute_size_workgroup_packed wg_size; + pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) { + cfg.workgroup_size_x = shader->local_size.x; + cfg.workgroup_size_y = shader->local_size.y; + cfg.workgroup_size_z = shader->local_size.z; + cfg.allow_merging_workgroups = false; + } + cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]); + + /* Job offset */ + cs_move32_to(b, cs_reg32(b, 34), 0); + cs_move32_to(b, cs_reg32(b, 35), 0); + cs_move32_to(b, cs_reg32(b, 36), 0); + + /* Job size */ + cs_move32_to(b, cs_reg32(b, 37), grid.count[0]); + cs_move32_to(b, cs_reg32(b, 38), grid.count[1]); + cs_move32_to(b, cs_reg32(b, 39), grid.count[2]); + + unsigned threads_per_wg = + shader->local_size.x * shader->local_size.y * shader->local_size.z; + unsigned max_thread_cnt = panfrost_compute_max_thread_count( + &dev->kmod.props, shader->info.work_reg_count); + + /* Pick the task_axis and task_increment to maximize thread utilization. */ + unsigned task_axis = MALI_TASK_AXIS_X; + unsigned threads_per_task = threads_per_wg; + unsigned task_increment = 0; + + for (unsigned i = 0; i < 3; i++) { + if (threads_per_task * grid.count[i] >= max_thread_cnt) { + /* We reached out thread limit, stop at the current axis and + * calculate the increment so it doesn't exceed the per-core + * thread capacity. + */ + task_increment = max_thread_cnt / threads_per_task; + break; + } else if (task_axis == MALI_TASK_AXIS_Z) { + /* We reached the Z axis, and there's still room to stuff more + * threads. Pick the current axis grid size as our increment + * as there's no point using something bigger. + */ + task_increment = grid.count[i]; + break; + } + + threads_per_task *= grid.count[i]; + task_axis++; + } + + assert(task_axis <= MALI_TASK_AXIS_Z); + assert(task_increment > 0); + cs_run_compute(b, task_increment, task_axis, false, + cs_shader_res_sel(0, 0, 0, 0)); +#endif +} diff --git a/src/gallium/drivers/panfrost/pan_precomp.h b/src/gallium/drivers/panfrost/pan_precomp.h new file mode 100644 index 00000000000..34a800d9a5a --- /dev/null +++ b/src/gallium/drivers/panfrost/pan_precomp.h @@ -0,0 +1,58 @@ +/* + * Copyright © 2024 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ +#ifndef PAN_PRECOMP_H +#define PAN_PRECOMP_H + +#include "pan_screen.h" +#ifndef PAN_ARCH +#error "PAN_ARCH must be defined" +#endif + +#include "genxml/gen_macros.h" +#include "util/simple_mtx.h" +#include "libpan_dgc.h" +#include "libpan_shaders.h" +#include "pan_job.h" +#include "pan_pool.h" + +struct panfrost_precomp_shader { + struct pan_shader_info info; + struct pan_compute_dim local_size; + uint64_t code_ptr; + uint64_t state_ptr; +}; + +struct panfrost_screen; + +struct panfrost_precomp_cache { + simple_mtx_t lock; + + /* Shader binary pool. */ + struct pan_pool *bin_pool; + + /* Shader desc pool for any descriptor that can be re-used across + * indirect dispatch calls. Job descriptors are allocated from the batch pool. + */ + struct pan_pool *desc_pool; + + /* Precompiled binary table */ + const uint32_t **programs; + + struct panfrost_precomp_shader *precomp[LIBPAN_SHADERS_NUM_PROGRAMS]; +}; + +struct panfrost_precomp_cache * + GENX(panfrost_precomp_cache_init)(struct panfrost_screen *screen); +void GENX(panfrost_precomp_cache_cleanup)(struct panfrost_precomp_cache *cache); + +void GENX(panfrost_launch_precomp)(struct panfrost_batch *batch, + struct panlib_precomp_grid grid, + enum panlib_barrier barrier, + enum libpan_shaders_program idx, void *data, + size_t data_size); + +#define MESA_DISPATCH_PRECOMP GENX(panfrost_launch_precomp) + +#endif