From 6d1e51de0426bec2334c3a107c9e215905e223b0 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 14 May 2025 13:21:25 +0200 Subject: [PATCH] panvk/v10+: Limit direct dispatch WLS allocation During direct dispatch, we calculate the size of the WLS allocation based on the number of WLS instances which is an unbounded calculation on number of workgroups. This leads to extreme allocation sizes and potentially VK_ERROR_OUT_OF_DEVICE_MEMORY for direct dispatches with a high amount of workgroups. This change adds an upper bound to the number of WLS instances, using the same value we assume for indirect dispatches. Additionally, this commit fixes the WLS max instance calculation (which should be per core). Fixes: 5544d39f442 ("panvk: Add a CSF backend for panvk_queue/cmd_buffer") Reviewed-by: Eric R. Smith Reviewed-by: John Anthony Tested-by: Heiko Stuebner Part-of: (cherry picked from commit 0a47a1cb6d71f39515960f051fc3f96dd3fb01e6) --- .pick_status.json | 2 +- src/panfrost/lib/pan_desc.h | 56 ++++++++++++++++++ .../vulkan/csf/panvk_vX_cmd_dispatch.c | 57 +++---------------- 3 files changed, 66 insertions(+), 49 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 13f216e2c36..1a5b8c7fbf4 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -5154,7 +5154,7 @@ "description": "panvk/v10+: Limit direct dispatch WLS allocation", "nominated": true, "nomination_type": 2, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": "5544d39f4420da88c53aaf8dd48d86ac92bd0eaa", "notes": null diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h index 0072a6d0bf2..412d509a6dd 100644 --- a/src/panfrost/lib/pan_desc.h +++ b/src/panfrost/lib/pan_desc.h @@ -30,6 +30,7 @@ #include "genxml/gen_macros.h" +#include "kmod/pan_kmod.h" #include "pan_texture.h" struct pan_compute_dim { @@ -165,6 +166,61 @@ pan_wls_adjust_size(unsigned wls_size) return util_next_power_of_two(MAX2(wls_size, 128)); } +static inline unsigned +pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size, + const struct pan_kmod_dev_props *props) +{ + /* Each shader core can run N tasks and a total of M threads at any single + * time, thus each task should ideally have no more than M/N threads. */ + unsigned max_threads_per_task = + props->max_threads_per_core / props->max_tasks_per_core; + + /* To achieve the best utilization, we should aim for as many workgroups + * per tasks as we can fit without exceeding the above thread limit */ + unsigned threads_per_wg = + shader_local_size->x * shader_local_size->y * shader_local_size->z; + assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg); + unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg); + assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task); + + return wg_per_task; +} + +static inline unsigned +pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size, + const struct pan_kmod_dev_props *props, + const struct pan_compute_dim *dim) +{ + /* NOTE: If the instance count is lower than the number of workgroups + * being dispatched, the HW will hold back workgroups until instances + * can be reused. */ + unsigned instances; + unsigned wg_per_task = + pan_calc_workgroups_per_task(shader_local_size, props); + unsigned max_instances_per_core = + util_next_power_of_two(wg_per_task * props->max_tasks_per_core); + + /* Not passing workgroup dimensions implies indirect compute. */ + if (!dim) { + /* Assume we utilize all shader cores to the max */ + instances = max_instances_per_core; + } else { + /* NOTE: There is no benefit from allocating more instances than what + * can concurrently be used by the HW */ + instances = MIN2(pan_wls_instances(dim), max_instances_per_core); + } + return instances; +} + +static inline unsigned +pan_calc_total_wls_size(unsigned wls_size, unsigned wls_instances, + unsigned max_core_id_plus_one) +{ + unsigned size = pan_wls_adjust_size(wls_size); + + return size * wls_instances * max_core_id_plus_one; +} + #ifdef PAN_ARCH #if PAN_ARCH >= 5 diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index 2934bd999a1..eabde1ffbac 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -65,27 +65,6 @@ prepare_driver_set(struct panvk_cmd_buffer *cmdbuf) return VK_SUCCESS; } -static unsigned -calculate_workgroups_per_task(const struct panvk_shader *shader, - struct panvk_physical_device *phys_dev) -{ - /* Each shader core can run N tasks and a total of M threads at any single - * time, thus each task should ideally have no more than M/N threads. */ - unsigned max_threads_per_task = phys_dev->kmod.props.max_threads_per_core / - phys_dev->kmod.props.max_tasks_per_core; - - /* To achieve the best utilization, we should aim for as many workgroups - * per tasks as we can fit without exceeding the above thread limit */ - unsigned threads_per_wg = shader->cs.local_size.x * shader->cs.local_size.y * - shader->cs.local_size.z; - assert(threads_per_wg > 0 && - threads_per_wg <= phys_dev->kmod.props.max_threads_per_wg); - unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg); - assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task); - - return wg_per_task; -} - uint64_t panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf, const struct panvk_shader *shader, @@ -103,35 +82,16 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf, .tls.size = shader->info.tls_size, .wls.size = shader->info.wls_size, }; - unsigned core_id_range; - unsigned core_count = - panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); - - /* Only used for indirect dispatch */ - unsigned wg_per_task = 0; - if (indirect) - wg_per_task = calculate_workgroups_per_task(shader, phys_dev); if (tlsinfo.wls.size) { - /* NOTE: If the instance count is lower than the number of workgroups - * being dispatched, the HW will hold back workgroups until instances - * can be reused. */ - /* NOTE: There is no benefit from allocating more instances than what - * can concurrently be used by the HW */ - if (indirect) { - /* Assume we utilize all shader cores to the max */ - tlsinfo.wls.instances = util_next_power_of_two( - wg_per_task * phys_dev->kmod.props.max_tasks_per_core * core_count); - } else { - /* TODO: Similar to what we are doing for indirect this should change - * to calculate the maximum number of workgroups we can execute - * concurrently. */ - tlsinfo.wls.instances = pan_wls_instances(dim); - } + unsigned core_id_range; + panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); - /* TODO: Clamp WLS instance to some maximum WLS budget. */ - unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) * - tlsinfo.wls.instances * core_id_range; + tlsinfo.wls.instances = pan_calc_wls_instances( + &shader->cs.local_size, &phys_dev->kmod.props, indirect ? NULL : dim); + + unsigned wls_total_size = pan_calc_total_wls_size( + tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range); /* TODO: Reuse WLS allocation for all dispatch commands in the command * buffer, similar to what we do for TLS in draw. As WLS size (and @@ -192,7 +152,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) /* Only used for indirect dispatch */ unsigned wg_per_task = 0; if (indirect) - wg_per_task = calculate_workgroups_per_task(shader, phys_dev); + wg_per_task = pan_calc_workgroups_per_task(&shader->cs.local_size, + &phys_dev->kmod.props); if (compute_state_dirty(cmdbuf, DESC_STATE) || compute_state_dirty(cmdbuf, CS)) {