From 1f1282801170170bd225fb4794071c29ec7f11ca Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Mon, 23 Mar 2026 14:05:55 +0100 Subject: [PATCH] pan: Add handling for v15+ uapi thread_max_wg_size thread_max_workgroup_size has been replaced with thread_num_active_granularity in v15, which requires updated handling for calculating the max number of threads in a workgroup --- src/gallium/drivers/panfrost/pan_cmdstream.c | 3 ++- src/gallium/drivers/panfrost/pan_precomp.c | 5 +++-- src/panfrost/lib/kmod/pan_kmod.h | 3 +++ src/panfrost/lib/kmod/panthor_kmod.c | 8 +++++-- src/panfrost/lib/pan_desc.h | 13 +++++++---- src/panfrost/lib/pan_props.c | 9 ++++++++ .../vulkan/csf/panvk_vX_cmd_dispatch.c | 8 ++++--- .../vulkan/csf/panvk_vX_cmd_precomp.c | 3 ++- .../vulkan/jm/panvk_vX_cmd_dispatch.c | 5 +++-- .../vulkan/panvk_vX_physical_device.c | 22 ++++++++++++------- 10 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c index aa32944195f..5294b5831da 100644 --- a/src/gallium/drivers/panfrost/pan_cmdstream.c +++ b/src/gallium/drivers/panfrost/pan_cmdstream.c @@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch, .tls.size = ss->info.tls_size, .wls.size = ss->info.wls_size + grid->variable_shared_mem, .wls.instances = pan_calc_wls_instances( - &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim), + &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim, + ss->info.work_reg_count), }; if (ss->info.tls_size) { diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c index c9b0c9b62fe..da2d3f51f7d 100644 --- a/src/gallium/drivers/panfrost/pan_precomp.c +++ b/src/gallium/drivers/panfrost/pan_precomp.c @@ -197,8 +197,9 @@ emit_tls(struct panfrost_batch *batch, struct pan_tls_info info = { .tls.size = shader->info.tls_size, .wls.size = shader->info.wls_size, - .wls.instances = pan_calc_wls_instances(&shader->local_size, - &dev->kmod.dev->props, dim), + .wls.instances = + pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim, + shader->info.work_reg_count), }; if (info.tls.size) { diff --git a/src/panfrost/lib/kmod/pan_kmod.h b/src/panfrost/lib/kmod/pan_kmod.h index e7356330e7d..a876afeafe3 100644 --- a/src/panfrost/lib/kmod/pan_kmod.h +++ b/src/panfrost/lib/kmod/pan_kmod.h @@ -206,6 +206,9 @@ struct pan_kmod_dev_props { /* Maximum number of threads per workgroup. */ uint32_t max_threads_per_wg; + /* Granularity of number of active threads. */ + uint32_t num_threads_active_granularity; + /* Number of registers per core. Can be used to determine the maximum * number of threads that can be allocated for a specific shader based on * the number of registers assigned to this shader. diff --git a/src/panfrost/lib/kmod/panthor_kmod.c b/src/panfrost/lib/kmod/panthor_kmod.c index 8900ea5ac5e..2c723f7d506 100644 --- a/src/panfrost/lib/kmod/panthor_kmod.c +++ b/src/panfrost/lib/kmod/panthor_kmod.c @@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev) props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24; props->num_registers_per_core = panthor_dev->props.gpu.thread_features & 0x3fffff; + props->num_threads_active_granularity = + panthor_dev->props.gpu.thread_num_active_granularity; /* We assume that all thread properties are populated. If we ever have a GPU * that have one of the THREAD_xxx register that's zero, we can always add a * quirk here. */ - assert(props->max_threads_per_wg && props->max_threads_per_core && - props->max_tasks_per_core && props->num_registers_per_core); + assert( + (props->max_threads_per_wg || props->num_threads_active_granularity) && + props->max_threads_per_core && props->max_tasks_per_core && + props->num_registers_per_core); /* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number * of TLS instance per core is assumed to be the maximum number of threads diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h index 7cc7639c897..bdb19976977 100644 --- a/src/panfrost/lib/pan_desc.h +++ b/src/panfrost/lib/pan_desc.h @@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size) static inline unsigned pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size, - const struct pan_kmod_dev_props *props) + const struct pan_kmod_dev_props *props, + unsigned work_reg_count) { /* Each shader core can run N tasks and a total of M threads at any single * time, thus each task should ideally have no more than M/N threads. */ unsigned max_threads_per_task = props->max_threads_per_core / props->max_tasks_per_core; + ASSERTED unsigned max_threads_per_wg = + pan_compute_max_thread_count(props, work_reg_count); + /* To achieve the best utilization, we should aim for as many workgroups * per tasks as we can fit without exceeding the above thread limit */ unsigned threads_per_wg = shader_local_size->x * shader_local_size->y * shader_local_size->z; - assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg); + assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg); unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg); assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task); @@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size, static inline unsigned pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size, const struct pan_kmod_dev_props *props, - const struct pan_compute_dim *dim) + const struct pan_compute_dim *dim, + unsigned work_reg_count) { /* NOTE: If the instance count is lower than the number of workgroups * being dispatched, the HW will hold back workgroups until instances * can be reused. */ unsigned instances; unsigned wg_per_task = - pan_calc_workgroups_per_task(shader_local_size, props); + pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count); unsigned max_instances_per_core = util_next_power_of_two(wg_per_task * props->max_tasks_per_core); diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c index 056bd48d4a2..b125dad4947 100644 --- a/src/panfrost/lib/pan_props.c +++ b/src/panfrost/lib/pan_props.c @@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props, aligned_reg_count = work_reg_count <= 32 ? 32 : 64; } + if (pan_arch(props->gpu_id) >= 15) { + assert(props->num_threads_active_granularity); + unsigned max_treads_per_wg = + ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count, + props->num_threads_active_granularity); + return MIN2(max_treads_per_wg, props->max_threads_per_core); + } + + assert(props->max_threads_per_wg); return MIN3(props->max_threads_per_wg, props->max_threads_per_core, props->num_registers_per_core / aligned_reg_count); } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index b1bf45483ee..1fd8e437d49 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)( unsigned core_id_range; pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range); - tlsinfo.wls.instances = pan_calc_wls_instances( - &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim); + tlsinfo.wls.instances = + pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props, + indirect ? NULL : dim, cs->info.work_reg_count); unsigned wls_total_size = pan_calc_total_wls_size( tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range); @@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) unsigned wg_per_task = 0; if (indirect) wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size, - &phys_dev->kmod.dev->props); + &phys_dev->kmod.dev->props, + cs->info.work_reg_count); if (compute_state_dirty(cmdbuf, DESC_STATE) || compute_state_dirty(cmdbuf, CS)) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index bd302847aec..56f6c546217 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -155,7 +155,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, * increment/axis parameters requires knowledge of job dimensions, but * this is somewhat offset by run_compute being a native instruction. */ task_increment = pan_calc_workgroups_per_task( - &shader->cs.local_size, &phys_dev->kmod.dev->props); + &shader->cs.local_size, &phys_dev->kmod.dev->props, + shader->info.work_reg_count); } else { panvk_per_arch(calculate_task_axis_and_increment)( shader, phys_dev, &dim, &task_axis, &task_increment); diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c index 57b48e69f15..fb5782a141f 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c @@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)( unsigned core_id_range; pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range); - batch->tlsinfo.wls.instances = pan_calc_wls_instances( - &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim); + batch->tlsinfo.wls.instances = + pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props, + indirect ? NULL : dim, cs->info.work_reg_count); batch->wls_total_size = pan_calc_total_wls_size( batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range); } diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c index 0db46da50f6..333a5abdaa0 100644 --- a/src/panfrost/vulkan/panvk_vX_physical_device.c +++ b/src/panfrost/vulkan/panvk_vX_physical_device.c @@ -728,8 +728,17 @@ panvk_per_arch(get_physical_device_properties)( const bool has_disk_cache = device->vk.disk_cache != NULL; + /* Calculate the value using register count on v15+. + * TODO: As this requires register allocation changes ensuring we don't + * violate the limits based on the workgroup size, clamp the value to half of + * the max threads value (always safe and matches previous GPUs) for now. */ + unsigned max_threads_per_wg = + (PAN_ARCH >= 15) + ? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32), + device->kmod.dev->props.max_threads_per_core / 2) + : device->kmod.dev->props.max_threads_per_wg; /* Ensure that the max threads count per workgroup is valid for Bifrost */ - assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024); + assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024); float pointSizeRangeMin; float pointSizeRangeMax; @@ -858,11 +867,9 @@ panvk_per_arch(get_physical_device_properties)( /* We could also split into serveral jobs but this has many limitations. * As such we limit to the max threads per workgroup supported by the GPU. */ - .maxComputeWorkGroupInvocations = - device->kmod.dev->props.max_threads_per_wg, - .maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg, - device->kmod.dev->props.max_threads_per_wg, - device->kmod.dev->props.max_threads_per_wg}, + .maxComputeWorkGroupInvocations = max_threads_per_wg, + .maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg, + max_threads_per_wg}, /* 8-bit subpixel precision. */ .subPixelPrecisionBits = 8, .subTexelPrecisionBits = 8, @@ -1053,8 +1060,7 @@ panvk_per_arch(get_physical_device_properties)( .minSubgroupSize = pan_subgroup_size(PAN_ARCH), .maxSubgroupSize = pan_subgroup_size(PAN_ARCH), .maxComputeWorkgroupSubgroups = - device->kmod.dev->props.max_threads_per_wg / - pan_subgroup_size(PAN_ARCH), + max_threads_per_wg / pan_subgroup_size(PAN_ARCH), .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT, .maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE, .maxPerStageDescriptorInlineUniformBlocks =