From 1f1282801170170bd225fb4794071c29ec7f11ca Mon Sep 17 00:00:00 2001
From: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Date: Mon, 23 Mar 2026 14:05:55 +0100
Subject: [PATCH] pan: Add handling for v15+ uapi thread_max_wg_size

thread_max_workgroup_size has been replaced with
thread_num_active_granularity in v15, which requires updated handling
for calculating the max number of threads in a workgroup
---
 src/gallium/drivers/panfrost/pan_cmdstream.c  |  3 ++-
 src/gallium/drivers/panfrost/pan_precomp.c    |  5 +++--
 src/panfrost/lib/kmod/pan_kmod.h              |  3 +++
 src/panfrost/lib/kmod/panthor_kmod.c          |  8 +++++--
 src/panfrost/lib/pan_desc.h                   | 13 +++++++----
 src/panfrost/lib/pan_props.c                  |  9 ++++++++
 .../vulkan/csf/panvk_vX_cmd_dispatch.c        |  8 ++++---
 .../vulkan/csf/panvk_vX_cmd_precomp.c         |  3 ++-
 .../vulkan/jm/panvk_vX_cmd_dispatch.c         |  5 +++--
 .../vulkan/panvk_vX_physical_device.c         | 22 ++++++++++++-------
 10 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index aa32944195f..5294b5831da 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
       .tls.size = ss->info.tls_size,
       .wls.size = ss->info.wls_size + grid->variable_shared_mem,
       .wls.instances = pan_calc_wls_instances(
-         &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
+         &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
+         ss->info.work_reg_count),
    };
 
    if (ss->info.tls_size) {
diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c
index c9b0c9b62fe..da2d3f51f7d 100644
--- a/src/gallium/drivers/panfrost/pan_precomp.c
+++ b/src/gallium/drivers/panfrost/pan_precomp.c
@@ -197,8 +197,9 @@ emit_tls(struct panfrost_batch *batch,
    struct pan_tls_info info = {
       .tls.size = shader->info.tls_size,
       .wls.size = shader->info.wls_size,
-      .wls.instances = pan_calc_wls_instances(&shader->local_size,
-                                              &dev->kmod.dev->props, dim),
+      .wls.instances =
+         pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
+                                shader->info.work_reg_count),
    };
 
    if (info.tls.size) {
diff --git a/src/panfrost/lib/kmod/pan_kmod.h b/src/panfrost/lib/kmod/pan_kmod.h
index e7356330e7d..a876afeafe3 100644
--- a/src/panfrost/lib/kmod/pan_kmod.h
+++ b/src/panfrost/lib/kmod/pan_kmod.h
@@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
    /* Maximum number of threads per workgroup. */
    uint32_t max_threads_per_wg;
 
+   /* Granularity of number of active threads. */
+   uint32_t num_threads_active_granularity;
+
    /* Number of registers per core. Can be used to determine the maximum
     * number of threads that can be allocated for a specific shader based on
     * the number of registers assigned to this shader.
diff --git a/src/panfrost/lib/kmod/panthor_kmod.c b/src/panfrost/lib/kmod/panthor_kmod.c
index 8900ea5ac5e..2c723f7d506 100644
--- a/src/panfrost/lib/kmod/panthor_kmod.c
+++ b/src/panfrost/lib/kmod/panthor_kmod.c
@@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
    props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
    props->num_registers_per_core =
       panthor_dev->props.gpu.thread_features & 0x3fffff;
+   props->num_threads_active_granularity =
+      panthor_dev->props.gpu.thread_num_active_granularity;
 
    /* We assume that all thread properties are populated. If we ever have a GPU
     * that have one of the THREAD_xxx register that's zero, we can always add a
     * quirk here.
     */
-   assert(props->max_threads_per_wg && props->max_threads_per_core &&
-          props->max_tasks_per_core && props->num_registers_per_core);
+   assert(
+      (props->max_threads_per_wg || props->num_threads_active_granularity) &&
+      props->max_threads_per_core && props->max_tasks_per_core &&
+      props->num_registers_per_core);
 
    /* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
     * of TLS instance per core is assumed to be the maximum number of threads
diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h
index 7cc7639c897..bdb19976977 100644
--- a/src/panfrost/lib/pan_desc.h
+++ b/src/panfrost/lib/pan_desc.h
@@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)
 
 static inline unsigned
 pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
-                             const struct pan_kmod_dev_props *props)
+                             const struct pan_kmod_dev_props *props,
+                             unsigned work_reg_count)
 {
    /* Each shader core can run N tasks and a total of M threads at any single
     * time, thus each task should ideally have no more than M/N threads. */
    unsigned max_threads_per_task =
       props->max_threads_per_core / props->max_tasks_per_core;
 
+   ASSERTED unsigned max_threads_per_wg =
+      pan_compute_max_thread_count(props, work_reg_count);
+
    /* To achieve the best utilization, we should aim for as many workgroups
     * per tasks as we can fit without exceeding the above thread limit */
    unsigned threads_per_wg =
       shader_local_size->x * shader_local_size->y * shader_local_size->z;
-   assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
+   assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
    unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
    assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
 
@@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
 static inline unsigned
 pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
                        const struct pan_kmod_dev_props *props,
-                       const struct pan_compute_dim *dim)
+                       const struct pan_compute_dim *dim,
+                       unsigned work_reg_count)
 {
    /* NOTE: If the instance count is lower than the number of workgroups
     * being dispatched, the HW will hold back workgroups until instances
     * can be reused. */
    unsigned instances;
    unsigned wg_per_task =
-      pan_calc_workgroups_per_task(shader_local_size, props);
+      pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
    unsigned max_instances_per_core =
       util_next_power_of_two(wg_per_task * props->max_tasks_per_core);
 
diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c
index 056bd48d4a2..b125dad4947 100644
--- a/src/panfrost/lib/pan_props.c
+++ b/src/panfrost/lib/pan_props.c
@@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
       aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
    }
 
+   if (pan_arch(props->gpu_id) >= 15) {
+      assert(props->num_threads_active_granularity);
+      unsigned max_treads_per_wg =
+         ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
+                       props->num_threads_active_granularity);
+      return MIN2(max_treads_per_wg, props->max_threads_per_core);
+   }
+
+   assert(props->max_threads_per_wg);
    return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
                props->num_registers_per_core / aligned_reg_count);
 }
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
index b1bf45483ee..1fd8e437d49 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
       unsigned core_id_range;
       pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
 
-      tlsinfo.wls.instances = pan_calc_wls_instances(
-         &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
+      tlsinfo.wls.instances =
+         pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
+                                indirect ? NULL : dim, cs->info.work_reg_count);
 
       unsigned wls_total_size = pan_calc_total_wls_size(
          tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
@@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
    unsigned wg_per_task = 0;
    if (indirect)
       wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
-                                                 &phys_dev->kmod.dev->props);
+                                                 &phys_dev->kmod.dev->props,
+                                                 cs->info.work_reg_count);
 
    if (compute_state_dirty(cmdbuf, DESC_STATE) ||
        compute_state_dirty(cmdbuf, CS)) {
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
index bd302847aec..56f6c546217 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
@@ -155,7 +155,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
        * increment/axis parameters requires knowledge of job dimensions, but
        * this is somewhat offset by run_compute being a native instruction. */
       task_increment = pan_calc_workgroups_per_task(
-         &shader->cs.local_size, &phys_dev->kmod.dev->props);
+         &shader->cs.local_size, &phys_dev->kmod.dev->props,
+         shader->info.work_reg_count);
    } else {
       panvk_per_arch(calculate_task_axis_and_increment)(
          shader, phys_dev, &dim, &task_axis, &task_increment);
diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
index 57b48e69f15..fb5782a141f 100644
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
@@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
       unsigned core_id_range;
 
       pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
-      batch->tlsinfo.wls.instances = pan_calc_wls_instances(
-         &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
+      batch->tlsinfo.wls.instances =
+         pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
+                                indirect ? NULL : dim, cs->info.work_reg_count);
       batch->wls_total_size = pan_calc_total_wls_size(
          batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
    }
diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c
index 0db46da50f6..333a5abdaa0 100644
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
@@ -728,8 +728,17 @@ panvk_per_arch(get_physical_device_properties)(
 
    const bool has_disk_cache = device->vk.disk_cache != NULL;
 
+   /* Calculate the value using register count on v15+.
+    * TODO: As this requires register allocation changes ensuring we don't
+    * violate the limits based on the workgroup size, clamp the value to half of
+    * the max threads value (always safe and matches previous GPUs) for now. */
+   unsigned max_threads_per_wg =
+      (PAN_ARCH >= 15)
+         ? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
+                device->kmod.dev->props.max_threads_per_core / 2)
+         : device->kmod.dev->props.max_threads_per_wg;
    /* Ensure that the max threads count per workgroup is valid for Bifrost */
-   assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
+   assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);
 
    float pointSizeRangeMin;
    float pointSizeRangeMax;
@@ -858,11 +867,9 @@ panvk_per_arch(get_physical_device_properties)(
       /* We could also split into serveral jobs but this has many limitations.
        * As such we limit to the max threads per workgroup supported by the GPU.
        */
-      .maxComputeWorkGroupInvocations =
-         device->kmod.dev->props.max_threads_per_wg,
-      .maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
-                                  device->kmod.dev->props.max_threads_per_wg,
-                                  device->kmod.dev->props.max_threads_per_wg},
+      .maxComputeWorkGroupInvocations = max_threads_per_wg,
+      .maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
+                                  max_threads_per_wg},
       /* 8-bit subpixel precision. */
       .subPixelPrecisionBits = 8,
       .subTexelPrecisionBits = 8,
@@ -1053,8 +1060,7 @@ panvk_per_arch(get_physical_device_properties)(
       .minSubgroupSize = pan_subgroup_size(PAN_ARCH),
       .maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
       .maxComputeWorkgroupSubgroups =
-         device->kmod.dev->props.max_threads_per_wg /
-         pan_subgroup_size(PAN_ARCH),
+         max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
       .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
       .maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
       .maxPerStageDescriptorInlineUniformBlocks =