pan: Add handling for v15+ uapi thread_max_wg_size

thread_max_workgroup_size has been replaced with thread_num_active_granularity in v15, which requires updated handling for calculating the max number of threads in a workgroup
2026-05-08 09:08:10 +02:00 · 2026-03-23 14:05:55 +01:00 · 2026-03-23 14:05:55 +01:00 · 1f12828011
commit 1f12828011
parent 003becf081
10 changed files with 56 additions and 23 deletions
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
      .tls.size = ss->info.tls_size,
      .wls.size = ss->info.wls_size + grid->variable_shared_mem,
      .wls.instances = pan_calc_wls_instances(
-         &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
+         &local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
+         ss->info.work_reg_count),
   };

   if (ss->info.tls_size) {
--- a/src/gallium/drivers/panfrost/pan_precomp.c
+++ b/src/gallium/drivers/panfrost/pan_precomp.c
@ -197,8 +197,9 @@ emit_tls(struct panfrost_batch *batch,
   struct pan_tls_info info = {
      .tls.size = shader->info.tls_size,
      .wls.size = shader->info.wls_size,
-      .wls.instances = pan_calc_wls_instances(&shader->local_size,
-                                              &dev->kmod.dev->props, dim),
+      .wls.instances =
+         pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
+                                shader->info.work_reg_count),
   };

   if (info.tls.size) {
--- a/src/panfrost/lib/kmod/pan_kmod.h
+++ b/src/panfrost/lib/kmod/pan_kmod.h
@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
   /* Maximum number of threads per workgroup. */
   uint32_t max_threads_per_wg;

+   /* Granularity of number of active threads. */
+   uint32_t num_threads_active_granularity;
+
   /* Number of registers per core. Can be used to determine the maximum
    * number of threads that can be allocated for a specific shader based on
    * the number of registers assigned to this shader.
--- a/src/panfrost/lib/kmod/panthor_kmod.c
+++ b/src/panfrost/lib/kmod/panthor_kmod.c
@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
   props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
   props->num_registers_per_core =
      panthor_dev->props.gpu.thread_features & 0x3fffff;
+   props->num_threads_active_granularity =
+      panthor_dev->props.gpu.thread_num_active_granularity;

   /* We assume that all thread properties are populated. If we ever have a GPU
    * that have one of the THREAD_xxx register that's zero, we can always add a
    * quirk here.
    */
-   assert(props->max_threads_per_wg && props->max_threads_per_core &&
-          props->max_tasks_per_core && props->num_registers_per_core);
+   assert(
+      (props->max_threads_per_wg || props->num_threads_active_granularity) &&
+      props->max_threads_per_core && props->max_tasks_per_core &&
+      props->num_registers_per_core);

   /* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
    * of TLS instance per core is assumed to be the maximum number of threads
--- a/src/panfrost/lib/pan_desc.h
+++ b/src/panfrost/lib/pan_desc.h
@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)

 static inline unsigned
 pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
-                             const struct pan_kmod_dev_props *props)
+                             const struct pan_kmod_dev_props *props,
+                             unsigned work_reg_count)
 {
   /* Each shader core can run N tasks and a total of M threads at any single
    * time, thus each task should ideally have no more than M/N threads. */
   unsigned max_threads_per_task =
      props->max_threads_per_core / props->max_tasks_per_core;

+   ASSERTED unsigned max_threads_per_wg =
+      pan_compute_max_thread_count(props, work_reg_count);
+
   /* To achieve the best utilization, we should aim for as many workgroups
    * per tasks as we can fit without exceeding the above thread limit */
   unsigned threads_per_wg =
      shader_local_size->x * shader_local_size->y * shader_local_size->z;
-   assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
+   assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
   unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
   assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);

@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
 static inline unsigned
 pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
                       const struct pan_kmod_dev_props *props,
-                       const struct pan_compute_dim *dim)
+                       const struct pan_compute_dim *dim,
+                       unsigned work_reg_count)
 {
   /* NOTE: If the instance count is lower than the number of workgroups
    * being dispatched, the HW will hold back workgroups until instances
    * can be reused. */
   unsigned instances;
   unsigned wg_per_task =
-      pan_calc_workgroups_per_task(shader_local_size, props);
+      pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
   unsigned max_instances_per_core =
      util_next_power_of_two(wg_per_task * props->max_tasks_per_core);

--- a/src/panfrost/lib/pan_props.c
+++ b/src/panfrost/lib/pan_props.c
@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
      aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
   }

+   if (pan_arch(props->gpu_id) >= 15) {
+      assert(props->num_threads_active_granularity);
+      unsigned max_treads_per_wg =
+         ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
+                       props->num_threads_active_granularity);
+      return MIN2(max_treads_per_wg, props->max_threads_per_core);
+   }
+
+   assert(props->max_threads_per_wg);
   return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
               props->num_registers_per_core / aligned_reg_count);
 }
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
      unsigned core_id_range;
      pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);

-      tlsinfo.wls.instances = pan_calc_wls_instances(
-         &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
+      tlsinfo.wls.instances =
+         pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
+                                indirect ? NULL : dim, cs->info.work_reg_count);

      unsigned wls_total_size = pan_calc_total_wls_size(
         tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
   unsigned wg_per_task = 0;
   if (indirect)
      wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
-                                                 &phys_dev->kmod.dev->props);
+                                                 &phys_dev->kmod.dev->props,
+                                                 cs->info.work_reg_count);

   if (compute_state_dirty(cmdbuf, DESC_STATE) ||
       compute_state_dirty(cmdbuf, CS)) {
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c
@ -155,7 +155,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
       * increment/axis parameters requires knowledge of job dimensions, but
       * this is somewhat offset by run_compute being a native instruction. */
      task_increment = pan_calc_workgroups_per_task(
-         &shader->cs.local_size, &phys_dev->kmod.dev->props);
+         &shader->cs.local_size, &phys_dev->kmod.dev->props,
+         shader->info.work_reg_count);
   } else {
      panvk_per_arch(calculate_task_axis_and_increment)(
         shader, phys_dev, &dim, &task_axis, &task_increment);
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c
@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
      unsigned core_id_range;

      pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
-      batch->tlsinfo.wls.instances = pan_calc_wls_instances(
-         &cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
+      batch->tlsinfo.wls.instances =
+         pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
+                                indirect ? NULL : dim, cs->info.work_reg_count);
      batch->wls_total_size = pan_calc_total_wls_size(
         batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
   }
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
@ -728,8 +728,17 @@ panvk_per_arch(get_physical_device_properties)(

   const bool has_disk_cache = device->vk.disk_cache != NULL;

+   /* Calculate the value using register count on v15+.
+    * TODO: As this requires register allocation changes ensuring we don't
+    * violate the limits based on the workgroup size, clamp the value to half of
+    * the max threads value (always safe and matches previous GPUs) for now. */
+   unsigned max_threads_per_wg =
+      (PAN_ARCH >= 15)
+         ? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
+                device->kmod.dev->props.max_threads_per_core / 2)
+         : device->kmod.dev->props.max_threads_per_wg;
   /* Ensure that the max threads count per workgroup is valid for Bifrost */
-   assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
+   assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);

   float pointSizeRangeMin;
   float pointSizeRangeMax;
@ -858,11 +867,9 @@ panvk_per_arch(get_physical_device_properties)(
      /* We could also split into serveral jobs but this has many limitations.
       * As such we limit to the max threads per workgroup supported by the GPU.
       */
-      .maxComputeWorkGroupInvocations =
-         device->kmod.dev->props.max_threads_per_wg,
-      .maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
-                                  device->kmod.dev->props.max_threads_per_wg,
-                                  device->kmod.dev->props.max_threads_per_wg},
+      .maxComputeWorkGroupInvocations = max_threads_per_wg,
+      .maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
+                                  max_threads_per_wg},
      /* 8-bit subpixel precision. */
      .subPixelPrecisionBits = 8,
      .subTexelPrecisionBits = 8,
@ -1053,8 +1060,7 @@ panvk_per_arch(get_physical_device_properties)(
      .minSubgroupSize = pan_subgroup_size(PAN_ARCH),
      .maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
      .maxComputeWorkgroupSubgroups =
-         device->kmod.dev->props.max_threads_per_wg /
-         pan_subgroup_size(PAN_ARCH),
+         max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
      .requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
      .maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
      .maxPerStageDescriptorInlineUniformBlocks =