panvk/v10+: Limit direct dispatch WLS allocation

During direct dispatch, we calculate the size of the WLS allocation based on the number of WLS instances which is an unbounded calculation on number of workgroups. This leads to extreme allocation sizes and potentially VK_ERROR_OUT_OF_DEVICE_MEMORY for direct dispatches with a high amount of workgroups. This change adds an upper bound to the number of WLS instances, using the same value we assume for indirect dispatches. Additionally, this commit fixes the WLS max instance calculation (which should be per core). Fixes: 5544d39f44 ("panvk: Add a CSF backend for panvk_queue/cmd_buffer") Reviewed-by: Eric R. Smith <eric.smith@collabora.com> Reviewed-by: John Anthony <john.anthony@arm.com> Tested-by: Heiko Stuebner <heiko@sntech.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34979>
2026-05-05 03:08:05 +02:00 · 2025-05-14 13:21:25 +02:00 · 2025-05-14 13:21:25 +02:00 · 0a47a1cb6d
commit 0a47a1cb6d
parent a6c7a774ab
2 changed files with 65 additions and 48 deletions
--- a/src/panfrost/lib/pan_desc.h
+++ b/src/panfrost/lib/pan_desc.h
@ -30,6 +30,7 @@

 #include "genxml/gen_macros.h"

+#include "kmod/pan_kmod.h"
 #include "pan_image.h"
 #include "pan_pool.h"

@ -175,6 +176,61 @@ pan_wls_adjust_size(unsigned wls_size)
   return util_next_power_of_two(MAX2(wls_size, 128));
 }

+static inline unsigned
+pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
+                             const struct pan_kmod_dev_props *props)
+{
+   /* Each shader core can run N tasks and a total of M threads at any single
+    * time, thus each task should ideally have no more than M/N threads. */
+   unsigned max_threads_per_task =
+      props->max_threads_per_core / props->max_tasks_per_core;
+
+   /* To achieve the best utilization, we should aim for as many workgroups
+    * per tasks as we can fit without exceeding the above thread limit */
+   unsigned threads_per_wg =
+      shader_local_size->x * shader_local_size->y * shader_local_size->z;
+   assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
+   unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
+   assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
+
+   return wg_per_task;
+}
+
+static inline unsigned
+pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
+                       const struct pan_kmod_dev_props *props,
+                       const struct pan_compute_dim *dim)
+{
+   /* NOTE: If the instance count is lower than the number of workgroups
+    * being dispatched, the HW will hold back workgroups until instances
+    * can be reused. */
+   unsigned instances;
+   unsigned wg_per_task =
+      pan_calc_workgroups_per_task(shader_local_size, props);
+   unsigned max_instances_per_core =
+      util_next_power_of_two(wg_per_task * props->max_tasks_per_core);
+
+   /* Not passing workgroup dimensions implies indirect compute. */
+   if (!dim) {
+      /* Assume we utilize all shader cores to the max */
+      instances = max_instances_per_core;
+   } else {
+      /* NOTE: There is no benefit from allocating more instances than what
+       * can concurrently be used by the HW */
+      instances = MIN2(pan_wls_instances(dim), max_instances_per_core);
+   }
+   return instances;
+}
+
+static inline unsigned
+pan_calc_total_wls_size(unsigned wls_size, unsigned wls_instances,
+                        unsigned max_core_id_plus_one)
+{
+   unsigned size = pan_wls_adjust_size(wls_size);
+
+   return size * wls_instances * max_core_id_plus_one;
+}
+
 #ifdef PAN_ARCH

 static inline enum mali_sample_pattern
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@ -65,27 +65,6 @@ prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
   return VK_SUCCESS;
 }

-static unsigned
-calculate_workgroups_per_task(const struct panvk_shader *shader,
-                              struct panvk_physical_device *phys_dev)
-{
-   /* Each shader core can run N tasks and a total of M threads at any single
-    * time, thus each task should ideally have no more than M/N threads. */
-   unsigned max_threads_per_task = phys_dev->kmod.props.max_threads_per_core /
-                                   phys_dev->kmod.props.max_tasks_per_core;
-
-   /* To achieve the best utilization, we should aim for as many workgroups
-    * per tasks as we can fit without exceeding the above thread limit */
-   unsigned threads_per_wg = shader->cs.local_size.x * shader->cs.local_size.y *
-                             shader->cs.local_size.z;
-   assert(threads_per_wg > 0 &&
-          threads_per_wg <= phys_dev->kmod.props.max_threads_per_wg);
-   unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
-   assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
-
-   return wg_per_task;
-}
-
 uint64_t
 panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
                                         const struct panvk_shader *shader,
@ -103,35 +82,16 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
      .tls.size = shader->info.tls_size,
      .wls.size = shader->info.wls_size,
   };
-   unsigned core_id_range;
-   unsigned core_count =
-      pan_query_core_count(&phys_dev->kmod.props, &core_id_range);
-
-   /* Only used for indirect dispatch */
-   unsigned wg_per_task = 0;
-   if (indirect)
-      wg_per_task = calculate_workgroups_per_task(shader, phys_dev);

   if (tlsinfo.wls.size) {
-      /* NOTE: If the instance count is lower than the number of workgroups
-       * being dispatched, the HW will hold back workgroups until instances
-       * can be reused. */
-      /* NOTE: There is no benefit from allocating more instances than what
-       * can concurrently be used by the HW */
-      if (indirect) {
-         /* Assume we utilize all shader cores to the max */
-         tlsinfo.wls.instances = util_next_power_of_two(
-            wg_per_task * phys_dev->kmod.props.max_tasks_per_core * core_count);
-      } else {
-         /* TODO: Similar to what we are doing for indirect this should change
-          * to calculate the maximum number of workgroups we can execute
-          * concurrently. */
-         tlsinfo.wls.instances = pan_wls_instances(dim);
-      }
+      unsigned core_id_range;
+      pan_query_core_count(&phys_dev->kmod.props, &core_id_range);

-      /* TODO: Clamp WLS instance to some maximum WLS budget. */
-      unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
-                                tlsinfo.wls.instances * core_id_range;
+      tlsinfo.wls.instances = pan_calc_wls_instances(
+         &shader->cs.local_size, &phys_dev->kmod.props, indirect ? NULL : dim);
+
+      unsigned wls_total_size = pan_calc_total_wls_size(
+         tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);

      /* TODO: Reuse WLS allocation for all dispatch commands in the command
       * buffer, similar to what we do for TLS in draw. As WLS size (and
@ -192,7 +152,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
   /* Only used for indirect dispatch */
   unsigned wg_per_task = 0;
   if (indirect)
-      wg_per_task = calculate_workgroups_per_task(shader, phys_dev);
+      wg_per_task = pan_calc_workgroups_per_task(&shader->cs.local_size,
+                                                 &phys_dev->kmod.props);

   if (compute_state_dirty(cmdbuf, DESC_STATE) ||
       compute_state_dirty(cmdbuf, CS)) {