diff --git a/.pick_status.json b/.pick_status.json
index 13f216e2c36..1a5b8c7fbf4 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -5154,7 +5154,7 @@
         "description": "panvk/v10+: Limit direct dispatch WLS allocation",
         "nominated": true,
         "nomination_type": 2,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "5544d39f4420da88c53aaf8dd48d86ac92bd0eaa",
         "notes": null
diff --git a/src/panfrost/lib/pan_desc.h b/src/panfrost/lib/pan_desc.h
index 0072a6d0bf2..412d509a6dd 100644
--- a/src/panfrost/lib/pan_desc.h
+++ b/src/panfrost/lib/pan_desc.h
@@ -30,6 +30,7 @@
 
 #include "genxml/gen_macros.h"
 
+#include "kmod/pan_kmod.h"
 #include "pan_texture.h"
 
 struct pan_compute_dim {
@@ -165,6 +166,61 @@ pan_wls_adjust_size(unsigned wls_size)
    return util_next_power_of_two(MAX2(wls_size, 128));
 }
 
+static inline unsigned
+pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
+                             const struct pan_kmod_dev_props *props)
+{
+   /* Each shader core can run N tasks and a total of M threads at any single
+    * time, thus each task should ideally have no more than M/N threads. */
+   unsigned max_threads_per_task =
+      props->max_threads_per_core / props->max_tasks_per_core;
+
+   /* To achieve the best utilization, we should aim for as many workgroups
+    * per tasks as we can fit without exceeding the above thread limit */
+   unsigned threads_per_wg =
+      shader_local_size->x * shader_local_size->y * shader_local_size->z;
+   assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
+   unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
+   assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
+
+   return wg_per_task;
+}
+
+static inline unsigned
+pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
+                       const struct pan_kmod_dev_props *props,
+                       const struct pan_compute_dim *dim)
+{
+   /* NOTE: If the instance count is lower than the number of workgroups
+    * being dispatched, the HW will hold back workgroups until instances
+    * can be reused. */
+   unsigned instances;
+   unsigned wg_per_task =
+      pan_calc_workgroups_per_task(shader_local_size, props);
+   unsigned max_instances_per_core =
+      util_next_power_of_two(wg_per_task * props->max_tasks_per_core);
+
+   /* Not passing workgroup dimensions implies indirect compute. */
+   if (!dim) {
+      /* Assume we utilize all shader cores to the max */
+      instances = max_instances_per_core;
+   } else {
+      /* NOTE: There is no benefit from allocating more instances than what
+       * can concurrently be used by the HW */
+      instances = MIN2(pan_wls_instances(dim), max_instances_per_core);
+   }
+   return instances;
+}
+
+static inline unsigned
+pan_calc_total_wls_size(unsigned wls_size, unsigned wls_instances,
+                        unsigned max_core_id_plus_one)
+{
+   unsigned size = pan_wls_adjust_size(wls_size);
+
+   return size * wls_instances * max_core_id_plus_one;
+}
+
 #ifdef PAN_ARCH
 
 #if PAN_ARCH >= 5
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
index 2934bd999a1..eabde1ffbac 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@@ -65,27 +65,6 @@ prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
    return VK_SUCCESS;
 }
 
-static unsigned
-calculate_workgroups_per_task(const struct panvk_shader *shader,
-                              struct panvk_physical_device *phys_dev)
-{
-   /* Each shader core can run N tasks and a total of M threads at any single
-    * time, thus each task should ideally have no more than M/N threads. */
-   unsigned max_threads_per_task = phys_dev->kmod.props.max_threads_per_core /
-                                   phys_dev->kmod.props.max_tasks_per_core;
-
-   /* To achieve the best utilization, we should aim for as many workgroups
-    * per tasks as we can fit without exceeding the above thread limit */
-   unsigned threads_per_wg = shader->cs.local_size.x * shader->cs.local_size.y *
-                             shader->cs.local_size.z;
-   assert(threads_per_wg > 0 &&
-          threads_per_wg <= phys_dev->kmod.props.max_threads_per_wg);
-   unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
-   assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
-
-   return wg_per_task;
-}
-
 uint64_t
 panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
                                          const struct panvk_shader *shader,
@@ -103,35 +82,16 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
       .tls.size = shader->info.tls_size,
       .wls.size = shader->info.wls_size,
    };
-   unsigned core_id_range;
-   unsigned core_count =
-      panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
-
-   /* Only used for indirect dispatch */
-   unsigned wg_per_task = 0;
-   if (indirect)
-      wg_per_task = calculate_workgroups_per_task(shader, phys_dev);
 
    if (tlsinfo.wls.size) {
-      /* NOTE: If the instance count is lower than the number of workgroups
-       * being dispatched, the HW will hold back workgroups until instances
-       * can be reused. */
-      /* NOTE: There is no benefit from allocating more instances than what
-       * can concurrently be used by the HW */
-      if (indirect) {
-         /* Assume we utilize all shader cores to the max */
-         tlsinfo.wls.instances = util_next_power_of_two(
-            wg_per_task * phys_dev->kmod.props.max_tasks_per_core * core_count);
-      } else {
-         /* TODO: Similar to what we are doing for indirect this should change
-          * to calculate the maximum number of workgroups we can execute
-          * concurrently. */
-         tlsinfo.wls.instances = pan_wls_instances(dim);
-      }
+      unsigned core_id_range;
+      panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
 
-      /* TODO: Clamp WLS instance to some maximum WLS budget. */
-      unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
-                                tlsinfo.wls.instances * core_id_range;
+      tlsinfo.wls.instances = pan_calc_wls_instances(
+         &shader->cs.local_size, &phys_dev->kmod.props, indirect ? NULL : dim);
+
+      unsigned wls_total_size = pan_calc_total_wls_size(
+         tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
 
       /* TODO: Reuse WLS allocation for all dispatch commands in the command
        * buffer, similar to what we do for TLS in draw. As WLS size (and
@@ -192,7 +152,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
    /* Only used for indirect dispatch */
    unsigned wg_per_task = 0;
    if (indirect)
-      wg_per_task = calculate_workgroups_per_task(shader, phys_dev);
+      wg_per_task = pan_calc_workgroups_per_task(&shader->cs.local_size,
+                                                 &phys_dev->kmod.props);
 
    if (compute_state_dirty(cmdbuf, DESC_STATE) ||
        compute_state_dirty(cmdbuf, CS)) {