panfrost: Apply direct dispatch WLS instance limit

Apply the direct dispatch WLS instance limit to panfrost as well to keep compute jobs with large workgroup counts from running out of memory. Fixes: 1304f4578d ("panfrost: Adapt emit_shared_memory for indirect dispatch") Reviewed-by: Eric R. Smith <eric.smith@collabora.com> Reviewed-by: John Anthony <john.anthony@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34979> (cherry picked from commit 64ce37b2d9)
2026-01-29 09:30:20 +01:00 · 2025-05-16 13:50:57 +02:00 · 2025-05-16 13:50:57 +02:00 · c01db1fb7f
commit c01db1fb7f
parent cc2e341a14
3 changed files with 13 additions and 30 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@ -5134,7 +5134,7 @@
        "description": "panfrost: Apply direct dispatch WLS instance limit",
        "nominated": true,
        "nomination_type": 2,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": "1304f4578d2ee206be20bab8f9aa9a55ae4563b5",
        "notes": null
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -1587,29 +1587,6 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
   return ubos.gpu;
 }

-/*
- * Choose the number of WLS instances to allocate. This must be a power-of-two.
- * The number of WLS instances limits the number of concurrent tasks on a given
- * shader core, setting to the (rounded) total number of tasks avoids any
- * throttling. Smaller values save memory at the expense of possible throttling.
- *
- * With indirect dispatch, we don't know at launch-time how many tasks will be
- * needed, so we use a conservative value that's unlikely to cause slowdown in
- * practice without wasting too much memory.
- */
-static unsigned
-panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
-{
-   if (grid->indirect) {
-      /* May need tuning in the future, conservative guess */
-      return 128;
-   } else {
-      return util_next_power_of_two(grid->grid[0]) *
-             util_next_power_of_two(grid->grid[1]) *
-             util_next_power_of_two(grid->grid[2]);
-   }
-}
-
 static uint64_t
 panfrost_emit_shared_memory(struct panfrost_batch *batch,
                            const struct pipe_grid_info *grid)
@ -1620,10 +1597,15 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
   struct panfrost_ptr t =
      pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);

+   struct pan_compute_dim local_size = {grid->block[0], grid->block[1],
+                                        grid->block[2]};
+   struct pan_compute_dim dim = {grid->grid[0], grid->grid[1], grid->grid[2]};
+
   struct pan_tls_info info = {
      .tls.size = ss->info.tls_size,
      .wls.size = ss->info.wls_size + grid->variable_shared_mem,
-      .wls.instances = panfrost_choose_wls_instance_count(grid),
+      .wls.instances = pan_calc_wls_instances(&local_size, &dev->kmod.props,
+                                              grid->indirect ? NULL : &dim),
   };

   if (ss->info.tls_size) {
@ -1637,8 +1619,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
   }

   if (info.wls.size) {
-      unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
-                      dev->core_id_range;
+      unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances,
+                                              dev->core_id_range);

      struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);

--- a/src/gallium/drivers/panfrost/pan_precomp.c
+++ b/src/gallium/drivers/panfrost/pan_precomp.c
@ -200,7 +200,8 @@ emit_tls(struct panfrost_batch *batch,
   struct pan_tls_info info = {
      .tls.size = shader->info.tls_size,
      .wls.size = shader->info.wls_size,
-      .wls.instances = pan_wls_instances(dim),
+      .wls.instances =
+         pan_calc_wls_instances(&shader->local_size, &dev->kmod.props, dim),
   };

   if (info.tls.size) {
@ -210,8 +211,8 @@ emit_tls(struct panfrost_batch *batch,
   }

   if (info.wls.size) {
-      unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
-                      dev->core_id_range;
+      unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances,
+                                              dev->core_id_range);

      struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);