From c01db1fb7f3bae4dfe798fbd7d76b7103b492cae Mon Sep 17 00:00:00 2001
From: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Date: Fri, 16 May 2025 13:50:57 +0200
Subject: [PATCH] panfrost: Apply direct dispatch WLS instance limit

Apply the direct dispatch WLS instance limit to panfrost as well to keep
compute jobs with large workgroup counts from running out of memory.

Fixes: 1304f4578d2 ("panfrost: Adapt emit_shared_memory for indirect dispatch")
Reviewed-by: Eric R. Smith <eric.smith@collabora.com>
Reviewed-by: John Anthony <john.anthony@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34979>
(cherry picked from commit 64ce37b2d981a00b46c1c38d7823f7fcd8ce862d)
---
 .pick_status.json                            |  2 +-
 src/gallium/drivers/panfrost/pan_cmdstream.c | 34 +++++---------------
 src/gallium/drivers/panfrost/pan_precomp.c   |  7 ++--
 3 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 24ec042edbc..1a2763041bd 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -5134,7 +5134,7 @@
         "description": "panfrost: Apply direct dispatch WLS instance limit",
         "nominated": true,
         "nomination_type": 2,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": "1304f4578d2ee206be20bab8f9aa9a55ae4563b5",
         "notes": null
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index a9b89f1e99c..fa2d5b34da9 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -1587,29 +1587,6 @@ panfrost_emit_const_buf(struct panfrost_batch *batch,
    return ubos.gpu;
 }
 
-/*
- * Choose the number of WLS instances to allocate. This must be a power-of-two.
- * The number of WLS instances limits the number of concurrent tasks on a given
- * shader core, setting to the (rounded) total number of tasks avoids any
- * throttling. Smaller values save memory at the expense of possible throttling.
- *
- * With indirect dispatch, we don't know at launch-time how many tasks will be
- * needed, so we use a conservative value that's unlikely to cause slowdown in
- * practice without wasting too much memory.
- */
-static unsigned
-panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
-{
-   if (grid->indirect) {
-      /* May need tuning in the future, conservative guess */
-      return 128;
-   } else {
-      return util_next_power_of_two(grid->grid[0]) *
-             util_next_power_of_two(grid->grid[1]) *
-             util_next_power_of_two(grid->grid[2]);
-   }
-}
-
 static uint64_t
 panfrost_emit_shared_memory(struct panfrost_batch *batch,
                             const struct pipe_grid_info *grid)
@@ -1620,10 +1597,15 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
    struct panfrost_ptr t =
       pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
 
+   struct pan_compute_dim local_size = {grid->block[0], grid->block[1],
+                                        grid->block[2]};
+   struct pan_compute_dim dim = {grid->grid[0], grid->grid[1], grid->grid[2]};
+
    struct pan_tls_info info = {
       .tls.size = ss->info.tls_size,
       .wls.size = ss->info.wls_size + grid->variable_shared_mem,
-      .wls.instances = panfrost_choose_wls_instance_count(grid),
+      .wls.instances = pan_calc_wls_instances(&local_size, &dev->kmod.props,
+                                              grid->indirect ? NULL : &dim),
    };
 
    if (ss->info.tls_size) {
@@ -1637,8 +1619,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
    }
 
    if (info.wls.size) {
-      unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
-                      dev->core_id_range;
+      unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances,
+                                              dev->core_id_range);
 
       struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);
 
diff --git a/src/gallium/drivers/panfrost/pan_precomp.c b/src/gallium/drivers/panfrost/pan_precomp.c
index 22133415ec1..cc104d953b8 100644
--- a/src/gallium/drivers/panfrost/pan_precomp.c
+++ b/src/gallium/drivers/panfrost/pan_precomp.c
@@ -200,7 +200,8 @@ emit_tls(struct panfrost_batch *batch,
    struct pan_tls_info info = {
       .tls.size = shader->info.tls_size,
       .wls.size = shader->info.wls_size,
-      .wls.instances = pan_wls_instances(dim),
+      .wls.instances =
+         pan_calc_wls_instances(&shader->local_size, &dev->kmod.props, dim),
    };
 
    if (info.tls.size) {
@@ -210,8 +211,8 @@ emit_tls(struct panfrost_batch *batch,
    }
 
    if (info.wls.size) {
-      unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
-                      dev->core_id_range;
+      unsigned size = pan_calc_total_wls_size(info.wls.size, info.wls.instances,
+                                              dev->core_id_range);
 
       struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);