From ddd0b0c3a8a7e312edcbb8a310840f8fa05d56e8 Mon Sep 17 00:00:00 2001
From: Chia-I Wu <olvaffe@gmail.com>
Date: Mon, 20 Oct 2025 15:06:36 -0700
Subject: [PATCH] panvk: rework calculate_task_axis_and_increment

We used to maximize threads_per_task, but that is ideal when the system
has a single gpu client. When there are multiple gpu clients, we want
smaller threads_per_task such that cores can be more fairly shared among
the clients.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
Tested-by: Yiwei Zhang <zzyiwei@chromium.org>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37988>
---
 src/panfrost/ci/panfrost-g925-skips.txt    |  3 ++
 src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 49 ++++++++++++----------
 2 files changed, 30 insertions(+), 22 deletions(-)
 create mode 100644 src/panfrost/ci/panfrost-g925-skips.txt

diff --git a/src/panfrost/ci/panfrost-g925-skips.txt b/src/panfrost/ci/panfrost-g925-skips.txt
new file mode 100644
index 00000000000..3b96b79f4c6
--- /dev/null
+++ b/src/panfrost/ci/panfrost-g925-skips.txt
@@ -0,0 +1,3 @@
+# Slow tests (>= 30s)
+dEQP-VK.api.external.fence.sync_fd.export_multiple_times_temporary
+dEQP-VK.api.external.semaphore.sync_fd.export_multiple_times_temporary
diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
index 1dc447375be..525130531ec 100644
--- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
+++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
@@ -677,13 +677,19 @@ panvk_per_arch(calculate_task_axis_and_increment)(
 {
    /* Pick the task_axis and task_increment to maximize thread
     * utilization. */
-   unsigned threads_per_wg = shader->cs.local_size.x * shader->cs.local_size.y *
-                             shader->cs.local_size.z;
-   unsigned max_thread_cnt = pan_compute_max_thread_count(
-      &phys_dev->kmod.props, shader->info.work_reg_count);
-   unsigned threads_per_task = threads_per_wg;
+   const struct pan_kmod_dev_props *props = &phys_dev->kmod.props;
+   const unsigned max_thread_cnt =
+      pan_compute_max_thread_count(props, shader->info.work_reg_count);
+   const unsigned threads_per_wg = shader->cs.local_size.x *
+                                   shader->cs.local_size.y *
+                                   shader->cs.local_size.z;
    const unsigned wg_count[3] = {wg_dim->x, wg_dim->y, wg_dim->z};
    const unsigned total_wgs = wg_dim->x * wg_dim->y * wg_dim->z;
+   const unsigned total_cores = util_bitcount64(phys_dev->compute_core_mask);
+   /* Split workgroups among cores evenly. */
+   const unsigned wgs_per_core = DIV_ROUND_UP(total_wgs, total_cores);
+   unsigned threads_per_task;
+   unsigned wgs_per_task;
 
    if (!total_wgs) {
       *task_axis = MALI_TASK_AXIS_X;
@@ -691,25 +697,24 @@ panvk_per_arch(calculate_task_axis_and_increment)(
       return;
    }
 
-   for (unsigned i = 0; i < 3; i++) {
-      if (threads_per_task * wg_count[i] >= max_thread_cnt) {
-         /* We reached out thread limit, stop at the current axis and
-          * calculate the increment so it doesn't exceed the per-core
-          * thread capacity.
-          */
-         *task_increment = max_thread_cnt / threads_per_task;
-         break;
-      } else if (*task_axis == MALI_TASK_AXIS_Z) {
-         /* We reached the Z axis, and there's still room to stuff more
-          * threads. Pick the current axis grid size as our increment
-          * as there's no point using something bigger.
-          */
-         *task_increment = wg_count[i];
-         break;
-      }
+   /* We used to maximize threads_per_task, but that is ideal when the system
+    * has a single gpu client. When there are multiple gpu clients, we want
+    * smaller threads_per_task such that cores can be more fairly shared among
+    * the clients.
+    */
+   threads_per_task = DIV_ROUND_UP(max_thread_cnt, props->max_tasks_per_core);
+
+   wgs_per_task = threads_per_task / threads_per_wg;
+   wgs_per_task = CLAMP(wgs_per_task, 1, wgs_per_core);
+
+   *task_axis = MALI_TASK_AXIS_X;
+   *task_increment = wgs_per_task;
+   for (unsigned i = 0; i < 2; i++) {
+      if (*task_increment <= wg_count[i])
+         break;
 
-      threads_per_task *= wg_count[i];
       (*task_axis)++;
+      *task_increment /= wg_count[i];
    }
 
    assert(*task_axis <= MALI_TASK_AXIS_Z);