From 8855f31962c6dd1961ddf22c91d6df8ca6f7420b Mon Sep 17 00:00:00 2001
From: John Anthony <john.anthony@arm.com>
Date: Tue, 24 Sep 2024 17:15:14 +0200
Subject: [PATCH] panvk: Pull out task axis and increment calculation for
 dispatch

Moves some some code specific to CmdDispatchBase out of cmd_dispatch and
into a new function.

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31370>
---
 .../vulkan/csf/panvk_vX_cmd_dispatch.c        | 85 ++++++++++---------
 1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
index 51f5840c61d..3873dbe9315 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c
@@ -80,6 +80,49 @@ struct panvk_dispatch_info {
    } direct;
 };
 
+static void
+calculate_task_axis_and_increment(const struct panvk_shader *shader,
+                                  struct panvk_physical_device *phys_dev,
+                                  unsigned *task_axis, unsigned *task_increment)
+{
+   /* Pick the task_axis and task_increment to maximize thread
+    * utilization. */
+   unsigned threads_per_wg =
+      shader->local_size.x * shader->local_size.y * shader->local_size.z;
+   unsigned max_thread_cnt = panfrost_compute_max_thread_count(
+      &phys_dev->kmod.props, shader->info.work_reg_count);
+   unsigned threads_per_task = threads_per_wg;
+   unsigned local_size[3] = {
+      shader->local_size.x,
+      shader->local_size.y,
+      shader->local_size.z,
+   };
+
+   for (unsigned i = 0; i < 3; i++) {
+      if (threads_per_task * local_size[i] >= max_thread_cnt) {
+         /* We reached out thread limit, stop at the current axis and
+          * calculate the increment so it doesn't exceed the per-core
+          * thread capacity.
+          */
+         *task_increment = max_thread_cnt / threads_per_task;
+         break;
+      } else if (*task_axis == MALI_TASK_AXIS_Z) {
+         /* We reached the Z axis, and there's still room to stuff more
+          * threads. Pick the current axis grid size as our increment
+          * as there's no point using something bigger.
+          */
+         *task_increment = local_size[i];
+         break;
+      }
+
+      threads_per_task *= local_size[i];
+      (*task_axis)++;
+   }
+
+   assert(*task_axis <= MALI_TASK_AXIS_Z);
+   assert(*task_increment > 0);
+}
+
 static void
 cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
 {
@@ -167,8 +210,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
       return;
 
    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
-   unsigned task_axis = MALI_TASK_AXIS_X;
-   unsigned task_increment = 0;
 
    /* Copy the global TLS pointer to the per-job TSD. */
    if (tlsinfo.tls.size) {
@@ -210,47 +251,15 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
       cs_move32_to(b, cs_sr_reg32(b, 37), info->direct.groupCountX);
       cs_move32_to(b, cs_sr_reg32(b, 38), info->direct.groupCountY);
       cs_move32_to(b, cs_sr_reg32(b, 39), info->direct.groupCountZ);
-
-      /* Pick the task_axis and task_increment to maximize thread utilization. */
-      unsigned threads_per_wg =
-         shader->local_size.x * shader->local_size.y * shader->local_size.z;
-      unsigned max_thread_cnt = panfrost_compute_max_thread_count(
-         &phys_dev->kmod.props, shader->info.work_reg_count);
-      unsigned threads_per_task = threads_per_wg;
-      unsigned local_size[3] = {
-         shader->local_size.x,
-         shader->local_size.y,
-         shader->local_size.z,
-      };
-
-      for (unsigned i = 0; i < 3; i++) {
-         if (threads_per_task * local_size[i] >= max_thread_cnt) {
-            /* We reached out thread limit, stop at the current axis and
-             * calculate the increment so it doesn't exceed the per-core
-             * thread capacity.
-             */
-            task_increment = max_thread_cnt / threads_per_task;
-            break;
-         } else if (task_axis == MALI_TASK_AXIS_Z) {
-            /* We reached the Z axis, and there's still room to stuff more
-             * threads. Pick the current axis grid size as our increment
-             * as there's no point using something bigger.
-             */
-            task_increment = local_size[i];
-            break;
-         }
-
-         threads_per_task *= local_size[i];
-         task_axis++;
-      }
    }
 
-   assert(task_axis <= MALI_TASK_AXIS_Z);
-   assert(task_increment > 0);
-
    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
 
    cs_req_res(b, CS_COMPUTE_RES);
+   unsigned task_axis = MALI_TASK_AXIS_X;
+   unsigned task_increment = 0;
+   calculate_task_axis_and_increment(shader, phys_dev, &task_axis,
+                                     &task_increment);
    cs_run_compute(b, task_increment, task_axis, false,
                   cs_shader_res_sel(0, 0, 0, 0));
    cs_req_res(b, 0);