panfrost: Rework the way we compute thread info

Rework the way we compute thread info to make it mostly GPU-agnostic outside of the kmod backend. The new logic is based on the following information extracted from GPU registers: - mximum number of threads per core - maximum number ot threads per workgroup - number of registers per core If the GPU doesn't provide this information (registers are zero), we pick the per-arch defaults we had in panfrost_max_thread_count(). Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Antonino Maniscalco <antonino.maniscalco@collabora.com> Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26358>
2026-05-08 22:08:26 +02:00 · 2023-11-14 13:11:49 +01:00 · 2023-11-14 13:11:49 +01:00 · 4477daf957
commit 4477daf957
parent 73da66706e
7 changed files with 130 additions and 42 deletions
--- a/src/gallium/drivers/panfrost/pan_shader.c
+++ b/src/gallium/drivers/panfrost/pan_shader.c
@ -533,8 +533,8 @@ panfrost_get_compute_state_info(struct pipe_context *pipe, void *cso,
   struct panfrost_compiled_shader *cs =
      util_dynarray_begin(&uncompiled->variants);

-   info->max_threads =
-      panfrost_max_thread_count(dev->arch, cs->info.work_reg_count);
+   info->max_threads = panfrost_compute_max_thread_count(
+      &dev->kmod.props, cs->info.work_reg_count);
   info->private_memory = cs->info.tls_size;
   info->simd_sizes = pan_subgroup_size(dev->arch);
   info->preferred_simd_size = info->simd_sizes;
--- a/src/panfrost/lib/kmod/meson.build
+++ b/src/panfrost/lib/kmod/meson.build
@ -29,7 +29,7 @@ libpankmod_lib = static_library(
  include_directories : [inc_include, inc_src, inc_panfrost],
  c_args : [no_override_init_args],
  gnu_symbol_visibility : 'hidden',
-  dependencies: [dep_libdrm, idep_mesautil],
+  dependencies: [dep_libdrm, idep_mesautil, idep_pan_packers],
  build_by_default : false,
 )

--- a/src/panfrost/lib/kmod/pan_kmod.h
+++ b/src/panfrost/lib/kmod/pan_kmod.h
@ -154,7 +154,23 @@ struct pan_kmod_dev_props {
   uint32_t texture_features[4];

   /* Maximum number of threads per core. */
-   uint32_t thread_tls_alloc;
+   uint32_t max_threads_per_core;
+
+   /* Maximum number of threads per workgroup. */
+   uint32_t max_threads_per_wg;
+
+   /* Number of registers per core. Can be used to determine the maximum
+    * number of threads that can be allocated for a specific shader based on
+    * the number of registers assigned to this shader.
+    */
+   uint32_t num_registers_per_core;
+
+   /* Maximum number of thread-local storage instance per core.
+    * If the GPU doesn't have a THREAD_TLS_ALLOC register or the register
+    * value is zero, the backend should assign the value of max_threads_per_core
+    * here.
+    */
+   uint32_t max_tls_instance_per_core;

   /* AFBC feature bits. */
   uint32_t afbc_features;
--- a/src/panfrost/lib/kmod/panfrost_kmod.c
+++ b/src/panfrost/lib/kmod/panfrost_kmod.c
@ -17,6 +17,9 @@

 #include "pan_kmod_backend.h"

+/* Only needed for pan_arch(), don't add per-arch stuff here. */
+#include "genxml/gen_macros.h"
+
 const struct pan_kmod_ops panfrost_kmod_ops;

 struct panfrost_kmod_vm {
@ -91,6 +94,85 @@ panfrost_query_raw(int fd, enum drm_panfrost_param param, bool required,
   return get_param.value;
 }

+static void
+panfrost_dev_query_thread_props(const struct pan_kmod_dev *dev,
+                                struct pan_kmod_dev_props *props)
+{
+   int fd = dev->fd;
+
+   props->max_threads_per_core =
+      panfrost_query_raw(fd, DRM_PANFROST_PARAM_MAX_THREADS, true, 0);
+   if (!props->max_threads_per_core) {
+      switch (pan_arch(props->gpu_prod_id)) {
+      case 4:
+      case 5:
+         props->max_threads_per_core = 256;
+         break;
+
+      case 6:
+         /* Bifrost, first generation */
+         props->max_threads_per_core = 384;
+         break;
+
+      case 7:
+         /* Bifrost, second generation (G31 is 512 but it doesn't matter) */
+         props->max_threads_per_core = 768;
+         break;
+
+      case 9:
+         /* Valhall, first generation. */
+         props->max_threads_per_core = 512;
+         break;
+
+      default:
+         assert(!"Unsupported arch");
+      }
+   }
+
+   props->max_threads_per_wg = panfrost_query_raw(
+      fd, DRM_PANFROST_PARAM_THREAD_MAX_WORKGROUP_SZ, true, 0);
+   if (!props->max_threads_per_wg)
+      props->max_threads_per_wg = props->max_threads_per_core;
+
+   uint32_t thread_features =
+      panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_FEATURES, true, 0);
+   props->num_registers_per_core = thread_features & 0xffff;
+   if (!props->num_registers_per_core) {
+      switch (pan_arch(props->gpu_prod_id)) {
+      case 4:
+      case 5:
+         /* Assume we can always schedule max_threads_per_core when using 4
+          * registers per-shader or less.
+          */
+         props->num_registers_per_core = props->max_threads_per_core * 4;
+         break;
+
+      case 6:
+         /* Assume we can always schedule max_threads_per_core for shader
+          * using the full per-shader register file (64 regs).
+          */
+         props->num_registers_per_core = props->max_threads_per_core * 64;
+         break;
+
+      case 7:
+      case 9:
+         /* Assume we can always schedule max_threads_per_core for shaders
+          * using half the per-shader register file (32 regs).
+          */
+         props->num_registers_per_core = props->max_threads_per_core * 32;
+         break;
+
+      default:
+         assert(!"Unsupported arch");
+      }
+   }
+
+   props->max_tls_instance_per_core =
+      panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0);
+   if (!props->max_tls_instance_per_core)
+      props->max_tls_instance_per_core = props->max_threads_per_core;
+}
+
 static void
 panfrost_dev_query_props(const struct pan_kmod_dev *dev,
                         struct pan_kmod_dev_props *props)
@ -116,10 +198,10 @@ panfrost_dev_query_props(const struct pan_kmod_dev *dev,
         fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0 + i, true, 0);
   }

-   props->thread_tls_alloc =
-      panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0);
   props->afbc_features =
      panfrost_query_raw(fd, DRM_PANFROST_PARAM_AFBC_FEATURES, true, 0);
+
+   panfrost_dev_query_thread_props(dev, props);
 }

 static uint32_t
--- a/src/panfrost/lib/pan_props.c
+++ b/src/panfrost/lib/pan_props.c
@ -24,6 +24,8 @@
 *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
 */

+#include "util/macros.h"
+
 #include "kmod/pan_kmod.h"
 #include "panfrost/util/pan_ir.h"
 #include "pan_props.h"
@ -126,11 +128,27 @@ panfrost_query_core_count(const struct pan_kmod_dev_props *props,
 unsigned
 panfrost_query_thread_tls_alloc(const struct pan_kmod_dev_props *props)
 {
-   unsigned tls = props->thread_tls_alloc;
+   return props->max_tls_instance_per_core ?: props->max_threads_per_core;
+}

-   return (tls > 0)
-             ? tls
-             : panfrost_max_thread_count(pan_arch(props->gpu_prod_id), 0);
+unsigned
+panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props,
+                                  unsigned work_reg_count)
+{
+   unsigned aligned_reg_count;
+
+   /* 4, 8 or 16 registers per shader on Midgard
+    * 32 or 64 registers per shader on Bifrost
+    */
+   if (pan_arch(props->gpu_prod_id) <= 5) {
+      aligned_reg_count = util_next_power_of_two(MAX2(work_reg_count, 4));
+      assert(aligned_reg_count <= 16);
+   } else {
+      aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
+   }
+
+   return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
+               props->num_registers_per_core / aligned_reg_count);
 }

 uint32_t
--- a/src/panfrost/lib/pan_props.h
+++ b/src/panfrost/lib/pan_props.h
@ -95,4 +95,8 @@ unsigned panfrost_query_optimal_tib_size(const struct panfrost_model *model);
 uint64_t panfrost_clamp_to_usable_va_range(const struct pan_kmod_dev *dev,
                                           uint64_t va);

+unsigned
+panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props,
+                                  unsigned work_reg_count);
+
 #endif
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@ -426,36 +426,4 @@ pan_subgroup_size(unsigned arch)
      return 1;
 }

-/* Architectural maximums, since this register may be not implemented
- * by a given chip. G31 is actually 512 instead of 768 but it doesn't
- * really matter. */
-
-static inline unsigned
-panfrost_max_thread_count(unsigned arch, unsigned work_reg_count)
-{
-   switch (arch) {
-   /* Midgard */
-   case 4:
-   case 5:
-      if (work_reg_count > 8)
-         return 64;
-      else if (work_reg_count > 4)
-         return 128;
-      else
-         return 256;
-
-   /* Bifrost, first generation */
-   case 6:
-      return 384;
-
-   /* Bifrost, second generation (G31 is 512 but it doesn't matter) */
-   case 7:
-      return work_reg_count > 32 ? 384 : 768;
-
-   /* Valhall (for completeness) */
-   default:
-      return work_reg_count > 32 ? 512 : 1024;
-   }
-}
-
 #endif