From 4477daf957cafcca1e4873b2d5c5b244c590f56c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 14 Nov 2023 13:11:49 +0100 Subject: [PATCH] panfrost: Rework the way we compute thread info Rework the way we compute thread info to make it mostly GPU-agnostic outside of the kmod backend. The new logic is based on the following information extracted from GPU registers: - mximum number of threads per core - maximum number ot threads per workgroup - number of registers per core If the GPU doesn't provide this information (registers are zero), we pick the per-arch defaults we had in panfrost_max_thread_count(). Signed-off-by: Boris Brezillon Reviewed-by: Antonino Maniscalco Reviewed-by: Erik Faye-Lund Part-of: --- src/gallium/drivers/panfrost/pan_shader.c | 4 +- src/panfrost/lib/kmod/meson.build | 2 +- src/panfrost/lib/kmod/pan_kmod.h | 18 ++++- src/panfrost/lib/kmod/panfrost_kmod.c | 86 ++++++++++++++++++++++- src/panfrost/lib/pan_props.c | 26 +++++-- src/panfrost/lib/pan_props.h | 4 ++ src/panfrost/util/pan_ir.h | 32 --------- 7 files changed, 130 insertions(+), 42 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index 87104f19cb7..61a4eb12ded 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -533,8 +533,8 @@ panfrost_get_compute_state_info(struct pipe_context *pipe, void *cso, struct panfrost_compiled_shader *cs = util_dynarray_begin(&uncompiled->variants); - info->max_threads = - panfrost_max_thread_count(dev->arch, cs->info.work_reg_count); + info->max_threads = panfrost_compute_max_thread_count( + &dev->kmod.props, cs->info.work_reg_count); info->private_memory = cs->info.tls_size; info->simd_sizes = pan_subgroup_size(dev->arch); info->preferred_simd_size = info->simd_sizes; diff --git a/src/panfrost/lib/kmod/meson.build b/src/panfrost/lib/kmod/meson.build index 1278dc6f394..398b572ba7e 100644 --- a/src/panfrost/lib/kmod/meson.build +++ b/src/panfrost/lib/kmod/meson.build @@ -29,7 +29,7 @@ libpankmod_lib = static_library( include_directories : [inc_include, inc_src, inc_panfrost], c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', - dependencies: [dep_libdrm, idep_mesautil], + dependencies: [dep_libdrm, idep_mesautil, idep_pan_packers], build_by_default : false, ) diff --git a/src/panfrost/lib/kmod/pan_kmod.h b/src/panfrost/lib/kmod/pan_kmod.h index fb0a935a059..e16461ebc5c 100644 --- a/src/panfrost/lib/kmod/pan_kmod.h +++ b/src/panfrost/lib/kmod/pan_kmod.h @@ -154,7 +154,23 @@ struct pan_kmod_dev_props { uint32_t texture_features[4]; /* Maximum number of threads per core. */ - uint32_t thread_tls_alloc; + uint32_t max_threads_per_core; + + /* Maximum number of threads per workgroup. */ + uint32_t max_threads_per_wg; + + /* Number of registers per core. Can be used to determine the maximum + * number of threads that can be allocated for a specific shader based on + * the number of registers assigned to this shader. + */ + uint32_t num_registers_per_core; + + /* Maximum number of thread-local storage instance per core. + * If the GPU doesn't have a THREAD_TLS_ALLOC register or the register + * value is zero, the backend should assign the value of max_threads_per_core + * here. + */ + uint32_t max_tls_instance_per_core; /* AFBC feature bits. */ uint32_t afbc_features; diff --git a/src/panfrost/lib/kmod/panfrost_kmod.c b/src/panfrost/lib/kmod/panfrost_kmod.c index cf5267db745..81f494a7a9d 100644 --- a/src/panfrost/lib/kmod/panfrost_kmod.c +++ b/src/panfrost/lib/kmod/panfrost_kmod.c @@ -17,6 +17,9 @@ #include "pan_kmod_backend.h" +/* Only needed for pan_arch(), don't add per-arch stuff here. */ +#include "genxml/gen_macros.h" + const struct pan_kmod_ops panfrost_kmod_ops; struct panfrost_kmod_vm { @@ -91,6 +94,85 @@ panfrost_query_raw(int fd, enum drm_panfrost_param param, bool required, return get_param.value; } +static void +panfrost_dev_query_thread_props(const struct pan_kmod_dev *dev, + struct pan_kmod_dev_props *props) +{ + int fd = dev->fd; + + props->max_threads_per_core = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_MAX_THREADS, true, 0); + if (!props->max_threads_per_core) { + switch (pan_arch(props->gpu_prod_id)) { + case 4: + case 5: + props->max_threads_per_core = 256; + break; + + case 6: + /* Bifrost, first generation */ + props->max_threads_per_core = 384; + break; + + case 7: + /* Bifrost, second generation (G31 is 512 but it doesn't matter) */ + props->max_threads_per_core = 768; + break; + + case 9: + /* Valhall, first generation. */ + props->max_threads_per_core = 512; + break; + + default: + assert(!"Unsupported arch"); + } + } + + props->max_threads_per_wg = panfrost_query_raw( + fd, DRM_PANFROST_PARAM_THREAD_MAX_WORKGROUP_SZ, true, 0); + if (!props->max_threads_per_wg) + props->max_threads_per_wg = props->max_threads_per_core; + + uint32_t thread_features = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_FEATURES, true, 0); + props->num_registers_per_core = thread_features & 0xffff; + if (!props->num_registers_per_core) { + switch (pan_arch(props->gpu_prod_id)) { + case 4: + case 5: + /* Assume we can always schedule max_threads_per_core when using 4 + * registers per-shader or less. + */ + props->num_registers_per_core = props->max_threads_per_core * 4; + break; + + case 6: + /* Assume we can always schedule max_threads_per_core for shader + * using the full per-shader register file (64 regs). + */ + props->num_registers_per_core = props->max_threads_per_core * 64; + break; + + case 7: + case 9: + /* Assume we can always schedule max_threads_per_core for shaders + * using half the per-shader register file (32 regs). + */ + props->num_registers_per_core = props->max_threads_per_core * 32; + break; + + default: + assert(!"Unsupported arch"); + } + } + + props->max_tls_instance_per_core = + panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0); + if (!props->max_tls_instance_per_core) + props->max_tls_instance_per_core = props->max_threads_per_core; +} + static void panfrost_dev_query_props(const struct pan_kmod_dev *dev, struct pan_kmod_dev_props *props) @@ -116,10 +198,10 @@ panfrost_dev_query_props(const struct pan_kmod_dev *dev, fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0 + i, true, 0); } - props->thread_tls_alloc = - panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0); props->afbc_features = panfrost_query_raw(fd, DRM_PANFROST_PARAM_AFBC_FEATURES, true, 0); + + panfrost_dev_query_thread_props(dev, props); } static uint32_t diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c index 55871169638..f4f0f99ec1d 100644 --- a/src/panfrost/lib/pan_props.c +++ b/src/panfrost/lib/pan_props.c @@ -24,6 +24,8 @@ * Alyssa Rosenzweig */ +#include "util/macros.h" + #include "kmod/pan_kmod.h" #include "panfrost/util/pan_ir.h" #include "pan_props.h" @@ -126,11 +128,27 @@ panfrost_query_core_count(const struct pan_kmod_dev_props *props, unsigned panfrost_query_thread_tls_alloc(const struct pan_kmod_dev_props *props) { - unsigned tls = props->thread_tls_alloc; + return props->max_tls_instance_per_core ?: props->max_threads_per_core; +} - return (tls > 0) - ? tls - : panfrost_max_thread_count(pan_arch(props->gpu_prod_id), 0); +unsigned +panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props, + unsigned work_reg_count) +{ + unsigned aligned_reg_count; + + /* 4, 8 or 16 registers per shader on Midgard + * 32 or 64 registers per shader on Bifrost + */ + if (pan_arch(props->gpu_prod_id) <= 5) { + aligned_reg_count = util_next_power_of_two(MAX2(work_reg_count, 4)); + assert(aligned_reg_count <= 16); + } else { + aligned_reg_count = work_reg_count <= 32 ? 32 : 64; + } + + return MIN3(props->max_threads_per_wg, props->max_threads_per_core, + props->num_registers_per_core / aligned_reg_count); } uint32_t diff --git a/src/panfrost/lib/pan_props.h b/src/panfrost/lib/pan_props.h index 3d15492b57d..927abede47b 100644 --- a/src/panfrost/lib/pan_props.h +++ b/src/panfrost/lib/pan_props.h @@ -95,4 +95,8 @@ unsigned panfrost_query_optimal_tib_size(const struct panfrost_model *model); uint64_t panfrost_clamp_to_usable_va_range(const struct pan_kmod_dev *dev, uint64_t va); +unsigned +panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props, + unsigned work_reg_count); + #endif diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index d2b5331edeb..94b53ddac56 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -426,36 +426,4 @@ pan_subgroup_size(unsigned arch) return 1; } -/* Architectural maximums, since this register may be not implemented - * by a given chip. G31 is actually 512 instead of 768 but it doesn't - * really matter. */ - -static inline unsigned -panfrost_max_thread_count(unsigned arch, unsigned work_reg_count) -{ - switch (arch) { - /* Midgard */ - case 4: - case 5: - if (work_reg_count > 8) - return 64; - else if (work_reg_count > 4) - return 128; - else - return 256; - - /* Bifrost, first generation */ - case 6: - return 384; - - /* Bifrost, second generation (G31 is 512 but it doesn't matter) */ - case 7: - return work_reg_count > 32 ? 384 : 768; - - /* Valhall (for completeness) */ - default: - return work_reg_count > 32 ? 512 : 1024; - } -} - #endif