panfrost: Rework the way we compute thread info

Rework the way we compute thread info to make it mostly GPU-agnostic
outside of the kmod backend.

The new logic is based on the following information extracted from
GPU registers:

- mximum number of threads per core
- maximum number ot threads per workgroup
- number of registers per core

If the GPU doesn't provide this information (registers are zero), we
pick the per-arch defaults we had in panfrost_max_thread_count().

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Antonino Maniscalco <antonino.maniscalco@collabora.com>
Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26358>
This commit is contained in:
Boris Brezillon 2023-11-14 13:11:49 +01:00 committed by Marge Bot
parent 73da66706e
commit 4477daf957
7 changed files with 130 additions and 42 deletions

View file

@ -533,8 +533,8 @@ panfrost_get_compute_state_info(struct pipe_context *pipe, void *cso,
struct panfrost_compiled_shader *cs =
util_dynarray_begin(&uncompiled->variants);
info->max_threads =
panfrost_max_thread_count(dev->arch, cs->info.work_reg_count);
info->max_threads = panfrost_compute_max_thread_count(
&dev->kmod.props, cs->info.work_reg_count);
info->private_memory = cs->info.tls_size;
info->simd_sizes = pan_subgroup_size(dev->arch);
info->preferred_simd_size = info->simd_sizes;

View file

@ -29,7 +29,7 @@ libpankmod_lib = static_library(
include_directories : [inc_include, inc_src, inc_panfrost],
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',
dependencies: [dep_libdrm, idep_mesautil],
dependencies: [dep_libdrm, idep_mesautil, idep_pan_packers],
build_by_default : false,
)

View file

@ -154,7 +154,23 @@ struct pan_kmod_dev_props {
uint32_t texture_features[4];
/* Maximum number of threads per core. */
uint32_t thread_tls_alloc;
uint32_t max_threads_per_core;
/* Maximum number of threads per workgroup. */
uint32_t max_threads_per_wg;
/* Number of registers per core. Can be used to determine the maximum
* number of threads that can be allocated for a specific shader based on
* the number of registers assigned to this shader.
*/
uint32_t num_registers_per_core;
/* Maximum number of thread-local storage instance per core.
* If the GPU doesn't have a THREAD_TLS_ALLOC register or the register
* value is zero, the backend should assign the value of max_threads_per_core
* here.
*/
uint32_t max_tls_instance_per_core;
/* AFBC feature bits. */
uint32_t afbc_features;

View file

@ -17,6 +17,9 @@
#include "pan_kmod_backend.h"
/* Only needed for pan_arch(), don't add per-arch stuff here. */
#include "genxml/gen_macros.h"
const struct pan_kmod_ops panfrost_kmod_ops;
struct panfrost_kmod_vm {
@ -91,6 +94,85 @@ panfrost_query_raw(int fd, enum drm_panfrost_param param, bool required,
return get_param.value;
}
static void
panfrost_dev_query_thread_props(const struct pan_kmod_dev *dev,
struct pan_kmod_dev_props *props)
{
int fd = dev->fd;
props->max_threads_per_core =
panfrost_query_raw(fd, DRM_PANFROST_PARAM_MAX_THREADS, true, 0);
if (!props->max_threads_per_core) {
switch (pan_arch(props->gpu_prod_id)) {
case 4:
case 5:
props->max_threads_per_core = 256;
break;
case 6:
/* Bifrost, first generation */
props->max_threads_per_core = 384;
break;
case 7:
/* Bifrost, second generation (G31 is 512 but it doesn't matter) */
props->max_threads_per_core = 768;
break;
case 9:
/* Valhall, first generation. */
props->max_threads_per_core = 512;
break;
default:
assert(!"Unsupported arch");
}
}
props->max_threads_per_wg = panfrost_query_raw(
fd, DRM_PANFROST_PARAM_THREAD_MAX_WORKGROUP_SZ, true, 0);
if (!props->max_threads_per_wg)
props->max_threads_per_wg = props->max_threads_per_core;
uint32_t thread_features =
panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_FEATURES, true, 0);
props->num_registers_per_core = thread_features & 0xffff;
if (!props->num_registers_per_core) {
switch (pan_arch(props->gpu_prod_id)) {
case 4:
case 5:
/* Assume we can always schedule max_threads_per_core when using 4
* registers per-shader or less.
*/
props->num_registers_per_core = props->max_threads_per_core * 4;
break;
case 6:
/* Assume we can always schedule max_threads_per_core for shader
* using the full per-shader register file (64 regs).
*/
props->num_registers_per_core = props->max_threads_per_core * 64;
break;
case 7:
case 9:
/* Assume we can always schedule max_threads_per_core for shaders
* using half the per-shader register file (32 regs).
*/
props->num_registers_per_core = props->max_threads_per_core * 32;
break;
default:
assert(!"Unsupported arch");
}
}
props->max_tls_instance_per_core =
panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0);
if (!props->max_tls_instance_per_core)
props->max_tls_instance_per_core = props->max_threads_per_core;
}
static void
panfrost_dev_query_props(const struct pan_kmod_dev *dev,
struct pan_kmod_dev_props *props)
@ -116,10 +198,10 @@ panfrost_dev_query_props(const struct pan_kmod_dev *dev,
fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0 + i, true, 0);
}
props->thread_tls_alloc =
panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0);
props->afbc_features =
panfrost_query_raw(fd, DRM_PANFROST_PARAM_AFBC_FEATURES, true, 0);
panfrost_dev_query_thread_props(dev, props);
}
static uint32_t

View file

@ -24,6 +24,8 @@
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
*/
#include "util/macros.h"
#include "kmod/pan_kmod.h"
#include "panfrost/util/pan_ir.h"
#include "pan_props.h"
@ -126,11 +128,27 @@ panfrost_query_core_count(const struct pan_kmod_dev_props *props,
unsigned
panfrost_query_thread_tls_alloc(const struct pan_kmod_dev_props *props)
{
unsigned tls = props->thread_tls_alloc;
return props->max_tls_instance_per_core ?: props->max_threads_per_core;
}
return (tls > 0)
? tls
: panfrost_max_thread_count(pan_arch(props->gpu_prod_id), 0);
unsigned
panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props,
unsigned work_reg_count)
{
unsigned aligned_reg_count;
/* 4, 8 or 16 registers per shader on Midgard
* 32 or 64 registers per shader on Bifrost
*/
if (pan_arch(props->gpu_prod_id) <= 5) {
aligned_reg_count = util_next_power_of_two(MAX2(work_reg_count, 4));
assert(aligned_reg_count <= 16);
} else {
aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
}
return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
props->num_registers_per_core / aligned_reg_count);
}
uint32_t

View file

@ -95,4 +95,8 @@ unsigned panfrost_query_optimal_tib_size(const struct panfrost_model *model);
uint64_t panfrost_clamp_to_usable_va_range(const struct pan_kmod_dev *dev,
uint64_t va);
unsigned
panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props,
unsigned work_reg_count);
#endif

View file

@ -426,36 +426,4 @@ pan_subgroup_size(unsigned arch)
return 1;
}
/* Architectural maximums, since this register may be not implemented
* by a given chip. G31 is actually 512 instead of 768 but it doesn't
* really matter. */
static inline unsigned
panfrost_max_thread_count(unsigned arch, unsigned work_reg_count)
{
switch (arch) {
/* Midgard */
case 4:
case 5:
if (work_reg_count > 8)
return 64;
else if (work_reg_count > 4)
return 128;
else
return 256;
/* Bifrost, first generation */
case 6:
return 384;
/* Bifrost, second generation (G31 is 512 but it doesn't matter) */
case 7:
return work_reg_count > 32 ? 384 : 768;
/* Valhall (for completeness) */
default:
return work_reg_count > 32 ? 512 : 1024;
}
}
#endif