mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 22:08:26 +02:00
panfrost: Rework the way we compute thread info
Rework the way we compute thread info to make it mostly GPU-agnostic outside of the kmod backend. The new logic is based on the following information extracted from GPU registers: - mximum number of threads per core - maximum number ot threads per workgroup - number of registers per core If the GPU doesn't provide this information (registers are zero), we pick the per-arch defaults we had in panfrost_max_thread_count(). Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Antonino Maniscalco <antonino.maniscalco@collabora.com> Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26358>
This commit is contained in:
parent
73da66706e
commit
4477daf957
7 changed files with 130 additions and 42 deletions
|
|
@ -533,8 +533,8 @@ panfrost_get_compute_state_info(struct pipe_context *pipe, void *cso,
|
|||
struct panfrost_compiled_shader *cs =
|
||||
util_dynarray_begin(&uncompiled->variants);
|
||||
|
||||
info->max_threads =
|
||||
panfrost_max_thread_count(dev->arch, cs->info.work_reg_count);
|
||||
info->max_threads = panfrost_compute_max_thread_count(
|
||||
&dev->kmod.props, cs->info.work_reg_count);
|
||||
info->private_memory = cs->info.tls_size;
|
||||
info->simd_sizes = pan_subgroup_size(dev->arch);
|
||||
info->preferred_simd_size = info->simd_sizes;
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ libpankmod_lib = static_library(
|
|||
include_directories : [inc_include, inc_src, inc_panfrost],
|
||||
c_args : [no_override_init_args],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
dependencies: [dep_libdrm, idep_mesautil],
|
||||
dependencies: [dep_libdrm, idep_mesautil, idep_pan_packers],
|
||||
build_by_default : false,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -154,7 +154,23 @@ struct pan_kmod_dev_props {
|
|||
uint32_t texture_features[4];
|
||||
|
||||
/* Maximum number of threads per core. */
|
||||
uint32_t thread_tls_alloc;
|
||||
uint32_t max_threads_per_core;
|
||||
|
||||
/* Maximum number of threads per workgroup. */
|
||||
uint32_t max_threads_per_wg;
|
||||
|
||||
/* Number of registers per core. Can be used to determine the maximum
|
||||
* number of threads that can be allocated for a specific shader based on
|
||||
* the number of registers assigned to this shader.
|
||||
*/
|
||||
uint32_t num_registers_per_core;
|
||||
|
||||
/* Maximum number of thread-local storage instance per core.
|
||||
* If the GPU doesn't have a THREAD_TLS_ALLOC register or the register
|
||||
* value is zero, the backend should assign the value of max_threads_per_core
|
||||
* here.
|
||||
*/
|
||||
uint32_t max_tls_instance_per_core;
|
||||
|
||||
/* AFBC feature bits. */
|
||||
uint32_t afbc_features;
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@
|
|||
|
||||
#include "pan_kmod_backend.h"
|
||||
|
||||
/* Only needed for pan_arch(), don't add per-arch stuff here. */
|
||||
#include "genxml/gen_macros.h"
|
||||
|
||||
const struct pan_kmod_ops panfrost_kmod_ops;
|
||||
|
||||
struct panfrost_kmod_vm {
|
||||
|
|
@ -91,6 +94,85 @@ panfrost_query_raw(int fd, enum drm_panfrost_param param, bool required,
|
|||
return get_param.value;
|
||||
}
|
||||
|
||||
static void
|
||||
panfrost_dev_query_thread_props(const struct pan_kmod_dev *dev,
|
||||
struct pan_kmod_dev_props *props)
|
||||
{
|
||||
int fd = dev->fd;
|
||||
|
||||
props->max_threads_per_core =
|
||||
panfrost_query_raw(fd, DRM_PANFROST_PARAM_MAX_THREADS, true, 0);
|
||||
if (!props->max_threads_per_core) {
|
||||
switch (pan_arch(props->gpu_prod_id)) {
|
||||
case 4:
|
||||
case 5:
|
||||
props->max_threads_per_core = 256;
|
||||
break;
|
||||
|
||||
case 6:
|
||||
/* Bifrost, first generation */
|
||||
props->max_threads_per_core = 384;
|
||||
break;
|
||||
|
||||
case 7:
|
||||
/* Bifrost, second generation (G31 is 512 but it doesn't matter) */
|
||||
props->max_threads_per_core = 768;
|
||||
break;
|
||||
|
||||
case 9:
|
||||
/* Valhall, first generation. */
|
||||
props->max_threads_per_core = 512;
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(!"Unsupported arch");
|
||||
}
|
||||
}
|
||||
|
||||
props->max_threads_per_wg = panfrost_query_raw(
|
||||
fd, DRM_PANFROST_PARAM_THREAD_MAX_WORKGROUP_SZ, true, 0);
|
||||
if (!props->max_threads_per_wg)
|
||||
props->max_threads_per_wg = props->max_threads_per_core;
|
||||
|
||||
uint32_t thread_features =
|
||||
panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_FEATURES, true, 0);
|
||||
props->num_registers_per_core = thread_features & 0xffff;
|
||||
if (!props->num_registers_per_core) {
|
||||
switch (pan_arch(props->gpu_prod_id)) {
|
||||
case 4:
|
||||
case 5:
|
||||
/* Assume we can always schedule max_threads_per_core when using 4
|
||||
* registers per-shader or less.
|
||||
*/
|
||||
props->num_registers_per_core = props->max_threads_per_core * 4;
|
||||
break;
|
||||
|
||||
case 6:
|
||||
/* Assume we can always schedule max_threads_per_core for shader
|
||||
* using the full per-shader register file (64 regs).
|
||||
*/
|
||||
props->num_registers_per_core = props->max_threads_per_core * 64;
|
||||
break;
|
||||
|
||||
case 7:
|
||||
case 9:
|
||||
/* Assume we can always schedule max_threads_per_core for shaders
|
||||
* using half the per-shader register file (32 regs).
|
||||
*/
|
||||
props->num_registers_per_core = props->max_threads_per_core * 32;
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(!"Unsupported arch");
|
||||
}
|
||||
}
|
||||
|
||||
props->max_tls_instance_per_core =
|
||||
panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0);
|
||||
if (!props->max_tls_instance_per_core)
|
||||
props->max_tls_instance_per_core = props->max_threads_per_core;
|
||||
}
|
||||
|
||||
static void
|
||||
panfrost_dev_query_props(const struct pan_kmod_dev *dev,
|
||||
struct pan_kmod_dev_props *props)
|
||||
|
|
@ -116,10 +198,10 @@ panfrost_dev_query_props(const struct pan_kmod_dev *dev,
|
|||
fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0 + i, true, 0);
|
||||
}
|
||||
|
||||
props->thread_tls_alloc =
|
||||
panfrost_query_raw(fd, DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, true, 0);
|
||||
props->afbc_features =
|
||||
panfrost_query_raw(fd, DRM_PANFROST_PARAM_AFBC_FEATURES, true, 0);
|
||||
|
||||
panfrost_dev_query_thread_props(dev, props);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@
|
|||
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
|
||||
*/
|
||||
|
||||
#include "util/macros.h"
|
||||
|
||||
#include "kmod/pan_kmod.h"
|
||||
#include "panfrost/util/pan_ir.h"
|
||||
#include "pan_props.h"
|
||||
|
|
@ -126,11 +128,27 @@ panfrost_query_core_count(const struct pan_kmod_dev_props *props,
|
|||
unsigned
|
||||
panfrost_query_thread_tls_alloc(const struct pan_kmod_dev_props *props)
|
||||
{
|
||||
unsigned tls = props->thread_tls_alloc;
|
||||
return props->max_tls_instance_per_core ?: props->max_threads_per_core;
|
||||
}
|
||||
|
||||
return (tls > 0)
|
||||
? tls
|
||||
: panfrost_max_thread_count(pan_arch(props->gpu_prod_id), 0);
|
||||
unsigned
|
||||
panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props,
|
||||
unsigned work_reg_count)
|
||||
{
|
||||
unsigned aligned_reg_count;
|
||||
|
||||
/* 4, 8 or 16 registers per shader on Midgard
|
||||
* 32 or 64 registers per shader on Bifrost
|
||||
*/
|
||||
if (pan_arch(props->gpu_prod_id) <= 5) {
|
||||
aligned_reg_count = util_next_power_of_two(MAX2(work_reg_count, 4));
|
||||
assert(aligned_reg_count <= 16);
|
||||
} else {
|
||||
aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
|
||||
}
|
||||
|
||||
return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
|
||||
props->num_registers_per_core / aligned_reg_count);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
|
|
|
|||
|
|
@ -95,4 +95,8 @@ unsigned panfrost_query_optimal_tib_size(const struct panfrost_model *model);
|
|||
uint64_t panfrost_clamp_to_usable_va_range(const struct pan_kmod_dev *dev,
|
||||
uint64_t va);
|
||||
|
||||
unsigned
|
||||
panfrost_compute_max_thread_count(const struct pan_kmod_dev_props *props,
|
||||
unsigned work_reg_count);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -426,36 +426,4 @@ pan_subgroup_size(unsigned arch)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Architectural maximums, since this register may be not implemented
|
||||
* by a given chip. G31 is actually 512 instead of 768 but it doesn't
|
||||
* really matter. */
|
||||
|
||||
static inline unsigned
|
||||
panfrost_max_thread_count(unsigned arch, unsigned work_reg_count)
|
||||
{
|
||||
switch (arch) {
|
||||
/* Midgard */
|
||||
case 4:
|
||||
case 5:
|
||||
if (work_reg_count > 8)
|
||||
return 64;
|
||||
else if (work_reg_count > 4)
|
||||
return 128;
|
||||
else
|
||||
return 256;
|
||||
|
||||
/* Bifrost, first generation */
|
||||
case 6:
|
||||
return 384;
|
||||
|
||||
/* Bifrost, second generation (G31 is 512 but it doesn't matter) */
|
||||
case 7:
|
||||
return work_reg_count > 32 ? 384 : 768;
|
||||
|
||||
/* Valhall (for completeness) */
|
||||
default:
|
||||
return work_reg_count > 32 ? 512 : 1024;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue