pan: Add handling for v15+ uapi thread_max_wg_size

thread_max_workgroup_size has been replaced with
thread_num_active_granularity in v15, which requires updated handling
for calculating the max number of threads in a workgroup
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2026-03-23 14:05:55 +01:00
parent 003becf081
commit 1f12828011
10 changed files with 56 additions and 23 deletions

View file

@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
.tls.size = ss->info.tls_size,
.wls.size = ss->info.wls_size + grid->variable_shared_mem,
.wls.instances = pan_calc_wls_instances(
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
ss->info.work_reg_count),
};
if (ss->info.tls_size) {

View file

@ -197,8 +197,9 @@ emit_tls(struct panfrost_batch *batch,
struct pan_tls_info info = {
.tls.size = shader->info.tls_size,
.wls.size = shader->info.wls_size,
.wls.instances = pan_calc_wls_instances(&shader->local_size,
&dev->kmod.dev->props, dim),
.wls.instances =
pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
shader->info.work_reg_count),
};
if (info.tls.size) {

View file

@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
/* Maximum number of threads per workgroup. */
uint32_t max_threads_per_wg;
/* Granularity of number of active threads. */
uint32_t num_threads_active_granularity;
/* Number of registers per core. Can be used to determine the maximum
* number of threads that can be allocated for a specific shader based on
* the number of registers assigned to this shader.

View file

@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
props->num_registers_per_core =
panthor_dev->props.gpu.thread_features & 0x3fffff;
props->num_threads_active_granularity =
panthor_dev->props.gpu.thread_num_active_granularity;
/* We assume that all thread properties are populated. If we ever have a GPU
* that have one of the THREAD_xxx register that's zero, we can always add a
* quirk here.
*/
assert(props->max_threads_per_wg && props->max_threads_per_core &&
props->max_tasks_per_core && props->num_registers_per_core);
assert(
(props->max_threads_per_wg || props->num_threads_active_granularity) &&
props->max_threads_per_core && props->max_tasks_per_core &&
props->num_registers_per_core);
/* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
* of TLS instance per core is assumed to be the maximum number of threads

View file

@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)
static inline unsigned
pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
const struct pan_kmod_dev_props *props)
const struct pan_kmod_dev_props *props,
unsigned work_reg_count)
{
/* Each shader core can run N tasks and a total of M threads at any single
* time, thus each task should ideally have no more than M/N threads. */
unsigned max_threads_per_task =
props->max_threads_per_core / props->max_tasks_per_core;
ASSERTED unsigned max_threads_per_wg =
pan_compute_max_thread_count(props, work_reg_count);
/* To achieve the best utilization, we should aim for as many workgroups
* per tasks as we can fit without exceeding the above thread limit */
unsigned threads_per_wg =
shader_local_size->x * shader_local_size->y * shader_local_size->z;
assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
static inline unsigned
pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
const struct pan_kmod_dev_props *props,
const struct pan_compute_dim *dim)
const struct pan_compute_dim *dim,
unsigned work_reg_count)
{
/* NOTE: If the instance count is lower than the number of workgroups
* being dispatched, the HW will hold back workgroups until instances
* can be reused. */
unsigned instances;
unsigned wg_per_task =
pan_calc_workgroups_per_task(shader_local_size, props);
pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
unsigned max_instances_per_core =
util_next_power_of_two(wg_per_task * props->max_tasks_per_core);

View file

@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
}
if (pan_arch(props->gpu_id) >= 15) {
assert(props->num_threads_active_granularity);
unsigned max_treads_per_wg =
ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
props->num_threads_active_granularity);
return MIN2(max_treads_per_wg, props->max_threads_per_core);
}
assert(props->max_threads_per_wg);
return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
props->num_registers_per_core / aligned_reg_count);
}

View file

@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
unsigned core_id_range;
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
tlsinfo.wls.instances = pan_calc_wls_instances(
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
tlsinfo.wls.instances =
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
indirect ? NULL : dim, cs->info.work_reg_count);
unsigned wls_total_size = pan_calc_total_wls_size(
tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
unsigned wg_per_task = 0;
if (indirect)
wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
&phys_dev->kmod.dev->props);
&phys_dev->kmod.dev->props,
cs->info.work_reg_count);
if (compute_state_dirty(cmdbuf, DESC_STATE) ||
compute_state_dirty(cmdbuf, CS)) {

View file

@ -155,7 +155,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
* increment/axis parameters requires knowledge of job dimensions, but
* this is somewhat offset by run_compute being a native instruction. */
task_increment = pan_calc_workgroups_per_task(
&shader->cs.local_size, &phys_dev->kmod.dev->props);
&shader->cs.local_size, &phys_dev->kmod.dev->props,
shader->info.work_reg_count);
} else {
panvk_per_arch(calculate_task_axis_and_increment)(
shader, phys_dev, &dim, &task_axis, &task_increment);

View file

@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
unsigned core_id_range;
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
batch->tlsinfo.wls.instances = pan_calc_wls_instances(
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
batch->tlsinfo.wls.instances =
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
indirect ? NULL : dim, cs->info.work_reg_count);
batch->wls_total_size = pan_calc_total_wls_size(
batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
}

View file

@ -728,8 +728,17 @@ panvk_per_arch(get_physical_device_properties)(
const bool has_disk_cache = device->vk.disk_cache != NULL;
/* Calculate the value using register count on v15+.
* TODO: As this requires register allocation changes ensuring we don't
* violate the limits based on the workgroup size, clamp the value to half of
* the max threads value (always safe and matches previous GPUs) for now. */
unsigned max_threads_per_wg =
(PAN_ARCH >= 15)
? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
device->kmod.dev->props.max_threads_per_core / 2)
: device->kmod.dev->props.max_threads_per_wg;
/* Ensure that the max threads count per workgroup is valid for Bifrost */
assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);
float pointSizeRangeMin;
float pointSizeRangeMax;
@ -858,11 +867,9 @@ panvk_per_arch(get_physical_device_properties)(
/* We could also split into serveral jobs but this has many limitations.
* As such we limit to the max threads per workgroup supported by the GPU.
*/
.maxComputeWorkGroupInvocations =
device->kmod.dev->props.max_threads_per_wg,
.maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
device->kmod.dev->props.max_threads_per_wg,
device->kmod.dev->props.max_threads_per_wg},
.maxComputeWorkGroupInvocations = max_threads_per_wg,
.maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
max_threads_per_wg},
/* 8-bit subpixel precision. */
.subPixelPrecisionBits = 8,
.subTexelPrecisionBits = 8,
@ -1053,8 +1060,7 @@ panvk_per_arch(get_physical_device_properties)(
.minSubgroupSize = pan_subgroup_size(PAN_ARCH),
.maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
.maxComputeWorkgroupSubgroups =
device->kmod.dev->props.max_threads_per_wg /
pan_subgroup_size(PAN_ARCH),
max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
.requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
.maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
.maxPerStageDescriptorInlineUniformBlocks =