mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
pan: Add handling for v15+ uapi thread_max_wg_size
thread_max_workgroup_size has been replaced with thread_num_active_granularity in v15, which requires updated handling for calculating the max number of threads in a workgroup
This commit is contained in:
parent
003becf081
commit
1f12828011
10 changed files with 56 additions and 23 deletions
|
|
@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
|
|||
.tls.size = ss->info.tls_size,
|
||||
.wls.size = ss->info.wls_size + grid->variable_shared_mem,
|
||||
.wls.instances = pan_calc_wls_instances(
|
||||
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
|
||||
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
|
||||
ss->info.work_reg_count),
|
||||
};
|
||||
|
||||
if (ss->info.tls_size) {
|
||||
|
|
|
|||
|
|
@ -197,8 +197,9 @@ emit_tls(struct panfrost_batch *batch,
|
|||
struct pan_tls_info info = {
|
||||
.tls.size = shader->info.tls_size,
|
||||
.wls.size = shader->info.wls_size,
|
||||
.wls.instances = pan_calc_wls_instances(&shader->local_size,
|
||||
&dev->kmod.dev->props, dim),
|
||||
.wls.instances =
|
||||
pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
|
||||
shader->info.work_reg_count),
|
||||
};
|
||||
|
||||
if (info.tls.size) {
|
||||
|
|
|
|||
|
|
@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
|
|||
/* Maximum number of threads per workgroup. */
|
||||
uint32_t max_threads_per_wg;
|
||||
|
||||
/* Granularity of number of active threads. */
|
||||
uint32_t num_threads_active_granularity;
|
||||
|
||||
/* Number of registers per core. Can be used to determine the maximum
|
||||
* number of threads that can be allocated for a specific shader based on
|
||||
* the number of registers assigned to this shader.
|
||||
|
|
|
|||
|
|
@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
|
|||
props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
|
||||
props->num_registers_per_core =
|
||||
panthor_dev->props.gpu.thread_features & 0x3fffff;
|
||||
props->num_threads_active_granularity =
|
||||
panthor_dev->props.gpu.thread_num_active_granularity;
|
||||
|
||||
/* We assume that all thread properties are populated. If we ever have a GPU
|
||||
* that have one of the THREAD_xxx register that's zero, we can always add a
|
||||
* quirk here.
|
||||
*/
|
||||
assert(props->max_threads_per_wg && props->max_threads_per_core &&
|
||||
props->max_tasks_per_core && props->num_registers_per_core);
|
||||
assert(
|
||||
(props->max_threads_per_wg || props->num_threads_active_granularity) &&
|
||||
props->max_threads_per_core && props->max_tasks_per_core &&
|
||||
props->num_registers_per_core);
|
||||
|
||||
/* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
|
||||
* of TLS instance per core is assumed to be the maximum number of threads
|
||||
|
|
|
|||
|
|
@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)
|
|||
|
||||
static inline unsigned
|
||||
pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
|
||||
const struct pan_kmod_dev_props *props)
|
||||
const struct pan_kmod_dev_props *props,
|
||||
unsigned work_reg_count)
|
||||
{
|
||||
/* Each shader core can run N tasks and a total of M threads at any single
|
||||
* time, thus each task should ideally have no more than M/N threads. */
|
||||
unsigned max_threads_per_task =
|
||||
props->max_threads_per_core / props->max_tasks_per_core;
|
||||
|
||||
ASSERTED unsigned max_threads_per_wg =
|
||||
pan_compute_max_thread_count(props, work_reg_count);
|
||||
|
||||
/* To achieve the best utilization, we should aim for as many workgroups
|
||||
* per tasks as we can fit without exceeding the above thread limit */
|
||||
unsigned threads_per_wg =
|
||||
shader_local_size->x * shader_local_size->y * shader_local_size->z;
|
||||
assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
|
||||
assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
|
||||
unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
|
||||
assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
|
||||
|
||||
|
|
@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
|
|||
static inline unsigned
|
||||
pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
|
||||
const struct pan_kmod_dev_props *props,
|
||||
const struct pan_compute_dim *dim)
|
||||
const struct pan_compute_dim *dim,
|
||||
unsigned work_reg_count)
|
||||
{
|
||||
/* NOTE: If the instance count is lower than the number of workgroups
|
||||
* being dispatched, the HW will hold back workgroups until instances
|
||||
* can be reused. */
|
||||
unsigned instances;
|
||||
unsigned wg_per_task =
|
||||
pan_calc_workgroups_per_task(shader_local_size, props);
|
||||
pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
|
||||
unsigned max_instances_per_core =
|
||||
util_next_power_of_two(wg_per_task * props->max_tasks_per_core);
|
||||
|
||||
|
|
|
|||
|
|
@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
|
|||
aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
|
||||
}
|
||||
|
||||
if (pan_arch(props->gpu_id) >= 15) {
|
||||
assert(props->num_threads_active_granularity);
|
||||
unsigned max_treads_per_wg =
|
||||
ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
|
||||
props->num_threads_active_granularity);
|
||||
return MIN2(max_treads_per_wg, props->max_threads_per_core);
|
||||
}
|
||||
|
||||
assert(props->max_threads_per_wg);
|
||||
return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
|
||||
props->num_registers_per_core / aligned_reg_count);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
|
|||
unsigned core_id_range;
|
||||
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
|
||||
|
||||
tlsinfo.wls.instances = pan_calc_wls_instances(
|
||||
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
|
||||
tlsinfo.wls.instances =
|
||||
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
|
||||
indirect ? NULL : dim, cs->info.work_reg_count);
|
||||
|
||||
unsigned wls_total_size = pan_calc_total_wls_size(
|
||||
tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
|
||||
|
|
@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
|
|||
unsigned wg_per_task = 0;
|
||||
if (indirect)
|
||||
wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
|
||||
&phys_dev->kmod.dev->props);
|
||||
&phys_dev->kmod.dev->props,
|
||||
cs->info.work_reg_count);
|
||||
|
||||
if (compute_state_dirty(cmdbuf, DESC_STATE) ||
|
||||
compute_state_dirty(cmdbuf, CS)) {
|
||||
|
|
|
|||
|
|
@ -155,7 +155,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
|
|||
* increment/axis parameters requires knowledge of job dimensions, but
|
||||
* this is somewhat offset by run_compute being a native instruction. */
|
||||
task_increment = pan_calc_workgroups_per_task(
|
||||
&shader->cs.local_size, &phys_dev->kmod.dev->props);
|
||||
&shader->cs.local_size, &phys_dev->kmod.dev->props,
|
||||
shader->info.work_reg_count);
|
||||
} else {
|
||||
panvk_per_arch(calculate_task_axis_and_increment)(
|
||||
shader, phys_dev, &dim, &task_axis, &task_increment);
|
||||
|
|
|
|||
|
|
@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
|
|||
unsigned core_id_range;
|
||||
|
||||
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
|
||||
batch->tlsinfo.wls.instances = pan_calc_wls_instances(
|
||||
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
|
||||
batch->tlsinfo.wls.instances =
|
||||
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
|
||||
indirect ? NULL : dim, cs->info.work_reg_count);
|
||||
batch->wls_total_size = pan_calc_total_wls_size(
|
||||
batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -728,8 +728,17 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
|
||||
const bool has_disk_cache = device->vk.disk_cache != NULL;
|
||||
|
||||
/* Calculate the value using register count on v15+.
|
||||
* TODO: As this requires register allocation changes ensuring we don't
|
||||
* violate the limits based on the workgroup size, clamp the value to half of
|
||||
* the max threads value (always safe and matches previous GPUs) for now. */
|
||||
unsigned max_threads_per_wg =
|
||||
(PAN_ARCH >= 15)
|
||||
? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
|
||||
device->kmod.dev->props.max_threads_per_core / 2)
|
||||
: device->kmod.dev->props.max_threads_per_wg;
|
||||
/* Ensure that the max threads count per workgroup is valid for Bifrost */
|
||||
assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
|
||||
assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);
|
||||
|
||||
float pointSizeRangeMin;
|
||||
float pointSizeRangeMax;
|
||||
|
|
@ -858,11 +867,9 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
/* We could also split into serveral jobs but this has many limitations.
|
||||
* As such we limit to the max threads per workgroup supported by the GPU.
|
||||
*/
|
||||
.maxComputeWorkGroupInvocations =
|
||||
device->kmod.dev->props.max_threads_per_wg,
|
||||
.maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
|
||||
device->kmod.dev->props.max_threads_per_wg,
|
||||
device->kmod.dev->props.max_threads_per_wg},
|
||||
.maxComputeWorkGroupInvocations = max_threads_per_wg,
|
||||
.maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
|
||||
max_threads_per_wg},
|
||||
/* 8-bit subpixel precision. */
|
||||
.subPixelPrecisionBits = 8,
|
||||
.subTexelPrecisionBits = 8,
|
||||
|
|
@ -1053,8 +1060,7 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
.minSubgroupSize = pan_subgroup_size(PAN_ARCH),
|
||||
.maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
|
||||
.maxComputeWorkgroupSubgroups =
|
||||
device->kmod.dev->props.max_threads_per_wg /
|
||||
pan_subgroup_size(PAN_ARCH),
|
||||
max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
|
||||
.requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
|
||||
.maxPerStageDescriptorInlineUniformBlocks =
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue