mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-06 02:20:11 +01:00
panvk: rework calculate_task_axis_and_increment
We used to maximize threads_per_task, but that is ideal when the system has a single gpu client. When there are multiple gpu clients, we want smaller threads_per_task such that cores can be more fairly shared among the clients. Signed-off-by: Chia-I Wu <olvaffe@gmail.com> Tested-by: Yiwei Zhang <zzyiwei@chromium.org> Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37988>
This commit is contained in:
parent
5fd32d79ee
commit
ddd0b0c3a8
2 changed files with 30 additions and 22 deletions
3
src/panfrost/ci/panfrost-g925-skips.txt
Normal file
3
src/panfrost/ci/panfrost-g925-skips.txt
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
# Slow tests (>= 30s)
|
||||
dEQP-VK.api.external.fence.sync_fd.export_multiple_times_temporary
|
||||
dEQP-VK.api.external.semaphore.sync_fd.export_multiple_times_temporary
|
||||
|
|
@ -677,13 +677,19 @@ panvk_per_arch(calculate_task_axis_and_increment)(
|
|||
{
|
||||
/* Pick the task_axis and task_increment to maximize thread
|
||||
* utilization. */
|
||||
unsigned threads_per_wg = shader->cs.local_size.x * shader->cs.local_size.y *
|
||||
shader->cs.local_size.z;
|
||||
unsigned max_thread_cnt = pan_compute_max_thread_count(
|
||||
&phys_dev->kmod.props, shader->info.work_reg_count);
|
||||
unsigned threads_per_task = threads_per_wg;
|
||||
const struct pan_kmod_dev_props *props = &phys_dev->kmod.props;
|
||||
const unsigned max_thread_cnt =
|
||||
pan_compute_max_thread_count(props, shader->info.work_reg_count);
|
||||
const unsigned threads_per_wg = shader->cs.local_size.x *
|
||||
shader->cs.local_size.y *
|
||||
shader->cs.local_size.z;
|
||||
const unsigned wg_count[3] = {wg_dim->x, wg_dim->y, wg_dim->z};
|
||||
const unsigned total_wgs = wg_dim->x * wg_dim->y * wg_dim->z;
|
||||
const unsigned total_cores = util_bitcount64(phys_dev->compute_core_mask);
|
||||
/* Split workgroups among cores evenly. */
|
||||
const unsigned wgs_per_core = DIV_ROUND_UP(total_wgs, total_cores);
|
||||
unsigned threads_per_task;
|
||||
unsigned wgs_per_task;
|
||||
|
||||
if (!total_wgs) {
|
||||
*task_axis = MALI_TASK_AXIS_X;
|
||||
|
|
@ -691,25 +697,24 @@ panvk_per_arch(calculate_task_axis_and_increment)(
|
|||
return;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (threads_per_task * wg_count[i] >= max_thread_cnt) {
|
||||
/* We reached out thread limit, stop at the current axis and
|
||||
* calculate the increment so it doesn't exceed the per-core
|
||||
* thread capacity.
|
||||
*/
|
||||
*task_increment = max_thread_cnt / threads_per_task;
|
||||
break;
|
||||
} else if (*task_axis == MALI_TASK_AXIS_Z) {
|
||||
/* We reached the Z axis, and there's still room to stuff more
|
||||
* threads. Pick the current axis grid size as our increment
|
||||
* as there's no point using something bigger.
|
||||
*/
|
||||
*task_increment = wg_count[i];
|
||||
break;
|
||||
}
|
||||
/* We used to maximize threads_per_task, but that is ideal when the system
|
||||
* has a single gpu client. When there are multiple gpu clients, we want
|
||||
* smaller threads_per_task such that cores can be more fairly shared among
|
||||
* the clients.
|
||||
*/
|
||||
threads_per_task = DIV_ROUND_UP(max_thread_cnt, props->max_tasks_per_core);
|
||||
|
||||
wgs_per_task = threads_per_task / threads_per_wg;
|
||||
wgs_per_task = CLAMP(wgs_per_task, 1, wgs_per_core);
|
||||
|
||||
*task_axis = MALI_TASK_AXIS_X;
|
||||
*task_increment = wgs_per_task;
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (*task_increment <= wg_count[i])
|
||||
break;
|
||||
|
||||
threads_per_task *= wg_count[i];
|
||||
(*task_axis)++;
|
||||
*task_increment /= wg_count[i];
|
||||
}
|
||||
|
||||
assert(*task_axis <= MALI_TASK_AXIS_Z);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue