From ddd0b0c3a8a7e312edcbb8a310840f8fa05d56e8 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Mon, 20 Oct 2025 15:06:36 -0700 Subject: [PATCH] panvk: rework calculate_task_axis_and_increment We used to maximize threads_per_task, but that is ideal when the system has a single gpu client. When there are multiple gpu clients, we want smaller threads_per_task such that cores can be more fairly shared among the clients. Signed-off-by: Chia-I Wu Tested-by: Yiwei Zhang Reviewed-by: Christoph Pillmayer Part-of: --- src/panfrost/ci/panfrost-g925-skips.txt | 3 ++ src/panfrost/vulkan/csf/panvk_cmd_buffer.h | 49 ++++++++++++---------- 2 files changed, 30 insertions(+), 22 deletions(-) create mode 100644 src/panfrost/ci/panfrost-g925-skips.txt diff --git a/src/panfrost/ci/panfrost-g925-skips.txt b/src/panfrost/ci/panfrost-g925-skips.txt new file mode 100644 index 00000000000..3b96b79f4c6 --- /dev/null +++ b/src/panfrost/ci/panfrost-g925-skips.txt @@ -0,0 +1,3 @@ +# Slow tests (>= 30s) +dEQP-VK.api.external.fence.sync_fd.export_multiple_times_temporary +dEQP-VK.api.external.semaphore.sync_fd.export_multiple_times_temporary diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 1dc447375be..525130531ec 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -677,13 +677,19 @@ panvk_per_arch(calculate_task_axis_and_increment)( { /* Pick the task_axis and task_increment to maximize thread * utilization. */ - unsigned threads_per_wg = shader->cs.local_size.x * shader->cs.local_size.y * - shader->cs.local_size.z; - unsigned max_thread_cnt = pan_compute_max_thread_count( - &phys_dev->kmod.props, shader->info.work_reg_count); - unsigned threads_per_task = threads_per_wg; + const struct pan_kmod_dev_props *props = &phys_dev->kmod.props; + const unsigned max_thread_cnt = + pan_compute_max_thread_count(props, shader->info.work_reg_count); + const unsigned threads_per_wg = shader->cs.local_size.x * + shader->cs.local_size.y * + shader->cs.local_size.z; const unsigned wg_count[3] = {wg_dim->x, wg_dim->y, wg_dim->z}; const unsigned total_wgs = wg_dim->x * wg_dim->y * wg_dim->z; + const unsigned total_cores = util_bitcount64(phys_dev->compute_core_mask); + /* Split workgroups among cores evenly. */ + const unsigned wgs_per_core = DIV_ROUND_UP(total_wgs, total_cores); + unsigned threads_per_task; + unsigned wgs_per_task; if (!total_wgs) { *task_axis = MALI_TASK_AXIS_X; @@ -691,25 +697,24 @@ panvk_per_arch(calculate_task_axis_and_increment)( return; } - for (unsigned i = 0; i < 3; i++) { - if (threads_per_task * wg_count[i] >= max_thread_cnt) { - /* We reached out thread limit, stop at the current axis and - * calculate the increment so it doesn't exceed the per-core - * thread capacity. - */ - *task_increment = max_thread_cnt / threads_per_task; - break; - } else if (*task_axis == MALI_TASK_AXIS_Z) { - /* We reached the Z axis, and there's still room to stuff more - * threads. Pick the current axis grid size as our increment - * as there's no point using something bigger. - */ - *task_increment = wg_count[i]; - break; - } + /* We used to maximize threads_per_task, but that is ideal when the system + * has a single gpu client. When there are multiple gpu clients, we want + * smaller threads_per_task such that cores can be more fairly shared among + * the clients. + */ + threads_per_task = DIV_ROUND_UP(max_thread_cnt, props->max_tasks_per_core); + + wgs_per_task = threads_per_task / threads_per_wg; + wgs_per_task = CLAMP(wgs_per_task, 1, wgs_per_core); + + *task_axis = MALI_TASK_AXIS_X; + *task_increment = wgs_per_task; + for (unsigned i = 0; i < 2; i++) { + if (*task_increment <= wg_count[i]) + break; - threads_per_task *= wg_count[i]; (*task_axis)++; + *task_increment /= wg_count[i]; } assert(*task_axis <= MALI_TASK_AXIS_Z);