From 1326d52d2366d06617175b97d8134b4e7600a56d Mon Sep 17 00:00:00 2001 From: Daivik Bhatia Date: Fri, 5 Sep 2025 19:35:36 +0530 Subject: [PATCH] broadcom/common: Optimize CSD super-group packing Return one work group per super group when the work group size is multiple of 16 (elements per batch) and recalculate max_wgs_per_sg only when TSY barriers cut the available QPU threads. Reviewed-by: Iago Toral Quiroga Part-of: --- src/broadcom/common/v3d_util.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c index 460a218667f..e69c7ba2354 100644 --- a/src/broadcom/common/v3d_util.c +++ b/src/broadcom/common/v3d_util.c @@ -41,6 +41,12 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, if (has_subgroups) return 1; + /* If the workgroup size is a multiple of 16 (elements per batch), + * the lane occupancy is already maximized. + */ + if (wg_size % 16 == 0) + return 1; + /* Compute maximum number of batches in a supergroup for this workgroup size. * Each batch is 16 elements, and we can have up to 16 work groups in a * supergroup: @@ -56,11 +62,13 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, * available, so we can have at least 2 supergroups executing in parallel * and we don't stall all our QPU threads when a supergroup hits a barrier. */ + uint32_t max_wgs_per_sg = 16; + if (has_tsy_barrier) { uint32_t max_qpu_threads = devinfo->qpu_count * threads; max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2); + max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; } - uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; uint32_t best_wgs_per_sg = 1; uint32_t best_unused_lanes = 16;