diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c index 460a218667f..e69c7ba2354 100644 --- a/src/broadcom/common/v3d_util.c +++ b/src/broadcom/common/v3d_util.c @@ -41,6 +41,12 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, if (has_subgroups) return 1; + /* If the workgroup size is a multiple of 16 (elements per batch), + * the lane occupancy is already maximized. + */ + if (wg_size % 16 == 0) + return 1; + /* Compute maximum number of batches in a supergroup for this workgroup size. * Each batch is 16 elements, and we can have up to 16 work groups in a * supergroup: @@ -56,11 +62,13 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, * available, so we can have at least 2 supergroups executing in parallel * and we don't stall all our QPU threads when a supergroup hits a barrier. */ + uint32_t max_wgs_per_sg = 16; + if (has_tsy_barrier) { uint32_t max_qpu_threads = devinfo->qpu_count * threads; max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2); + max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; } - uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; uint32_t best_wgs_per_sg = 1; uint32_t best_unused_lanes = 16;