From 6d9e7086425553d83364099826e3e35385e555a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 15 Apr 2025 23:11:36 -0400 Subject: [PATCH] ac/gpu_info: reduce the tess offchip ring size and compute it proportionately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .. to the CU count. We allocated too much. This reduces the tess offchip ring size as follows (examples): - GFX11-12: - Navi31, Navi33, and Navi48 get 75% decrease. - Navi32 gets 68.75% decrease. - Phoenix gets 81.25% decrease. - Phoenix2 gets 93.75% decrease. - GFX10.3: - Navi21 and Navi22 get 37.5% decrease. - Navi23 and Navi24 get 50% decrease. - Rembrandt gets 62.5% decrease. - VanGogh gets 75% decrease. - Raphael gets 93.75% decrease. - GFX8-9: - Vega10 gets 0% decrease. - Vega20 gets 49.6% decrease. - Raven gets 65.3% decrease. - Raven2 gets 93.7% decrease. - Stoney gets 81% decrease. No difference in performance was measured. Reviewed-by: Timur Kristóf Part-of: --- src/amd/common/ac_gpu_info.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index f25f649b61c..49d0b48ee32 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1741,7 +1741,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, */ unsigned wg_size_in_dwords = info->family == CHIP_HAWAII ? 4096 : 8192; unsigned wg_size_enum; - unsigned num_workgroups_per_se; + unsigned max_workgroups_per_se; switch (wg_size_in_dwords) { case 8192: @@ -1765,15 +1765,21 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, * Gfx6 should limit num_workgroups to 126 (63 per SE) */ if (info->gfx_level >= GFX11) { - num_workgroups_per_se = 256; + max_workgroups_per_se = 256; } else if (info->gfx_level >= GFX10) { - num_workgroups_per_se = 128; + max_workgroups_per_se = 128; } else if (info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { - num_workgroups_per_se = double_offchip_wg ? 128 : 64; + max_workgroups_per_se = double_offchip_wg ? 128 : 64; } else { - num_workgroups_per_se = double_offchip_wg ? 127 : 63; + max_workgroups_per_se = double_offchip_wg ? 127 : 63; } + /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB. + * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish. + */ + unsigned num_offchip_wg_per_cu = 4; + unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa * + info->max_sa_per_se, max_workgroups_per_se); unsigned num_workgroups = num_workgroups_per_se * info->max_se; if (info->gfx_level >= GFX11) {