From 8c6a18364a4fe450d237435374d25d2c6883119b Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Thu, 26 Mar 2026 10:57:29 +0100 Subject: [PATCH] ac/info: add ac_fill_tess_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Samuel Pitoiset Reviewed-by: Marek Olšák Part-of: --- src/amd/common/ac_gpu_info.c | 149 ++++++++++++++++++----------------- src/amd/common/ac_gpu_info.h | 1 + 2 files changed, 78 insertions(+), 72 deletions(-) diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 145bf91ff35..a5f25bc614f 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -1292,6 +1292,82 @@ void ac_fill_hw_info(struct radeon_info *info, const struct drm_amdgpu_info_devi } } +void ac_fill_tess_info(struct radeon_info *info) +{ + /* This is the size of all TCS outputs in memory per workgroup. + * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K. + */ + unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192; + unsigned max_hs_out_vram_dwords_enum; + unsigned max_workgroups_per_se; + + switch (max_hs_out_vram_dwords_per_wg) { + case 8192: + max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS; + break; + case 4096: + max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS; + break; + case 2048: + max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS; + break; + case 1024: + max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS; + break; + default: + UNREACHABLE("invalid TCS workgroup size"); + } + + /* Vega10 should limit num_workgroups to 508 (127 per SE) + * Gfx7 should limit num_workgroups to 508 (127 per SE) + * Gfx6 should limit num_workgroups to 126 (63 per SE) + */ + if (info->gfx_level >= GFX11) { + max_workgroups_per_se = 256; + } else if (info->gfx_level >= GFX10 || + info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { + max_workgroups_per_se = 128; + } else if (info->gfx_level >= GFX7 && info->family != CHIP_CARRIZO && info->family != CHIP_STONEY) { + max_workgroups_per_se = 127; + } else { + max_workgroups_per_se = 63; + } + + /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB. + * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish. + */ + unsigned num_offchip_wg_per_cu = 4; + unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa * + info->max_sa_per_se, max_workgroups_per_se); + unsigned num_workgroups = num_workgroups_per_se * info->max_se; + + if (info->gfx_level >= GFX11) { + /* OFFCHIP_BUFFERING is per SE. */ + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); + } else if (info->gfx_level >= GFX10_3) { + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); + } else if (info->gfx_level >= GFX7) { + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups - + (info->gfx_level >= GFX8 ? 1 : 0)) | + S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum); + } else { + info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) | + S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum); + } + + /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */ + unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16; + unsigned num_tess_factor_wg_per_cu = 3; + + info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg; + info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4; + info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu * + info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se; + info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size; +} + enum ac_query_gpu_info_result ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, bool require_pci_bus_info) @@ -1467,78 +1543,7 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, ac_fill_bug_info(info); - /* This is the size of all TCS outputs in memory per workgroup. - * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K. - */ - unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192; - unsigned max_hs_out_vram_dwords_enum; - unsigned max_workgroups_per_se; - - switch (max_hs_out_vram_dwords_per_wg) { - case 8192: - max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS; - break; - case 4096: - max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS; - break; - case 2048: - max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS; - break; - case 1024: - max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS; - break; - default: - UNREACHABLE("invalid TCS workgroup size"); - } - - /* Vega10 should limit num_workgroups to 508 (127 per SE) - * Gfx7 should limit num_workgroups to 508 (127 per SE) - * Gfx6 should limit num_workgroups to 126 (63 per SE) - */ - if (info->gfx_level >= GFX11) { - max_workgroups_per_se = 256; - } else if (info->gfx_level >= GFX10 || - info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { - max_workgroups_per_se = 128; - } else if (info->gfx_level >= GFX7 && info->family != CHIP_CARRIZO && info->family != CHIP_STONEY) { - max_workgroups_per_se = 127; - } else { - max_workgroups_per_se = 63; - } - - /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB. - * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish. - */ - unsigned num_offchip_wg_per_cu = 4; - unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa * - info->max_sa_per_se, max_workgroups_per_se); - unsigned num_workgroups = num_workgroups_per_se * info->max_se; - - if (info->gfx_level >= GFX11) { - /* OFFCHIP_BUFFERING is per SE. */ - info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); - } else if (info->gfx_level >= GFX10_3) { - info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); - } else if (info->gfx_level >= GFX7) { - info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups - - (info->gfx_level >= GFX8 ? 1 : 0)) | - S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum); - } else { - info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) | - S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum); - } - - /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */ - unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16; - unsigned num_tess_factor_wg_per_cu = 3; - - info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg; - info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4; - info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu * - info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se; - info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size; + ac_fill_tess_info(info); ac_fill_compiler_info(info, &device_info); diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 55060ebe12c..19be8920445 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -499,6 +499,7 @@ ac_identify_chip(struct radeon_info *info, const struct drm_amdgpu_info_device * void ac_fill_bug_info(struct radeon_info *info); void ac_fill_feature_info(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info); void ac_fill_hw_info(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info); +void ac_fill_tess_info(struct radeon_info *info); void ac_compute_driver_uuid(char *uuid, size_t size);