diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 356fd6ca242..2c46f547e9f 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -236,7 +236,7 @@ static bool handle_env_var_force_family(struct radeon_info *info) void ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_info) { - STATIC_ASSERT(sizeof(struct ac_cu_info) == 48); + STATIC_ASSERT(sizeof(struct ac_cu_info) == 52); struct ac_cu_info *cu_info = &info->cu_info; @@ -304,6 +304,8 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_ cu_info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4; + cu_info->hs_offchip_workgroup_dw_size = info->hs_offchip_workgroup_dw_size; + /* Flags */ cu_info->has_lds_bank_count_16 = info->family == CHIP_KABINI || info->family == CHIP_STONEY; cu_info->has_sram_ecc_enabled = info->family == CHIP_VEGA20 || info->family == CHIP_MI100 || @@ -343,6 +345,11 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_ cu_info->mesh_fast_launch_2 = info->mesh_fast_launch_2; + /* When distributed tessellation is unsupported, switch between SEs + * at a higher frequency to manually balance the workload between SEs. + */ + cu_info->smaller_tcs_workgroups = !info->has_distributed_tess && info->max_se > 1; + cu_info->has_gfx6_mrt_export_bug = info->family == CHIP_TAHITI || info->family == CHIP_PITCAIRN || info->family == CHIP_VERDE; cu_info->has_vtx_format_alpha_adjust_bug = info->gfx_level <= GFX8 && info->family != CHIP_STONEY; @@ -389,6 +396,8 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_ * The workaround is to issue and wait for attribute stores before the last export. */ cu_info->has_attr_ring_wait_bug = info->gfx_level == GFX11 || info->gfx_level == GFX11_5; + + cu_info->has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1; } enum ac_query_gpu_info_result @@ -1385,6 +1394,79 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->mesh_fast_launch_2 = info->gfx_level >= GFX11; + /* This is the size of all TCS outputs in memory per workgroup. + * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K. + */ + unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192; + unsigned max_hs_out_vram_dwords_enum; + unsigned max_workgroups_per_se; + + switch (max_hs_out_vram_dwords_per_wg) { + case 8192: + max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS; + break; + case 4096: + max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS; + break; + case 2048: + max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS; + break; + case 1024: + max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS; + break; + default: + UNREACHABLE("invalid TCS workgroup size"); + } + + /* Vega10 should limit num_workgroups to 508 (127 per SE) + * Gfx7 should limit num_workgroups to 508 (127 per SE) + * Gfx6 should limit num_workgroups to 126 (63 per SE) + */ + if (info->gfx_level >= GFX11) { + max_workgroups_per_se = 256; + } else if (info->gfx_level >= GFX10 || + info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { + max_workgroups_per_se = 128; + } else if (info->gfx_level >= GFX7 && info->family != CHIP_CARRIZO && info->family != CHIP_STONEY) { + max_workgroups_per_se = 127; + } else { + max_workgroups_per_se = 63; + } + + /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB. + * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish. + */ + unsigned num_offchip_wg_per_cu = 4; + unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa * + info->max_sa_per_se, max_workgroups_per_se); + unsigned num_workgroups = num_workgroups_per_se * info->max_se; + + if (info->gfx_level >= GFX11) { + /* OFFCHIP_BUFFERING is per SE. */ + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); + } else if (info->gfx_level >= GFX10_3) { + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) | + S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); + } else if (info->gfx_level >= GFX7) { + info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups - + (info->gfx_level >= GFX8 ? 1 : 0)) | + S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum); + } else { + info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) | + S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum); + } + + /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */ + unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16; + unsigned num_tess_factor_wg_per_cu = 3; + + info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg; + info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4; + info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu * + info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se; + info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size; + ac_fill_cu_info(info, &device_info); /* BIG_PAGE is supported since gfx10.3 and requires VRAM. VRAM is only guaranteed @@ -1571,79 +1653,6 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, info->has_set_sh_pairs_packed = info->has_kernelq_reg_shadowing; } - /* This is the size of all TCS outputs in memory per workgroup. - * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K. - */ - unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192; - unsigned max_hs_out_vram_dwords_enum; - unsigned max_workgroups_per_se; - - switch (max_hs_out_vram_dwords_per_wg) { - case 8192: - max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS; - break; - case 4096: - max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS; - break; - case 2048: - max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS; - break; - case 1024: - max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS; - break; - default: - UNREACHABLE("invalid TCS workgroup size"); - } - - /* Vega10 should limit num_workgroups to 508 (127 per SE) - * Gfx7 should limit num_workgroups to 508 (127 per SE) - * Gfx6 should limit num_workgroups to 126 (63 per SE) - */ - if (info->gfx_level >= GFX11) { - max_workgroups_per_se = 256; - } else if (info->gfx_level >= GFX10 || - info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) { - max_workgroups_per_se = 128; - } else if (info->gfx_level >= GFX7 && info->family != CHIP_CARRIZO && info->family != CHIP_STONEY) { - max_workgroups_per_se = 127; - } else { - max_workgroups_per_se = 63; - } - - /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB. - * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish. - */ - unsigned num_offchip_wg_per_cu = 4; - unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa * - info->max_sa_per_se, max_workgroups_per_se); - unsigned num_workgroups = num_workgroups_per_se * info->max_se; - - if (info->gfx_level >= GFX11) { - /* OFFCHIP_BUFFERING is per SE. */ - info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); - } else if (info->gfx_level >= GFX10_3) { - info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) | - S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum); - } else if (info->gfx_level >= GFX7) { - info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups - - (info->gfx_level >= GFX8 ? 1 : 0)) | - S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum); - } else { - info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) | - S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum); - } - - /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */ - unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16; - unsigned num_tess_factor_wg_per_cu = 3; - - info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg; - info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4; - info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu * - info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se; - info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size; - if (info->gfx_level >= GFX12) info->rt_ip_version = RT_3_1; else if (info->gfx_level >= GFX11) diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index ff587c68f1e..9947470adc9 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -113,6 +113,8 @@ struct ac_cu_info { uint32_t max_vgpr_alloc; uint32_t wave64_vgpr_alloc_granularity; + uint32_t hs_offchip_workgroup_dw_size; + /* Flags */ uint32_t has_lds_bank_count_16 : 1; uint32_t has_sram_ecc_enabled : 1; @@ -167,6 +169,9 @@ struct ac_cu_info { uint32_t has_attr_ring : 1; uint32_t mesh_fast_launch_2 : 1; + /* GFX6-7: limit TCS workgroup to 16 patches for better performance. */ + uint32_t smaller_tcs_workgroups : 1; + /* Some GFX6 GPUs have a bug where it only looks at the x writemask component. */ uint32_t has_gfx6_mrt_export_bug : 1; /* Pre-GFX9: A bug where the alpha component of 10_10_10_2 formats is always unsigned.*/ @@ -185,8 +190,10 @@ struct ac_cu_info { uint32_t has_ngg_fully_culled_bug : 1; /* GFX11-11.5: require wait between attribute stores and the final export. */ uint32_t has_attr_ring_wait_bug : 1; + /* GFX6: limit TCS workgroup to one patch if primitive ID is used. */ + uint32_t has_primid_instancing_bug : 1; - uint32_t reserved : 8; + uint32_t reserved : 6; }; struct radeon_info { diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 7ac97337f82..0d53d16e273 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -926,7 +926,7 @@ static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t mem_one_perpatch_output * num_mem_tcs_patch_outputs; } -uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, +uint32_t ac_compute_num_tess_patches(const struct ac_cu_info *info, uint32_t num_tcs_input_cp, uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch, uint32_t wave_size, bool tess_uses_primid) @@ -938,8 +938,7 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu * SWITCH_ON_EOI, which should cause IA to split instances up. However, this doesn't work * correctly on GFX6 when there is no other SE to switch to. */ - const bool has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1; - if (has_primid_instancing_bug && tess_uses_primid) + if (info->has_primid_instancing_bug && tess_uses_primid) return 1; /* 256 threads per workgroup is the hw limit, but 192 performs better. */ @@ -952,7 +951,7 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu /* When distributed tessellation is unsupported, switch between SEs * at a higher frequency to manually balance the workload between SEs. */ - if (!info->has_distributed_tess && info->max_se > 1) + if (info->smaller_tcs_workgroups) num_patches = MIN2(num_patches, 16); /* recommended */ /* Make sure the output data fits in the offchip buffer */ diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 0a4da7518f5..552dfe2c6e4 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -237,6 +237,8 @@ enum ac_descriptor_type AC_DESC_PLANE_2, }; +struct ac_cu_info; + unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, bool writes_mrt0_alpha); @@ -284,7 +286,7 @@ unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, mesa_shade unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, unsigned max_vtx_out, unsigned prim_amp_factor); -uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, +uint32_t ac_compute_num_tess_patches(const struct ac_cu_info *info, uint32_t num_tcs_input_cp, uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs, uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch, uint32_t wave_size, bool tess_uses_primid); diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h index 5bd8b9705c9..cf83765abd8 100644 --- a/src/amd/common/nir/ac_nir.h +++ b/src/amd/common/nir/ac_nir.h @@ -158,7 +158,7 @@ ac_nir_lower_tes_inputs_to_mem(nir_shader *shader, ac_nir_map_io_driver_location map); void -ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io_info *io_info, +ac_nir_compute_tess_wg_info(const struct ac_cu_info *info, const ac_nir_tess_io_info *io_info, unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid, unsigned num_tcs_input_cp, unsigned lds_input_vertex_size, unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg, diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c index 17360ccf644..b571170d3ba 100644 --- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c @@ -1651,7 +1651,7 @@ ac_nir_lower_tes_inputs_to_mem(nir_shader *shader, } void -ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io_info *io_info, +ac_nir_compute_tess_wg_info(const struct ac_cu_info *info, const ac_nir_tess_io_info *io_info, unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid, unsigned num_tcs_input_cp, unsigned lds_input_vertex_size, unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg, @@ -1668,7 +1668,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io /* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip * ring buffer always uses a fixed allocation size per workgroup determined by - * info->hs_offchip_workgroup_dw_size. + * ac_cpu_info::hs_offchip_workgroup_dw_size. * * LDS is only used for TCS inputs (with cross-invocation or indirect access only or if TCS in/out * vertex counts are different) and for TCS outputs that are read (including tess level outputs diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 4e30903b6db..d4ac01d63a7 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -3820,7 +3820,7 @@ radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess { const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs); - ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false, + ac_nir_compute_tess_wg_info(&pdev->info.cu_info, io_info, tcs_vertices_out, pdev->ge_wave_size, false, tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size); } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 067d69e283a..0376f68e653 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4709,7 +4709,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) unsigned num_patches, lds_size; /* Compute NUM_PATCHES and LDS_SIZE. */ - ac_nir_compute_tess_wg_info(&sctx->screen->info, &tcs->info.tess_io_info, + ac_nir_compute_tess_wg_info(&sctx->screen->info.cu_info, &tcs->info.tess_io_info, tcs->info.base.tess.tcs_vertices_out, ls_current->wave_size, tess_uses_primid, num_tcs_input_cp, lds_input_vertex_size, num_remapped_tess_level_outputs, &num_patches, &lds_size);