mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-11 17:50:32 +01:00
ac/nir: pass ac_cu_info to ac_nir_compute_tess_wg_info
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40042>
This commit is contained in:
parent
8801ca188d
commit
a65089dfce
8 changed files with 102 additions and 85 deletions
|
|
@ -236,7 +236,7 @@ static bool handle_env_var_force_family(struct radeon_info *info)
|
|||
void
|
||||
ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_info)
|
||||
{
|
||||
STATIC_ASSERT(sizeof(struct ac_cu_info) == 48);
|
||||
STATIC_ASSERT(sizeof(struct ac_cu_info) == 52);
|
||||
|
||||
struct ac_cu_info *cu_info = &info->cu_info;
|
||||
|
||||
|
|
@ -304,6 +304,8 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_
|
|||
|
||||
cu_info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
|
||||
|
||||
cu_info->hs_offchip_workgroup_dw_size = info->hs_offchip_workgroup_dw_size;
|
||||
|
||||
/* Flags */
|
||||
cu_info->has_lds_bank_count_16 = info->family == CHIP_KABINI || info->family == CHIP_STONEY;
|
||||
cu_info->has_sram_ecc_enabled = info->family == CHIP_VEGA20 || info->family == CHIP_MI100 ||
|
||||
|
|
@ -343,6 +345,11 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_
|
|||
|
||||
cu_info->mesh_fast_launch_2 = info->mesh_fast_launch_2;
|
||||
|
||||
/* When distributed tessellation is unsupported, switch between SEs
|
||||
* at a higher frequency to manually balance the workload between SEs.
|
||||
*/
|
||||
cu_info->smaller_tcs_workgroups = !info->has_distributed_tess && info->max_se > 1;
|
||||
|
||||
cu_info->has_gfx6_mrt_export_bug =
|
||||
info->family == CHIP_TAHITI || info->family == CHIP_PITCAIRN || info->family == CHIP_VERDE;
|
||||
cu_info->has_vtx_format_alpha_adjust_bug = info->gfx_level <= GFX8 && info->family != CHIP_STONEY;
|
||||
|
|
@ -389,6 +396,8 @@ ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_
|
|||
* The workaround is to issue and wait for attribute stores before the last export.
|
||||
*/
|
||||
cu_info->has_attr_ring_wait_bug = info->gfx_level == GFX11 || info->gfx_level == GFX11_5;
|
||||
|
||||
cu_info->has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1;
|
||||
}
|
||||
|
||||
enum ac_query_gpu_info_result
|
||||
|
|
@ -1385,6 +1394,79 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
|
|||
|
||||
info->mesh_fast_launch_2 = info->gfx_level >= GFX11;
|
||||
|
||||
/* This is the size of all TCS outputs in memory per workgroup.
|
||||
* Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K.
|
||||
*/
|
||||
unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192;
|
||||
unsigned max_hs_out_vram_dwords_enum;
|
||||
unsigned max_workgroups_per_se;
|
||||
|
||||
switch (max_hs_out_vram_dwords_per_wg) {
|
||||
case 8192:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS;
|
||||
break;
|
||||
case 4096:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS;
|
||||
break;
|
||||
case 2048:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS;
|
||||
break;
|
||||
case 1024:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS;
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("invalid TCS workgroup size");
|
||||
}
|
||||
|
||||
/* Vega10 should limit num_workgroups to 508 (127 per SE)
|
||||
* Gfx7 should limit num_workgroups to 508 (127 per SE)
|
||||
* Gfx6 should limit num_workgroups to 126 (63 per SE)
|
||||
*/
|
||||
if (info->gfx_level >= GFX11) {
|
||||
max_workgroups_per_se = 256;
|
||||
} else if (info->gfx_level >= GFX10 ||
|
||||
info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) {
|
||||
max_workgroups_per_se = 128;
|
||||
} else if (info->gfx_level >= GFX7 && info->family != CHIP_CARRIZO && info->family != CHIP_STONEY) {
|
||||
max_workgroups_per_se = 127;
|
||||
} else {
|
||||
max_workgroups_per_se = 63;
|
||||
}
|
||||
|
||||
/* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB.
|
||||
* Note that the offchip allocation isn't deallocated until the corresponding TES waves finish.
|
||||
*/
|
||||
unsigned num_offchip_wg_per_cu = 4;
|
||||
unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa *
|
||||
info->max_sa_per_se, max_workgroups_per_se);
|
||||
unsigned num_workgroups = num_workgroups_per_se * info->max_se;
|
||||
|
||||
if (info->gfx_level >= GFX11) {
|
||||
/* OFFCHIP_BUFFERING is per SE. */
|
||||
info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) |
|
||||
S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum);
|
||||
} else if (info->gfx_level >= GFX10_3) {
|
||||
info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) |
|
||||
S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum);
|
||||
} else if (info->gfx_level >= GFX7) {
|
||||
info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups -
|
||||
(info->gfx_level >= GFX8 ? 1 : 0)) |
|
||||
S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum);
|
||||
} else {
|
||||
info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) |
|
||||
S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum);
|
||||
}
|
||||
|
||||
/* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */
|
||||
unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16;
|
||||
unsigned num_tess_factor_wg_per_cu = 3;
|
||||
|
||||
info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg;
|
||||
info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4;
|
||||
info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu *
|
||||
info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se;
|
||||
info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size;
|
||||
|
||||
ac_fill_cu_info(info, &device_info);
|
||||
|
||||
/* BIG_PAGE is supported since gfx10.3 and requires VRAM. VRAM is only guaranteed
|
||||
|
|
@ -1571,79 +1653,6 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
|
|||
info->has_set_sh_pairs_packed = info->has_kernelq_reg_shadowing;
|
||||
}
|
||||
|
||||
/* This is the size of all TCS outputs in memory per workgroup.
|
||||
* Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K.
|
||||
*/
|
||||
unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192;
|
||||
unsigned max_hs_out_vram_dwords_enum;
|
||||
unsigned max_workgroups_per_se;
|
||||
|
||||
switch (max_hs_out_vram_dwords_per_wg) {
|
||||
case 8192:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS;
|
||||
break;
|
||||
case 4096:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS;
|
||||
break;
|
||||
case 2048:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS;
|
||||
break;
|
||||
case 1024:
|
||||
max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS;
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("invalid TCS workgroup size");
|
||||
}
|
||||
|
||||
/* Vega10 should limit num_workgroups to 508 (127 per SE)
|
||||
* Gfx7 should limit num_workgroups to 508 (127 per SE)
|
||||
* Gfx6 should limit num_workgroups to 126 (63 per SE)
|
||||
*/
|
||||
if (info->gfx_level >= GFX11) {
|
||||
max_workgroups_per_se = 256;
|
||||
} else if (info->gfx_level >= GFX10 ||
|
||||
info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) {
|
||||
max_workgroups_per_se = 128;
|
||||
} else if (info->gfx_level >= GFX7 && info->family != CHIP_CARRIZO && info->family != CHIP_STONEY) {
|
||||
max_workgroups_per_se = 127;
|
||||
} else {
|
||||
max_workgroups_per_se = 63;
|
||||
}
|
||||
|
||||
/* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB.
|
||||
* Note that the offchip allocation isn't deallocated until the corresponding TES waves finish.
|
||||
*/
|
||||
unsigned num_offchip_wg_per_cu = 4;
|
||||
unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa *
|
||||
info->max_sa_per_se, max_workgroups_per_se);
|
||||
unsigned num_workgroups = num_workgroups_per_se * info->max_se;
|
||||
|
||||
if (info->gfx_level >= GFX11) {
|
||||
/* OFFCHIP_BUFFERING is per SE. */
|
||||
info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups_per_se - 1) |
|
||||
S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum);
|
||||
} else if (info->gfx_level >= GFX10_3) {
|
||||
info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(num_workgroups - 1) |
|
||||
S_03093C_OFFCHIP_GRANULARITY_GFX103(max_hs_out_vram_dwords_enum);
|
||||
} else if (info->gfx_level >= GFX7) {
|
||||
info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups -
|
||||
(info->gfx_level >= GFX8 ? 1 : 0)) |
|
||||
S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum);
|
||||
} else {
|
||||
info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) |
|
||||
S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum);
|
||||
}
|
||||
|
||||
/* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */
|
||||
unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16;
|
||||
unsigned num_tess_factor_wg_per_cu = 3;
|
||||
|
||||
info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg;
|
||||
info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4;
|
||||
info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu *
|
||||
info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se;
|
||||
info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size;
|
||||
|
||||
if (info->gfx_level >= GFX12)
|
||||
info->rt_ip_version = RT_3_1;
|
||||
else if (info->gfx_level >= GFX11)
|
||||
|
|
|
|||
|
|
@ -113,6 +113,8 @@ struct ac_cu_info {
|
|||
uint32_t max_vgpr_alloc;
|
||||
uint32_t wave64_vgpr_alloc_granularity;
|
||||
|
||||
uint32_t hs_offchip_workgroup_dw_size;
|
||||
|
||||
/* Flags */
|
||||
uint32_t has_lds_bank_count_16 : 1;
|
||||
uint32_t has_sram_ecc_enabled : 1;
|
||||
|
|
@ -167,6 +169,9 @@ struct ac_cu_info {
|
|||
uint32_t has_attr_ring : 1;
|
||||
uint32_t mesh_fast_launch_2 : 1;
|
||||
|
||||
/* GFX6-7: limit TCS workgroup to 16 patches for better performance. */
|
||||
uint32_t smaller_tcs_workgroups : 1;
|
||||
|
||||
/* Some GFX6 GPUs have a bug where it only looks at the x writemask component. */
|
||||
uint32_t has_gfx6_mrt_export_bug : 1;
|
||||
/* Pre-GFX9: A bug where the alpha component of 10_10_10_2 formats is always unsigned.*/
|
||||
|
|
@ -185,8 +190,10 @@ struct ac_cu_info {
|
|||
uint32_t has_ngg_fully_culled_bug : 1;
|
||||
/* GFX11-11.5: require wait between attribute stores and the final export. */
|
||||
uint32_t has_attr_ring_wait_bug : 1;
|
||||
/* GFX6: limit TCS workgroup to one patch if primitive ID is used. */
|
||||
uint32_t has_primid_instancing_bug : 1;
|
||||
|
||||
uint32_t reserved : 8;
|
||||
uint32_t reserved : 6;
|
||||
};
|
||||
|
||||
struct radeon_info {
|
||||
|
|
|
|||
|
|
@ -926,7 +926,7 @@ static unsigned get_tcs_wg_output_mem_size(uint32_t num_tcs_output_cp, uint32_t
|
|||
mem_one_perpatch_output * num_mem_tcs_patch_outputs;
|
||||
}
|
||||
|
||||
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t ac_compute_num_tess_patches(const struct ac_cu_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
|
||||
uint32_t wave_size, bool tess_uses_primid)
|
||||
|
|
@ -938,8 +938,7 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
|
|||
* SWITCH_ON_EOI, which should cause IA to split instances up. However, this doesn't work
|
||||
* correctly on GFX6 when there is no other SE to switch to.
|
||||
*/
|
||||
const bool has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1;
|
||||
if (has_primid_instancing_bug && tess_uses_primid)
|
||||
if (info->has_primid_instancing_bug && tess_uses_primid)
|
||||
return 1;
|
||||
|
||||
/* 256 threads per workgroup is the hw limit, but 192 performs better. */
|
||||
|
|
@ -952,7 +951,7 @@ uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t nu
|
|||
/* When distributed tessellation is unsupported, switch between SEs
|
||||
* at a higher frequency to manually balance the workload between SEs.
|
||||
*/
|
||||
if (!info->has_distributed_tess && info->max_se > 1)
|
||||
if (info->smaller_tcs_workgroups)
|
||||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
|
|
|
|||
|
|
@ -237,6 +237,8 @@ enum ac_descriptor_type
|
|||
AC_DESC_PLANE_2,
|
||||
};
|
||||
|
||||
struct ac_cu_info;
|
||||
|
||||
unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
|
||||
bool writes_mrt0_alpha);
|
||||
|
||||
|
|
@ -284,7 +286,7 @@ unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, mesa_shade
|
|||
unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
|
||||
unsigned max_vtx_out, unsigned prim_amp_factor);
|
||||
|
||||
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t ac_compute_num_tess_patches(const struct ac_cu_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t num_tcs_output_cp, uint32_t num_mem_tcs_outputs,
|
||||
uint32_t num_mem_tcs_patch_outputs, uint32_t lds_per_patch,
|
||||
uint32_t wave_size, bool tess_uses_primid);
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ ac_nir_lower_tes_inputs_to_mem(nir_shader *shader,
|
|||
ac_nir_map_io_driver_location map);
|
||||
|
||||
void
|
||||
ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io_info *io_info,
|
||||
ac_nir_compute_tess_wg_info(const struct ac_cu_info *info, const ac_nir_tess_io_info *io_info,
|
||||
unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
|
||||
unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
|
||||
unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
|
||||
|
|
|
|||
|
|
@ -1651,7 +1651,7 @@ ac_nir_lower_tes_inputs_to_mem(nir_shader *shader,
|
|||
}
|
||||
|
||||
void
|
||||
ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io_info *io_info,
|
||||
ac_nir_compute_tess_wg_info(const struct ac_cu_info *info, const ac_nir_tess_io_info *io_info,
|
||||
unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
|
||||
unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
|
||||
unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
|
||||
|
|
@ -1668,7 +1668,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
|
|||
|
||||
/* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip
|
||||
* ring buffer always uses a fixed allocation size per workgroup determined by
|
||||
* info->hs_offchip_workgroup_dw_size.
|
||||
* ac_cpu_info::hs_offchip_workgroup_dw_size.
|
||||
*
|
||||
* LDS is only used for TCS inputs (with cross-invocation or indirect access only or if TCS in/out
|
||||
* vertex counts are different) and for TCS outputs that are read (including tess level outputs
|
||||
|
|
|
|||
|
|
@ -3820,7 +3820,7 @@ radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess
|
|||
{
|
||||
const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs);
|
||||
|
||||
ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false,
|
||||
ac_nir_compute_tess_wg_info(&pdev->info.cu_info, io_info, tcs_vertices_out, pdev->ge_wave_size, false,
|
||||
tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4709,7 +4709,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
unsigned num_patches, lds_size;
|
||||
|
||||
/* Compute NUM_PATCHES and LDS_SIZE. */
|
||||
ac_nir_compute_tess_wg_info(&sctx->screen->info, &tcs->info.tess_io_info,
|
||||
ac_nir_compute_tess_wg_info(&sctx->screen->info.cu_info, &tcs->info.tess_io_info,
|
||||
tcs->info.base.tess.tcs_vertices_out, ls_current->wave_size,
|
||||
tess_uses_primid, num_tcs_input_cp, lds_input_vertex_size,
|
||||
num_remapped_tess_level_outputs, &num_patches, &lds_size);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue