diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 6837083a51b..e80da1c692c 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -1031,6 +1031,96 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims return CLAMP(workgroup_size, 1, 256); } +uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, + uint32_t num_tcs_output_cp, uint32_t vram_per_patch, + uint32_t lds_per_patch, uint32_t wave_size, + bool tess_uses_primid) +{ + /* The VGT HS block increments the patch ID unconditionally + * within a single threadgroup. This results in incorrect + * patch IDs when instanced draws are used. + * + * The intended solution is to restrict threadgroups to + * a single instance by setting SWITCH_ON_EOI, which + * should cause IA to split instances up. However, this + * doesn't work correctly on GFX6 when there is no other + * SE to switch to. + */ + const bool has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1; + if (has_primid_instancing_bug && tess_uses_primid) + return 1; + + /* Ensure that we only need 4 waves per CU, so that we don't need to check + * resource usage (such as whether we have enough VGPRs to fit the whole + * threadgroup into the CU). It also ensures that the number of tcs in and out + * vertices per threadgroup are at most 256, which is the hw limit. + */ + const unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); + unsigned num_patches = 256 / max_verts_per_patch; + + /* Not necessary for correctness, but higher numbers are slower. + * The hardware can do more, but we prefer fully occupied waves. + * eg. 64 triangle patches means 3 fully occupied Wave64 waves. + */ + num_patches = MIN2(num_patches, 64); + + /* When distributed tessellation is unsupported, switch between SEs + * at a higher frequency to manually balance the workload between SEs. + */ + if (!info->has_distributed_tess && info->max_se > 1) + num_patches = MIN2(num_patches, 16); /* recommended */ + + /* Make sure the output data fits in the offchip buffer */ + if (vram_per_patch) { + const uint32_t tess_offchip_block_dw_size = info->family == CHIP_HAWAII ? 4096 : 8192; + num_patches = + MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_per_patch); + } + + /* Make sure that the data fits in LDS. This assumes the shaders only + * use LDS for the inputs and outputs. + * + * The maximum allowed LDS size is 32K. Higher numbers can hang. + * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU. + */ + if (lds_per_patch) { + ASSERTED const unsigned max_lds_size = 32 * 1024; /* hw limit */ + const unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */ + num_patches = MIN2(num_patches, target_lds_size / lds_per_patch); + assert(num_patches * lds_per_patch <= max_lds_size); + } + num_patches = MAX2(num_patches, 1); + + /* Make sure that vector lanes are fully occupied by cutting off the last wave + * if it's only partially filled. + */ + const unsigned temp_verts_per_tg = num_patches * max_verts_per_patch; + + if (temp_verts_per_tg > wave_size && + (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8))) + num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; + + if (info->gfx_level == GFX6) { + /* GFX6 bug workaround, related to power management. Limit LS-HS + * threadgroups to only one wave. + */ + const unsigned one_wave = wave_size / max_verts_per_patch; + num_patches = MIN2(num_patches, one_wave); + } + + return num_patches; +} + +uint32_t +ac_compute_tess_lds_size(const struct radeon_info *info, uint32_t lds_per_patch, uint32_t num_patches) +{ + const unsigned lds_size = lds_per_patch * num_patches; + + assert(lds_size <= (info->gfx_level >= GFX7 ? 65536 : 32768)); + + return align(lds_size, info->lds_encode_granularity) / info->lds_encode_granularity; +} + uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, const struct radeon_info *info) { diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index f36bffdfb77..e364b6b41db 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -197,6 +197,14 @@ unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned w unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, unsigned max_vtx_out, unsigned prim_amp_factor); +uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, + uint32_t num_tcs_output_cp, uint32_t vram_per_patch, + uint32_t lds_per_patch, uint32_t wave_size, + bool tess_uses_primid); + +uint32_t ac_compute_tess_lds_size(const struct radeon_info *info, + uint32_t lds_per_patch, uint32_t num_patches); + uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, const struct radeon_info *info); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 38bc0b3f444..497439d290d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4515,72 +4515,11 @@ void si_update_tess_io_layout_state(struct si_context *sctx) lds_per_patch = MAX2(input_patch_size, output_patch_size); } - /* Ensure that we only need 4 waves per CU, so that we don't need to check - * resource usage (such as whether we have enough VGPRs to fit the whole - * threadgroup into the CU). It also ensures that the number of tcs in and out - * vertices per threadgroup are at most 256, which is the hw limit. - */ - unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp); - unsigned num_patches = 256 / max_verts_per_patch; - - /* Not necessary for correctness, but higher numbers are slower. - * The hardware can do more, but the radeonsi shader constant is - * limited to 6 bits. - */ - num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */ - - /* When distributed tessellation is unsupported, switch between SEs - * at a higher frequency to manually balance the workload between SEs. - */ - if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1) - num_patches = MIN2(num_patches, 16); /* recommended */ - - /* Make sure the output data fits in the offchip buffer */ - num_patches = - MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size); - - /* Make sure that the data fits in LDS. This assumes the shaders only - * use LDS for the inputs and outputs. - * - * The maximum allowed LDS size is 32K. Higher numbers can hang. - * Use 16K as the maximum, so that we can fit 2 workgroups on the same CU. - */ - ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */ - unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */ - num_patches = MIN2(num_patches, target_lds_size / lds_per_patch); - num_patches = MAX2(num_patches, 1); - assert(num_patches * lds_per_patch <= max_lds_size); - - /* Make sure that vector lanes are fully occupied by cutting off the last wave - * if it's only partially filled. - */ - unsigned temp_verts_per_tg = num_patches * max_verts_per_patch; - unsigned wave_size = ls_current->wave_size; - - if (temp_verts_per_tg > wave_size && - (wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8))) - num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch; - - if (sctx->gfx_level == GFX6) { - /* GFX6 bug workaround, related to power management. Limit LS-HS - * threadgroups to only one wave. - */ - unsigned one_wave = wave_size / max_verts_per_patch; - num_patches = MIN2(num_patches, one_wave); - } - - /* The VGT HS block increments the patch ID unconditionally - * within a single threadgroup. This results in incorrect - * patch IDs when instanced draws are used. - * - * The intended solution is to restrict threadgroups to - * a single instance by setting SWITCH_ON_EOI, which - * should cause IA to split instances up. However, this - * doesn't work correctly on GFX6 when there is no other - * SE to switch to. - */ - if (has_primid_instancing_bug && tess_uses_primid) - num_patches = 1; + unsigned num_patches = + ac_compute_num_tess_patches(&sctx->screen->info, num_tcs_input_cp, + num_tcs_output_cp, output_patch_size, + lds_per_patch, ls_current->wave_size, + tess_uses_primid); if (sctx->num_patches_per_workgroup != num_patches) { sctx->num_patches_per_workgroup = num_patches; @@ -4607,15 +4546,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) (num_vs_outputs << 17) | (num_tcs_outputs << 23); /* Compute the LDS size. */ - unsigned lds_size = lds_per_patch * num_patches; - - if (sctx->gfx_level >= GFX7) { - assert(lds_size <= 65536); - lds_size = align(lds_size, 512) / 512; - } else { - assert(lds_size <= 32768); - lds_size = align(lds_size, 256) / 256; - } + unsigned lds_size = ac_compute_tess_lds_size(&sctx->screen->info, lds_per_patch, num_patches); /* We should be able to support in-shader LDS use with LLVM >= 9 * by just adding the lds_sizes together, but it has never