mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 19:40:10 +01:00
ac,radeonsi: add helpers to compute the number of tess patches/lds size
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28015>
This commit is contained in:
parent
8b8d194bfb
commit
758e6d9005
3 changed files with 104 additions and 75 deletions
|
|
@ -1031,6 +1031,96 @@ unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims
|
|||
return CLAMP(workgroup_size, 1, 256);
|
||||
}
|
||||
|
||||
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
|
||||
uint32_t lds_per_patch, uint32_t wave_size,
|
||||
bool tess_uses_primid)
|
||||
{
|
||||
/* The VGT HS block increments the patch ID unconditionally
|
||||
* within a single threadgroup. This results in incorrect
|
||||
* patch IDs when instanced draws are used.
|
||||
*
|
||||
* The intended solution is to restrict threadgroups to
|
||||
* a single instance by setting SWITCH_ON_EOI, which
|
||||
* should cause IA to split instances up. However, this
|
||||
* doesn't work correctly on GFX6 when there is no other
|
||||
* SE to switch to.
|
||||
*/
|
||||
const bool has_primid_instancing_bug = info->gfx_level == GFX6 && info->max_se == 1;
|
||||
if (has_primid_instancing_bug && tess_uses_primid)
|
||||
return 1;
|
||||
|
||||
/* Ensure that we only need 4 waves per CU, so that we don't need to check
|
||||
* resource usage (such as whether we have enough VGPRs to fit the whole
|
||||
* threadgroup into the CU). It also ensures that the number of tcs in and out
|
||||
* vertices per threadgroup are at most 256, which is the hw limit.
|
||||
*/
|
||||
const unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
|
||||
unsigned num_patches = 256 / max_verts_per_patch;
|
||||
|
||||
/* Not necessary for correctness, but higher numbers are slower.
|
||||
* The hardware can do more, but we prefer fully occupied waves.
|
||||
* eg. 64 triangle patches means 3 fully occupied Wave64 waves.
|
||||
*/
|
||||
num_patches = MIN2(num_patches, 64);
|
||||
|
||||
/* When distributed tessellation is unsupported, switch between SEs
|
||||
* at a higher frequency to manually balance the workload between SEs.
|
||||
*/
|
||||
if (!info->has_distributed_tess && info->max_se > 1)
|
||||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
if (vram_per_patch) {
|
||||
const uint32_t tess_offchip_block_dw_size = info->family == CHIP_HAWAII ? 4096 : 8192;
|
||||
num_patches =
|
||||
MIN2(num_patches, (tess_offchip_block_dw_size * 4) / vram_per_patch);
|
||||
}
|
||||
|
||||
/* Make sure that the data fits in LDS. This assumes the shaders only
|
||||
* use LDS for the inputs and outputs.
|
||||
*
|
||||
* The maximum allowed LDS size is 32K. Higher numbers can hang.
|
||||
* Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.
|
||||
*/
|
||||
if (lds_per_patch) {
|
||||
ASSERTED const unsigned max_lds_size = 32 * 1024; /* hw limit */
|
||||
const unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
|
||||
num_patches = MIN2(num_patches, target_lds_size / lds_per_patch);
|
||||
assert(num_patches * lds_per_patch <= max_lds_size);
|
||||
}
|
||||
num_patches = MAX2(num_patches, 1);
|
||||
|
||||
/* Make sure that vector lanes are fully occupied by cutting off the last wave
|
||||
* if it's only partially filled.
|
||||
*/
|
||||
const unsigned temp_verts_per_tg = num_patches * max_verts_per_patch;
|
||||
|
||||
if (temp_verts_per_tg > wave_size &&
|
||||
(wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
|
||||
num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
|
||||
|
||||
if (info->gfx_level == GFX6) {
|
||||
/* GFX6 bug workaround, related to power management. Limit LS-HS
|
||||
* threadgroups to only one wave.
|
||||
*/
|
||||
const unsigned one_wave = wave_size / max_verts_per_patch;
|
||||
num_patches = MIN2(num_patches, one_wave);
|
||||
}
|
||||
|
||||
return num_patches;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
ac_compute_tess_lds_size(const struct radeon_info *info, uint32_t lds_per_patch, uint32_t num_patches)
|
||||
{
|
||||
const unsigned lds_size = lds_per_patch * num_patches;
|
||||
|
||||
assert(lds_size <= (info->gfx_level >= GFX7 ? 65536 : 32768));
|
||||
|
||||
return align(lds_size, info->lds_encode_granularity) / info->lds_encode_granularity;
|
||||
}
|
||||
|
||||
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
|
||||
const struct radeon_info *info)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -197,6 +197,14 @@ unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned w
|
|||
unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
|
||||
unsigned max_vtx_out, unsigned prim_amp_factor);
|
||||
|
||||
uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
|
||||
uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
|
||||
uint32_t lds_per_patch, uint32_t wave_size,
|
||||
bool tess_uses_primid);
|
||||
|
||||
uint32_t ac_compute_tess_lds_size(const struct radeon_info *info,
|
||||
uint32_t lds_per_patch, uint32_t num_patches);
|
||||
|
||||
uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
|
||||
const struct radeon_info *info);
|
||||
|
||||
|
|
|
|||
|
|
@ -4515,72 +4515,11 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
lds_per_patch = MAX2(input_patch_size, output_patch_size);
|
||||
}
|
||||
|
||||
/* Ensure that we only need 4 waves per CU, so that we don't need to check
|
||||
* resource usage (such as whether we have enough VGPRs to fit the whole
|
||||
* threadgroup into the CU). It also ensures that the number of tcs in and out
|
||||
* vertices per threadgroup are at most 256, which is the hw limit.
|
||||
*/
|
||||
unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
|
||||
unsigned num_patches = 256 / max_verts_per_patch;
|
||||
|
||||
/* Not necessary for correctness, but higher numbers are slower.
|
||||
* The hardware can do more, but the radeonsi shader constant is
|
||||
* limited to 6 bits.
|
||||
*/
|
||||
num_patches = MIN2(num_patches, 64); /* e.g. 64 triangles in exactly 3 waves */
|
||||
|
||||
/* When distributed tessellation is unsupported, switch between SEs
|
||||
* at a higher frequency to manually balance the workload between SEs.
|
||||
*/
|
||||
if (!sctx->screen->info.has_distributed_tess && sctx->screen->info.max_se > 1)
|
||||
num_patches = MIN2(num_patches, 16); /* recommended */
|
||||
|
||||
/* Make sure the output data fits in the offchip buffer */
|
||||
num_patches =
|
||||
MIN2(num_patches, (sctx->screen->hs.tess_offchip_block_dw_size * 4) / output_patch_size);
|
||||
|
||||
/* Make sure that the data fits in LDS. This assumes the shaders only
|
||||
* use LDS for the inputs and outputs.
|
||||
*
|
||||
* The maximum allowed LDS size is 32K. Higher numbers can hang.
|
||||
* Use 16K as the maximum, so that we can fit 2 workgroups on the same CU.
|
||||
*/
|
||||
ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */
|
||||
unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
|
||||
num_patches = MIN2(num_patches, target_lds_size / lds_per_patch);
|
||||
num_patches = MAX2(num_patches, 1);
|
||||
assert(num_patches * lds_per_patch <= max_lds_size);
|
||||
|
||||
/* Make sure that vector lanes are fully occupied by cutting off the last wave
|
||||
* if it's only partially filled.
|
||||
*/
|
||||
unsigned temp_verts_per_tg = num_patches * max_verts_per_patch;
|
||||
unsigned wave_size = ls_current->wave_size;
|
||||
|
||||
if (temp_verts_per_tg > wave_size &&
|
||||
(wave_size - temp_verts_per_tg % wave_size >= MAX2(max_verts_per_patch, 8)))
|
||||
num_patches = (temp_verts_per_tg & ~(wave_size - 1)) / max_verts_per_patch;
|
||||
|
||||
if (sctx->gfx_level == GFX6) {
|
||||
/* GFX6 bug workaround, related to power management. Limit LS-HS
|
||||
* threadgroups to only one wave.
|
||||
*/
|
||||
unsigned one_wave = wave_size / max_verts_per_patch;
|
||||
num_patches = MIN2(num_patches, one_wave);
|
||||
}
|
||||
|
||||
/* The VGT HS block increments the patch ID unconditionally
|
||||
* within a single threadgroup. This results in incorrect
|
||||
* patch IDs when instanced draws are used.
|
||||
*
|
||||
* The intended solution is to restrict threadgroups to
|
||||
* a single instance by setting SWITCH_ON_EOI, which
|
||||
* should cause IA to split instances up. However, this
|
||||
* doesn't work correctly on GFX6 when there is no other
|
||||
* SE to switch to.
|
||||
*/
|
||||
if (has_primid_instancing_bug && tess_uses_primid)
|
||||
num_patches = 1;
|
||||
unsigned num_patches =
|
||||
ac_compute_num_tess_patches(&sctx->screen->info, num_tcs_input_cp,
|
||||
num_tcs_output_cp, output_patch_size,
|
||||
lds_per_patch, ls_current->wave_size,
|
||||
tess_uses_primid);
|
||||
|
||||
if (sctx->num_patches_per_workgroup != num_patches) {
|
||||
sctx->num_patches_per_workgroup = num_patches;
|
||||
|
|
@ -4607,15 +4546,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
|||
(num_vs_outputs << 17) | (num_tcs_outputs << 23);
|
||||
|
||||
/* Compute the LDS size. */
|
||||
unsigned lds_size = lds_per_patch * num_patches;
|
||||
|
||||
if (sctx->gfx_level >= GFX7) {
|
||||
assert(lds_size <= 65536);
|
||||
lds_size = align(lds_size, 512) / 512;
|
||||
} else {
|
||||
assert(lds_size <= 32768);
|
||||
lds_size = align(lds_size, 256) / 256;
|
||||
}
|
||||
unsigned lds_size = ac_compute_tess_lds_size(&sctx->screen->info, lds_per_patch, num_patches);
|
||||
|
||||
/* We should be able to support in-shader LDS use with LLVM >= 9
|
||||
* by just adding the lds_sizes together, but it has never
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue