radeonsi: fix the fast launch vert/prim thread counts if they are trimmed

This fixes the case when the counts were out of sync because one of them
was decreased.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
This commit is contained in:
Marek Olšák 2021-05-10 06:53:46 -04:00 committed by Marge Bot
parent f6e19fd831
commit c8e8979d6b
2 changed files with 28 additions and 6 deletions

View file

@ -2103,12 +2103,36 @@ retry_select_mode:
prim_amp_factor = gs_sel->info.base.gs.vertices_out;
}
/* Fix up the thread counts for fast launch. */
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
/* The vertex count must be a multiple of 3. */
max_esverts -= max_esverts % 3;
/* We can only decrease the size, not increase it. */
if (max_gsprims * 3 < max_esverts) {
max_esverts = max_gsprims * 3;
} else {
max_gsprims = max_esverts / 3;
}
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
/* The primitive count must be even to get correct winding for triangle strips. */
max_gsprims &= ~1;
if (max_gsprims - 2 < max_esverts) {
max_esverts = max_gsprims + 2;
} else {
max_gsprims = max_esverts - 2;
max_gsprims &= ~1;
max_esverts = max_gsprims + 2;
}
}
/* On gfx10, the GE only checks against the maximum number of ES verts after
* allocating a full GS primitive. So we need to ensure that whenever
* this check passes, there is enough space for a full primitive without
* vertex reuse.
*/
if (gs_sel->screen->info.chip_class == GFX10)
if (gs_sel->screen->info.chip_class == GFX10 &&
!(shader->key.opt.ngg_culling & (SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST |
SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)))
shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
else
shader->ngg.hw_max_esverts = max_esverts;

View file

@ -1305,12 +1305,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST ||
shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims * 3);
} else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.max_gsprims + 2);
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts);
} else {
shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |