radeonsi: remove a twice duplicated workaround for VERT_GRP_SIZE

This enables better lane occupancy.

Acked-by: Timur Kristóf <timur.kristof@gmail.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10813>
This commit is contained in:
Marek Olšák 2021-05-10 20:31:19 -04:00 committed by Marge Bot
parent c8e8979d6b
commit a0fcd37731
2 changed files with 13 additions and 30 deletions

View file

@ -1964,16 +1964,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
max_esverts_base = 128;
}
/* Hardware has the following non-natural restrictions on the value
* of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
* the draw:
* - at most 252 for any line input primitive type
* - at most 251 for any quad input primitive type
* - at most 251 for triangle strips with adjacency (this happens to
* be the natural limit for triangle *lists* with adjacency)
*/
max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
if (gs_stage == MESA_SHADER_GEOMETRY) {
bool force_multi_cycling = false;
unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
@ -2125,18 +2115,7 @@ retry_select_mode:
}
}
/* On gfx10, the GE only checks against the maximum number of ES verts after
* allocating a full GS primitive. So we need to ensure that whenever
* this check passes, there is enough space for a full primitive without
* vertex reuse.
*/
if (gs_sel->screen->info.chip_class == GFX10 &&
!(shader->key.opt.ngg_culling & (SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST |
SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)))
shader->ngg.hw_max_esverts = max_esverts - max_verts_per_prim + 1;
else
shader->ngg.hw_max_esverts = max_esverts;
shader->ngg.hw_max_esverts = max_esverts;
shader->ngg.max_gsprims = max_gsprims;
shader->ngg.max_out_verts = max_out_vertices;
shader->ngg.prim_amp_factor = prim_amp_factor;

View file

@ -1314,19 +1314,23 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
/* Bug workaround for a possible hang with non-tessellation cases.
* Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
/* On gfx10, the GE only checks against the maximum number of ES verts after
* allocating a full GS primitive. So we need to ensure that whenever
* this check passes, there is enough space for a full primitive without
* vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
* if we have enough LDS.
*
* Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
* Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
*/
if ((sscreen->info.chip_class == GFX10) &&
(es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
shader->ngg.hw_max_esverts != 256) {
shader->ngg.hw_max_esverts != 256 &&
shader->ngg.hw_max_esverts > 5) {
/* This could be based on the input primitive type. 5 is the worst case
* for primitive types with adjacency.
*/
shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
if (shader->ngg.hw_max_esverts > 5) {
shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
}
shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
}
}