radeonsi: disallow NGG fast launch on Navi1x because VGT_FLUSH makes it slower

This improves viewperf performance on Navi1x.

All Navi1x fast launch workarounds are removed and all fast launch
codepaths are disabled.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13048>
This commit is contained in:
Marek Olšák 2021-09-26 08:45:19 -04:00 committed by Marge Bot
parent b1cf504f78
commit ccbd551192
2 changed files with 15 additions and 18 deletions

View file

@ -358,12 +358,10 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
ctx->pipeline_stats_enabled = -1;
/* We don't know if the last draw used NGG or NGG fast launch because it can be a different
* process. When switching NGG->legacy or NGG->FAST_LAUNCH, we need to flush VGT for certain
* hw generations.
/* We don't know if the last draw used NGG because it can be a different process.
* When switching NGG->legacy, we need to flush VGT for certain hw generations.
*/
if ((ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg) ||
(ctx->chip_class == GFX10 && ctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL))
if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
ctx->flags |= SI_CONTEXT_VGT_FLUSH;
if (ctx->border_color_buffer) {

View file

@ -1601,7 +1601,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
}
} else {
/* Set the index buffer for fast launch. The VS prolog will load the indices. */
if (NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
if (GFX_VERSION >= GFX10_3 && NGG &&
sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),
@ -2173,11 +2174,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
* A draw must have at least 1 full primitive.
* The fast launch doesn't work with tessellation.
*
* Fast launch is disabled on Navi1x because enabling it requires VGT_FLUSH,
* which decreases performance by up to 10%. Only use fast launch on gfx10.3 and newer.
*
* Since NGG fast launch is enabled by VGT_SHADER_STAGES_EN, which causes a context roll,
* which decreases performance, decrease the frequency of switching it on/off using
* a high vertex count threshold.
*/
if (!HAS_TESS && total_direct_count >= 8000 &&
if (GFX_VERSION >= GFX10_3 && !HAS_TESS && total_direct_count >= 8000 &&
!(sctx->screen->debug_flags & DBG(NO_FAST_LAUNCH))) {
if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
@ -2208,19 +2212,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
return;
}
/* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
* See issues #2418, #2426, #2434
*
* This is the setting that is used by the draw.
/* si_update_shaders can clear the ngg_culling settings if the shader compilation hasn't
* finished.
*/
if (GFX_VERSION >= GFX10) {
if (GFX_VERSION >= GFX10 && NGG) {
uint8_t ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling;
if (GFX_VERSION == GFX10 &&
!(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
if (GFX_VERSION >= GFX10_3 &&
old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
!(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
/* Need to re-set these, because we have bound an index buffer there. */
sctx->shader_pointers_dirty |=
@ -2235,7 +2234,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
/* ngg_culling can be changed after si_update_shaders above, so determine index_size here. */
if (GFX_VERSION >= GFX10 && NGG &&
if (GFX_VERSION >= GFX10_3 && NGG &&
sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))
index_size = 0; /* The index buffer will be emulated. */