radeonsi: prefetch VB descriptors right after uploading

This skips the logic that sets and checks prefetch_L2_mask.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8794>
This commit is contained in:
Marek Olšák 2021-01-11 15:14:40 -05:00
parent d5b529ad14
commit 47587758f2
3 changed files with 16 additions and 36 deletions

View file

@ -445,8 +445,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->prefetch_L2_mask |= SI_PREFETCH_VS; ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
if (ctx->queued.named.ps) if (ctx->queued.named.ps)
ctx->prefetch_L2_mask |= SI_PREFETCH_PS; ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
if (ctx->vb_descriptors_buffer)
ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
bool has_clear_state = ctx->screen->info.has_clear_state; bool has_clear_state = ctx->screen->info.has_clear_state;

View file

@ -99,7 +99,6 @@ extern "C" {
#define SI_CONTEXT_VGT_FLUSH (1 << 15) #define SI_CONTEXT_VGT_FLUSH (1 << 15)
#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16) #define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
#define SI_PREFETCH_VBO_DESCRIPTORS (1 << 0)
#define SI_PREFETCH_LS (1 << 1) #define SI_PREFETCH_LS (1 << 1)
#define SI_PREFETCH_HS (1 << 2) #define SI_PREFETCH_HS (1 << 2)
#define SI_PREFETCH_ES (1 << 3) #define SI_PREFETCH_ES (1 << 3)

View file

@ -66,15 +66,6 @@ static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_stat
si_cp_dma_prefetch(sctx, bo, 0, bo->width0); si_cp_dma_prefetch(sctx, bo, 0, bo->width0);
} }
static void si_prefetch_VBO_descriptors(struct si_context *sctx)
{
if (!sctx->vertex_elements->vb_desc_list_alloc_size)
return;
si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
sctx->vertex_elements->vb_desc_list_alloc_size);
}
enum si_L2_prefetch_mode { enum si_L2_prefetch_mode {
PREFETCH_BEFORE_DRAW = 1, PREFETCH_BEFORE_DRAW = 1,
PREFETCH_AFTER_DRAW, PREFETCH_AFTER_DRAW,
@ -82,11 +73,11 @@ enum si_L2_prefetch_mode {
}; };
/** /**
* Prefetch shaders and VBO descriptors. * Prefetch shaders.
*/ */
template<chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG, template<chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
si_L2_prefetch_mode mode> si_L2_prefetch_mode mode>
static void si_emit_prefetch_L2(struct si_context *sctx) static void si_prefetch_shaders(struct si_context *sctx)
{ {
unsigned mask = sctx->prefetch_L2_mask; unsigned mask = sctx->prefetch_L2_mask;
@ -101,8 +92,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
if (mode != PREFETCH_AFTER_DRAW) { if (mode != PREFETCH_AFTER_DRAW) {
if (mask & SI_PREFETCH_HS) if (mask & SI_PREFETCH_HS)
si_prefetch_shader_async(sctx, sctx->queued.named.hs); si_prefetch_shader_async(sctx, sctx->queued.named.hs);
if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
si_prefetch_VBO_descriptors(sctx);
if (mode == PREFETCH_BEFORE_DRAW) if (mode == PREFETCH_BEFORE_DRAW)
return; return;
@ -116,8 +105,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
if (mode != PREFETCH_AFTER_DRAW) { if (mode != PREFETCH_AFTER_DRAW) {
if (mask & SI_PREFETCH_GS) if (mask & SI_PREFETCH_GS)
si_prefetch_shader_async(sctx, sctx->queued.named.gs); si_prefetch_shader_async(sctx, sctx->queued.named.gs);
if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
si_prefetch_VBO_descriptors(sctx);
if (mode == PREFETCH_BEFORE_DRAW) if (mode == PREFETCH_BEFORE_DRAW)
return; return;
@ -129,8 +116,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
if (mode != PREFETCH_AFTER_DRAW) { if (mode != PREFETCH_AFTER_DRAW) {
if (mask & SI_PREFETCH_VS) if (mask & SI_PREFETCH_VS)
si_prefetch_shader_async(sctx, sctx->queued.named.vs); si_prefetch_shader_async(sctx, sctx->queued.named.vs);
if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
si_prefetch_VBO_descriptors(sctx);
if (mode == PREFETCH_BEFORE_DRAW) if (mode == PREFETCH_BEFORE_DRAW)
return; return;
@ -143,8 +128,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
if (mode != PREFETCH_AFTER_DRAW) { if (mode != PREFETCH_AFTER_DRAW) {
if (mask & SI_PREFETCH_LS) if (mask & SI_PREFETCH_LS)
si_prefetch_shader_async(sctx, sctx->queued.named.ls); si_prefetch_shader_async(sctx, sctx->queued.named.ls);
if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
si_prefetch_VBO_descriptors(sctx);
if (mode == PREFETCH_BEFORE_DRAW) if (mode == PREFETCH_BEFORE_DRAW)
return; return;
@ -162,8 +145,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
if (mode != PREFETCH_AFTER_DRAW) { if (mode != PREFETCH_AFTER_DRAW) {
if (mask & SI_PREFETCH_ES) if (mask & SI_PREFETCH_ES)
si_prefetch_shader_async(sctx, sctx->queued.named.es); si_prefetch_shader_async(sctx, sctx->queued.named.es);
if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
si_prefetch_VBO_descriptors(sctx);
if (mode == PREFETCH_BEFORE_DRAW) if (mode == PREFETCH_BEFORE_DRAW)
return; return;
@ -177,8 +158,6 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
if (mode != PREFETCH_AFTER_DRAW) { if (mode != PREFETCH_AFTER_DRAW) {
if (mask & SI_PREFETCH_VS) if (mask & SI_PREFETCH_VS)
si_prefetch_shader_async(sctx, sctx->queued.named.vs); si_prefetch_shader_async(sctx, sctx->queued.named.vs);
if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
si_prefetch_VBO_descriptors(sctx);
if (mode == PREFETCH_BEFORE_DRAW) if (mode == PREFETCH_BEFORE_DRAW)
return; return;
@ -1331,7 +1310,7 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
} }
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
{ {
unsigned count = sctx->num_vertex_elements; unsigned count = sctx->num_vertex_elements;
bool pointer_dirty, user_sgprs_dirty; bool pointer_dirty, user_sgprs_dirty;
@ -1362,10 +1341,10 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
sctx->vb_descriptors_gpu_list = ptr; sctx->vb_descriptors_gpu_list = ptr;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer, radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->vb_descriptors_buffer,
RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
alloc_size);
} else { } else {
si_resource_reference(&sctx->vb_descriptors_buffer, NULL); si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
} }
unsigned first_vb_use_mask = velems->first_vb_use_mask; unsigned first_vb_use_mask = velems->first_vb_use_mask;
@ -2142,8 +2121,10 @@ static void si_draw_vbo(struct pipe_context *ctx,
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
/* <-- CUs are idle here. */ /* <-- CUs are idle here. */
/* This uploads VBO descriptors and sets user SGPRs. */ /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
if (unlikely((!si_upload_vertex_buffer_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) { * It should done after cache flushing.
*/
if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
DRAW_CLEANUP; DRAW_CLEANUP;
return; return;
} }
@ -2170,7 +2151,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* Start prefetches after the draw has been started. Both will run /* Start prefetches after the draw has been started. Both will run
* in parallel, but starting the draw first is more important. * in parallel, but starting the draw first is more important.
*/ */
si_emit_prefetch_L2<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx); si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_ALL>(sctx);
} else { } else {
/* If we don't wait for idle, start prefetches first, then set /* If we don't wait for idle, start prefetches first, then set
* states, and draw at the end. * states, and draw at the end.
@ -2179,10 +2160,12 @@ static void si_draw_vbo(struct pipe_context *ctx,
sctx->emit_cache_flush(sctx, &sctx->gfx_cs); sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
/* Only prefetch the API VS and VBO descriptors. */ /* Only prefetch the API VS and VBO descriptors. */
si_emit_prefetch_L2<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx); si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_BEFORE_DRAW>(sctx);
/* This uploads VBO descriptors and sets user SGPRs. */ /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
if (unlikely((!si_upload_vertex_buffer_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) { * It should done after cache flushing and after the VS prefetch.
*/
if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
DRAW_CLEANUP; DRAW_CLEANUP;
return; return;
} }
@ -2206,7 +2189,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* Prefetch the remaining shaders after the draw has been /* Prefetch the remaining shaders after the draw has been
* started. */ * started. */
si_emit_prefetch_L2<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx); si_prefetch_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG, PREFETCH_AFTER_DRAW>(sctx);
} }
/* Clear the context roll flag after the draw call. /* Clear the context roll flag after the draw call.