radeonsi: add SI_CONTEXT_PFP_SYNC_ME to skip syncing PFP for image operations

DCC/CMASK/HTILE clears will not set this. We could do a better job
at not setting this in other cases too

Image copies also don't set this.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9795>
This commit is contained in:
Marek Olšák 2021-03-21 15:26:17 -04:00 committed by Marge Bot
parent 4fb1b7b2d8
commit c53261645d
8 changed files with 35 additions and 29 deletions

View file

@ -70,6 +70,9 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
if (flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
if (!(flags & SI_OP_CS_IMAGE))
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
/* Invalidate L0-L1 caches. */
/* sL0 is never invalidated, because src resources don't use it. */
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
@ -107,7 +110,7 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
sctx->flags |= SI_CONTEXT_INV_VCACHE;
} else {
/* Make sure buffer stores are visible to all CUs. */
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
}
}
}

View file

@ -197,10 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
assert(size && size % 4 == 0);
if (user_flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
if (user_flags & SI_OP_SYNC_PS_BEFORE)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
/* Mark the buffer range of destination as valid (initialized),
* so that transfer_map knows it should wait for the GPU when mapping
@ -340,10 +340,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
}
if (user_flags & SI_OP_SYNC_CS_BEFORE)
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
if (user_flags & SI_OP_SYNC_PS_BEFORE)
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);

View file

@ -569,6 +569,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns
assert(sctx->chip_class <= GFX9);
cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
radeon_begin(cs);
if (sctx->chip_class == GFX9 || compute_ib) {
@ -749,21 +751,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
/* Ignore fields that only modify the behavior of other fields. */
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31;
/* Flush caches and wait for the caches to assert idle.
* The cache flush is executed in the ME, but the PFP waits
* for completion.
*/
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(cs, 0); /* CP_COHER_CNTL */
radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
radeon_emit(cs, 0); /* CP_COHER_BASE */
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
} else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
/* We need to ensure that PFP waits as well. */
} else if (flags & SI_CONTEXT_PFP_SYNC_ME) {
/* Synchronize PFP with ME. (this stalls PFP) */
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
@ -953,23 +956,11 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
}
/* Make sure ME is idle (it executes most packets) before continuing.
* This prevents read-after-write hazards between PFP and ME.
*/
if (sctx->has_graphics &&
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
radeon_end();
}
/* GFX6-GFX8 only:
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
*
* cp_coher_cntl should contain all necessary flags except TC flags
* cp_coher_cntl should contain all necessary flags except TC and PFP flags
* at this point.
*
* GFX6-GFX7 don't support L2 write-back.
@ -1011,6 +1002,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
if (cp_coher_cntl)
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
if (flags & SI_CONTEXT_PFP_SYNC_ME) {
radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
radeon_end();
}
if (is_barrier)
si_prim_discard_signal_next_compute_ib_start(sctx);

View file

@ -98,6 +98,9 @@ extern "C" {
#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
#define SI_CONTEXT_VGT_FLUSH (1 << 15)
#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
/* PFP waits for ME to finish. Used to sync for index and indirect buffers and render
* condition. It's typically set when doing a VS/PS/CS partial flush for buffers. */
#define SI_CONTEXT_PFP_SYNC_ME (1 << 17)
#define SI_PREFETCH_LS (1 << 1)
#define SI_PREFETCH_HS (1 << 2)

View file

@ -1582,7 +1582,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
}
sctx->b.launch_grid(&sctx->b, &grid);
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
}
si_restore_qbo_state(sctx, &saved_state);

View file

@ -383,7 +383,7 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
sctx->flags |=
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2;
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
sctx->emit_cache_flush(sctx, cs);
si_inhibit_clockgating(sctx, cs, true);
@ -426,7 +426,7 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *
sctx->flags |=
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2;
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
sctx->emit_cache_flush(sctx, cs);
si_emit_thread_trace_stop(sctx, cs, family);

View file

@ -5001,7 +5001,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
/* Subsequent commands must wait for all shader invocations to
* complete. */
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_PFP_SYNC_ME;
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;

View file

@ -112,7 +112,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
if (sctx->screen->use_ngg_streamout) {
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
/* Wait now. This is needed to make sure that GDS is not
* busy at the end of IBs.
@ -122,7 +122,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
*/
wait_now = true;
} else {
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
}
}
@ -133,7 +133,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
if (sctx->screen->use_ngg_streamout)
si_allocate_gds(sctx);
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
SI_CONTEXT_PFP_SYNC_ME;
}
/* Streamout buffers must be bound in 2 places: