mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 17:48:10 +02:00
radeonsi: add SI_CONTEXT_PFP_SYNC_ME to skip syncing PFP for image operations
DCC/CMASK/HTILE clears will not set this. We could do a better job at not setting this in other cases too Image copies also don't set this. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9795>
This commit is contained in:
parent
4fb1b7b2d8
commit
c53261645d
8 changed files with 35 additions and 29 deletions
|
|
@ -70,6 +70,9 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
|
||||||
if (flags & SI_OP_SYNC_CS_BEFORE)
|
if (flags & SI_OP_SYNC_CS_BEFORE)
|
||||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||||
|
|
||||||
|
if (!(flags & SI_OP_CS_IMAGE))
|
||||||
|
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
/* Invalidate L0-L1 caches. */
|
/* Invalidate L0-L1 caches. */
|
||||||
/* sL0 is never invalidated, because src resources don't use it. */
|
/* sL0 is never invalidated, because src resources don't use it. */
|
||||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||||
|
|
@ -107,7 +110,7 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
|
||||||
sctx->flags |= SI_CONTEXT_INV_VCACHE;
|
sctx->flags |= SI_CONTEXT_INV_VCACHE;
|
||||||
} else {
|
} else {
|
||||||
/* Make sure buffer stores are visible to all CUs. */
|
/* Make sure buffer stores are visible to all CUs. */
|
||||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -197,10 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||||
assert(size && size % 4 == 0);
|
assert(size && size % 4 == 0);
|
||||||
|
|
||||||
if (user_flags & SI_OP_SYNC_CS_BEFORE)
|
if (user_flags & SI_OP_SYNC_CS_BEFORE)
|
||||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
||||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
/* Mark the buffer range of destination as valid (initialized),
|
/* Mark the buffer range of destination as valid (initialized),
|
||||||
* so that transfer_map knows it should wait for the GPU when mapping
|
* so that transfer_map knows it should wait for the GPU when mapping
|
||||||
|
|
@ -340,10 +340,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (user_flags & SI_OP_SYNC_CS_BEFORE)
|
if (user_flags & SI_OP_SYNC_CS_BEFORE)
|
||||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
||||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||||
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
||||||
|
|
|
||||||
|
|
@ -569,6 +569,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns
|
||||||
|
|
||||||
assert(sctx->chip_class <= GFX9);
|
assert(sctx->chip_class <= GFX9);
|
||||||
|
|
||||||
|
cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
|
||||||
|
|
||||||
radeon_begin(cs);
|
radeon_begin(cs);
|
||||||
|
|
||||||
if (sctx->chip_class == GFX9 || compute_ib) {
|
if (sctx->chip_class == GFX9 || compute_ib) {
|
||||||
|
|
@ -749,21 +751,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
||||||
|
|
||||||
/* Ignore fields that only modify the behavior of other fields. */
|
/* Ignore fields that only modify the behavior of other fields. */
|
||||||
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
|
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
|
||||||
|
unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31;
|
||||||
|
|
||||||
/* Flush caches and wait for the caches to assert idle.
|
/* Flush caches and wait for the caches to assert idle.
|
||||||
* The cache flush is executed in the ME, but the PFP waits
|
* The cache flush is executed in the ME, but the PFP waits
|
||||||
* for completion.
|
* for completion.
|
||||||
*/
|
*/
|
||||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||||
radeon_emit(cs, 0); /* CP_COHER_CNTL */
|
radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */
|
||||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||||
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
|
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
|
||||||
} else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
|
} else if (flags & SI_CONTEXT_PFP_SYNC_ME) {
|
||||||
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
|
/* Synchronize PFP with ME. (this stalls PFP) */
|
||||||
/* We need to ensure that PFP waits as well. */
|
|
||||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||||
radeon_emit(cs, 0);
|
radeon_emit(cs, 0);
|
||||||
}
|
}
|
||||||
|
|
@ -953,23 +956,11 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
||||||
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Make sure ME is idle (it executes most packets) before continuing.
|
|
||||||
* This prevents read-after-write hazards between PFP and ME.
|
|
||||||
*/
|
|
||||||
if (sctx->has_graphics &&
|
|
||||||
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
|
|
||||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
|
|
||||||
radeon_begin(cs);
|
|
||||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
|
||||||
radeon_emit(cs, 0);
|
|
||||||
radeon_end();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* GFX6-GFX8 only:
|
/* GFX6-GFX8 only:
|
||||||
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
|
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
|
||||||
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
|
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
|
||||||
*
|
*
|
||||||
* cp_coher_cntl should contain all necessary flags except TC flags
|
* cp_coher_cntl should contain all necessary flags except TC and PFP flags
|
||||||
* at this point.
|
* at this point.
|
||||||
*
|
*
|
||||||
* GFX6-GFX7 don't support L2 write-back.
|
* GFX6-GFX7 don't support L2 write-back.
|
||||||
|
|
@ -1011,6 +1002,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
||||||
if (cp_coher_cntl)
|
if (cp_coher_cntl)
|
||||||
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
|
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
|
||||||
|
|
||||||
|
if (flags & SI_CONTEXT_PFP_SYNC_ME) {
|
||||||
|
radeon_begin(cs);
|
||||||
|
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||||
|
radeon_emit(cs, 0);
|
||||||
|
radeon_end();
|
||||||
|
}
|
||||||
|
|
||||||
if (is_barrier)
|
if (is_barrier)
|
||||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -98,6 +98,9 @@ extern "C" {
|
||||||
#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
|
#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
|
||||||
#define SI_CONTEXT_VGT_FLUSH (1 << 15)
|
#define SI_CONTEXT_VGT_FLUSH (1 << 15)
|
||||||
#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
|
#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
|
||||||
|
/* PFP waits for ME to finish. Used to sync for index and indirect buffers and render
|
||||||
|
* condition. It's typically set when doing a VS/PS/CS partial flush for buffers. */
|
||||||
|
#define SI_CONTEXT_PFP_SYNC_ME (1 << 17)
|
||||||
|
|
||||||
#define SI_PREFETCH_LS (1 << 1)
|
#define SI_PREFETCH_LS (1 << 1)
|
||||||
#define SI_PREFETCH_HS (1 << 2)
|
#define SI_PREFETCH_HS (1 << 2)
|
||||||
|
|
|
||||||
|
|
@ -1582,7 +1582,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
|
||||||
}
|
}
|
||||||
|
|
||||||
sctx->b.launch_grid(&sctx->b, &grid);
|
sctx->b.launch_grid(&sctx->b, &grid);
|
||||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
}
|
}
|
||||||
|
|
||||||
si_restore_qbo_state(sctx, &saved_state);
|
si_restore_qbo_state(sctx, &saved_state);
|
||||||
|
|
|
||||||
|
|
@ -383,7 +383,7 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
|
||||||
sctx->flags |=
|
sctx->flags |=
|
||||||
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||||
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||||
SI_CONTEXT_INV_L2;
|
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
sctx->emit_cache_flush(sctx, cs);
|
sctx->emit_cache_flush(sctx, cs);
|
||||||
|
|
||||||
si_inhibit_clockgating(sctx, cs, true);
|
si_inhibit_clockgating(sctx, cs, true);
|
||||||
|
|
@ -426,7 +426,7 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *
|
||||||
sctx->flags |=
|
sctx->flags |=
|
||||||
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||||
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||||
SI_CONTEXT_INV_L2;
|
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
sctx->emit_cache_flush(sctx, cs);
|
sctx->emit_cache_flush(sctx, cs);
|
||||||
|
|
||||||
si_emit_thread_trace_stop(sctx, cs, family);
|
si_emit_thread_trace_stop(sctx, cs, family);
|
||||||
|
|
|
||||||
|
|
@ -5001,7 +5001,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
|
||||||
|
|
||||||
/* Subsequent commands must wait for all shader invocations to
|
/* Subsequent commands must wait for all shader invocations to
|
||||||
* complete. */
|
* complete. */
|
||||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||||
|
SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
|
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
|
||||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
||||||
|
|
|
||||||
|
|
@ -112,7 +112,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
|
|
||||||
/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
|
/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
|
||||||
if (sctx->screen->use_ngg_streamout) {
|
if (sctx->screen->use_ngg_streamout) {
|
||||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
|
|
||||||
/* Wait now. This is needed to make sure that GDS is not
|
/* Wait now. This is needed to make sure that GDS is not
|
||||||
* busy at the end of IBs.
|
* busy at the end of IBs.
|
||||||
|
|
@ -122,7 +122,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
*/
|
*/
|
||||||
wait_now = true;
|
wait_now = true;
|
||||||
} else {
|
} else {
|
||||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -133,7 +133,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
||||||
if (sctx->screen->use_ngg_streamout)
|
if (sctx->screen->use_ngg_streamout)
|
||||||
si_allocate_gds(sctx);
|
si_allocate_gds(sctx);
|
||||||
|
|
||||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||||
|
SI_CONTEXT_PFP_SYNC_ME;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Streamout buffers must be bound in 2 places:
|
/* Streamout buffers must be bound in 2 places:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue