mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 06:10:13 +01:00
radeonsi: add SI_CONTEXT_PFP_SYNC_ME to skip syncing PFP for image operations
DCC/CMASK/HTILE clears will not set this. We could do a better job at not setting this in other cases too Image copies also don't set this. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9795>
This commit is contained in:
parent
4fb1b7b2d8
commit
c53261645d
8 changed files with 35 additions and 29 deletions
|
|
@ -70,6 +70,9 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
|
|||
if (flags & SI_OP_SYNC_CS_BEFORE)
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
|
||||
if (!(flags & SI_OP_CS_IMAGE))
|
||||
sctx->flags |= SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
/* Invalidate L0-L1 caches. */
|
||||
/* sL0 is never invalidated, because src resources don't use it. */
|
||||
if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||
|
|
@ -107,7 +110,7 @@ void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *inf
|
|||
sctx->flags |= SI_CONTEXT_INV_VCACHE;
|
||||
} else {
|
||||
/* Make sure buffer stores are visible to all CUs. */
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | SI_CONTEXT_PFP_SYNC_ME;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -197,10 +197,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
|||
assert(size && size % 4 == 0);
|
||||
|
||||
if (user_flags & SI_OP_SYNC_CS_BEFORE)
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
|
|
@ -340,10 +340,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
|||
}
|
||||
|
||||
if (user_flags & SI_OP_SYNC_CS_BEFORE)
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
if (user_flags & SI_OP_SYNC_PS_BEFORE)
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE))
|
||||
sctx->flags |= si_get_flush_flags(sctx, coher, cache_policy);
|
||||
|
|
|
|||
|
|
@ -569,6 +569,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns
|
|||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
|
||||
|
||||
radeon_begin(cs);
|
||||
|
||||
if (sctx->chip_class == GFX9 || compute_ib) {
|
||||
|
|
@ -749,21 +751,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|||
|
||||
/* Ignore fields that only modify the behavior of other fields. */
|
||||
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
|
||||
unsigned dont_sync_pfp = (!(flags & SI_CONTEXT_PFP_SYNC_ME)) << 31;
|
||||
|
||||
/* Flush caches and wait for the caches to assert idle.
|
||||
* The cache flush is executed in the ME, but the PFP waits
|
||||
* for completion.
|
||||
*/
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(cs, 0); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
|
||||
} else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
|
||||
/* We need to ensure that PFP waits as well. */
|
||||
} else if (flags & SI_CONTEXT_PFP_SYNC_ME) {
|
||||
/* Synchronize PFP with ME. (this stalls PFP) */
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
|
|
@ -953,23 +956,11 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|||
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
||||
}
|
||||
|
||||
/* Make sure ME is idle (it executes most packets) before continuing.
|
||||
* This prevents read-after-write hazards between PFP and ME.
|
||||
*/
|
||||
if (sctx->has_graphics &&
|
||||
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
|
||||
radeon_begin(cs);
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
/* GFX6-GFX8 only:
|
||||
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
|
||||
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
|
||||
*
|
||||
* cp_coher_cntl should contain all necessary flags except TC flags
|
||||
* cp_coher_cntl should contain all necessary flags except TC and PFP flags
|
||||
* at this point.
|
||||
*
|
||||
* GFX6-GFX7 don't support L2 write-back.
|
||||
|
|
@ -1011,6 +1002,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
|||
if (cp_coher_cntl)
|
||||
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
|
||||
|
||||
if (flags & SI_CONTEXT_PFP_SYNC_ME) {
|
||||
radeon_begin(cs);
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
if (is_barrier)
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -98,6 +98,9 @@ extern "C" {
|
|||
#define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14)
|
||||
#define SI_CONTEXT_VGT_FLUSH (1 << 15)
|
||||
#define SI_CONTEXT_VGT_STREAMOUT_SYNC (1 << 16)
|
||||
/* PFP waits for ME to finish. Used to sync for index and indirect buffers and render
|
||||
* condition. It's typically set when doing a VS/PS/CS partial flush for buffers. */
|
||||
#define SI_CONTEXT_PFP_SYNC_ME (1 << 17)
|
||||
|
||||
#define SI_PREFETCH_LS (1 << 1)
|
||||
#define SI_PREFETCH_HS (1 << 2)
|
||||
|
|
|
|||
|
|
@ -1582,7 +1582,7 @@ static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_q
|
|||
}
|
||||
|
||||
sctx->b.launch_grid(&sctx->b, &grid);
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
}
|
||||
|
||||
si_restore_qbo_state(sctx, &saved_state);
|
||||
|
|
|
|||
|
|
@ -383,7 +383,7 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
|
|||
sctx->flags |=
|
||||
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2;
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
|
||||
sctx->emit_cache_flush(sctx, cs);
|
||||
|
||||
si_inhibit_clockgating(sctx, cs, true);
|
||||
|
|
@ -426,7 +426,7 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *
|
|||
sctx->flags |=
|
||||
SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2;
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
|
||||
sctx->emit_cache_flush(sctx, cs);
|
||||
|
||||
si_emit_thread_trace_stop(sctx, cs, family);
|
||||
|
|
|
|||
|
|
@ -5001,7 +5001,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
|
|||
|
||||
/* Subsequent commands must wait for all shader invocations to
|
||||
* complete. */
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
|||
|
||||
/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
|
||||
/* Wait now. This is needed to make sure that GDS is not
|
||||
* busy at the end of IBs.
|
||||
|
|
@ -122,7 +122,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
|||
*/
|
||||
wait_now = true;
|
||||
} else {
|
||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -133,7 +133,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
|
|||
if (sctx->screen->use_ngg_streamout)
|
||||
si_allocate_gds(sctx);
|
||||
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_PFP_SYNC_ME;
|
||||
}
|
||||
|
||||
/* Streamout buffers must be bound in 2 places:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue