radeonsi: add si_cp_acquire_mem helper and clean up its usage for gfx6-9

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
This commit is contained in:
Marek Olšák 2024-08-06 23:41:17 -04:00
parent 1d5ffb13d6
commit a42d9db1b6
3 changed files with 98 additions and 71 deletions

View file

@ -131,3 +131,56 @@ void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf
si_cp_release_mem_pws(sctx, cs, event_type, gcr_cntl);
si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags);
}
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits for idle on older
* chips. "engine" determines whether to sync in PFP or ME.
*/
void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl,
unsigned engine)
{
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
assert(gcr_cntl);
if (sctx->gfx_level >= GFX10) {
/* TODO */
} else {
bool compute_ib = !sctx->has_graphics;
/* This seems problematic with GFX7 (see #4764) */
if (sctx->gfx_level != GFX7)
gcr_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
if (sctx->gfx_level == GFX9 || compute_ib) {
/* Flush caches and wait for the caches to assert idle. */
radeon_begin(cs);
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
radeon_emit(gcr_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0xffffff); /* CP_COHER_SIZE_HI */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0); /* CP_COHER_BASE_HI */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
radeon_end();
} else {
/* ACQUIRE_MEM is only required on the compute ring. */
radeon_begin(cs);
radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
radeon_emit(gcr_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
radeon_end();
}
/* ACQUIRE_MEM & SURFACE_SYNC roll the context if the current context is busy. */
if (!compute_ib)
sctx->context_roll = true;
if (engine == V_580_CP_PFP) {
radeon_begin(cs);
radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(0);
radeon_end();
}
}
}

View file

@ -684,43 +684,6 @@ void si_emit_ts(struct si_context *sctx, struct si_resource* buffer, unsigned in
EOP_DATA_SEL_TIMESTAMP, buffer, va, 0, PIPE_QUERY_TIMESTAMP);
}
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
{
bool compute_ib = !sctx->has_graphics;
assert(sctx->gfx_level <= GFX9);
/* This seems problematic with GFX7 (see #4764) */
if (sctx->gfx_level != GFX7)
cp_coher_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
radeon_begin(cs);
if (sctx->gfx_level == GFX9 || compute_ib) {
/* Flush caches and wait for the caches to assert idle. */
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0xffffff); /* CP_COHER_SIZE_HI */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0); /* CP_COHER_BASE_HI */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
} else {
/* ACQUIRE_MEM is only required on a compute ring. */
radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
}
radeon_end();
/* ACQUIRE_MEM has an implicit context roll if the current context
* is busy. */
if (!compute_ib)
sctx->context_roll = true;
}
static struct si_resource *si_get_wait_mem_scratch_bo(struct si_context *ctx,
struct radeon_cmdbuf *cs, bool is_secure)
{
@ -1135,27 +1098,24 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
}
}
/* GFX6-GFX8 only:
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
/* GFX6-GFX8 only: When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC waits
* for idle, so it should be last.
*
* cp_coher_cntl should contain all necessary flags except TC and PFP flags
* at this point.
* cp_coher_cntl should contain everything except TC flags at this point.
*
* GFX6-GFX7 don't support L2 write-back.
*/
if (flags & SI_CONTEXT_INV_L2 || (sctx->gfx_level <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
* WB must be set on GFX8+ when TC_ACTION is set.
*/
si_emit_surface_sync(sctx, cs,
cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8));
cp_coher_cntl = 0;
unsigned engine = flags & SI_CONTEXT_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME;
if (flags & SI_CONTEXT_INV_L2 || (sctx->gfx_level <= GFX7 && flags & SI_CONTEXT_WB_L2)) {
/* Invalidate L1 & L2. WB must be set on GFX8+ when TC_ACTION is set. */
si_cp_acquire_mem(sctx, cs,
cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(sctx->gfx_level >= GFX8), engine);
sctx->num_L2_invalidates++;
} else {
/* L1 invalidation and L2 writeback must be done separately,
* because both operations can't be done together.
/* L1 invalidation and L2 writeback must be done separately, because both operations can't
* be done together.
*/
if (flags & SI_CONTEXT_WB_L2) {
/* WB = write-back
@ -1163,29 +1123,43 @@ void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
* (i.e. MTYPE <= 1, which is what we use everywhere)
*
* WB doesn't work without NC.
*
* If we get here, the only flag that can't be executed together with WB_L2 is VMEM cache
* invalidation.
*/
si_emit_surface_sync(
sctx, cs,
cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
bool last_acquire_mem = !(flags & SI_CONTEXT_INV_VCACHE);
si_cp_acquire_mem(sctx, cs,
cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) |
S_0301F0_TC_NC_ACTION_ENA(1),
/* If this is not the last ACQUIRE_MEM, flush in ME.
* We only want to synchronize with PFP in the last ACQUIRE_MEM. */
last_acquire_mem ? engine : V_580_CP_ME);
if (last_acquire_mem)
flags &= ~SI_CONTEXT_PFP_SYNC_ME;
cp_coher_cntl = 0;
sctx->num_L2_writebacks++;
}
if (flags & SI_CONTEXT_INV_VCACHE) {
/* Invalidate per-CU VMEM L1. */
si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
cp_coher_cntl = 0;
if (flags & SI_CONTEXT_INV_VCACHE)
cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
/* If there are still some cache flags left... */
if (cp_coher_cntl) {
si_cp_acquire_mem(sctx, cs, cp_coher_cntl, engine);
flags &= ~SI_CONTEXT_PFP_SYNC_ME;
}
}
/* If TC flushes haven't cleared this... */
if (cp_coher_cntl)
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
if (flags & SI_CONTEXT_PFP_SYNC_ME) {
radeon_begin(cs);
radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(0);
radeon_end();
/* This might be needed even without any cache flags, such as when doing buffer stores
* to an index buffer.
*/
if (flags & SI_CONTEXT_PFP_SYNC_ME) {
radeon_begin(cs);
radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(0);
radeon_end();
}
}
if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {

View file

@ -1568,6 +1568,8 @@ void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
unsigned event_type, unsigned gcr_cntl, unsigned stage_sel,
unsigned sqtt_flush_flags);
void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl,
unsigned engine);
/* si_debug.c */
void si_gather_context_rolls(struct si_context *sctx);
@ -1610,8 +1612,6 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
void si_trace_emit(struct si_context *sctx);
void si_emit_ts(struct si_context *sctx, struct si_resource* buffer, unsigned int offset);
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
unsigned cp_coher_cntl);
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
void gfx6_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement