mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 07:00:12 +01:00
radeonsi: move emit_cache_flush functions into si_gfx_cs.c
This is a better place for them. They are not inlined anyway. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8548>
This commit is contained in:
parent
1ceec51b12
commit
4056e953fe
4 changed files with 449 additions and 451 deletions
|
|
@ -552,3 +552,448 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
|
|||
|
||||
ctx->index_ring_offset = 0;
|
||||
}
|
||||
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
|
||||
{
|
||||
bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
|
||||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
if (sctx->chip_class == GFX9 || compute_ib) {
|
||||
/* Flush caches and wait for the caches to assert idle. */
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
|
||||
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
} else {
|
||||
/* ACQUIRE_MEM is only required on a compute ring. */
|
||||
radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
|
||||
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
}
|
||||
|
||||
/* ACQUIRE_MEM has an implicit context roll if the current context
|
||||
* is busy. */
|
||||
if (!compute_ib)
|
||||
sctx->context_roll = true;
|
||||
}
|
||||
|
||||
void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
||||
{
|
||||
uint32_t gcr_cntl = 0;
|
||||
unsigned cb_db_event = 0;
|
||||
unsigned flags = ctx->flags;
|
||||
|
||||
if (!ctx->has_graphics) {
|
||||
/* Only process compute flags. */
|
||||
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
/* We don't need these. */
|
||||
assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
|
||||
|
||||
if (flags & SI_CONTEXT_VGT_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
||||
ctx->num_cb_cache_flushes++;
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
||||
ctx->num_db_cache_flushes++;
|
||||
|
||||
if (flags & SI_CONTEXT_INV_ICACHE)
|
||||
gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
|
||||
if (flags & SI_CONTEXT_INV_SCACHE) {
|
||||
/* TODO: When writing to the SMEM L1 cache, we need to set SEQ
|
||||
* to FORWARD when both L1 and L2 are written out (WB or INV).
|
||||
*/
|
||||
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
|
||||
}
|
||||
if (flags & SI_CONTEXT_INV_VCACHE)
|
||||
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
|
||||
|
||||
/* The L2 cache ops are:
|
||||
* - INV: - invalidate lines that reflect memory (were loaded from memory)
|
||||
* - don't touch lines that were overwritten (were stored by gfx clients)
|
||||
* - WB: - don't touch lines that reflect memory
|
||||
* - write back lines that were overwritten
|
||||
* - WB | INV: - invalidate lines that reflect memory
|
||||
* - write back lines that were overwritten
|
||||
*
|
||||
* GLM doesn't support WB alone. If WB is set, INV must be set too.
|
||||
*/
|
||||
if (flags & SI_CONTEXT_INV_L2) {
|
||||
/* Writeback and invalidate everything in L2. */
|
||||
gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
||||
ctx->num_L2_invalidates++;
|
||||
} else if (flags & SI_CONTEXT_WB_L2) {
|
||||
gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
|
||||
} else if (flags & SI_CONTEXT_INV_L2_METADATA) {
|
||||
gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
||||
}
|
||||
|
||||
if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
/* Flush CMASK/FMASK/DCC. Will wait for idle later. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
||||
/* Flush HTILE. Will wait for idle later. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* First flush CB/DB, then L1/L2. */
|
||||
gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
|
||||
|
||||
if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
|
||||
(SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
||||
cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
|
||||
} else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
} else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
} else {
|
||||
/* Wait for graphics shaders to go idle if requested. */
|
||||
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
/* Only count explicit shader flushes, not implicit ones. */
|
||||
ctx->num_vs_flushes++;
|
||||
ctx->num_ps_flushes++;
|
||||
} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
ctx->num_vs_flushes++;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
|
||||
ctx->num_cs_flushes++;
|
||||
ctx->compute_is_busy = false;
|
||||
}
|
||||
|
||||
if (cb_db_event) {
|
||||
struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?
|
||||
ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch;
|
||||
/* CB/DB flush and invalidate (or possibly just a wait for a
|
||||
* meta flush) via RELEASE_MEM.
|
||||
*
|
||||
* Combine this with other cache flushes when possible; this
|
||||
* requires affected shaders to be idle, so do it after the
|
||||
* CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
|
||||
* implied).
|
||||
*/
|
||||
uint64_t va;
|
||||
|
||||
/* Do the flush (enqueue the event and wait for it). */
|
||||
va = wait_mem_scratch->gpu_address;
|
||||
ctx->wait_mem_number++;
|
||||
|
||||
/* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
|
||||
unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
|
||||
unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
|
||||
unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
|
||||
unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
|
||||
assert(G_586_GL2_US(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
|
||||
unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
|
||||
unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
|
||||
unsigned gcr_seq = G_586_SEQ(gcr_cntl);
|
||||
|
||||
gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
|
||||
C_586_GL2_WB; /* keep SEQ */
|
||||
|
||||
si_cp_release_mem(ctx, cs, cb_db_event,
|
||||
S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
|
||||
S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
|
||||
S_490_SEQ(gcr_seq),
|
||||
EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
|
||||
EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
|
||||
SI_NOT_QUERY);
|
||||
si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
||||
}
|
||||
|
||||
/* Ignore fields that only modify the behavior of other fields. */
|
||||
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
|
||||
/* Flush caches and wait for the caches to assert idle.
|
||||
* The cache flush is executed in the ME, but the PFP waits
|
||||
* for completion.
|
||||
*/
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(cs, 0); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
|
||||
} else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
|
||||
/* We need to ensure that PFP waits as well. */
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
|
||||
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
ctx->flags = 0;
|
||||
}
|
||||
|
||||
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
||||
{
|
||||
uint32_t flags = sctx->flags;
|
||||
|
||||
if (!sctx->has_graphics) {
|
||||
/* Only process compute flags. */
|
||||
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
uint32_t cp_coher_cntl = 0;
|
||||
const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
|
||||
const bool is_barrier =
|
||||
flush_cb_db ||
|
||||
/* INV_ICACHE == beginning of gfx IB. Checking
|
||||
* INV_ICACHE fixes corruption for DeusExMD with
|
||||
* compute-based culling, but I don't know why.
|
||||
*/
|
||||
flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
|
||||
(flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
|
||||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
||||
sctx->num_cb_cache_flushes++;
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
||||
sctx->num_db_cache_flushes++;
|
||||
|
||||
/* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
|
||||
* bit is set. An alternative way is to write SQC_CACHES, but that
|
||||
* doesn't seem to work reliably. Since the bug doesn't affect
|
||||
* correctness (it only does more work than necessary) and
|
||||
* the performance impact is likely negligible, there is no plan
|
||||
* to add a workaround for it.
|
||||
*/
|
||||
|
||||
if (flags & SI_CONTEXT_INV_ICACHE)
|
||||
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
|
||||
if (flags & SI_CONTEXT_INV_SCACHE)
|
||||
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
|
||||
|
||||
if (sctx->chip_class <= GFX8) {
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB7_DEST_BASE_ENA(1);
|
||||
|
||||
/* Necessary for DCC */
|
||||
if (sctx->chip_class == GFX8)
|
||||
si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
|
||||
EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
|
||||
}
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
||||
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
|
||||
/* Flush HTILE. SURFACE_SYNC will wait for idle. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* Wait for shader engines to go idle.
|
||||
* VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
|
||||
* for everything including CB/DB cache flushes.
|
||||
*/
|
||||
if (!flush_cb_db) {
|
||||
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
/* Only count explicit shader flushes, not implicit ones
|
||||
* done by SURFACE_SYNC.
|
||||
*/
|
||||
sctx->num_vs_flushes++;
|
||||
sctx->num_ps_flushes++;
|
||||
} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
sctx->num_vs_flushes++;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
sctx->num_cs_flushes++;
|
||||
sctx->compute_is_busy = false;
|
||||
}
|
||||
|
||||
/* VGT state synchronization. */
|
||||
if (flags & SI_CONTEXT_VGT_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
||||
}
|
||||
if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
|
||||
* wait for idle on GFX9. We have to use a TS event.
|
||||
*/
|
||||
if (sctx->chip_class == GFX9 && flush_cb_db) {
|
||||
uint64_t va;
|
||||
unsigned tc_flags, cb_db_event;
|
||||
|
||||
/* Set the CB/DB flush event. */
|
||||
switch (flush_cb_db) {
|
||||
case SI_CONTEXT_FLUSH_AND_INV_CB:
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
break;
|
||||
case SI_CONTEXT_FLUSH_AND_INV_DB:
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
|
||||
break;
|
||||
default:
|
||||
/* both CB & DB */
|
||||
cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
|
||||
}
|
||||
|
||||
/* These are the only allowed combinations. If you need to
|
||||
* do multiple operations at once, do them separately.
|
||||
* All operations that invalidate L2 also seem to invalidate
|
||||
* metadata. Volatile (VOL) and WC flushes are not listed here.
|
||||
*
|
||||
* TC | TC_WB = writeback & invalidate L2 & L1
|
||||
* TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
|
||||
* TC_WB | TC_NC = writeback L2 for MTYPE == NC
|
||||
* TC | TC_NC = invalidate L2 for MTYPE == NC
|
||||
* TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
|
||||
* TCL1 = invalidate L1
|
||||
*/
|
||||
tc_flags = 0;
|
||||
|
||||
if (flags & SI_CONTEXT_INV_L2_METADATA) {
|
||||
tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
|
||||
}
|
||||
|
||||
/* Ideally flush TC together with CB/DB. */
|
||||
if (flags & SI_CONTEXT_INV_L2) {
|
||||
/* Writeback and invalidate everything in L2 & L1. */
|
||||
tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
|
||||
|
||||
/* Clear the flags. */
|
||||
flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
|
||||
sctx->num_L2_invalidates++;
|
||||
}
|
||||
|
||||
/* Do the flush (enqueue the event and wait for it). */
|
||||
struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ?
|
||||
sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch;
|
||||
va = wait_mem_scratch->gpu_address;
|
||||
sctx->wait_mem_number++;
|
||||
|
||||
si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
|
||||
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
|
||||
wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
|
||||
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
||||
}
|
||||
|
||||
/* Make sure ME is idle (it executes most packets) before continuing.
|
||||
* This prevents read-after-write hazards between PFP and ME.
|
||||
*/
|
||||
if (sctx->has_graphics &&
|
||||
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
|
||||
/* GFX6-GFX8 only:
|
||||
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
|
||||
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
|
||||
*
|
||||
* cp_coher_cntl should contain all necessary flags except TC flags
|
||||
* at this point.
|
||||
*
|
||||
* GFX6-GFX7 don't support L2 write-back.
|
||||
*/
|
||||
if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
|
||||
/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
|
||||
* WB must be set on GFX8+ when TC_ACTION is set.
|
||||
*/
|
||||
si_emit_surface_sync(sctx, cs,
|
||||
cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
|
||||
S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
|
||||
cp_coher_cntl = 0;
|
||||
sctx->num_L2_invalidates++;
|
||||
} else {
|
||||
/* L1 invalidation and L2 writeback must be done separately,
|
||||
* because both operations can't be done together.
|
||||
*/
|
||||
if (flags & SI_CONTEXT_WB_L2) {
|
||||
/* WB = write-back
|
||||
* NC = apply to non-coherent MTYPEs
|
||||
* (i.e. MTYPE <= 1, which is what we use everywhere)
|
||||
*
|
||||
* WB doesn't work without NC.
|
||||
*/
|
||||
si_emit_surface_sync(
|
||||
sctx, cs,
|
||||
cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
|
||||
cp_coher_cntl = 0;
|
||||
sctx->num_L2_writebacks++;
|
||||
}
|
||||
if (flags & SI_CONTEXT_INV_VCACHE) {
|
||||
/* Invalidate per-CU VMEM L1. */
|
||||
si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
|
||||
cp_coher_cntl = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* If TC flushes haven't cleared this... */
|
||||
if (cp_coher_cntl)
|
||||
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
|
||||
|
||||
if (is_barrier)
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
|
||||
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
|
||||
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
sctx->flags = 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1450,6 +1450,10 @@ void si_allocate_gds(struct si_context *ctx);
|
|||
void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
|
||||
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
|
||||
void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws);
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned cp_coher_cntl);
|
||||
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
|
||||
/* si_gpu_load.c */
|
||||
void si_gpu_load_kill_thread(struct si_screen *sscreen);
|
||||
|
|
|
|||
|
|
@ -587,11 +587,7 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs);
|
|||
bool si_update_ngg(struct si_context *sctx);
|
||||
|
||||
/* si_state_draw.c */
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned cp_coher_cntl);
|
||||
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
|
||||
void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
|
||||
void si_trace_emit(struct si_context *sctx);
|
||||
void si_init_draw_functions(struct si_context *sctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -1135,37 +1135,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
|||
EMIT_SQTT_END_DRAW;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
|
||||
{
|
||||
bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
|
||||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
if (sctx->chip_class == GFX9 || compute_ib) {
|
||||
/* Flush caches and wait for the caches to assert idle. */
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
|
||||
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
} else {
|
||||
/* ACQUIRE_MEM is only required on a compute ring. */
|
||||
radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
|
||||
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
}
|
||||
|
||||
/* ACQUIRE_MEM has an implicit context roll if the current context
|
||||
* is busy. */
|
||||
if (!compute_ib)
|
||||
sctx->context_roll = true;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
|
||||
{
|
||||
|
|
@ -1192,422 +1161,6 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
|
|||
*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
|
||||
}
|
||||
|
||||
void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
||||
{
|
||||
uint32_t gcr_cntl = 0;
|
||||
unsigned cb_db_event = 0;
|
||||
unsigned flags = ctx->flags;
|
||||
|
||||
if (!ctx->has_graphics) {
|
||||
/* Only process compute flags. */
|
||||
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
/* We don't need these. */
|
||||
assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
|
||||
|
||||
if (flags & SI_CONTEXT_VGT_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
||||
ctx->num_cb_cache_flushes++;
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
||||
ctx->num_db_cache_flushes++;
|
||||
|
||||
if (flags & SI_CONTEXT_INV_ICACHE)
|
||||
gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
|
||||
if (flags & SI_CONTEXT_INV_SCACHE) {
|
||||
/* TODO: When writing to the SMEM L1 cache, we need to set SEQ
|
||||
* to FORWARD when both L1 and L2 are written out (WB or INV).
|
||||
*/
|
||||
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
|
||||
}
|
||||
if (flags & SI_CONTEXT_INV_VCACHE)
|
||||
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
|
||||
|
||||
/* The L2 cache ops are:
|
||||
* - INV: - invalidate lines that reflect memory (were loaded from memory)
|
||||
* - don't touch lines that were overwritten (were stored by gfx clients)
|
||||
* - WB: - don't touch lines that reflect memory
|
||||
* - write back lines that were overwritten
|
||||
* - WB | INV: - invalidate lines that reflect memory
|
||||
* - write back lines that were overwritten
|
||||
*
|
||||
* GLM doesn't support WB alone. If WB is set, INV must be set too.
|
||||
*/
|
||||
if (flags & SI_CONTEXT_INV_L2) {
|
||||
/* Writeback and invalidate everything in L2. */
|
||||
gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
||||
ctx->num_L2_invalidates++;
|
||||
} else if (flags & SI_CONTEXT_WB_L2) {
|
||||
gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
|
||||
} else if (flags & SI_CONTEXT_INV_L2_METADATA) {
|
||||
gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
|
||||
}
|
||||
|
||||
if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
/* Flush CMASK/FMASK/DCC. Will wait for idle later. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
||||
/* Flush HTILE. Will wait for idle later. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* First flush CB/DB, then L1/L2. */
|
||||
gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
|
||||
|
||||
if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
|
||||
(SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
|
||||
cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
|
||||
} else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
} else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
} else {
|
||||
/* Wait for graphics shaders to go idle if requested. */
|
||||
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
/* Only count explicit shader flushes, not implicit ones. */
|
||||
ctx->num_vs_flushes++;
|
||||
ctx->num_ps_flushes++;
|
||||
} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
ctx->num_vs_flushes++;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
|
||||
ctx->num_cs_flushes++;
|
||||
ctx->compute_is_busy = false;
|
||||
}
|
||||
|
||||
if (cb_db_event) {
|
||||
struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?
|
||||
ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch;
|
||||
/* CB/DB flush and invalidate (or possibly just a wait for a
|
||||
* meta flush) via RELEASE_MEM.
|
||||
*
|
||||
* Combine this with other cache flushes when possible; this
|
||||
* requires affected shaders to be idle, so do it after the
|
||||
* CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
|
||||
* implied).
|
||||
*/
|
||||
uint64_t va;
|
||||
|
||||
/* Do the flush (enqueue the event and wait for it). */
|
||||
va = wait_mem_scratch->gpu_address;
|
||||
ctx->wait_mem_number++;
|
||||
|
||||
/* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
|
||||
unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
|
||||
unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
|
||||
unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
|
||||
unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
|
||||
assert(G_586_GL2_US(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
|
||||
unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
|
||||
unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
|
||||
unsigned gcr_seq = G_586_SEQ(gcr_cntl);
|
||||
|
||||
gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
|
||||
C_586_GL2_WB; /* keep SEQ */
|
||||
|
||||
si_cp_release_mem(ctx, cs, cb_db_event,
|
||||
S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
|
||||
S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
|
||||
S_490_SEQ(gcr_seq),
|
||||
EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
|
||||
EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
|
||||
SI_NOT_QUERY);
|
||||
si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
||||
}
|
||||
|
||||
/* Ignore fields that only modify the behavior of other fields. */
|
||||
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
|
||||
/* Flush caches and wait for the caches to assert idle.
|
||||
* The cache flush is executed in the ME, but the PFP waits
|
||||
* for completion.
|
||||
*/
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(cs, 0); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
|
||||
} else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
|
||||
/* We need to ensure that PFP waits as well. */
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
|
||||
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
ctx->flags = 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
|
||||
{
|
||||
uint32_t flags = sctx->flags;
|
||||
|
||||
if (!sctx->has_graphics) {
|
||||
/* Only process compute flags. */
|
||||
flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
uint32_t cp_coher_cntl = 0;
|
||||
const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
|
||||
const bool is_barrier =
|
||||
flush_cb_db ||
|
||||
/* INV_ICACHE == beginning of gfx IB. Checking
|
||||
* INV_ICACHE fixes corruption for DeusExMD with
|
||||
* compute-based culling, but I don't know why.
|
||||
*/
|
||||
flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
|
||||
(flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
|
||||
|
||||
assert(sctx->chip_class <= GFX9);
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
|
||||
sctx->num_cb_cache_flushes++;
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
||||
sctx->num_db_cache_flushes++;
|
||||
|
||||
/* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
|
||||
* bit is set. An alternative way is to write SQC_CACHES, but that
|
||||
* doesn't seem to work reliably. Since the bug doesn't affect
|
||||
* correctness (it only does more work than necessary) and
|
||||
* the performance impact is likely negligible, there is no plan
|
||||
* to add a workaround for it.
|
||||
*/
|
||||
|
||||
if (flags & SI_CONTEXT_INV_ICACHE)
|
||||
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
|
||||
if (flags & SI_CONTEXT_INV_SCACHE)
|
||||
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
|
||||
|
||||
if (sctx->chip_class <= GFX8) {
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
|
||||
S_0085F0_CB7_DEST_BASE_ENA(1);
|
||||
|
||||
/* Necessary for DCC */
|
||||
if (sctx->chip_class == GFX8)
|
||||
si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
|
||||
EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
|
||||
}
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
|
||||
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
|
||||
/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
|
||||
/* Flush HTILE. SURFACE_SYNC will wait for idle. */
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* Wait for shader engines to go idle.
|
||||
* VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
|
||||
* for everything including CB/DB cache flushes.
|
||||
*/
|
||||
if (!flush_cb_db) {
|
||||
if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
/* Only count explicit shader flushes, not implicit ones
|
||||
* done by SURFACE_SYNC.
|
||||
*/
|
||||
sctx->num_vs_flushes++;
|
||||
sctx->num_ps_flushes++;
|
||||
} else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
sctx->num_vs_flushes++;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
sctx->num_cs_flushes++;
|
||||
sctx->compute_is_busy = false;
|
||||
}
|
||||
|
||||
/* VGT state synchronization. */
|
||||
if (flags & SI_CONTEXT_VGT_FLUSH) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
|
||||
}
|
||||
if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
/* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
|
||||
* wait for idle on GFX9. We have to use a TS event.
|
||||
*/
|
||||
if (sctx->chip_class == GFX9 && flush_cb_db) {
|
||||
uint64_t va;
|
||||
unsigned tc_flags, cb_db_event;
|
||||
|
||||
/* Set the CB/DB flush event. */
|
||||
switch (flush_cb_db) {
|
||||
case SI_CONTEXT_FLUSH_AND_INV_CB:
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
break;
|
||||
case SI_CONTEXT_FLUSH_AND_INV_DB:
|
||||
cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
|
||||
break;
|
||||
default:
|
||||
/* both CB & DB */
|
||||
cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
|
||||
}
|
||||
|
||||
/* These are the only allowed combinations. If you need to
|
||||
* do multiple operations at once, do them separately.
|
||||
* All operations that invalidate L2 also seem to invalidate
|
||||
* metadata. Volatile (VOL) and WC flushes are not listed here.
|
||||
*
|
||||
* TC | TC_WB = writeback & invalidate L2 & L1
|
||||
* TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
|
||||
* TC_WB | TC_NC = writeback L2 for MTYPE == NC
|
||||
* TC | TC_NC = invalidate L2 for MTYPE == NC
|
||||
* TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
|
||||
* TCL1 = invalidate L1
|
||||
*/
|
||||
tc_flags = 0;
|
||||
|
||||
if (flags & SI_CONTEXT_INV_L2_METADATA) {
|
||||
tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
|
||||
}
|
||||
|
||||
/* Ideally flush TC together with CB/DB. */
|
||||
if (flags & SI_CONTEXT_INV_L2) {
|
||||
/* Writeback and invalidate everything in L2 & L1. */
|
||||
tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
|
||||
|
||||
/* Clear the flags. */
|
||||
flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
|
||||
sctx->num_L2_invalidates++;
|
||||
}
|
||||
|
||||
/* Do the flush (enqueue the event and wait for it). */
|
||||
struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ?
|
||||
sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch;
|
||||
va = wait_mem_scratch->gpu_address;
|
||||
sctx->wait_mem_number++;
|
||||
|
||||
si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
|
||||
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
|
||||
wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
|
||||
si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
|
||||
}
|
||||
|
||||
/* Make sure ME is idle (it executes most packets) before continuing.
|
||||
* This prevents read-after-write hazards between PFP and ME.
|
||||
*/
|
||||
if (sctx->has_graphics &&
|
||||
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
|
||||
/* GFX6-GFX8 only:
|
||||
* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
|
||||
* waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
|
||||
*
|
||||
* cp_coher_cntl should contain all necessary flags except TC flags
|
||||
* at this point.
|
||||
*
|
||||
* GFX6-GFX7 don't support L2 write-back.
|
||||
*/
|
||||
if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
|
||||
/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
|
||||
* WB must be set on GFX8+ when TC_ACTION is set.
|
||||
*/
|
||||
si_emit_surface_sync(sctx, cs,
|
||||
cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
|
||||
S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
|
||||
cp_coher_cntl = 0;
|
||||
sctx->num_L2_invalidates++;
|
||||
} else {
|
||||
/* L1 invalidation and L2 writeback must be done separately,
|
||||
* because both operations can't be done together.
|
||||
*/
|
||||
if (flags & SI_CONTEXT_WB_L2) {
|
||||
/* WB = write-back
|
||||
* NC = apply to non-coherent MTYPEs
|
||||
* (i.e. MTYPE <= 1, which is what we use everywhere)
|
||||
*
|
||||
* WB doesn't work without NC.
|
||||
*/
|
||||
si_emit_surface_sync(
|
||||
sctx, cs,
|
||||
cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
|
||||
cp_coher_cntl = 0;
|
||||
sctx->num_L2_writebacks++;
|
||||
}
|
||||
if (flags & SI_CONTEXT_INV_VCACHE) {
|
||||
/* Invalidate per-CU VMEM L1. */
|
||||
si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
|
||||
cp_coher_cntl = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* If TC flushes haven't cleared this... */
|
||||
if (cp_coher_cntl)
|
||||
si_emit_surface_sync(sctx, cs, cp_coher_cntl);
|
||||
|
||||
if (is_barrier)
|
||||
si_prim_discard_signal_next_compute_ib_start(sctx);
|
||||
|
||||
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
|
||||
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
sctx->flags = 0;
|
||||
}
|
||||
|
||||
template <chip_class GFX_VERSION> ALWAYS_INLINE
|
||||
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue