amd,radv,radeonsi: add ac_emit_cp_acquire_mem()

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37870>
This commit is contained in:
Samuel Pitoiset 2025-10-14 14:57:48 +02:00 committed by Marge Bot
parent 9ad7fb8569
commit 679332f9a9
4 changed files with 69 additions and 87 deletions

View file

@ -1192,3 +1192,54 @@ ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
ac_cmdbuf_end();
}
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits
* for idle on older chips. "engine" determines whether to sync in PFP or ME.
*/
void
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
enum amd_ip_type ip_type, uint32_t engine,
uint32_t gcr_cntl)
{
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
assert(gcr_cntl);
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX10) {
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
/* Flush caches. This doesn't wait for idle. */
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
ac_cmdbuf_emit(engine_flag); /* which engine to use */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */
} else {
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
if (gfx_level == GFX9 || is_mec) {
/* Flush caches and wait for the caches to assert idle. */
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec));
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
} else {
/* ACQUIRE_MEM is only required on the compute ring. */
ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
}
}
ac_cmdbuf_end();
}

View file

@ -160,6 +160,11 @@ void
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t size);
void
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
enum amd_ip_type ip_type, uint32_t engine,
uint32_t gcr_cntl);
#ifdef __cplusplus
}
#endif

View file

@ -103,32 +103,6 @@ radv_cs_emit_write_event_eop(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_
radeon_end();
}
static void
radv_emit_acquire_mem(struct radv_cmd_stream *cs, bool is_mec, bool is_gfx9, unsigned cp_coher_cntl)
{
radeon_begin(cs);
if (is_mec || is_gfx9) {
uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(is_mec));
radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(hi_val); /* CP_COHER_SIZE_HI */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0); /* CP_COHER_BASE_HI */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
} else {
/* ACQUIRE_MEM is only required on a compute ring. */
radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, false));
radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
}
radeon_end();
}
static void
gfx10_cs_emit_cache_flush(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_level, uint32_t *flush_cnt,
uint64_t flush_va, enum radv_cmd_flush_bits flush_bits, enum rgp_flush_bits *sqtt_flush_bits,
@ -298,20 +272,7 @@ gfx10_cs_emit_cache_flush(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_lev
/* Ignore fields that only modify the behavior of other fields. */
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
/* Flush caches and wait for the caches to assert idle.
* The cache flush is executed in the ME, but the PFP waits
* for completion.
*/
radeon_begin(cs);
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(0); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0xffffff); /* CP_COHER_SIZE_HI */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0); /* CP_COHER_BASE_HI */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
radeon_emit(gcr_cntl); /* GCR_CNTL */
radeon_end();
ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, gcr_cntl);
} else if ((cb_db_event || (flush_bits & (RADV_CMD_FLAG_VS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
RADV_CMD_FLAG_CS_PARTIAL_FLUSH))) &&
!is_mec) {
@ -494,9 +455,9 @@ radv_cs_emit_cache_flush(struct radeon_winsys *ws, struct radv_cmd_stream *cs, e
}
if ((flush_bits & RADV_CMD_FLAG_INV_L2) || (gfx_level <= GFX7 && (flush_bits & RADV_CMD_FLAG_WB_L2))) {
radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9,
cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(gfx_level >= GFX8));
ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP,
cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(gfx_level >= GFX8));
cp_coher_cntl = 0;
*sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0;
@ -508,14 +469,14 @@ radv_cs_emit_cache_flush(struct radeon_winsys *ws, struct radv_cmd_stream *cs, e
*
* WB doesn't work without NC.
*/
radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9,
cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP,
cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
cp_coher_cntl = 0;
*sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0;
}
if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
cp_coher_cntl = 0;
*sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0;
@ -526,7 +487,7 @@ radv_cs_emit_cache_flush(struct radeon_winsys *ws, struct radv_cmd_stream *cs, e
* Therefore, it should be last. Done in PFP.
*/
if (cp_coher_cntl)
radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9, cp_coher_cntl);
ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, cp_coher_cntl);
radeon_begin(cs);

View file

@ -37,30 +37,14 @@ void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf
si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags);
}
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits for idle on older
* chips. "engine" determines whether to sync in PFP or ME.
*/
void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl,
unsigned engine)
{
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
assert(gcr_cntl);
const enum amd_ip_type ip_type = sctx->is_gfx_queue ? AMD_IP_GFX : AMD_IP_COMPUTE;
if (sctx->gfx_level >= GFX10) {
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
unsigned engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
/* Flush caches. This doesn't wait for idle. */
radeon_begin(cs);
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(engine_flag); /* which engine to use */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0); /* CP_COHER_BASE_HI */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
radeon_emit(gcr_cntl); /* GCR_CNTL */
radeon_end();
ac_emit_cp_acquire_mem(&cs->current, sctx->gfx_level, ip_type, engine,
gcr_cntl);
} else {
bool compute_ib = !sctx->is_gfx_queue;
@ -68,27 +52,8 @@ void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsign
if (sctx->gfx_level != GFX7)
gcr_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */
if (sctx->gfx_level == GFX9 || compute_ib) {
/* Flush caches and wait for the caches to assert idle. */
radeon_begin(cs);
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
radeon_emit(gcr_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0xffffff); /* CP_COHER_SIZE_HI */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0); /* CP_COHER_BASE_HI */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
radeon_end();
} else {
/* ACQUIRE_MEM is only required on the compute ring. */
radeon_begin(cs);
radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
radeon_emit(gcr_cntl); /* CP_COHER_CNTL */
radeon_emit(0xffffffff); /* CP_COHER_SIZE */
radeon_emit(0); /* CP_COHER_BASE */
radeon_emit(0x0000000A); /* POLL_INTERVAL */
radeon_end();
}
ac_emit_cp_acquire_mem(&cs->current, sctx->gfx_level, ip_type, engine,
gcr_cntl);
/* ACQUIRE_MEM & SURFACE_SYNC roll the context if the current context is busy. */
if (!compute_ib)