From 679332f9a98f0443b23ad3781b6a0ea948324424 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 14 Oct 2025 14:57:48 +0200 Subject: [PATCH] amd,radv,radeonsi: add ac_emit_cp_acquire_mem() Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_cmdbuf.c | 51 ++++++++++++++++++++ src/amd/common/ac_cmdbuf.h | 5 ++ src/amd/vulkan/radv_cs.c | 55 ++++------------------ src/gallium/drivers/radeonsi/si_cp_utils.c | 45 ++---------------- 4 files changed, 69 insertions(+), 87 deletions(-) diff --git a/src/amd/common/ac_cmdbuf.c b/src/amd/common/ac_cmdbuf.c index 9029cc89561..a327fd26909 100644 --- a/src/amd/common/ac_cmdbuf.c +++ b/src/amd/common/ac_cmdbuf.c @@ -1192,3 +1192,54 @@ ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, ac_cmdbuf_end(); } + +/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits + * for idle on older chips. "engine" determines whether to sync in PFP or ME. + */ +void +ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + enum amd_ip_type ip_type, uint32_t engine, + uint32_t gcr_cntl) +{ + assert(engine == V_580_CP_PFP || engine == V_580_CP_ME); + assert(gcr_cntl); + + ac_cmdbuf_begin(cs); + + if (gfx_level >= GFX10) { + /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */ + const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0; + + /* Flush caches. This doesn't wait for idle. */ + ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + ac_cmdbuf_emit(engine_flag); /* which engine to use */ + ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ + ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */ + ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ + ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */ + } else { + const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE; + + if (gfx_level == GFX9 || is_mec) { + /* Flush caches and wait for the caches to assert idle. */ + ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec)); + ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */ + ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ + ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */ + ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ + } else { + /* ACQUIRE_MEM is only required on the compute ring. */ + ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0)); + ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */ + ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE */ + ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ + } + } + + ac_cmdbuf_end(); +} diff --git a/src/amd/common/ac_cmdbuf.h b/src/amd/common/ac_cmdbuf.h index 795ee567e1d..74d34db86c8 100644 --- a/src/amd/common/ac_cmdbuf.h +++ b/src/amd/common/ac_cmdbuf.h @@ -160,6 +160,11 @@ void ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, uint64_t va, uint32_t size); +void +ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + enum amd_ip_type ip_type, uint32_t engine, + uint32_t gcr_cntl); + #ifdef __cplusplus } #endif diff --git a/src/amd/vulkan/radv_cs.c b/src/amd/vulkan/radv_cs.c index 3a42a00f494..00c1515d9d1 100644 --- a/src/amd/vulkan/radv_cs.c +++ b/src/amd/vulkan/radv_cs.c @@ -103,32 +103,6 @@ radv_cs_emit_write_event_eop(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_ radeon_end(); } -static void -radv_emit_acquire_mem(struct radv_cmd_stream *cs, bool is_mec, bool is_gfx9, unsigned cp_coher_cntl) -{ - radeon_begin(cs); - - if (is_mec || is_gfx9) { - uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff; - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, false) | PKT3_SHADER_TYPE_S(is_mec)); - radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(hi_val); /* CP_COHER_SIZE_HI */ - radeon_emit(0); /* CP_COHER_BASE */ - radeon_emit(0); /* CP_COHER_BASE_HI */ - radeon_emit(0x0000000A); /* POLL_INTERVAL */ - } else { - /* ACQUIRE_MEM is only required on a compute ring. */ - radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, false)); - radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(0); /* CP_COHER_BASE */ - radeon_emit(0x0000000A); /* POLL_INTERVAL */ - } - - radeon_end(); -} - static void gfx10_cs_emit_cache_flush(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_level, uint32_t *flush_cnt, uint64_t flush_va, enum radv_cmd_flush_bits flush_bits, enum rgp_flush_bits *sqtt_flush_bits, @@ -298,20 +272,7 @@ gfx10_cs_emit_cache_flush(struct radv_cmd_stream *cs, enum amd_gfx_level gfx_lev /* Ignore fields that only modify the behavior of other fields. */ if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { - /* Flush caches and wait for the caches to assert idle. - * The cache flush is executed in the ME, but the PFP waits - * for completion. - */ - radeon_begin(cs); - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(0); /* CP_COHER_CNTL */ - radeon_emit(0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(0); /* CP_COHER_BASE */ - radeon_emit(0); /* CP_COHER_BASE_HI */ - radeon_emit(0x0000000A); /* POLL_INTERVAL */ - radeon_emit(gcr_cntl); /* GCR_CNTL */ - radeon_end(); + ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, gcr_cntl); } else if ((cb_db_event || (flush_bits & (RADV_CMD_FLAG_VS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH))) && !is_mec) { @@ -494,9 +455,9 @@ radv_cs_emit_cache_flush(struct radeon_winsys *ws, struct radv_cmd_stream *cs, e } if ((flush_bits & RADV_CMD_FLAG_INV_L2) || (gfx_level <= GFX7 && (flush_bits & RADV_CMD_FLAG_WB_L2))) { - radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9, - cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(gfx_level >= GFX8)); + ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, + cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(gfx_level >= GFX8)); cp_coher_cntl = 0; *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0; @@ -508,14 +469,14 @@ radv_cs_emit_cache_flush(struct radeon_winsys *ws, struct radv_cmd_stream *cs, e * * WB doesn't work without NC. */ - radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9, - cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); + ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, + cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); cp_coher_cntl = 0; *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0; } if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) { - radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); + ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); cp_coher_cntl = 0; *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0; @@ -526,7 +487,7 @@ radv_cs_emit_cache_flush(struct radeon_winsys *ws, struct radv_cmd_stream *cs, e * Therefore, it should be last. Done in PFP. */ if (cp_coher_cntl) - radv_emit_acquire_mem(cs, is_mec, gfx_level == GFX9, cp_coher_cntl); + ac_emit_cp_acquire_mem(cs->b, gfx_level, cs->hw_ip, V_580_CP_PFP, cp_coher_cntl); radeon_begin(cs); diff --git a/src/gallium/drivers/radeonsi/si_cp_utils.c b/src/gallium/drivers/radeonsi/si_cp_utils.c index 9599c13ecd7..6036e764e3a 100644 --- a/src/gallium/drivers/radeonsi/si_cp_utils.c +++ b/src/gallium/drivers/radeonsi/si_cp_utils.c @@ -37,30 +37,14 @@ void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags); } -/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits for idle on older - * chips. "engine" determines whether to sync in PFP or ME. - */ void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned gcr_cntl, unsigned engine) { - assert(engine == V_580_CP_PFP || engine == V_580_CP_ME); - assert(gcr_cntl); + const enum amd_ip_type ip_type = sctx->is_gfx_queue ? AMD_IP_GFX : AMD_IP_COMPUTE; if (sctx->gfx_level >= GFX10) { - /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */ - unsigned engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0; - - /* Flush caches. This doesn't wait for idle. */ - radeon_begin(cs); - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(engine_flag); /* which engine to use */ - radeon_emit(0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(0x01ffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(0); /* CP_COHER_BASE */ - radeon_emit(0); /* CP_COHER_BASE_HI */ - radeon_emit(0x0000000A); /* POLL_INTERVAL */ - radeon_emit(gcr_cntl); /* GCR_CNTL */ - radeon_end(); + ac_emit_cp_acquire_mem(&cs->current, sctx->gfx_level, ip_type, engine, + gcr_cntl); } else { bool compute_ib = !sctx->is_gfx_queue; @@ -68,27 +52,8 @@ void si_cp_acquire_mem(struct si_context *sctx, struct radeon_cmdbuf *cs, unsign if (sctx->gfx_level != GFX7) gcr_cntl |= 1u << 31; /* don't sync PFP, i.e. execute the sync in ME */ - if (sctx->gfx_level == GFX9 || compute_ib) { - /* Flush caches and wait for the caches to assert idle. */ - radeon_begin(cs); - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0)); - radeon_emit(gcr_cntl); /* CP_COHER_CNTL */ - radeon_emit(0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(0); /* CP_COHER_BASE */ - radeon_emit(0); /* CP_COHER_BASE_HI */ - radeon_emit(0x0000000A); /* POLL_INTERVAL */ - radeon_end(); - } else { - /* ACQUIRE_MEM is only required on the compute ring. */ - radeon_begin(cs); - radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0)); - radeon_emit(gcr_cntl); /* CP_COHER_CNTL */ - radeon_emit(0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(0); /* CP_COHER_BASE */ - radeon_emit(0x0000000A); /* POLL_INTERVAL */ - radeon_end(); - } + ac_emit_cp_acquire_mem(&cs->current, sctx->gfx_level, ip_type, engine, + gcr_cntl); /* ACQUIRE_MEM & SURFACE_SYNC roll the context if the current context is busy. */ if (!compute_ib)