From 1d5ffb13d69b109401215f6747d1765faeb76b54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 6 Aug 2024 02:16:28 -0400 Subject: [PATCH] radeonsi: add ACQUIRE_MEM, RELEASE_MEM PWS packet helpers Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/meson.build | 1 + src/gallium/drivers/radeonsi/si_cp_utils.c | 133 ++++++++++++++++++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 68 ++------- src/gallium/drivers/radeonsi/si_pipe.h | 10 ++ .../drivers/radeonsi/si_state_shaders.cpp | 30 +--- 5 files changed, 158 insertions(+), 84 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/si_cp_utils.c diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index cb34f36b342..661b0365063 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -31,6 +31,7 @@ files_libradeonsi = files( 'si_compute_blit.c', 'si_cp_dma.c', 'si_cp_reg_shadowing.c', + 'si_cp_utils.c', 'si_debug.c', 'si_descriptors.c', 'si_fence.c', diff --git a/src/gallium/drivers/radeonsi/si_cp_utils.c b/src/gallium/drivers/radeonsi/si_cp_utils.c new file mode 100644 index 00000000000..e0bef5a4ff6 --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_cp_utils.c @@ -0,0 +1,133 @@ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#include "si_build_pm4.h" + +static bool is_ts_event(unsigned event_type) +{ + return event_type == V_028A90_CACHE_FLUSH_TS || + event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT || + event_type == V_028A90_BOTTOM_OF_PIPE_TS || + event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS || + event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS; +} + +/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will signal after the work + * indicated by the event is complete, which optionally includes flushing caches using "gcr_cntl" + * after the completion of the work. *_TS events are always signaled at the end of the pipeline, + * while CS_DONE and PS_DONE are signaled when those shaders finish. This call only inserts + * the event into the pipeline. It doesn't wait for anything and it doesn't execute anything + * immediately. The only way to wait for the event completion is to call si_cp_acquire_mem_pws + * with the same "event_type". + */ +void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned event_type, unsigned gcr_cntl) +{ + assert(sctx->gfx_level >= GFX11 && sctx->has_graphics); + bool ts = is_ts_event(event_type); + /* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */ + assert(G_586_GLI_INV(gcr_cntl) == 0); + assert(G_586_GL1_RANGE(gcr_cntl) == 0); + unsigned glm_wb = G_586_GLM_WB(gcr_cntl); + unsigned glm_inv = G_586_GLM_INV(gcr_cntl); + unsigned glk_wb = G_586_GLK_WB(gcr_cntl); + unsigned glk_inv = G_586_GLK_INV(gcr_cntl); + unsigned glv_inv = G_586_GLV_INV(gcr_cntl); + unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); + assert(G_586_GL2_US(gcr_cntl) == 0); + assert(G_586_GL2_RANGE(gcr_cntl) == 0); + assert(G_586_GL2_DISCARD(gcr_cntl) == 0); + unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); + unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); + unsigned gcr_seq = G_586_SEQ(gcr_cntl); + + radeon_begin(cs); + radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); + radeon_emit(S_490_EVENT_TYPE(event_type) | + S_490_EVENT_INDEX(ts ? 5 : 6) | + S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) | + S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) | + S_490_SEQ(gcr_seq) | S_490_GLK_WB(glk_wb) | S_490_GLK_INV(glk_inv) | + S_490_PWS_ENABLE(1)); + radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */ + radeon_emit(0); /* ADDRESS_LO */ + radeon_emit(0); /* ADDRESS_HI */ + radeon_emit(0); /* DATA_LO */ + radeon_emit(0); /* DATA_HI */ + radeon_emit(0); /* INT_CTXID */ + radeon_end(); +} + +/* This will wait or insert into the pipeline a wait for a previous RELEASE_MEM PWS event. + * + * "event_type" must be the same as the RELEASE_MEM PWS event. + * + * "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, PRE_SHADER, + * PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the pipeline instead of completely + * idling the hw at the frontend. + * + * "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the pipeline, any + * cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM. + * + * "distance" determines how many RELEASE_MEM PWS events ago it should wait for, minus one + * (starting from 0). There are 3 event types: PS_DONE, CS_DONE, and TS events. The distance + * counter increments separately for each type, so 0 with PS_DONE means wait for the last PS_DONE + * event, while 0 with *_TS means wait for the last TS event (even if it's a different TS event + * because all TS events share the same counter). + * + * PRE_SHADER waits before the first shader that has IMAGE_OP=1, while PRE_PIX_SHADER waits before + * PS if it has IMAGE_OP=1 (IMAGE_OP should really be called SYNC_ENABLE) PRE_DEPTH waits before + * depth/stencil tests. + * + * PRE_COLOR also exists but shouldn't be used because it can hang. It's recommended to use + * PRE_PIX_SHADER instead, which means all PS that have color exports with enabled color buffers, + * non-zero colormask, and non-zero sample mask must have IMAGE_OP=1 to enable the sync before PS. + * + * Waiting for a PWS fence that was generated by a previous IB is valid, but if there is an IB + * from another process in between and that IB also inserted a PWS fence, the hw will wait for + * the newer fence instead because the PWS counter was incremented. + */ +void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned event_type, unsigned stage_sel, unsigned gcr_cntl, + unsigned distance, unsigned sqtt_flush_flags) +{ + assert(sctx->gfx_level >= GFX11 && sctx->has_graphics); + bool ts = is_ts_event(event_type); + bool cs_done = event_type == V_028A90_CS_DONE; + bool ps = event_type == V_028A90_PS_DONE; + + assert((int)ts + (int)cs_done + (int)ps == 1); + assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME); + assert(stage_sel != V_580_PRE_COLOR); + + if (unlikely(sctx->sqtt_enabled)) + si_sqtt_describe_barrier_start(sctx, cs); + + radeon_begin(cs); + radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(S_580_PWS_STAGE_SEL(stage_sel) | + S_580_PWS_COUNTER_SEL(ts ? V_580_TS_SELECT : ps ? V_580_PS_SELECT : V_580_CS_SELECT) | + S_580_PWS_ENA2(1) | + S_580_PWS_COUNT(distance)); + radeon_emit(0xffffffff); /* GCR_SIZE */ + radeon_emit(0x01ffffff); /* GCR_SIZE_HI */ + radeon_emit(0); /* GCR_BASE_LO */ + radeon_emit(0); /* GCR_BASE_HI */ + radeon_emit(S_585_PWS_ENA(1)); + radeon_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */ + radeon_end(); + + if (unlikely(sctx->sqtt_enabled)) + si_sqtt_describe_barrier_end(sctx, cs, sqtt_flush_flags); +} + +void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned event_type, unsigned gcr_cntl, unsigned stage_sel, + unsigned sqtt_flush_flags) +{ + si_cp_release_mem_pws(sctx, cs, event_type, gcr_cntl); + si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags); +} diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 438e87033d8..0cb9a814641 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -879,72 +879,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) } if (cb_db_event) { + radeon_end(); + if (ctx->gfx_level >= GFX11) { - /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ - unsigned glm_wb = G_586_GLM_WB(gcr_cntl); - unsigned glm_inv = G_586_GLM_INV(gcr_cntl); - unsigned glk_wb = G_586_GLK_WB(gcr_cntl); - unsigned glk_inv = G_586_GLK_INV(gcr_cntl); - unsigned glv_inv = G_586_GLV_INV(gcr_cntl); - unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); - assert(G_586_GL2_US(gcr_cntl) == 0); - assert(G_586_GL2_RANGE(gcr_cntl) == 0); - assert(G_586_GL2_DISCARD(gcr_cntl) == 0); - unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); - unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); - unsigned gcr_seq = G_586_SEQ(gcr_cntl); - - gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLK_WB & C_586_GLK_INV & - C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV & C_586_GL2_WB; /* keep SEQ */ - - /* Send an event that flushes caches. */ - radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); - radeon_emit(S_490_EVENT_TYPE(cb_db_event) | - S_490_EVENT_INDEX(5) | - S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) | - S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) | - S_490_SEQ(gcr_seq) | S_490_GLK_WB(glk_wb) | S_490_GLK_INV(glk_inv) | - S_490_PWS_ENABLE(1)); - radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */ - radeon_emit(0); /* ADDRESS_LO */ - radeon_emit(0); /* ADDRESS_HI */ - radeon_emit(0); /* DATA_LO */ - radeon_emit(0); /* DATA_HI */ - radeon_emit(0); /* INT_CTXID */ - - if (unlikely(ctx->sqtt_enabled)) { - radeon_end(); - si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs); - radeon_begin_again(cs); - } + si_cp_release_mem_pws(ctx, cs, cb_db_event, gcr_cntl & C_586_GLI_INV); /* Wait for the event and invalidate remaining caches if needed. */ - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(S_580_PWS_STAGE_SEL(flags & SI_CONTEXT_PFP_SYNC_ME ? V_580_CP_PFP : - V_580_CP_ME) | - S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) | - S_580_PWS_ENA2(1) | - S_580_PWS_COUNT(0)); - radeon_emit(0xffffffff); /* GCR_SIZE */ - radeon_emit(0x01ffffff); /* GCR_SIZE_HI */ - radeon_emit(0); /* GCR_BASE_LO */ - radeon_emit(0); /* GCR_BASE_HI */ - radeon_emit(S_585_PWS_ENA(1)); - radeon_emit(gcr_cntl); /* GCR_CNTL */ - - if (unlikely(ctx->sqtt_enabled)) { - radeon_end(); - si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags); - radeon_begin_again(cs); - } + si_cp_acquire_mem_pws(ctx, cs, cb_db_event, + flags & SI_CONTEXT_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME, + gcr_cntl & ~C_586_GLI_INV, /* keep only GLI_INV */ + 0, flags); gcr_cntl = 0; /* all done */ /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */ flags &= ~SI_CONTEXT_PFP_SYNC_ME; } else { /* GFX10 */ - radeon_end(); - struct si_resource *wait_mem_scratch = si_get_wait_mem_scratch_bo(ctx, cs, ctx->ws->cs_is_secure(cs)); @@ -986,9 +936,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) if (unlikely(ctx->sqtt_enabled)) { si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags); } - - radeon_begin_again(cs); } + + radeon_begin_again(cs); } /* Ignore fields that only modify the behavior of other fields. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 67d567a72fc..39a2a2ced6d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1559,6 +1559,16 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned /* si_cp_reg_shadowing.c */ void si_init_cp_reg_shadowing(struct si_context *sctx); +/* si_cp_utils.c */ +void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned event_type, unsigned gcr_cntl); +void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned event_type, unsigned stage_sel, unsigned gcr_cntl, + unsigned distance, unsigned sqtt_flush_flags); +void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned event_type, unsigned gcr_cntl, unsigned stage_sel, + unsigned sqtt_flush_flags); + /* si_debug.c */ void si_gather_context_rolls(struct si_context *sctx); void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 6a1c5bae3eb..69ca9f39910 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -5031,37 +5031,17 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index) } if (sctx->gfx_level >= GFX11) { - radeon_begin(&sctx->gfx_cs); /* We must wait for idle using an EOP event before changing the attribute ring registers. - * Use the bottom-of-pipe EOP event, but increment the PWS counter instead of writing memory. + * Use the bottom-of-pipe EOP event, but use the PWS TS counter instead of the counter + * in memory. */ - radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); - radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | - S_490_EVENT_INDEX(5) | - S_490_PWS_ENABLE(1)); - radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */ - radeon_emit(0); /* ADDRESS_LO */ - radeon_emit(0); /* ADDRESS_HI */ - radeon_emit(0); /* DATA_LO */ - radeon_emit(0); /* DATA_HI */ - radeon_emit(0); /* INT_CTXID */ - - /* Wait for the PWS counter. */ - radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(S_580_PWS_STAGE_SEL(V_580_CP_ME) | - S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) | - S_580_PWS_ENA2(1) | - S_580_PWS_COUNT(0)); - radeon_emit(0xffffffff); /* GCR_SIZE */ - radeon_emit(0x01ffffff); /* GCR_SIZE_HI */ - radeon_emit(0); /* GCR_BASE_LO */ - radeon_emit(0); /* GCR_BASE_HI */ - radeon_emit(S_585_PWS_ENA(1)); - radeon_emit(0); /* GCR_CNTL */ + si_cp_release_acquire_mem_pws(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, + V_580_CP_ME, 0); uint64_t attr_address = sscreen->attribute_pos_prim_ring->gpu_address; assert((attr_address >> 32) == sscreen->info.address32_hi); + radeon_begin(&sctx->gfx_cs); radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4); radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */ radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */