mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 20:10:14 +01:00
radeonsi: add ACQUIRE_MEM, RELEASE_MEM PWS packet helpers
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
This commit is contained in:
parent
9690481535
commit
1d5ffb13d6
5 changed files with 158 additions and 84 deletions
|
|
@ -31,6 +31,7 @@ files_libradeonsi = files(
|
|||
'si_compute_blit.c',
|
||||
'si_cp_dma.c',
|
||||
'si_cp_reg_shadowing.c',
|
||||
'si_cp_utils.c',
|
||||
'si_debug.c',
|
||||
'si_descriptors.c',
|
||||
'si_fence.c',
|
||||
|
|
|
|||
133
src/gallium/drivers/radeonsi/si_cp_utils.c
Normal file
133
src/gallium/drivers/radeonsi/si_cp_utils.c
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* Copyright 2024 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "si_build_pm4.h"
|
||||
|
||||
static bool is_ts_event(unsigned event_type)
|
||||
{
|
||||
return event_type == V_028A90_CACHE_FLUSH_TS ||
|
||||
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
|
||||
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
}
|
||||
|
||||
/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will signal after the work
|
||||
* indicated by the event is complete, which optionally includes flushing caches using "gcr_cntl"
|
||||
* after the completion of the work. *_TS events are always signaled at the end of the pipeline,
|
||||
* while CS_DONE and PS_DONE are signaled when those shaders finish. This call only inserts
|
||||
* the event into the pipeline. It doesn't wait for anything and it doesn't execute anything
|
||||
* immediately. The only way to wait for the event completion is to call si_cp_acquire_mem_pws
|
||||
* with the same "event_type".
|
||||
*/
|
||||
void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned gcr_cntl)
|
||||
{
|
||||
assert(sctx->gfx_level >= GFX11 && sctx->has_graphics);
|
||||
bool ts = is_ts_event(event_type);
|
||||
/* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
|
||||
assert(G_586_GLI_INV(gcr_cntl) == 0);
|
||||
assert(G_586_GL1_RANGE(gcr_cntl) == 0);
|
||||
unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
|
||||
unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
|
||||
unsigned glk_wb = G_586_GLK_WB(gcr_cntl);
|
||||
unsigned glk_inv = G_586_GLK_INV(gcr_cntl);
|
||||
unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
|
||||
unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
|
||||
assert(G_586_GL2_US(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
|
||||
unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
|
||||
unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
|
||||
unsigned gcr_seq = G_586_SEQ(gcr_cntl);
|
||||
|
||||
radeon_begin(cs);
|
||||
radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
||||
radeon_emit(S_490_EVENT_TYPE(event_type) |
|
||||
S_490_EVENT_INDEX(ts ? 5 : 6) |
|
||||
S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
|
||||
S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
|
||||
S_490_SEQ(gcr_seq) | S_490_GLK_WB(glk_wb) | S_490_GLK_INV(glk_inv) |
|
||||
S_490_PWS_ENABLE(1));
|
||||
radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
|
||||
radeon_emit(0); /* ADDRESS_LO */
|
||||
radeon_emit(0); /* ADDRESS_HI */
|
||||
radeon_emit(0); /* DATA_LO */
|
||||
radeon_emit(0); /* DATA_HI */
|
||||
radeon_emit(0); /* INT_CTXID */
|
||||
radeon_end();
|
||||
}
|
||||
|
||||
/* This will wait or insert into the pipeline a wait for a previous RELEASE_MEM PWS event.
|
||||
*
|
||||
* "event_type" must be the same as the RELEASE_MEM PWS event.
|
||||
*
|
||||
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, PRE_SHADER,
|
||||
* PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the pipeline instead of completely
|
||||
* idling the hw at the frontend.
|
||||
*
|
||||
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the pipeline, any
|
||||
* cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
|
||||
*
|
||||
* "distance" determines how many RELEASE_MEM PWS events ago it should wait for, minus one
|
||||
* (starting from 0). There are 3 event types: PS_DONE, CS_DONE, and TS events. The distance
|
||||
* counter increments separately for each type, so 0 with PS_DONE means wait for the last PS_DONE
|
||||
* event, while 0 with *_TS means wait for the last TS event (even if it's a different TS event
|
||||
* because all TS events share the same counter).
|
||||
*
|
||||
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while PRE_PIX_SHADER waits before
|
||||
* PS if it has IMAGE_OP=1 (IMAGE_OP should really be called SYNC_ENABLE) PRE_DEPTH waits before
|
||||
* depth/stencil tests.
|
||||
*
|
||||
* PRE_COLOR also exists but shouldn't be used because it can hang. It's recommended to use
|
||||
* PRE_PIX_SHADER instead, which means all PS that have color exports with enabled color buffers,
|
||||
* non-zero colormask, and non-zero sample mask must have IMAGE_OP=1 to enable the sync before PS.
|
||||
*
|
||||
* Waiting for a PWS fence that was generated by a previous IB is valid, but if there is an IB
|
||||
* from another process in between and that IB also inserted a PWS fence, the hw will wait for
|
||||
* the newer fence instead because the PWS counter was incremented.
|
||||
*/
|
||||
void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned stage_sel, unsigned gcr_cntl,
|
||||
unsigned distance, unsigned sqtt_flush_flags)
|
||||
{
|
||||
assert(sctx->gfx_level >= GFX11 && sctx->has_graphics);
|
||||
bool ts = is_ts_event(event_type);
|
||||
bool cs_done = event_type == V_028A90_CS_DONE;
|
||||
bool ps = event_type == V_028A90_PS_DONE;
|
||||
|
||||
assert((int)ts + (int)cs_done + (int)ps == 1);
|
||||
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
|
||||
assert(stage_sel != V_580_PRE_COLOR);
|
||||
|
||||
if (unlikely(sctx->sqtt_enabled))
|
||||
si_sqtt_describe_barrier_start(sctx, cs);
|
||||
|
||||
radeon_begin(cs);
|
||||
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(S_580_PWS_STAGE_SEL(stage_sel) |
|
||||
S_580_PWS_COUNTER_SEL(ts ? V_580_TS_SELECT : ps ? V_580_PS_SELECT : V_580_CS_SELECT) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(distance));
|
||||
radeon_emit(0xffffffff); /* GCR_SIZE */
|
||||
radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
radeon_emit(0); /* GCR_BASE_LO */
|
||||
radeon_emit(0); /* GCR_BASE_HI */
|
||||
radeon_emit(S_585_PWS_ENA(1));
|
||||
radeon_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
|
||||
radeon_end();
|
||||
|
||||
if (unlikely(sctx->sqtt_enabled))
|
||||
si_sqtt_describe_barrier_end(sctx, cs, sqtt_flush_flags);
|
||||
}
|
||||
|
||||
void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned gcr_cntl, unsigned stage_sel,
|
||||
unsigned sqtt_flush_flags)
|
||||
{
|
||||
si_cp_release_mem_pws(sctx, cs, event_type, gcr_cntl);
|
||||
si_cp_acquire_mem_pws(sctx, cs, event_type, stage_sel, 0, 0, sqtt_flush_flags);
|
||||
}
|
||||
|
|
@ -879,72 +879,22 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|||
}
|
||||
|
||||
if (cb_db_event) {
|
||||
radeon_end();
|
||||
|
||||
if (ctx->gfx_level >= GFX11) {
|
||||
/* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
|
||||
unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
|
||||
unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
|
||||
unsigned glk_wb = G_586_GLK_WB(gcr_cntl);
|
||||
unsigned glk_inv = G_586_GLK_INV(gcr_cntl);
|
||||
unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
|
||||
unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
|
||||
assert(G_586_GL2_US(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
|
||||
unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
|
||||
unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
|
||||
unsigned gcr_seq = G_586_SEQ(gcr_cntl);
|
||||
|
||||
gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLK_WB & C_586_GLK_INV &
|
||||
C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV & C_586_GL2_WB; /* keep SEQ */
|
||||
|
||||
/* Send an event that flushes caches. */
|
||||
radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
||||
radeon_emit(S_490_EVENT_TYPE(cb_db_event) |
|
||||
S_490_EVENT_INDEX(5) |
|
||||
S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
|
||||
S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
|
||||
S_490_SEQ(gcr_seq) | S_490_GLK_WB(glk_wb) | S_490_GLK_INV(glk_inv) |
|
||||
S_490_PWS_ENABLE(1));
|
||||
radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
|
||||
radeon_emit(0); /* ADDRESS_LO */
|
||||
radeon_emit(0); /* ADDRESS_HI */
|
||||
radeon_emit(0); /* DATA_LO */
|
||||
radeon_emit(0); /* DATA_HI */
|
||||
radeon_emit(0); /* INT_CTXID */
|
||||
|
||||
if (unlikely(ctx->sqtt_enabled)) {
|
||||
radeon_end();
|
||||
si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
|
||||
radeon_begin_again(cs);
|
||||
}
|
||||
si_cp_release_mem_pws(ctx, cs, cb_db_event, gcr_cntl & C_586_GLI_INV);
|
||||
|
||||
/* Wait for the event and invalidate remaining caches if needed. */
|
||||
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(S_580_PWS_STAGE_SEL(flags & SI_CONTEXT_PFP_SYNC_ME ? V_580_CP_PFP :
|
||||
V_580_CP_ME) |
|
||||
S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(0));
|
||||
radeon_emit(0xffffffff); /* GCR_SIZE */
|
||||
radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
radeon_emit(0); /* GCR_BASE_LO */
|
||||
radeon_emit(0); /* GCR_BASE_HI */
|
||||
radeon_emit(S_585_PWS_ENA(1));
|
||||
radeon_emit(gcr_cntl); /* GCR_CNTL */
|
||||
|
||||
if (unlikely(ctx->sqtt_enabled)) {
|
||||
radeon_end();
|
||||
si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
|
||||
radeon_begin_again(cs);
|
||||
}
|
||||
si_cp_acquire_mem_pws(ctx, cs, cb_db_event,
|
||||
flags & SI_CONTEXT_PFP_SYNC_ME ? V_580_CP_PFP : V_580_CP_ME,
|
||||
gcr_cntl & ~C_586_GLI_INV, /* keep only GLI_INV */
|
||||
0, flags);
|
||||
|
||||
gcr_cntl = 0; /* all done */
|
||||
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
|
||||
flags &= ~SI_CONTEXT_PFP_SYNC_ME;
|
||||
} else {
|
||||
/* GFX10 */
|
||||
radeon_end();
|
||||
|
||||
struct si_resource *wait_mem_scratch =
|
||||
si_get_wait_mem_scratch_bo(ctx, cs, ctx->ws->cs_is_secure(cs));
|
||||
|
||||
|
|
@ -986,9 +936,9 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
|
|||
if (unlikely(ctx->sqtt_enabled)) {
|
||||
si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
|
||||
}
|
||||
|
||||
radeon_begin_again(cs);
|
||||
}
|
||||
|
||||
radeon_begin_again(cs);
|
||||
}
|
||||
|
||||
/* Ignore fields that only modify the behavior of other fields. */
|
||||
|
|
|
|||
|
|
@ -1559,6 +1559,16 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned
|
|||
/* si_cp_reg_shadowing.c */
|
||||
void si_init_cp_reg_shadowing(struct si_context *sctx);
|
||||
|
||||
/* si_cp_utils.c */
|
||||
void si_cp_release_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned gcr_cntl);
|
||||
void si_cp_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned stage_sel, unsigned gcr_cntl,
|
||||
unsigned distance, unsigned sqtt_flush_flags);
|
||||
void si_cp_release_acquire_mem_pws(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
unsigned event_type, unsigned gcr_cntl, unsigned stage_sel,
|
||||
unsigned sqtt_flush_flags);
|
||||
|
||||
/* si_debug.c */
|
||||
void si_gather_context_rolls(struct si_context *sctx);
|
||||
void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved,
|
||||
|
|
|
|||
|
|
@ -5031,37 +5031,17 @@ static void si_emit_spi_ge_ring_state(struct si_context *sctx, unsigned index)
|
|||
}
|
||||
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
/* We must wait for idle using an EOP event before changing the attribute ring registers.
|
||||
* Use the bottom-of-pipe EOP event, but increment the PWS counter instead of writing memory.
|
||||
* Use the bottom-of-pipe EOP event, but use the PWS TS counter instead of the counter
|
||||
* in memory.
|
||||
*/
|
||||
radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
||||
radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) |
|
||||
S_490_EVENT_INDEX(5) |
|
||||
S_490_PWS_ENABLE(1));
|
||||
radeon_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
|
||||
radeon_emit(0); /* ADDRESS_LO */
|
||||
radeon_emit(0); /* ADDRESS_HI */
|
||||
radeon_emit(0); /* DATA_LO */
|
||||
radeon_emit(0); /* DATA_HI */
|
||||
radeon_emit(0); /* INT_CTXID */
|
||||
|
||||
/* Wait for the PWS counter. */
|
||||
radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(S_580_PWS_STAGE_SEL(V_580_CP_ME) |
|
||||
S_580_PWS_COUNTER_SEL(V_580_TS_SELECT) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(0));
|
||||
radeon_emit(0xffffffff); /* GCR_SIZE */
|
||||
radeon_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
radeon_emit(0); /* GCR_BASE_LO */
|
||||
radeon_emit(0); /* GCR_BASE_HI */
|
||||
radeon_emit(S_585_PWS_ENA(1));
|
||||
radeon_emit(0); /* GCR_CNTL */
|
||||
si_cp_release_acquire_mem_pws(sctx, &sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0,
|
||||
V_580_CP_ME, 0);
|
||||
|
||||
uint64_t attr_address = sscreen->attribute_pos_prim_ring->gpu_address;
|
||||
assert((attr_address >> 32) == sscreen->info.address32_hi);
|
||||
|
||||
radeon_begin(&sctx->gfx_cs);
|
||||
radeon_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
|
||||
radeon_emit(0x12355123); /* SPI_GS_THROTTLE_CNTL1 */
|
||||
radeon_emit(0x1544D); /* SPI_GS_THROTTLE_CNTL2 */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue