mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-17 18:18:06 +02:00
The new definitions have their numbers offset by 1 (e.g. S_580 -> S_581). The remaining old definitions are adjusted to match that. Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40588>
644 lines
24 KiB
C
644 lines
24 KiB
C
/*
|
|
* Copyright 2012 Advanced Micro Devices, Inc.
|
|
* Copyright 2024 Valve Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "ac_cmdbuf.h"
|
|
#include "ac_cmdbuf_cp.h"
|
|
#include "ac_gpu_info.h"
|
|
#include "ac_shader_util.h"
|
|
|
|
#include "amd_family.h"
|
|
#include "sid.h"
|
|
|
|
void
|
|
ac_emit_cp_indirect_buffer(struct ac_cmdbuf *cs, uint64_t va, uint32_t cdw,
|
|
enum ac_cp_indirect_buffer_flags flags,
|
|
bool predicate)
|
|
{
|
|
uint32_t dword2_flags = 0;
|
|
|
|
if (flags & AC_CP_INDIRECT_BUFFER_CHAIN)
|
|
dword2_flags |= S_3F3_CHAIN(1);
|
|
if (flags & AC_CP_INDIRECT_BUFFER_VALID)
|
|
dword2_flags |= S_3F3_VALID(1);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
ac_cmdbuf_emit(cdw | dword2_flags);
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
uint64_t va, uint32_t count)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
if (gfx_level >= GFX7) {
|
|
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0));
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
ac_cmdbuf_emit(0);
|
|
ac_cmdbuf_emit(count);
|
|
} else {
|
|
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0));
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
ac_cmdbuf_emit(count);
|
|
}
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
|
uint32_t dst_sel, uint64_t va, uint32_t size,
|
|
bool predicate)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate));
|
|
ac_cmdbuf_emit(S_371_DST_SEL(dst_sel) |
|
|
S_371_WR_CONFIRM(1) |
|
|
S_371_ENGINE_SEL(engine_sel));
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
|
uint32_t dst_sel, uint64_t va, uint32_t size,
|
|
const uint32_t *data, bool predicate)
|
|
{
|
|
ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate);
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit_array(data, size);
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
|
|
uint64_t va, uint32_t value)
|
|
{
|
|
ac_emit_cp_write_data(cs, engine_sel, V_371_MEMORY, va, 1, &value, false);
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
|
uint32_t mask, unsigned flags)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
|
ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags);
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
ac_cmdbuf_emit(ref); /* reference value */
|
|
ac_cmdbuf_emit(mask); /* mask */
|
|
ac_cmdbuf_emit(4); /* poll interval */
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
static bool
|
|
is_ts_event(unsigned event_type)
|
|
{
|
|
return event_type == V_028A90_CACHE_FLUSH_TS ||
|
|
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
|
|
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
|
|
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
|
|
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
|
}
|
|
|
|
/* This will wait or insert into the pipeline a wait for a previous
|
|
* RELEASE_MEM PWS event.
|
|
*
|
|
* "event_type" must be the same as the RELEASE_MEM PWS event.
|
|
*
|
|
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME,
|
|
* PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the
|
|
* pipeline instead of completely idling the hw at the frontend.
|
|
*
|
|
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the
|
|
* pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
|
|
*
|
|
* "distance" determines how many RELEASE_MEM PWS events ago it should wait
|
|
* for, minus one (starting from 0). There are 3 event types: PS_DONE,
|
|
* CS_DONE, and TS events. The distance counter increments separately for each
|
|
* type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with
|
|
* *_TS means wait for the last TS event (even if it's a different TS event
|
|
* because all TS events share the same counter).
|
|
*
|
|
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while
|
|
* PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really
|
|
* be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests.
|
|
*
|
|
* PRE_COLOR also exists but shouldn't be used because it can hang. It's
|
|
* recommended to use PRE_PIX_SHADER instead, which means all PS that have
|
|
* color exports with enabled color buffers, non-zero colormask, and non-zero
|
|
* sample mask must have IMAGE_OP=1 to enable the sync before PS.
|
|
*
|
|
* Waiting for a PWS fence that was generated by a previous IB is valid, but
|
|
* if there is an IB from another process in between and that IB also inserted
|
|
* a PWS fence, the hw will wait for the newer fence instead because the PWS
|
|
* counter was incremented.
|
|
*/
|
|
void
|
|
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
|
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
|
uint32_t stage_sel, uint32_t count,
|
|
uint32_t gcr_cntl)
|
|
{
|
|
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
|
|
|
const bool ts = is_ts_event(event_type);
|
|
const bool ps_done = event_type == V_028A90_PS_DONE;
|
|
const bool cs_done = event_type == V_028A90_CS_DONE;
|
|
const uint32_t counter_sel = ts ? V_581B_TS_SELECT : ps_done ? V_581B_PS_SELECT : V_581B_CS_SELECT;
|
|
|
|
assert((int)ts + (int)cs_done + (int)ps_done == 1);
|
|
assert(!gcr_cntl || stage_sel == V_581B_CP_PFP || stage_sel == V_581B_CP_ME);
|
|
assert(stage_sel != V_581B_PRE_COLOR);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
|
ac_cmdbuf_emit(S_581B_PWS_STAGE_SEL(stage_sel) |
|
|
S_581B_PWS_COUNTER_SEL(counter_sel) |
|
|
S_581B_PWS_ENA2(1) |
|
|
S_581B_PWS_COUNT(count));
|
|
ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */
|
|
ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */
|
|
ac_cmdbuf_emit(0); /* GCR_BASE_LO */
|
|
ac_cmdbuf_emit(0); /* GCR_BASE_HI */
|
|
ac_cmdbuf_emit(S_586B_PWS_ENA(1));
|
|
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will
|
|
* signal after the work indicated by the event is complete, which optionally
|
|
* includes flushing caches using "gcr_cntl" after the completion of the work.
|
|
* *_TS events are always signaled at the end of the pipeline, while CS_DONE
|
|
* and PS_DONE are signaled when those shaders finish. This call only inserts
|
|
* the event into the pipeline. It doesn't wait for anything and it doesn't
|
|
* execute anything immediately. The only way to wait for the event completion
|
|
* is to call si_cp_acquire_mem_pws with the same "event_type".
|
|
*/
|
|
void
|
|
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
|
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
|
uint32_t gcr_cntl)
|
|
{
|
|
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
|
/* Only GFX12+ supports GCR ops with PS_DONE & CS_DONE in RELEASE_MEM. */
|
|
assert(gfx_level >= GFX12 || !gcr_cntl || (event_type != V_028A90_PS_DONE &&
|
|
event_type != V_028A90_CS_DONE));
|
|
|
|
/* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
|
|
assert(G_587_GLI_INV(gcr_cntl) == 0);
|
|
assert(gfx_level >= GFX12 || G_587_GL1_RANGE(gcr_cntl) == 0);
|
|
const uint32_t glm_wb = G_587_GLM_WB(gcr_cntl);
|
|
const uint32_t glm_inv = G_587_GLM_INV(gcr_cntl);
|
|
const uint32_t glk_wb = G_587_GLK_WB(gcr_cntl);
|
|
const uint32_t glk_inv = G_587_GLK_INV(gcr_cntl);
|
|
const uint32_t glv_inv = G_587_GLV_INV(gcr_cntl);
|
|
const uint32_t gl1_inv = G_587_GL1_INV(gcr_cntl);
|
|
assert(G_587_GL2_US(gcr_cntl) == 0);
|
|
assert(G_587_GL2_RANGE(gcr_cntl) == 0);
|
|
assert(G_587_GL2_DISCARD(gcr_cntl) == 0);
|
|
const uint32_t gl2_inv = G_587_GL2_INV(gcr_cntl);
|
|
const uint32_t gl2_wb = G_587_GL2_WB(gcr_cntl);
|
|
const uint32_t gcr_seq = G_587_SEQ(gcr_cntl);
|
|
const bool ts = is_ts_event(event_type);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
|
ac_cmdbuf_emit(S_491_EVENT_TYPE(event_type) |
|
|
S_491_EVENT_INDEX(ts ? 5 : 6) |
|
|
(gfx_level >= GFX12 ? 0 : S_491_GLM_WB(glm_wb) | S_491_GLM_INV(glm_inv) | S_491_GL1_INV(gl1_inv)) |
|
|
S_491_GLV_INV(glv_inv) |
|
|
S_491_GL2_INV(gl2_inv) |
|
|
S_491_GL2_WB(gl2_wb) |
|
|
S_491_SEQ(gcr_seq) |
|
|
S_491_GLK_WB(glk_wb) |
|
|
S_491_GLK_INV(glk_inv) |
|
|
S_491_PWS_ENABLE(1));
|
|
ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
|
|
ac_cmdbuf_emit(0); /* ADDRESS_LO */
|
|
ac_cmdbuf_emit(0); /* ADDRESS_HI */
|
|
ac_cmdbuf_emit(0); /* DATA_LO */
|
|
ac_cmdbuf_emit(0); /* DATA_HI */
|
|
ac_cmdbuf_emit(0); /* INT_CTXID */
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
|
|
uint64_t src_va, uint64_t dst_va,
|
|
enum ac_cp_copy_data_flags flags, bool predicate)
|
|
{
|
|
uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) |
|
|
COPY_DATA_DST_SEL(dst_sel);
|
|
|
|
if (flags & AC_CP_COPY_DATA_WR_CONFIRM)
|
|
dword0 |= COPY_DATA_WR_CONFIRM;
|
|
if (flags & AC_CP_COPY_DATA_COUNT_SEL)
|
|
dword0 |= COPY_DATA_COUNT_SEL;
|
|
if (flags & AC_CP_COPY_DATA_ENGINE_PFP) {
|
|
/* COPY_DATA shouldn't set registers in PFP because that would execute
|
|
* out-of-order with SET register packets that are executed by ME.
|
|
*/
|
|
assert(src_sel != COPY_DATA_REG && dst_sel != COPY_DATA_REG);
|
|
dword0 |= COPY_DATA_ENGINE_PFP;
|
|
}
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate));
|
|
ac_cmdbuf_emit(dword0);
|
|
ac_cmdbuf_emit(src_va);
|
|
ac_cmdbuf_emit(src_va >> 32);
|
|
ac_cmdbuf_emit(dst_va);
|
|
ac_cmdbuf_emit(dst_va >> 32);
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate));
|
|
ac_cmdbuf_emit(0);
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
uint64_t va, uint32_t op)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
if (gfx_level >= GFX9) {
|
|
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
|
|
ac_cmdbuf_emit(op);
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
} else {
|
|
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(op | ((va >> 32) & 0xFF));
|
|
}
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
|
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa)
|
|
{
|
|
assert(info->gfx_level >= GFX11);
|
|
assert((attr_ring_va >> 32) == info->address32_hi);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
|
|
ac_cmdbuf_set_ucfg_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
|
|
ac_cmdbuf_emit(0x12355123);
|
|
ac_cmdbuf_emit(0x1544D);
|
|
ac_cmdbuf_emit(attr_ring_va >> 16);
|
|
ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) |
|
|
S_03111C_BIG_PAGE(info->discardable_allows_big_page) |
|
|
S_03111C_L1_POLICY(1));
|
|
|
|
if (info->gfx_level >= GFX12) {
|
|
const uint64_t pos_va = attr_ring_va + info->pos_ring_offset;
|
|
const uint64_t prim_va = attr_ring_va + info->prim_ring_offset;
|
|
|
|
/* When one of these 4 registers is updated, all 4 must be updated. */
|
|
ac_cmdbuf_set_ucfg_reg_seq(R_0309A0_GE_POS_RING_BASE, 4);
|
|
ac_cmdbuf_emit(pos_va >> 16);
|
|
ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5));
|
|
ac_cmdbuf_emit(prim_va >> 16);
|
|
ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) |
|
|
S_0309AC_SCOPE(gfx12_scope_device) |
|
|
S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) |
|
|
S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) |
|
|
S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) |
|
|
S_0309AC_FORCE_SE_SCOPE(1) |
|
|
S_0309AC_PAB_NOFILL(1));
|
|
|
|
if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) {
|
|
/* Mitigate the HiZ GPU hang by increasing a timeout when
|
|
* BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted
|
|
* when the gfx queue is idle.
|
|
*/
|
|
const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0;
|
|
|
|
ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0));
|
|
ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout));
|
|
}
|
|
}
|
|
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
|
uint64_t attr_ring_va)
|
|
{
|
|
const uint64_t va = attr_ring_va + info->tess_offchip_ring_size;
|
|
uint32_t tf_ring_size = info->tess_factor_ring_size / 4;
|
|
|
|
if (info->gfx_level >= GFX11) {
|
|
/* TF_RING_SIZE is per SE on GFX11. */
|
|
tf_ring_size /= info->max_se;
|
|
}
|
|
|
|
assert((tf_ring_size & C_030938_SIZE) == 0);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
|
|
if (info->gfx_level >= GFX7) {
|
|
ac_cmdbuf_set_ucfg_reg_seq(R_030938_VGT_TF_RING_SIZE, 3);
|
|
ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size));
|
|
ac_cmdbuf_emit(info->hs_offchip_param);
|
|
ac_cmdbuf_emit(va >> 8);
|
|
|
|
if (info->gfx_level >= GFX12) {
|
|
ac_cmdbuf_set_ucfg_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40));
|
|
} else if (info->gfx_level >= GFX10) {
|
|
ac_cmdbuf_set_ucfg_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
|
|
} else if (info->gfx_level == GFX9) {
|
|
ac_cmdbuf_set_ucfg_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
|
|
}
|
|
} else {
|
|
ac_cmdbuf_set_cfg_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size));
|
|
ac_cmdbuf_set_cfg_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8);
|
|
ac_cmdbuf_set_cfg_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param);
|
|
}
|
|
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
uint64_t va, uint32_t size)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
|
|
if (gfx_level >= GFX11) {
|
|
ac_cmdbuf_set_ctx_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
|
|
ac_cmdbuf_emit(size);
|
|
ac_cmdbuf_emit(va >> 8);
|
|
ac_cmdbuf_emit(va >> 40);
|
|
} else {
|
|
ac_cmdbuf_set_ctx_reg(R_0286E8_SPI_TMPRING_SIZE, size);
|
|
}
|
|
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits
|
|
* for idle on older chips. "engine" determines whether to sync in PFP or ME.
|
|
*/
|
|
void
|
|
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
enum amd_ip_type ip_type, uint32_t engine,
|
|
uint32_t gcr_cntl)
|
|
{
|
|
assert(engine == V_581B_CP_PFP || engine == V_581B_CP_ME);
|
|
assert(gcr_cntl);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
|
|
if (gfx_level >= GFX10) {
|
|
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
|
|
const uint32_t engine_flag = engine == V_581B_CP_ME ? BITFIELD_BIT(31) : 0;
|
|
|
|
/* Flush caches. This doesn't wait for idle. */
|
|
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
|
ac_cmdbuf_emit(engine_flag); /* which engine to use */
|
|
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
|
ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
|
|
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
|
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
|
|
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
|
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */
|
|
} else {
|
|
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
|
|
|
|
if (gfx_level == GFX9 || is_mec) {
|
|
/* Flush caches and wait for the caches to assert idle. */
|
|
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec));
|
|
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
|
|
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
|
ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */
|
|
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
|
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
|
|
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
|
} else {
|
|
/* ACQUIRE_MEM is only required on the compute ring. */
|
|
ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
|
|
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
|
|
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
|
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
|
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
|
}
|
|
}
|
|
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_release_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
enum amd_ip_type ip_type, uint32_t event,
|
|
uint32_t event_flags, uint32_t dst_sel,
|
|
uint32_t int_sel, uint32_t data_sel, uint64_t va,
|
|
uint32_t new_fence, uint64_t eop_bug_va)
|
|
{
|
|
/* Only GFX12+ supports GCR ops with PS_DONE & CS_DONE in RELEASE_MEM. */
|
|
assert(gfx_level >= GFX12 || !event_flags || (event != V_028A90_PS_DONE &&
|
|
event != V_028A90_CS_DONE));
|
|
|
|
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
|
|
|
|
/* GFX7 CP DMA: any use of CP_DMA.DST_SEL=TC must be avoided when EOS packets are used.
|
|
* Use DST_SEL=MC instead. For prefetch, use SRC_SEL=TC and DST_SEL=MC.
|
|
* Maybe related to waCpDmaHangMcTcAckDrop in PAL.
|
|
*/
|
|
if (gfx_level == GFX7 && (event == V_028A90_CS_DONE || event == V_028A90_PS_DONE))
|
|
event = V_028A90_BOTTOM_OF_PIPE_TS;
|
|
|
|
const uint32_t op = EVENT_TYPE(event) |
|
|
EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
|
|
event_flags;
|
|
const uint32_t sel = EOP_DST_SEL(dst_sel) |
|
|
EOP_INT_SEL(int_sel) |
|
|
EOP_DATA_SEL(data_sel);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
|
|
if (gfx_level >= GFX9 || is_mec) {
|
|
/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion counters)
|
|
* must immediately precede every timestamp event to prevent a GPU hang
|
|
* on GFX9.
|
|
*/
|
|
if (gfx_level == GFX9 && !is_mec && eop_bug_va) {
|
|
ac_cmdbuf_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
|
|
ac_cmdbuf_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
|
|
ac_cmdbuf_emit(eop_bug_va);
|
|
ac_cmdbuf_emit(eop_bug_va >> 32);
|
|
}
|
|
|
|
ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, gfx_level >= GFX9 ? 6 : 5, false));
|
|
ac_cmdbuf_emit(op);
|
|
ac_cmdbuf_emit(sel);
|
|
ac_cmdbuf_emit(va); /* address lo */
|
|
ac_cmdbuf_emit(va >> 32); /* address hi */
|
|
ac_cmdbuf_emit(new_fence); /* immediate data lo */
|
|
ac_cmdbuf_emit(0); /* immediate data hi */
|
|
if (gfx_level >= GFX9)
|
|
ac_cmdbuf_emit(0); /* unused */
|
|
} else {
|
|
/* On GFX6, EOS events are always emitted with EVENT_WRITE_EOS.
|
|
* On GFX7+, EOS events are emitted with EVENT_WRITE_EOS on the graphics
|
|
* queue, and with RELEASE_MEM on the compute queue.
|
|
*/
|
|
if (event == V_028B9C_CS_DONE || event == V_028B9C_PS_DONE) {
|
|
assert(event_flags == 0 && dst_sel == EOP_DST_SEL_MEM && data_sel == EOP_DATA_SEL_VALUE_32BIT);
|
|
|
|
ac_cmdbuf_emit(PKT3(PKT3_EVENT_WRITE_EOS, 3, false));
|
|
ac_cmdbuf_emit(op);
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(((va >> 32) & 0xffff) |
|
|
EOS_DATA_SEL(EOS_DATA_SEL_VALUE_32BIT));
|
|
ac_cmdbuf_emit(new_fence);
|
|
} else {
|
|
if (gfx_level == GFX7 || gfx_level == GFX8) {
|
|
/* Two EOP events are required to make all engines go idle (and
|
|
* optional cache flushes executed) before the timestamp is
|
|
* written.
|
|
*/
|
|
ac_cmdbuf_emit(PKT3(PKT3_EVENT_WRITE_EOP, 4, false));
|
|
ac_cmdbuf_emit(op);
|
|
ac_cmdbuf_emit(eop_bug_va);
|
|
ac_cmdbuf_emit(((eop_bug_va >> 32) & 0xffff) | sel);
|
|
ac_cmdbuf_emit(0); /* immediate data */
|
|
ac_cmdbuf_emit(0); /* unused */
|
|
}
|
|
|
|
ac_cmdbuf_emit(PKT3(PKT3_EVENT_WRITE_EOP, 4, false));
|
|
ac_cmdbuf_emit(op);
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(((va >> 32) & 0xffff) | sel);
|
|
ac_cmdbuf_emit(new_fence); /* immediate data */
|
|
ac_cmdbuf_emit(0); /* unused */
|
|
}
|
|
}
|
|
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
|
|
uint32_t atomic_cmd, uint64_t va, uint64_t data,
|
|
uint64_t compare_data)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
|
|
ac_cmdbuf_emit(S_1E1_ATOMIC(atomic_op) |
|
|
S_1E1_COMMAND(atomic_cmd));
|
|
ac_cmdbuf_emit(va); /* addr lo */
|
|
ac_cmdbuf_emit(va >> 32); /* addr hi */
|
|
ac_cmdbuf_emit(data); /* data lo */
|
|
ac_cmdbuf_emit(data >> 32); /* data hi */
|
|
ac_cmdbuf_emit(compare_data); /* compare data lo */
|
|
ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */
|
|
ac_cmdbuf_emit(10); /* loop interval */
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0));
|
|
ac_cmdbuf_emit(value);
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
|
|
uint32_t reg_count, uint64_t va,
|
|
bool predicate)
|
|
{
|
|
assert(reg_count);
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate));
|
|
ac_cmdbuf_emit(va);
|
|
ac_cmdbuf_emit(va >> 32);
|
|
ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
|
ac_cmdbuf_emit(reg_count); /* in DWORDS */
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_inhibit_clockgating(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
bool inhibit)
|
|
{
|
|
if (gfx_level >= GFX11)
|
|
return; /* not needed */
|
|
|
|
ac_cmdbuf_begin(cs);
|
|
if (gfx_level >= GFX10) {
|
|
ac_cmdbuf_set_ucfg_reg(R_037390_RLC_PERFMON_CLK_CNTL,
|
|
S_037390_PERFMON_CLOCK_STATE(inhibit));
|
|
} else if (gfx_level >= GFX8) {
|
|
ac_cmdbuf_set_ucfg_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
|
|
S_0372FC_PERFMON_CLOCK_STATE(inhibit));
|
|
}
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_spi_config_cntl(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
|
bool enable)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
if (gfx_level >= GFX12) {
|
|
ac_cmdbuf_set_ucfg_reg(R_031120_SPI_SQG_EVENT_CTL,
|
|
S_031120_ENABLE_SQG_TOP_EVENTS(enable) |
|
|
S_031120_ENABLE_SQG_BOP_EVENTS(enable));
|
|
} else if (gfx_level >= GFX9) {
|
|
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
|
|
S_031100_EXP_PRIORITY_ORDER(3) |
|
|
S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
|
|
S_031100_ENABLE_SQG_BOP_EVENTS(enable);
|
|
|
|
if (gfx_level >= GFX10)
|
|
spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
|
|
|
|
ac_cmdbuf_set_ucfg_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
|
|
} else {
|
|
/* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
|
|
ac_cmdbuf_set_privileged_cfg_reg(R_009100_SPI_CONFIG_CNTL,
|
|
S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
|
|
S_009100_ENABLE_SQG_BOP_EVENTS(enable));
|
|
}
|
|
ac_cmdbuf_end();
|
|
}
|
|
|
|
void
|
|
ac_emit_cp_update_windowed_counters(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
|
enum amd_ip_type ip_type, bool enable)
|
|
{
|
|
ac_cmdbuf_begin(cs);
|
|
if (ip_type == AMD_IP_GFX) {
|
|
if (enable) {
|
|
ac_cmdbuf_event_write(V_028A90_PERFCOUNTER_START);
|
|
} else if (!info->never_send_perfcounter_stop) {
|
|
ac_cmdbuf_event_write(V_028A90_PERFCOUNTER_STOP);
|
|
}
|
|
}
|
|
ac_cmdbuf_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE,
|
|
S_00B82C_PERFCOUNT_ENABLE(enable));
|
|
ac_cmdbuf_end();
|
|
}
|