mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-30 14:20:11 +01:00
amd: move CP emit helpers to ac_cmdbuf_cp.c/h
Seems more organized this way. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37881>
This commit is contained in:
parent
e0ffc41d9a
commit
7cd12e5c6a
7 changed files with 579 additions and 532 deletions
|
|
@ -858,457 +858,6 @@ ac_init_graphics_preamble_state(const struct ac_preamble_state *state,
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t count)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
if (gfx_level >= GFX7) {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit(0);
|
||||
ac_cmdbuf_emit(count);
|
||||
} else {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit(count);
|
||||
}
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
bool predicate)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate));
|
||||
ac_cmdbuf_emit(S_370_DST_SEL(dst_sel) |
|
||||
S_370_WR_CONFIRM(1) |
|
||||
S_370_ENGINE_SEL(engine_sel));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
const uint32_t *data, bool predicate)
|
||||
{
|
||||
ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate);
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit_array(data, size);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
|
||||
uint64_t va, uint32_t value)
|
||||
{
|
||||
ac_emit_cp_write_data(cs, engine_sel, V_370_MEM, va, 1, &value, false);
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
||||
uint32_t mask, unsigned flags)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
||||
ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags);
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit(ref); /* reference value */
|
||||
ac_cmdbuf_emit(mask); /* mask */
|
||||
ac_cmdbuf_emit(4); /* poll interval */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
static bool
|
||||
is_ts_event(unsigned event_type)
|
||||
{
|
||||
return event_type == V_028A90_CACHE_FLUSH_TS ||
|
||||
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
|
||||
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
}
|
||||
|
||||
/* This will wait or insert into the pipeline a wait for a previous
|
||||
* RELEASE_MEM PWS event.
|
||||
*
|
||||
* "event_type" must be the same as the RELEASE_MEM PWS event.
|
||||
*
|
||||
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME,
|
||||
* PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the
|
||||
* pipeline instead of completely idling the hw at the frontend.
|
||||
*
|
||||
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the
|
||||
* pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
|
||||
*
|
||||
* "distance" determines how many RELEASE_MEM PWS events ago it should wait
|
||||
* for, minus one (starting from 0). There are 3 event types: PS_DONE,
|
||||
* CS_DONE, and TS events. The distance counter increments separately for each
|
||||
* type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with
|
||||
* *_TS means wait for the last TS event (even if it's a different TS event
|
||||
* because all TS events share the same counter).
|
||||
*
|
||||
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while
|
||||
* PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really
|
||||
* be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests.
|
||||
*
|
||||
* PRE_COLOR also exists but shouldn't be used because it can hang. It's
|
||||
* recommended to use PRE_PIX_SHADER instead, which means all PS that have
|
||||
* color exports with enabled color buffers, non-zero colormask, and non-zero
|
||||
* sample mask must have IMAGE_OP=1 to enable the sync before PS.
|
||||
*
|
||||
* Waiting for a PWS fence that was generated by a previous IB is valid, but
|
||||
* if there is an IB from another process in between and that IB also inserted
|
||||
* a PWS fence, the hw will wait for the newer fence instead because the PWS
|
||||
* counter was incremented.
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t stage_sel, uint32_t count,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
||||
|
||||
const bool ts = is_ts_event(event_type);
|
||||
const bool ps_done = event_type == V_028A90_PS_DONE;
|
||||
const bool cs_done = event_type == V_028A90_CS_DONE;
|
||||
const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT;
|
||||
|
||||
assert((int)ts + (int)cs_done + (int)ps_done == 1);
|
||||
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
|
||||
assert(stage_sel != V_580_PRE_COLOR);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) |
|
||||
S_580_PWS_COUNTER_SEL(counter_sel) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(count));
|
||||
ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */
|
||||
ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* GCR_BASE_LO */
|
||||
ac_cmdbuf_emit(0); /* GCR_BASE_HI */
|
||||
ac_cmdbuf_emit(S_585_PWS_ENA(1));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will
|
||||
* signal after the work indicated by the event is complete, which optionally
|
||||
* includes flushing caches using "gcr_cntl" after the completion of the work.
|
||||
* *_TS events are always signaled at the end of the pipeline, while CS_DONE
|
||||
* and PS_DONE are signaled when those shaders finish. This call only inserts
|
||||
* the event into the pipeline. It doesn't wait for anything and it doesn't
|
||||
* execute anything immediately. The only way to wait for the event completion
|
||||
* is to call si_cp_acquire_mem_pws with the same "event_type".
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
||||
|
||||
/* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
|
||||
assert(G_586_GLI_INV(gcr_cntl) == 0);
|
||||
assert(G_586_GL1_RANGE(gcr_cntl) == 0);
|
||||
const uint32_t glm_wb = G_586_GLM_WB(gcr_cntl);
|
||||
const uint32_t glm_inv = G_586_GLM_INV(gcr_cntl);
|
||||
const uint32_t glk_wb = G_586_GLK_WB(gcr_cntl);
|
||||
const uint32_t glk_inv = G_586_GLK_INV(gcr_cntl);
|
||||
const uint32_t glv_inv = G_586_GLV_INV(gcr_cntl);
|
||||
const uint32_t gl1_inv = G_586_GL1_INV(gcr_cntl);
|
||||
assert(G_586_GL2_US(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
|
||||
const uint32_t gl2_inv = G_586_GL2_INV(gcr_cntl);
|
||||
const uint32_t gl2_wb = G_586_GL2_WB(gcr_cntl);
|
||||
const uint32_t gcr_seq = G_586_SEQ(gcr_cntl);
|
||||
const bool ts = is_ts_event(event_type);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(S_490_EVENT_TYPE(event_type) |
|
||||
S_490_EVENT_INDEX(ts ? 5 : 6) |
|
||||
S_490_GLM_WB(glm_wb) |
|
||||
S_490_GLM_INV(glm_inv) |
|
||||
S_490_GLV_INV(glv_inv) |
|
||||
S_490_GL1_INV(gl1_inv) |
|
||||
S_490_GL2_INV(gl2_inv) |
|
||||
S_490_GL2_WB(gl2_wb) |
|
||||
S_490_SEQ(gcr_seq) |
|
||||
S_490_GLK_WB(glk_wb) |
|
||||
S_490_GLK_INV(glk_inv) |
|
||||
S_490_PWS_ENABLE(1));
|
||||
ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
|
||||
ac_cmdbuf_emit(0); /* ADDRESS_LO */
|
||||
ac_cmdbuf_emit(0); /* ADDRESS_HI */
|
||||
ac_cmdbuf_emit(0); /* DATA_LO */
|
||||
ac_cmdbuf_emit(0); /* DATA_HI */
|
||||
ac_cmdbuf_emit(0); /* INT_CTXID */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
|
||||
uint64_t src_va, uint64_t dst_va,
|
||||
enum ac_cp_copy_data_flags flags, bool predicate)
|
||||
{
|
||||
uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) |
|
||||
COPY_DATA_DST_SEL(dst_sel);
|
||||
|
||||
if (flags & AC_CP_COPY_DATA_WR_CONFIRM)
|
||||
dword0 |= COPY_DATA_WR_CONFIRM;
|
||||
if (flags & AC_CP_COPY_DATA_COUNT_SEL)
|
||||
dword0 |= COPY_DATA_COUNT_SEL;
|
||||
if (flags & AC_CP_COPY_DATA_ENGINE_PFP)
|
||||
dword0 |= COPY_DATA_ENGINE_PFP;
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate));
|
||||
ac_cmdbuf_emit(dword0);
|
||||
ac_cmdbuf_emit(src_va);
|
||||
ac_cmdbuf_emit(src_va >> 32);
|
||||
ac_cmdbuf_emit(dst_va);
|
||||
ac_cmdbuf_emit(dst_va >> 32);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate));
|
||||
ac_cmdbuf_emit(0);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t op)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
if (gfx_level >= GFX9) {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
|
||||
ac_cmdbuf_emit(op);
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
} else {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(op | ((va >> 32) & 0xFF));
|
||||
}
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa)
|
||||
{
|
||||
assert(info->gfx_level >= GFX11);
|
||||
assert((attr_ring_va >> 32) == info->address32_hi);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
ac_cmdbuf_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
|
||||
ac_cmdbuf_emit(0x12355123);
|
||||
ac_cmdbuf_emit(0x1544D);
|
||||
ac_cmdbuf_emit(attr_ring_va >> 16);
|
||||
ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) |
|
||||
S_03111C_BIG_PAGE(info->discardable_allows_big_page) |
|
||||
S_03111C_L1_POLICY(1));
|
||||
|
||||
if (info->gfx_level >= GFX12) {
|
||||
const uint64_t pos_va = attr_ring_va + info->pos_ring_offset;
|
||||
const uint64_t prim_va = attr_ring_va + info->prim_ring_offset;
|
||||
|
||||
/* When one of these 4 registers is updated, all 4 must be updated. */
|
||||
ac_cmdbuf_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4);
|
||||
ac_cmdbuf_emit(pos_va >> 16);
|
||||
ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5));
|
||||
ac_cmdbuf_emit(prim_va >> 16);
|
||||
ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) |
|
||||
S_0309AC_SCOPE(gfx12_scope_device) |
|
||||
S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) |
|
||||
S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) |
|
||||
S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) |
|
||||
S_0309AC_FORCE_SE_SCOPE(1) |
|
||||
S_0309AC_PAB_NOFILL(1));
|
||||
|
||||
if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) {
|
||||
/* Mitigate the HiZ GPU hang by increasing a timeout when
|
||||
* BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted
|
||||
* when the gfx queue is idle.
|
||||
*/
|
||||
const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0;
|
||||
|
||||
ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0));
|
||||
ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout));
|
||||
}
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t attr_ring_va)
|
||||
{
|
||||
const uint64_t va = attr_ring_va + info->tess_offchip_ring_size;
|
||||
uint32_t tf_ring_size = info->tess_factor_ring_size / 4;
|
||||
|
||||
if (info->gfx_level >= GFX11) {
|
||||
/* TF_RING_SIZE is per SE on GFX11. */
|
||||
tf_ring_size /= info->max_se;
|
||||
}
|
||||
|
||||
assert((tf_ring_size & C_030938_SIZE) == 0);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
if (info->gfx_level >= GFX7) {
|
||||
ac_cmdbuf_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3);
|
||||
ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size));
|
||||
ac_cmdbuf_emit(info->hs_offchip_param);
|
||||
ac_cmdbuf_emit(va >> 8);
|
||||
|
||||
if (info->gfx_level >= GFX12) {
|
||||
ac_cmdbuf_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40));
|
||||
} else if (info->gfx_level >= GFX10) {
|
||||
ac_cmdbuf_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
|
||||
} else if (info->gfx_level == GFX9) {
|
||||
ac_cmdbuf_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
|
||||
}
|
||||
} else {
|
||||
ac_cmdbuf_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size));
|
||||
ac_cmdbuf_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8);
|
||||
ac_cmdbuf_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param);
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t size)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
if (gfx_level >= GFX11) {
|
||||
ac_cmdbuf_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
|
||||
ac_cmdbuf_emit(size);
|
||||
ac_cmdbuf_emit(va >> 8);
|
||||
ac_cmdbuf_emit(va >> 40);
|
||||
} else {
|
||||
ac_cmdbuf_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, size);
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits
|
||||
* for idle on older chips. "engine" determines whether to sync in PFP or ME.
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
enum amd_ip_type ip_type, uint32_t engine,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
|
||||
assert(gcr_cntl);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
if (gfx_level >= GFX10) {
|
||||
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
|
||||
const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
|
||||
|
||||
/* Flush caches. This doesn't wait for idle. */
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(engine_flag); /* which engine to use */
|
||||
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
||||
ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
|
||||
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
||||
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */
|
||||
} else {
|
||||
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
|
||||
|
||||
if (gfx_level == GFX9 || is_mec) {
|
||||
/* Flush caches and wait for the caches to assert idle. */
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
|
||||
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
||||
ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
|
||||
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
||||
} else {
|
||||
/* ACQUIRE_MEM is only required on the compute ring. */
|
||||
ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
|
||||
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
||||
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
||||
}
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
|
||||
uint32_t atomic_cmd, uint64_t va, uint64_t data,
|
||||
uint64_t compare_data)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
|
||||
ac_cmdbuf_emit(ATOMIC_OP(atomic_op) |
|
||||
ATOMIC_COMMAND(atomic_cmd));
|
||||
ac_cmdbuf_emit(va); /* addr lo */
|
||||
ac_cmdbuf_emit(va >> 32); /* addr hi */
|
||||
ac_cmdbuf_emit(data); /* data lo */
|
||||
ac_cmdbuf_emit(data >> 32); /* data hi */
|
||||
ac_cmdbuf_emit(compare_data); /* compare data lo */
|
||||
ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */
|
||||
ac_cmdbuf_emit(10); /* loop interval */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0));
|
||||
ac_cmdbuf_emit(value);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
|
||||
uint32_t reg_count, uint64_t va,
|
||||
bool predicate)
|
||||
{
|
||||
assert(reg_count);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
ac_cmdbuf_emit(reg_count); /* in DWORDS */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_cmdbuf_flush_vgt_streamout(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -120,87 +120,6 @@ void
|
|||
ac_init_graphics_preamble_state(const struct ac_preamble_state *state,
|
||||
struct ac_pm4_state *pm4);
|
||||
|
||||
void
|
||||
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t count);
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
const uint32_t *data, bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
|
||||
uint64_t va, uint32_t value);
|
||||
|
||||
void
|
||||
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
||||
uint32_t mask, unsigned flags);
|
||||
|
||||
void
|
||||
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t stage_sel, uint32_t count,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
void
|
||||
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
enum ac_cp_copy_data_flags {
|
||||
AC_CP_COPY_DATA_WR_CONFIRM = 1u << 0,
|
||||
AC_CP_COPY_DATA_COUNT_SEL = 1u << 1, /* 64 bits */
|
||||
AC_CP_COPY_DATA_ENGINE_PFP = 1u << 2,
|
||||
};
|
||||
|
||||
void
|
||||
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
|
||||
uint64_t src_va, uint64_t dst_va,
|
||||
enum ac_cp_copy_data_flags flags, bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t op);
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa);
|
||||
|
||||
void
|
||||
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t va);
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t size);
|
||||
|
||||
void
|
||||
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
enum amd_ip_type ip_type, uint32_t engine,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
void
|
||||
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
|
||||
uint32_t atomic_cmd, uint64_t va, uint64_t data,
|
||||
uint64_t compare_data);
|
||||
|
||||
void
|
||||
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value);
|
||||
|
||||
void
|
||||
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
|
||||
uint32_t reg_count, uint64_t va,
|
||||
bool predicate);
|
||||
|
||||
void
|
||||
ac_cmdbuf_flush_vgt_streamout(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level);
|
||||
|
||||
|
|
|
|||
465
src/amd/common/ac_cmdbuf_cp.c
Normal file
465
src/amd/common/ac_cmdbuf_cp.c
Normal file
|
|
@ -0,0 +1,465 @@
|
|||
/*
|
||||
* Copyright 2012 Advanced Micro Devices, Inc.
|
||||
* Copyright 2024 Valve Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "ac_cmdbuf.h"
|
||||
#include "ac_cmdbuf_cp.h"
|
||||
#include "ac_gpu_info.h"
|
||||
#include "ac_shader_util.h"
|
||||
|
||||
#include "amd_family.h"
|
||||
#include "sid.h"
|
||||
|
||||
void
|
||||
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t count)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
if (gfx_level >= GFX7) {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit(0);
|
||||
ac_cmdbuf_emit(count);
|
||||
} else {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit(count);
|
||||
}
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
bool predicate)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate));
|
||||
ac_cmdbuf_emit(S_370_DST_SEL(dst_sel) |
|
||||
S_370_WR_CONFIRM(1) |
|
||||
S_370_ENGINE_SEL(engine_sel));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
const uint32_t *data, bool predicate)
|
||||
{
|
||||
ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate);
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit_array(data, size);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
|
||||
uint64_t va, uint32_t value)
|
||||
{
|
||||
ac_emit_cp_write_data(cs, engine_sel, V_370_MEM, va, 1, &value, false);
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
||||
uint32_t mask, unsigned flags)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
||||
ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags);
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit(ref); /* reference value */
|
||||
ac_cmdbuf_emit(mask); /* mask */
|
||||
ac_cmdbuf_emit(4); /* poll interval */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
static bool
|
||||
is_ts_event(unsigned event_type)
|
||||
{
|
||||
return event_type == V_028A90_CACHE_FLUSH_TS ||
|
||||
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
|
||||
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
|
||||
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
|
||||
}
|
||||
|
||||
/* This will wait or insert into the pipeline a wait for a previous
|
||||
* RELEASE_MEM PWS event.
|
||||
*
|
||||
* "event_type" must be the same as the RELEASE_MEM PWS event.
|
||||
*
|
||||
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME,
|
||||
* PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the
|
||||
* pipeline instead of completely idling the hw at the frontend.
|
||||
*
|
||||
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the
|
||||
* pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
|
||||
*
|
||||
* "distance" determines how many RELEASE_MEM PWS events ago it should wait
|
||||
* for, minus one (starting from 0). There are 3 event types: PS_DONE,
|
||||
* CS_DONE, and TS events. The distance counter increments separately for each
|
||||
* type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with
|
||||
* *_TS means wait for the last TS event (even if it's a different TS event
|
||||
* because all TS events share the same counter).
|
||||
*
|
||||
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while
|
||||
* PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really
|
||||
* be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests.
|
||||
*
|
||||
* PRE_COLOR also exists but shouldn't be used because it can hang. It's
|
||||
* recommended to use PRE_PIX_SHADER instead, which means all PS that have
|
||||
* color exports with enabled color buffers, non-zero colormask, and non-zero
|
||||
* sample mask must have IMAGE_OP=1 to enable the sync before PS.
|
||||
*
|
||||
* Waiting for a PWS fence that was generated by a previous IB is valid, but
|
||||
* if there is an IB from another process in between and that IB also inserted
|
||||
* a PWS fence, the hw will wait for the newer fence instead because the PWS
|
||||
* counter was incremented.
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t stage_sel, uint32_t count,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
||||
|
||||
const bool ts = is_ts_event(event_type);
|
||||
const bool ps_done = event_type == V_028A90_PS_DONE;
|
||||
const bool cs_done = event_type == V_028A90_CS_DONE;
|
||||
const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT;
|
||||
|
||||
assert((int)ts + (int)cs_done + (int)ps_done == 1);
|
||||
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
|
||||
assert(stage_sel != V_580_PRE_COLOR);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) |
|
||||
S_580_PWS_COUNTER_SEL(counter_sel) |
|
||||
S_580_PWS_ENA2(1) |
|
||||
S_580_PWS_COUNT(count));
|
||||
ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */
|
||||
ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* GCR_BASE_LO */
|
||||
ac_cmdbuf_emit(0); /* GCR_BASE_HI */
|
||||
ac_cmdbuf_emit(S_585_PWS_ENA(1));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will
|
||||
* signal after the work indicated by the event is complete, which optionally
|
||||
* includes flushing caches using "gcr_cntl" after the completion of the work.
|
||||
* *_TS events are always signaled at the end of the pipeline, while CS_DONE
|
||||
* and PS_DONE are signaled when those shaders finish. This call only inserts
|
||||
* the event into the pipeline. It doesn't wait for anything and it doesn't
|
||||
* execute anything immediately. The only way to wait for the event completion
|
||||
* is to call si_cp_acquire_mem_pws with the same "event_type".
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
|
||||
|
||||
/* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
|
||||
assert(G_586_GLI_INV(gcr_cntl) == 0);
|
||||
assert(G_586_GL1_RANGE(gcr_cntl) == 0);
|
||||
const uint32_t glm_wb = G_586_GLM_WB(gcr_cntl);
|
||||
const uint32_t glm_inv = G_586_GLM_INV(gcr_cntl);
|
||||
const uint32_t glk_wb = G_586_GLK_WB(gcr_cntl);
|
||||
const uint32_t glk_inv = G_586_GLK_INV(gcr_cntl);
|
||||
const uint32_t glv_inv = G_586_GLV_INV(gcr_cntl);
|
||||
const uint32_t gl1_inv = G_586_GL1_INV(gcr_cntl);
|
||||
assert(G_586_GL2_US(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
|
||||
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
|
||||
const uint32_t gl2_inv = G_586_GL2_INV(gcr_cntl);
|
||||
const uint32_t gl2_wb = G_586_GL2_WB(gcr_cntl);
|
||||
const uint32_t gcr_seq = G_586_SEQ(gcr_cntl);
|
||||
const bool ts = is_ts_event(event_type);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(S_490_EVENT_TYPE(event_type) |
|
||||
S_490_EVENT_INDEX(ts ? 5 : 6) |
|
||||
S_490_GLM_WB(glm_wb) |
|
||||
S_490_GLM_INV(glm_inv) |
|
||||
S_490_GLV_INV(glv_inv) |
|
||||
S_490_GL1_INV(gl1_inv) |
|
||||
S_490_GL2_INV(gl2_inv) |
|
||||
S_490_GL2_WB(gl2_wb) |
|
||||
S_490_SEQ(gcr_seq) |
|
||||
S_490_GLK_WB(glk_wb) |
|
||||
S_490_GLK_INV(glk_inv) |
|
||||
S_490_PWS_ENABLE(1));
|
||||
ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
|
||||
ac_cmdbuf_emit(0); /* ADDRESS_LO */
|
||||
ac_cmdbuf_emit(0); /* ADDRESS_HI */
|
||||
ac_cmdbuf_emit(0); /* DATA_LO */
|
||||
ac_cmdbuf_emit(0); /* DATA_HI */
|
||||
ac_cmdbuf_emit(0); /* INT_CTXID */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
|
||||
uint64_t src_va, uint64_t dst_va,
|
||||
enum ac_cp_copy_data_flags flags, bool predicate)
|
||||
{
|
||||
uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) |
|
||||
COPY_DATA_DST_SEL(dst_sel);
|
||||
|
||||
if (flags & AC_CP_COPY_DATA_WR_CONFIRM)
|
||||
dword0 |= COPY_DATA_WR_CONFIRM;
|
||||
if (flags & AC_CP_COPY_DATA_COUNT_SEL)
|
||||
dword0 |= COPY_DATA_COUNT_SEL;
|
||||
if (flags & AC_CP_COPY_DATA_ENGINE_PFP)
|
||||
dword0 |= COPY_DATA_ENGINE_PFP;
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate));
|
||||
ac_cmdbuf_emit(dword0);
|
||||
ac_cmdbuf_emit(src_va);
|
||||
ac_cmdbuf_emit(src_va >> 32);
|
||||
ac_cmdbuf_emit(dst_va);
|
||||
ac_cmdbuf_emit(dst_va >> 32);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate));
|
||||
ac_cmdbuf_emit(0);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t op)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
if (gfx_level >= GFX9) {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
|
||||
ac_cmdbuf_emit(op);
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
} else {
|
||||
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(op | ((va >> 32) & 0xFF));
|
||||
}
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa)
|
||||
{
|
||||
assert(info->gfx_level >= GFX11);
|
||||
assert((attr_ring_va >> 32) == info->address32_hi);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
ac_cmdbuf_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
|
||||
ac_cmdbuf_emit(0x12355123);
|
||||
ac_cmdbuf_emit(0x1544D);
|
||||
ac_cmdbuf_emit(attr_ring_va >> 16);
|
||||
ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) |
|
||||
S_03111C_BIG_PAGE(info->discardable_allows_big_page) |
|
||||
S_03111C_L1_POLICY(1));
|
||||
|
||||
if (info->gfx_level >= GFX12) {
|
||||
const uint64_t pos_va = attr_ring_va + info->pos_ring_offset;
|
||||
const uint64_t prim_va = attr_ring_va + info->prim_ring_offset;
|
||||
|
||||
/* When one of these 4 registers is updated, all 4 must be updated. */
|
||||
ac_cmdbuf_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4);
|
||||
ac_cmdbuf_emit(pos_va >> 16);
|
||||
ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5));
|
||||
ac_cmdbuf_emit(prim_va >> 16);
|
||||
ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) |
|
||||
S_0309AC_SCOPE(gfx12_scope_device) |
|
||||
S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) |
|
||||
S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) |
|
||||
S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) |
|
||||
S_0309AC_FORCE_SE_SCOPE(1) |
|
||||
S_0309AC_PAB_NOFILL(1));
|
||||
|
||||
if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) {
|
||||
/* Mitigate the HiZ GPU hang by increasing a timeout when
|
||||
* BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted
|
||||
* when the gfx queue is idle.
|
||||
*/
|
||||
const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0;
|
||||
|
||||
ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0));
|
||||
ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout));
|
||||
}
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t attr_ring_va)
|
||||
{
|
||||
const uint64_t va = attr_ring_va + info->tess_offchip_ring_size;
|
||||
uint32_t tf_ring_size = info->tess_factor_ring_size / 4;
|
||||
|
||||
if (info->gfx_level >= GFX11) {
|
||||
/* TF_RING_SIZE is per SE on GFX11. */
|
||||
tf_ring_size /= info->max_se;
|
||||
}
|
||||
|
||||
assert((tf_ring_size & C_030938_SIZE) == 0);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
if (info->gfx_level >= GFX7) {
|
||||
ac_cmdbuf_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3);
|
||||
ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size));
|
||||
ac_cmdbuf_emit(info->hs_offchip_param);
|
||||
ac_cmdbuf_emit(va >> 8);
|
||||
|
||||
if (info->gfx_level >= GFX12) {
|
||||
ac_cmdbuf_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40));
|
||||
} else if (info->gfx_level >= GFX10) {
|
||||
ac_cmdbuf_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
|
||||
} else if (info->gfx_level == GFX9) {
|
||||
ac_cmdbuf_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
|
||||
}
|
||||
} else {
|
||||
ac_cmdbuf_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size));
|
||||
ac_cmdbuf_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8);
|
||||
ac_cmdbuf_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param);
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t size)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
if (gfx_level >= GFX11) {
|
||||
ac_cmdbuf_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
|
||||
ac_cmdbuf_emit(size);
|
||||
ac_cmdbuf_emit(va >> 8);
|
||||
ac_cmdbuf_emit(va >> 40);
|
||||
} else {
|
||||
ac_cmdbuf_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, size);
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits
|
||||
* for idle on older chips. "engine" determines whether to sync in PFP or ME.
|
||||
*/
|
||||
void
|
||||
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
enum amd_ip_type ip_type, uint32_t engine,
|
||||
uint32_t gcr_cntl)
|
||||
{
|
||||
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
|
||||
assert(gcr_cntl);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
|
||||
if (gfx_level >= GFX10) {
|
||||
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
|
||||
const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
|
||||
|
||||
/* Flush caches. This doesn't wait for idle. */
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
ac_cmdbuf_emit(engine_flag); /* which engine to use */
|
||||
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
||||
ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
|
||||
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
||||
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */
|
||||
} else {
|
||||
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
|
||||
|
||||
if (gfx_level == GFX9 || is_mec) {
|
||||
/* Flush caches and wait for the caches to assert idle. */
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
|
||||
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
||||
ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
|
||||
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
||||
} else {
|
||||
/* ACQUIRE_MEM is only required on the compute ring. */
|
||||
ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
|
||||
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
|
||||
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
|
||||
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
|
||||
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
|
||||
}
|
||||
}
|
||||
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
|
||||
uint32_t atomic_cmd, uint64_t va, uint64_t data,
|
||||
uint64_t compare_data)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
|
||||
ac_cmdbuf_emit(ATOMIC_OP(atomic_op) |
|
||||
ATOMIC_COMMAND(atomic_cmd));
|
||||
ac_cmdbuf_emit(va); /* addr lo */
|
||||
ac_cmdbuf_emit(va >> 32); /* addr hi */
|
||||
ac_cmdbuf_emit(data); /* data lo */
|
||||
ac_cmdbuf_emit(data >> 32); /* data hi */
|
||||
ac_cmdbuf_emit(compare_data); /* compare data lo */
|
||||
ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */
|
||||
ac_cmdbuf_emit(10); /* loop interval */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value)
|
||||
{
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0));
|
||||
ac_cmdbuf_emit(value);
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
|
||||
void
|
||||
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
|
||||
uint32_t reg_count, uint64_t va,
|
||||
bool predicate)
|
||||
{
|
||||
assert(reg_count);
|
||||
|
||||
ac_cmdbuf_begin(cs);
|
||||
ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate));
|
||||
ac_cmdbuf_emit(va);
|
||||
ac_cmdbuf_emit(va >> 32);
|
||||
ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
ac_cmdbuf_emit(reg_count); /* in DWORDS */
|
||||
ac_cmdbuf_end();
|
||||
}
|
||||
109
src/amd/common/ac_cmdbuf_cp.h
Normal file
109
src/amd/common/ac_cmdbuf_cp.h
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Copyright 2012 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef AC_CMDBUF_CP_H
|
||||
#define AC_CMDBUF_CP_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "amd_family.h"
|
||||
|
||||
#include "util/macros.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ac_cmdbuf;
|
||||
struct radeon_info;
|
||||
|
||||
void
|
||||
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t count);
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
|
||||
uint32_t dst_sel, uint64_t va, uint32_t size,
|
||||
const uint32_t *data, bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
|
||||
uint64_t va, uint32_t value);
|
||||
|
||||
void
|
||||
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
|
||||
uint32_t mask, unsigned flags);
|
||||
|
||||
void
|
||||
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t stage_sel, uint32_t count,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
void
|
||||
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
|
||||
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
enum ac_cp_copy_data_flags {
|
||||
AC_CP_COPY_DATA_WR_CONFIRM = 1u << 0,
|
||||
AC_CP_COPY_DATA_COUNT_SEL = 1u << 1, /* 64 bits */
|
||||
AC_CP_COPY_DATA_ENGINE_PFP = 1u << 2,
|
||||
};
|
||||
|
||||
void
|
||||
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
|
||||
uint64_t src_va, uint64_t dst_va,
|
||||
enum ac_cp_copy_data_flags flags, bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate);
|
||||
|
||||
void
|
||||
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t op);
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa);
|
||||
|
||||
void
|
||||
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
|
||||
uint64_t va);
|
||||
|
||||
void
|
||||
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
uint64_t va, uint32_t size);
|
||||
|
||||
void
|
||||
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
|
||||
enum amd_ip_type ip_type, uint32_t engine,
|
||||
uint32_t gcr_cntl);
|
||||
|
||||
void
|
||||
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
|
||||
uint32_t atomic_cmd, uint64_t va, uint64_t data,
|
||||
uint64_t compare_data);
|
||||
|
||||
void
|
||||
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value);
|
||||
|
||||
void
|
||||
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
|
||||
uint32_t reg_count, uint64_t va,
|
||||
bool predicate);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -56,6 +56,8 @@ amd_common_files = files(
|
|||
'ac_binary.h',
|
||||
'ac_cmdbuf.c',
|
||||
'ac_cmdbuf.h',
|
||||
'ac_cmdbuf_cp.c',
|
||||
'ac_cmdbuf_cp.h',
|
||||
'ac_cmdbuf_sdma.c',
|
||||
'ac_cmdbuf_sdma.h',
|
||||
'ac_shader_args.c',
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include "radv_sdma.h"
|
||||
#include "sid.h"
|
||||
|
||||
#include "ac_cmdbuf_cp.h"
|
||||
#include "ac_cmdbuf_sdma.h"
|
||||
|
||||
static inline unsigned
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@
|
|||
#ifndef SI_BUILD_PM4_H
|
||||
#define SI_BUILD_PM4_H
|
||||
|
||||
#include "ac_cmdbuf_cp.h"
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "sid.h"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue