amd: move CP emit helpers to ac_cmdbuf_cp.c/h

Seems more organized this way.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37881>
This commit is contained in:
Samuel Pitoiset 2025-10-15 11:42:01 +02:00
parent e0ffc41d9a
commit 7cd12e5c6a
7 changed files with 579 additions and 532 deletions

View file

@ -858,457 +858,6 @@ ac_init_graphics_preamble_state(const struct ac_preamble_state *state,
}
}
void
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t count)
{
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX7) {
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit(0);
ac_cmdbuf_emit(count);
} else {
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit(count);
}
ac_cmdbuf_end();
}
void
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
bool predicate)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate));
ac_cmdbuf_emit(S_370_DST_SEL(dst_sel) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(engine_sel));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_end();
}
void
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
const uint32_t *data, bool predicate)
{
ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit_array(data, size);
ac_cmdbuf_end();
}
void
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
uint64_t va, uint32_t value)
{
ac_emit_cp_write_data(cs, engine_sel, V_370_MEM, va, 1, &value, false);
}
void
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
uint32_t mask, unsigned flags)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags);
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit(ref); /* reference value */
ac_cmdbuf_emit(mask); /* mask */
ac_cmdbuf_emit(4); /* poll interval */
ac_cmdbuf_end();
}
static bool
is_ts_event(unsigned event_type)
{
return event_type == V_028A90_CACHE_FLUSH_TS ||
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
}
/* This will wait or insert into the pipeline a wait for a previous
* RELEASE_MEM PWS event.
*
* "event_type" must be the same as the RELEASE_MEM PWS event.
*
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME,
* PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the
* pipeline instead of completely idling the hw at the frontend.
*
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the
* pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
*
* "distance" determines how many RELEASE_MEM PWS events ago it should wait
* for, minus one (starting from 0). There are 3 event types: PS_DONE,
* CS_DONE, and TS events. The distance counter increments separately for each
* type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with
* *_TS means wait for the last TS event (even if it's a different TS event
* because all TS events share the same counter).
*
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while
* PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really
* be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests.
*
* PRE_COLOR also exists but shouldn't be used because it can hang. It's
* recommended to use PRE_PIX_SHADER instead, which means all PS that have
* color exports with enabled color buffers, non-zero colormask, and non-zero
* sample mask must have IMAGE_OP=1 to enable the sync before PS.
*
* Waiting for a PWS fence that was generated by a previous IB is valid, but
* if there is an IB from another process in between and that IB also inserted
* a PWS fence, the hw will wait for the newer fence instead because the PWS
* counter was incremented.
*/
void
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t stage_sel, uint32_t count,
uint32_t gcr_cntl)
{
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
const bool ts = is_ts_event(event_type);
const bool ps_done = event_type == V_028A90_PS_DONE;
const bool cs_done = event_type == V_028A90_CS_DONE;
const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT;
assert((int)ts + (int)cs_done + (int)ps_done == 1);
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
assert(stage_sel != V_580_PRE_COLOR);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) |
S_580_PWS_COUNTER_SEL(counter_sel) |
S_580_PWS_ENA2(1) |
S_580_PWS_COUNT(count));
ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */
ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */
ac_cmdbuf_emit(0); /* GCR_BASE_LO */
ac_cmdbuf_emit(0); /* GCR_BASE_HI */
ac_cmdbuf_emit(S_585_PWS_ENA(1));
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
ac_cmdbuf_end();
}
/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will
* signal after the work indicated by the event is complete, which optionally
* includes flushing caches using "gcr_cntl" after the completion of the work.
* *_TS events are always signaled at the end of the pipeline, while CS_DONE
* and PS_DONE are signaled when those shaders finish. This call only inserts
* the event into the pipeline. It doesn't wait for anything and it doesn't
* execute anything immediately. The only way to wait for the event completion
* is to call si_cp_acquire_mem_pws with the same "event_type".
*/
void
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t gcr_cntl)
{
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
/* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
assert(G_586_GLI_INV(gcr_cntl) == 0);
assert(G_586_GL1_RANGE(gcr_cntl) == 0);
const uint32_t glm_wb = G_586_GLM_WB(gcr_cntl);
const uint32_t glm_inv = G_586_GLM_INV(gcr_cntl);
const uint32_t glk_wb = G_586_GLK_WB(gcr_cntl);
const uint32_t glk_inv = G_586_GLK_INV(gcr_cntl);
const uint32_t glv_inv = G_586_GLV_INV(gcr_cntl);
const uint32_t gl1_inv = G_586_GL1_INV(gcr_cntl);
assert(G_586_GL2_US(gcr_cntl) == 0);
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
const uint32_t gl2_inv = G_586_GL2_INV(gcr_cntl);
const uint32_t gl2_wb = G_586_GL2_WB(gcr_cntl);
const uint32_t gcr_seq = G_586_SEQ(gcr_cntl);
const bool ts = is_ts_event(event_type);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
ac_cmdbuf_emit(S_490_EVENT_TYPE(event_type) |
S_490_EVENT_INDEX(ts ? 5 : 6) |
S_490_GLM_WB(glm_wb) |
S_490_GLM_INV(glm_inv) |
S_490_GLV_INV(glv_inv) |
S_490_GL1_INV(gl1_inv) |
S_490_GL2_INV(gl2_inv) |
S_490_GL2_WB(gl2_wb) |
S_490_SEQ(gcr_seq) |
S_490_GLK_WB(glk_wb) |
S_490_GLK_INV(glk_inv) |
S_490_PWS_ENABLE(1));
ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
ac_cmdbuf_emit(0); /* ADDRESS_LO */
ac_cmdbuf_emit(0); /* ADDRESS_HI */
ac_cmdbuf_emit(0); /* DATA_LO */
ac_cmdbuf_emit(0); /* DATA_HI */
ac_cmdbuf_emit(0); /* INT_CTXID */
ac_cmdbuf_end();
}
void
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
uint64_t src_va, uint64_t dst_va,
enum ac_cp_copy_data_flags flags, bool predicate)
{
uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) |
COPY_DATA_DST_SEL(dst_sel);
if (flags & AC_CP_COPY_DATA_WR_CONFIRM)
dword0 |= COPY_DATA_WR_CONFIRM;
if (flags & AC_CP_COPY_DATA_COUNT_SEL)
dword0 |= COPY_DATA_COUNT_SEL;
if (flags & AC_CP_COPY_DATA_ENGINE_PFP)
dword0 |= COPY_DATA_ENGINE_PFP;
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate));
ac_cmdbuf_emit(dword0);
ac_cmdbuf_emit(src_va);
ac_cmdbuf_emit(src_va >> 32);
ac_cmdbuf_emit(dst_va);
ac_cmdbuf_emit(dst_va >> 32);
ac_cmdbuf_end();
}
void
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate));
ac_cmdbuf_emit(0);
ac_cmdbuf_end();
}
void
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t op)
{
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX9) {
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
ac_cmdbuf_emit(op);
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
} else {
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(op | ((va >> 32) & 0xFF));
}
ac_cmdbuf_end();
}
void
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa)
{
assert(info->gfx_level >= GFX11);
assert((attr_ring_va >> 32) == info->address32_hi);
ac_cmdbuf_begin(cs);
ac_cmdbuf_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
ac_cmdbuf_emit(0x12355123);
ac_cmdbuf_emit(0x1544D);
ac_cmdbuf_emit(attr_ring_va >> 16);
ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) |
S_03111C_BIG_PAGE(info->discardable_allows_big_page) |
S_03111C_L1_POLICY(1));
if (info->gfx_level >= GFX12) {
const uint64_t pos_va = attr_ring_va + info->pos_ring_offset;
const uint64_t prim_va = attr_ring_va + info->prim_ring_offset;
/* When one of these 4 registers is updated, all 4 must be updated. */
ac_cmdbuf_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4);
ac_cmdbuf_emit(pos_va >> 16);
ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5));
ac_cmdbuf_emit(prim_va >> 16);
ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) |
S_0309AC_SCOPE(gfx12_scope_device) |
S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) |
S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) |
S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) |
S_0309AC_FORCE_SE_SCOPE(1) |
S_0309AC_PAB_NOFILL(1));
if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) {
/* Mitigate the HiZ GPU hang by increasing a timeout when
* BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted
* when the gfx queue is idle.
*/
const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0;
ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0));
ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout));
}
}
ac_cmdbuf_end();
}
void
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t attr_ring_va)
{
const uint64_t va = attr_ring_va + info->tess_offchip_ring_size;
uint32_t tf_ring_size = info->tess_factor_ring_size / 4;
if (info->gfx_level >= GFX11) {
/* TF_RING_SIZE is per SE on GFX11. */
tf_ring_size /= info->max_se;
}
assert((tf_ring_size & C_030938_SIZE) == 0);
ac_cmdbuf_begin(cs);
if (info->gfx_level >= GFX7) {
ac_cmdbuf_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3);
ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size));
ac_cmdbuf_emit(info->hs_offchip_param);
ac_cmdbuf_emit(va >> 8);
if (info->gfx_level >= GFX12) {
ac_cmdbuf_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40));
} else if (info->gfx_level >= GFX10) {
ac_cmdbuf_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
} else if (info->gfx_level == GFX9) {
ac_cmdbuf_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
}
} else {
ac_cmdbuf_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size));
ac_cmdbuf_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8);
ac_cmdbuf_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param);
}
ac_cmdbuf_end();
}
void
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t size)
{
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX11) {
ac_cmdbuf_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
ac_cmdbuf_emit(size);
ac_cmdbuf_emit(va >> 8);
ac_cmdbuf_emit(va >> 40);
} else {
ac_cmdbuf_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, size);
}
ac_cmdbuf_end();
}
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits
* for idle on older chips. "engine" determines whether to sync in PFP or ME.
*/
void
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
enum amd_ip_type ip_type, uint32_t engine,
uint32_t gcr_cntl)
{
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
assert(gcr_cntl);
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX10) {
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
/* Flush caches. This doesn't wait for idle. */
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
ac_cmdbuf_emit(engine_flag); /* which engine to use */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */
} else {
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
if (gfx_level == GFX9 || is_mec) {
/* Flush caches and wait for the caches to assert idle. */
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec));
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
} else {
/* ACQUIRE_MEM is only required on the compute ring. */
ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
}
}
ac_cmdbuf_end();
}
void
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
uint32_t atomic_cmd, uint64_t va, uint64_t data,
uint64_t compare_data)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
ac_cmdbuf_emit(ATOMIC_OP(atomic_op) |
ATOMIC_COMMAND(atomic_cmd));
ac_cmdbuf_emit(va); /* addr lo */
ac_cmdbuf_emit(va >> 32); /* addr hi */
ac_cmdbuf_emit(data); /* data lo */
ac_cmdbuf_emit(data >> 32); /* data hi */
ac_cmdbuf_emit(compare_data); /* compare data lo */
ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */
ac_cmdbuf_emit(10); /* loop interval */
ac_cmdbuf_end();
}
void
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0));
ac_cmdbuf_emit(value);
ac_cmdbuf_end();
}
void
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
uint32_t reg_count, uint64_t va,
bool predicate)
{
assert(reg_count);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2);
ac_cmdbuf_emit(reg_count); /* in DWORDS */
ac_cmdbuf_end();
}
void
ac_cmdbuf_flush_vgt_streamout(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level)
{

View file

@ -120,87 +120,6 @@ void
ac_init_graphics_preamble_state(const struct ac_preamble_state *state,
struct ac_pm4_state *pm4);
void
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t count);
void
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
bool predicate);
void
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
const uint32_t *data, bool predicate);
void
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
uint64_t va, uint32_t value);
void
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
uint32_t mask, unsigned flags);
void
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t stage_sel, uint32_t count,
uint32_t gcr_cntl);
void
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t gcr_cntl);
enum ac_cp_copy_data_flags {
AC_CP_COPY_DATA_WR_CONFIRM = 1u << 0,
AC_CP_COPY_DATA_COUNT_SEL = 1u << 1, /* 64 bits */
AC_CP_COPY_DATA_ENGINE_PFP = 1u << 2,
};
void
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
uint64_t src_va, uint64_t dst_va,
enum ac_cp_copy_data_flags flags, bool predicate);
void
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate);
void
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t op);
void
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa);
void
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t va);
void
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t size);
void
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
enum amd_ip_type ip_type, uint32_t engine,
uint32_t gcr_cntl);
void
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
uint32_t atomic_cmd, uint64_t va, uint64_t data,
uint64_t compare_data);
void
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value);
void
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
uint32_t reg_count, uint64_t va,
bool predicate);
void
ac_cmdbuf_flush_vgt_streamout(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level);

View file

@ -0,0 +1,465 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
* Copyright 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "ac_cmdbuf.h"
#include "ac_cmdbuf_cp.h"
#include "ac_gpu_info.h"
#include "ac_shader_util.h"
#include "amd_family.h"
#include "sid.h"
void
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t count)
{
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX7) {
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit(0);
ac_cmdbuf_emit(count);
} else {
ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit(count);
}
ac_cmdbuf_end();
}
void
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
bool predicate)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate));
ac_cmdbuf_emit(S_370_DST_SEL(dst_sel) |
S_370_WR_CONFIRM(1) |
S_370_ENGINE_SEL(engine_sel));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_end();
}
void
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
const uint32_t *data, bool predicate)
{
ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit_array(data, size);
ac_cmdbuf_end();
}
void
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
uint64_t va, uint32_t value)
{
ac_emit_cp_write_data(cs, engine_sel, V_370_MEM, va, 1, &value, false);
}
void
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
uint32_t mask, unsigned flags)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags);
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit(ref); /* reference value */
ac_cmdbuf_emit(mask); /* mask */
ac_cmdbuf_emit(4); /* poll interval */
ac_cmdbuf_end();
}
static bool
is_ts_event(unsigned event_type)
{
return event_type == V_028A90_CACHE_FLUSH_TS ||
event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT ||
event_type == V_028A90_BOTTOM_OF_PIPE_TS ||
event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS ||
event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS;
}
/* This will wait or insert into the pipeline a wait for a previous
* RELEASE_MEM PWS event.
*
* "event_type" must be the same as the RELEASE_MEM PWS event.
*
* "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME,
* PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the
* pipeline instead of completely idling the hw at the frontend.
*
* "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the
* pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM.
*
* "distance" determines how many RELEASE_MEM PWS events ago it should wait
* for, minus one (starting from 0). There are 3 event types: PS_DONE,
* CS_DONE, and TS events. The distance counter increments separately for each
* type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with
* *_TS means wait for the last TS event (even if it's a different TS event
* because all TS events share the same counter).
*
* PRE_SHADER waits before the first shader that has IMAGE_OP=1, while
* PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really
* be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests.
*
* PRE_COLOR also exists but shouldn't be used because it can hang. It's
* recommended to use PRE_PIX_SHADER instead, which means all PS that have
* color exports with enabled color buffers, non-zero colormask, and non-zero
* sample mask must have IMAGE_OP=1 to enable the sync before PS.
*
* Waiting for a PWS fence that was generated by a previous IB is valid, but
* if there is an IB from another process in between and that IB also inserted
* a PWS fence, the hw will wait for the newer fence instead because the PWS
* counter was incremented.
*/
void
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t stage_sel, uint32_t count,
uint32_t gcr_cntl)
{
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
const bool ts = is_ts_event(event_type);
const bool ps_done = event_type == V_028A90_PS_DONE;
const bool cs_done = event_type == V_028A90_CS_DONE;
const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT;
assert((int)ts + (int)cs_done + (int)ps_done == 1);
assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME);
assert(stage_sel != V_580_PRE_COLOR);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) |
S_580_PWS_COUNTER_SEL(counter_sel) |
S_580_PWS_ENA2(1) |
S_580_PWS_COUNT(count));
ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */
ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */
ac_cmdbuf_emit(0); /* GCR_BASE_LO */
ac_cmdbuf_emit(0); /* GCR_BASE_HI */
ac_cmdbuf_emit(S_585_PWS_ENA(1));
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */
ac_cmdbuf_end();
}
/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will
* signal after the work indicated by the event is complete, which optionally
* includes flushing caches using "gcr_cntl" after the completion of the work.
* *_TS events are always signaled at the end of the pipeline, while CS_DONE
* and PS_DONE are signaled when those shaders finish. This call only inserts
* the event into the pipeline. It doesn't wait for anything and it doesn't
* execute anything immediately. The only way to wait for the event completion
* is to call si_cp_acquire_mem_pws with the same "event_type".
*/
void
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t gcr_cntl)
{
assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX);
/* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */
assert(G_586_GLI_INV(gcr_cntl) == 0);
assert(G_586_GL1_RANGE(gcr_cntl) == 0);
const uint32_t glm_wb = G_586_GLM_WB(gcr_cntl);
const uint32_t glm_inv = G_586_GLM_INV(gcr_cntl);
const uint32_t glk_wb = G_586_GLK_WB(gcr_cntl);
const uint32_t glk_inv = G_586_GLK_INV(gcr_cntl);
const uint32_t glv_inv = G_586_GLV_INV(gcr_cntl);
const uint32_t gl1_inv = G_586_GL1_INV(gcr_cntl);
assert(G_586_GL2_US(gcr_cntl) == 0);
assert(G_586_GL2_RANGE(gcr_cntl) == 0);
assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
const uint32_t gl2_inv = G_586_GL2_INV(gcr_cntl);
const uint32_t gl2_wb = G_586_GL2_WB(gcr_cntl);
const uint32_t gcr_seq = G_586_SEQ(gcr_cntl);
const bool ts = is_ts_event(event_type);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
ac_cmdbuf_emit(S_490_EVENT_TYPE(event_type) |
S_490_EVENT_INDEX(ts ? 5 : 6) |
S_490_GLM_WB(glm_wb) |
S_490_GLM_INV(glm_inv) |
S_490_GLV_INV(glv_inv) |
S_490_GL1_INV(gl1_inv) |
S_490_GL2_INV(gl2_inv) |
S_490_GL2_WB(gl2_wb) |
S_490_SEQ(gcr_seq) |
S_490_GLK_WB(glk_wb) |
S_490_GLK_INV(glk_inv) |
S_490_PWS_ENABLE(1));
ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */
ac_cmdbuf_emit(0); /* ADDRESS_LO */
ac_cmdbuf_emit(0); /* ADDRESS_HI */
ac_cmdbuf_emit(0); /* DATA_LO */
ac_cmdbuf_emit(0); /* DATA_HI */
ac_cmdbuf_emit(0); /* INT_CTXID */
ac_cmdbuf_end();
}
void
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
uint64_t src_va, uint64_t dst_va,
enum ac_cp_copy_data_flags flags, bool predicate)
{
uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) |
COPY_DATA_DST_SEL(dst_sel);
if (flags & AC_CP_COPY_DATA_WR_CONFIRM)
dword0 |= COPY_DATA_WR_CONFIRM;
if (flags & AC_CP_COPY_DATA_COUNT_SEL)
dword0 |= COPY_DATA_COUNT_SEL;
if (flags & AC_CP_COPY_DATA_ENGINE_PFP)
dword0 |= COPY_DATA_ENGINE_PFP;
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate));
ac_cmdbuf_emit(dword0);
ac_cmdbuf_emit(src_va);
ac_cmdbuf_emit(src_va >> 32);
ac_cmdbuf_emit(dst_va);
ac_cmdbuf_emit(dst_va >> 32);
ac_cmdbuf_end();
}
void
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate));
ac_cmdbuf_emit(0);
ac_cmdbuf_end();
}
void
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t op)
{
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX9) {
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
ac_cmdbuf_emit(op);
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
} else {
ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(op | ((va >> 32) & 0xFF));
}
ac_cmdbuf_end();
}
void
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa)
{
assert(info->gfx_level >= GFX11);
assert((attr_ring_va >> 32) == info->address32_hi);
ac_cmdbuf_begin(cs);
ac_cmdbuf_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4);
ac_cmdbuf_emit(0x12355123);
ac_cmdbuf_emit(0x1544D);
ac_cmdbuf_emit(attr_ring_va >> 16);
ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) |
S_03111C_BIG_PAGE(info->discardable_allows_big_page) |
S_03111C_L1_POLICY(1));
if (info->gfx_level >= GFX12) {
const uint64_t pos_va = attr_ring_va + info->pos_ring_offset;
const uint64_t prim_va = attr_ring_va + info->prim_ring_offset;
/* When one of these 4 registers is updated, all 4 must be updated. */
ac_cmdbuf_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4);
ac_cmdbuf_emit(pos_va >> 16);
ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5));
ac_cmdbuf_emit(prim_va >> 16);
ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) |
S_0309AC_SCOPE(gfx12_scope_device) |
S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) |
S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) |
S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) |
S_0309AC_FORCE_SE_SCOPE(1) |
S_0309AC_PAB_NOFILL(1));
if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) {
/* Mitigate the HiZ GPU hang by increasing a timeout when
* BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted
* when the gfx queue is idle.
*/
const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0;
ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0));
ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout));
}
}
ac_cmdbuf_end();
}
void
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t attr_ring_va)
{
const uint64_t va = attr_ring_va + info->tess_offchip_ring_size;
uint32_t tf_ring_size = info->tess_factor_ring_size / 4;
if (info->gfx_level >= GFX11) {
/* TF_RING_SIZE is per SE on GFX11. */
tf_ring_size /= info->max_se;
}
assert((tf_ring_size & C_030938_SIZE) == 0);
ac_cmdbuf_begin(cs);
if (info->gfx_level >= GFX7) {
ac_cmdbuf_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3);
ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size));
ac_cmdbuf_emit(info->hs_offchip_param);
ac_cmdbuf_emit(va >> 8);
if (info->gfx_level >= GFX12) {
ac_cmdbuf_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40));
} else if (info->gfx_level >= GFX10) {
ac_cmdbuf_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40));
} else if (info->gfx_level == GFX9) {
ac_cmdbuf_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40));
}
} else {
ac_cmdbuf_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size));
ac_cmdbuf_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8);
ac_cmdbuf_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param);
}
ac_cmdbuf_end();
}
void
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t size)
{
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX11) {
ac_cmdbuf_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3);
ac_cmdbuf_emit(size);
ac_cmdbuf_emit(va >> 8);
ac_cmdbuf_emit(va >> 40);
} else {
ac_cmdbuf_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, size);
}
ac_cmdbuf_end();
}
/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits
* for idle on older chips. "engine" determines whether to sync in PFP or ME.
*/
void
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
enum amd_ip_type ip_type, uint32_t engine,
uint32_t gcr_cntl)
{
assert(engine == V_580_CP_PFP || engine == V_580_CP_ME);
assert(gcr_cntl);
ac_cmdbuf_begin(cs);
if (gfx_level >= GFX10) {
/* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */
const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0;
/* Flush caches. This doesn't wait for idle. */
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
ac_cmdbuf_emit(engine_flag); /* which engine to use */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */
} else {
const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE;
if (gfx_level == GFX9 || is_mec) {
/* Flush caches and wait for the caches to assert idle. */
ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec));
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
} else {
/* ACQUIRE_MEM is only required on the compute ring. */
ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */
ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */
ac_cmdbuf_emit(0); /* CP_COHER_BASE */
ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */
}
}
ac_cmdbuf_end();
}
void
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
uint32_t atomic_cmd, uint64_t va, uint64_t data,
uint64_t compare_data)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
ac_cmdbuf_emit(ATOMIC_OP(atomic_op) |
ATOMIC_COMMAND(atomic_cmd));
ac_cmdbuf_emit(va); /* addr lo */
ac_cmdbuf_emit(va >> 32); /* addr hi */
ac_cmdbuf_emit(data); /* data lo */
ac_cmdbuf_emit(data >> 32); /* data hi */
ac_cmdbuf_emit(compare_data); /* compare data lo */
ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */
ac_cmdbuf_emit(10); /* loop interval */
ac_cmdbuf_end();
}
void
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value)
{
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0));
ac_cmdbuf_emit(value);
ac_cmdbuf_end();
}
void
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
uint32_t reg_count, uint64_t va,
bool predicate)
{
assert(reg_count);
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate));
ac_cmdbuf_emit(va);
ac_cmdbuf_emit(va >> 32);
ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2);
ac_cmdbuf_emit(reg_count); /* in DWORDS */
ac_cmdbuf_end();
}

View file

@ -0,0 +1,109 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#ifndef AC_CMDBUF_CP_H
#define AC_CMDBUF_CP_H
#include <inttypes.h>
#include <stdbool.h>
#include "amd_family.h"
#include "util/macros.h"
#ifdef __cplusplus
extern "C" {
#endif
struct ac_cmdbuf;
struct radeon_info;
void
ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t count);
void
ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
bool predicate);
void
ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel,
uint32_t dst_sel, uint64_t va, uint32_t size,
const uint32_t *data, bool predicate);
void
ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel,
uint64_t va, uint32_t value);
void
ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref,
uint32_t mask, unsigned flags);
void
ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t stage_sel, uint32_t count,
uint32_t gcr_cntl);
void
ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level,
ASSERTED enum amd_ip_type ip_type, uint32_t event_type,
uint32_t gcr_cntl);
enum ac_cp_copy_data_flags {
AC_CP_COPY_DATA_WR_CONFIRM = 1u << 0,
AC_CP_COPY_DATA_COUNT_SEL = 1u << 1, /* 64 bits */
AC_CP_COPY_DATA_ENGINE_PFP = 1u << 2,
};
void
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
uint64_t src_va, uint64_t dst_va,
enum ac_cp_copy_data_flags flags, bool predicate);
void
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate);
void
ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t op);
void
ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa);
void
ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info,
uint64_t va);
void
ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
uint64_t va, uint32_t size);
void
ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level,
enum amd_ip_type ip_type, uint32_t engine,
uint32_t gcr_cntl);
void
ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op,
uint32_t atomic_cmd, uint64_t va, uint64_t data,
uint64_t compare_data);
void
ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value);
void
ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg,
uint32_t reg_count, uint64_t va,
bool predicate);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -56,6 +56,8 @@ amd_common_files = files(
'ac_binary.h',
'ac_cmdbuf.c',
'ac_cmdbuf.h',
'ac_cmdbuf_cp.c',
'ac_cmdbuf_cp.h',
'ac_cmdbuf_sdma.c',
'ac_cmdbuf_sdma.h',
'ac_shader_args.c',

View file

@ -17,6 +17,7 @@
#include "radv_sdma.h"
#include "sid.h"
#include "ac_cmdbuf_cp.h"
#include "ac_cmdbuf_sdma.h"
static inline unsigned

View file

@ -11,6 +11,8 @@
#ifndef SI_BUILD_PM4_H
#define SI_BUILD_PM4_H
#include "ac_cmdbuf_cp.h"
#include "si_pipe.h"
#include "sid.h"