diff --git a/src/amd/common/ac_cmdbuf.c b/src/amd/common/ac_cmdbuf.c index 062d0165a25..c34b88c4d9e 100644 --- a/src/amd/common/ac_cmdbuf.c +++ b/src/amd/common/ac_cmdbuf.c @@ -858,457 +858,6 @@ ac_init_graphics_preamble_state(const struct ac_preamble_state *state, } } -void -ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - uint64_t va, uint32_t count) -{ - ac_cmdbuf_begin(cs); - if (gfx_level >= GFX7) { - ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0)); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(va >> 32); - ac_cmdbuf_emit(0); - ac_cmdbuf_emit(count); - } else { - ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0)); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(va >> 32); - ac_cmdbuf_emit(count); - } - ac_cmdbuf_end(); -} - -void -ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel, - uint32_t dst_sel, uint64_t va, uint32_t size, - bool predicate) -{ - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate)); - ac_cmdbuf_emit(S_370_DST_SEL(dst_sel) | - S_370_WR_CONFIRM(1) | - S_370_ENGINE_SEL(engine_sel)); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(va >> 32); - ac_cmdbuf_end(); -} - -void -ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel, - uint32_t dst_sel, uint64_t va, uint32_t size, - const uint32_t *data, bool predicate) -{ - ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate); - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit_array(data, size); - ac_cmdbuf_end(); -} - -void -ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel, - uint64_t va, uint32_t value) -{ - ac_emit_cp_write_data(cs, engine_sel, V_370_MEM, va, 1, &value, false); -} - -void -ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref, - uint32_t mask, unsigned flags) -{ - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(va >> 32); - ac_cmdbuf_emit(ref); /* reference value */ - ac_cmdbuf_emit(mask); /* mask */ - ac_cmdbuf_emit(4); /* poll interval */ - ac_cmdbuf_end(); -} - -static bool -is_ts_event(unsigned event_type) -{ - return event_type == V_028A90_CACHE_FLUSH_TS || - event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT || - event_type == V_028A90_BOTTOM_OF_PIPE_TS || - event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS || - event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS; -} - -/* This will wait or insert into the pipeline a wait for a previous - * RELEASE_MEM PWS event. - * - * "event_type" must be the same as the RELEASE_MEM PWS event. - * - * "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, - * PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the - * pipeline instead of completely idling the hw at the frontend. - * - * "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the - * pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM. - * - * "distance" determines how many RELEASE_MEM PWS events ago it should wait - * for, minus one (starting from 0). There are 3 event types: PS_DONE, - * CS_DONE, and TS events. The distance counter increments separately for each - * type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with - * *_TS means wait for the last TS event (even if it's a different TS event - * because all TS events share the same counter). - * - * PRE_SHADER waits before the first shader that has IMAGE_OP=1, while - * PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really - * be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests. - * - * PRE_COLOR also exists but shouldn't be used because it can hang. It's - * recommended to use PRE_PIX_SHADER instead, which means all PS that have - * color exports with enabled color buffers, non-zero colormask, and non-zero - * sample mask must have IMAGE_OP=1 to enable the sync before PS. - * - * Waiting for a PWS fence that was generated by a previous IB is valid, but - * if there is an IB from another process in between and that IB also inserted - * a PWS fence, the hw will wait for the newer fence instead because the PWS - * counter was incremented. - */ -void -ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, - ASSERTED enum amd_ip_type ip_type, uint32_t event_type, - uint32_t stage_sel, uint32_t count, - uint32_t gcr_cntl) -{ - assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX); - - const bool ts = is_ts_event(event_type); - const bool ps_done = event_type == V_028A90_PS_DONE; - const bool cs_done = event_type == V_028A90_CS_DONE; - const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT; - - assert((int)ts + (int)cs_done + (int)ps_done == 1); - assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME); - assert(stage_sel != V_580_PRE_COLOR); - - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) | - S_580_PWS_COUNTER_SEL(counter_sel) | - S_580_PWS_ENA2(1) | - S_580_PWS_COUNT(count)); - ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */ - ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */ - ac_cmdbuf_emit(0); /* GCR_BASE_LO */ - ac_cmdbuf_emit(0); /* GCR_BASE_HI */ - ac_cmdbuf_emit(S_585_PWS_ENA(1)); - ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */ - ac_cmdbuf_end(); -} - -/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will - * signal after the work indicated by the event is complete, which optionally - * includes flushing caches using "gcr_cntl" after the completion of the work. - * *_TS events are always signaled at the end of the pipeline, while CS_DONE - * and PS_DONE are signaled when those shaders finish. This call only inserts - * the event into the pipeline. It doesn't wait for anything and it doesn't - * execute anything immediately. The only way to wait for the event completion - * is to call si_cp_acquire_mem_pws with the same "event_type". - */ -void -ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, - ASSERTED enum amd_ip_type ip_type, uint32_t event_type, - uint32_t gcr_cntl) -{ - assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX); - - /* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */ - assert(G_586_GLI_INV(gcr_cntl) == 0); - assert(G_586_GL1_RANGE(gcr_cntl) == 0); - const uint32_t glm_wb = G_586_GLM_WB(gcr_cntl); - const uint32_t glm_inv = G_586_GLM_INV(gcr_cntl); - const uint32_t glk_wb = G_586_GLK_WB(gcr_cntl); - const uint32_t glk_inv = G_586_GLK_INV(gcr_cntl); - const uint32_t glv_inv = G_586_GLV_INV(gcr_cntl); - const uint32_t gl1_inv = G_586_GL1_INV(gcr_cntl); - assert(G_586_GL2_US(gcr_cntl) == 0); - assert(G_586_GL2_RANGE(gcr_cntl) == 0); - assert(G_586_GL2_DISCARD(gcr_cntl) == 0); - const uint32_t gl2_inv = G_586_GL2_INV(gcr_cntl); - const uint32_t gl2_wb = G_586_GL2_WB(gcr_cntl); - const uint32_t gcr_seq = G_586_SEQ(gcr_cntl); - const bool ts = is_ts_event(event_type); - - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); - ac_cmdbuf_emit(S_490_EVENT_TYPE(event_type) | - S_490_EVENT_INDEX(ts ? 5 : 6) | - S_490_GLM_WB(glm_wb) | - S_490_GLM_INV(glm_inv) | - S_490_GLV_INV(glv_inv) | - S_490_GL1_INV(gl1_inv) | - S_490_GL2_INV(gl2_inv) | - S_490_GL2_WB(gl2_wb) | - S_490_SEQ(gcr_seq) | - S_490_GLK_WB(glk_wb) | - S_490_GLK_INV(glk_inv) | - S_490_PWS_ENABLE(1)); - ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */ - ac_cmdbuf_emit(0); /* ADDRESS_LO */ - ac_cmdbuf_emit(0); /* ADDRESS_HI */ - ac_cmdbuf_emit(0); /* DATA_LO */ - ac_cmdbuf_emit(0); /* DATA_HI */ - ac_cmdbuf_emit(0); /* INT_CTXID */ - ac_cmdbuf_end(); -} - -void -ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, - uint64_t src_va, uint64_t dst_va, - enum ac_cp_copy_data_flags flags, bool predicate) -{ - uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) | - COPY_DATA_DST_SEL(dst_sel); - - if (flags & AC_CP_COPY_DATA_WR_CONFIRM) - dword0 |= COPY_DATA_WR_CONFIRM; - if (flags & AC_CP_COPY_DATA_COUNT_SEL) - dword0 |= COPY_DATA_COUNT_SEL; - if (flags & AC_CP_COPY_DATA_ENGINE_PFP) - dword0 |= COPY_DATA_ENGINE_PFP; - - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate)); - ac_cmdbuf_emit(dword0); - ac_cmdbuf_emit(src_va); - ac_cmdbuf_emit(src_va >> 32); - ac_cmdbuf_emit(dst_va); - ac_cmdbuf_emit(dst_va >> 32); - ac_cmdbuf_end(); -} - -void -ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate) -{ - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate)); - ac_cmdbuf_emit(0); - ac_cmdbuf_end(); -} - -void -ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - uint64_t va, uint32_t op) -{ - ac_cmdbuf_begin(cs); - if (gfx_level >= GFX9) { - ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0)); - ac_cmdbuf_emit(op); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(va >> 32); - } else { - ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0)); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(op | ((va >> 32) & 0xFF)); - } - ac_cmdbuf_end(); -} - -void -ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, - uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa) -{ - assert(info->gfx_level >= GFX11); - assert((attr_ring_va >> 32) == info->address32_hi); - - ac_cmdbuf_begin(cs); - - ac_cmdbuf_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4); - ac_cmdbuf_emit(0x12355123); - ac_cmdbuf_emit(0x1544D); - ac_cmdbuf_emit(attr_ring_va >> 16); - ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) | - S_03111C_BIG_PAGE(info->discardable_allows_big_page) | - S_03111C_L1_POLICY(1)); - - if (info->gfx_level >= GFX12) { - const uint64_t pos_va = attr_ring_va + info->pos_ring_offset; - const uint64_t prim_va = attr_ring_va + info->prim_ring_offset; - - /* When one of these 4 registers is updated, all 4 must be updated. */ - ac_cmdbuf_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4); - ac_cmdbuf_emit(pos_va >> 16); - ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5)); - ac_cmdbuf_emit(prim_va >> 16); - ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) | - S_0309AC_SCOPE(gfx12_scope_device) | - S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) | - S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) | - S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) | - S_0309AC_FORCE_SE_SCOPE(1) | - S_0309AC_PAB_NOFILL(1)); - - if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) { - /* Mitigate the HiZ GPU hang by increasing a timeout when - * BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted - * when the gfx queue is idle. - */ - const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0; - - ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0)); - ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout)); - } - } - - ac_cmdbuf_end(); -} - -void -ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, - uint64_t attr_ring_va) -{ - const uint64_t va = attr_ring_va + info->tess_offchip_ring_size; - uint32_t tf_ring_size = info->tess_factor_ring_size / 4; - - if (info->gfx_level >= GFX11) { - /* TF_RING_SIZE is per SE on GFX11. */ - tf_ring_size /= info->max_se; - } - - assert((tf_ring_size & C_030938_SIZE) == 0); - - ac_cmdbuf_begin(cs); - - if (info->gfx_level >= GFX7) { - ac_cmdbuf_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3); - ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size)); - ac_cmdbuf_emit(info->hs_offchip_param); - ac_cmdbuf_emit(va >> 8); - - if (info->gfx_level >= GFX12) { - ac_cmdbuf_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40)); - } else if (info->gfx_level >= GFX10) { - ac_cmdbuf_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40)); - } else if (info->gfx_level == GFX9) { - ac_cmdbuf_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40)); - } - } else { - ac_cmdbuf_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size)); - ac_cmdbuf_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8); - ac_cmdbuf_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param); - } - - ac_cmdbuf_end(); -} - -void -ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - uint64_t va, uint32_t size) -{ - ac_cmdbuf_begin(cs); - - if (gfx_level >= GFX11) { - ac_cmdbuf_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3); - ac_cmdbuf_emit(size); - ac_cmdbuf_emit(va >> 8); - ac_cmdbuf_emit(va >> 40); - } else { - ac_cmdbuf_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, size); - } - - ac_cmdbuf_end(); -} - -/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits - * for idle on older chips. "engine" determines whether to sync in PFP or ME. - */ -void -ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - enum amd_ip_type ip_type, uint32_t engine, - uint32_t gcr_cntl) -{ - assert(engine == V_580_CP_PFP || engine == V_580_CP_ME); - assert(gcr_cntl); - - ac_cmdbuf_begin(cs); - - if (gfx_level >= GFX10) { - /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */ - const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0; - - /* Flush caches. This doesn't wait for idle. */ - ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - ac_cmdbuf_emit(engine_flag); /* which engine to use */ - ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ - ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */ - ac_cmdbuf_emit(0); /* CP_COHER_BASE */ - ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */ - ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ - ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */ - } else { - const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE; - - if (gfx_level == GFX9 || is_mec) { - /* Flush caches and wait for the caches to assert idle. */ - ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec)); - ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */ - ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ - ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */ - ac_cmdbuf_emit(0); /* CP_COHER_BASE */ - ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */ - ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ - } else { - /* ACQUIRE_MEM is only required on the compute ring. */ - ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0)); - ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */ - ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ - ac_cmdbuf_emit(0); /* CP_COHER_BASE */ - ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ - } - } - - ac_cmdbuf_end(); -} - -void -ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op, - uint32_t atomic_cmd, uint64_t va, uint64_t data, - uint64_t compare_data) -{ - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0)); - ac_cmdbuf_emit(ATOMIC_OP(atomic_op) | - ATOMIC_COMMAND(atomic_cmd)); - ac_cmdbuf_emit(va); /* addr lo */ - ac_cmdbuf_emit(va >> 32); /* addr hi */ - ac_cmdbuf_emit(data); /* data lo */ - ac_cmdbuf_emit(data >> 32); /* data hi */ - ac_cmdbuf_emit(compare_data); /* compare data lo */ - ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */ - ac_cmdbuf_emit(10); /* loop interval */ - ac_cmdbuf_end(); -} - -void -ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value) -{ - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0)); - ac_cmdbuf_emit(value); - ac_cmdbuf_end(); -} - -void -ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg, - uint32_t reg_count, uint64_t va, - bool predicate) -{ - assert(reg_count); - - ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate)); - ac_cmdbuf_emit(va); - ac_cmdbuf_emit(va >> 32); - ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2); - ac_cmdbuf_emit(reg_count); /* in DWORDS */ - ac_cmdbuf_end(); -} - void ac_cmdbuf_flush_vgt_streamout(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level) { diff --git a/src/amd/common/ac_cmdbuf.h b/src/amd/common/ac_cmdbuf.h index 97e68c78e7d..2f1855f5450 100644 --- a/src/amd/common/ac_cmdbuf.h +++ b/src/amd/common/ac_cmdbuf.h @@ -120,87 +120,6 @@ void ac_init_graphics_preamble_state(const struct ac_preamble_state *state, struct ac_pm4_state *pm4); -void -ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - uint64_t va, uint32_t count); - -void -ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel, - uint32_t dst_sel, uint64_t va, uint32_t size, - bool predicate); - -void -ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel, - uint32_t dst_sel, uint64_t va, uint32_t size, - const uint32_t *data, bool predicate); - -void -ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel, - uint64_t va, uint32_t value); - -void -ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref, - uint32_t mask, unsigned flags); - -void -ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, - ASSERTED enum amd_ip_type ip_type, uint32_t event_type, - uint32_t stage_sel, uint32_t count, - uint32_t gcr_cntl); - -void -ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, - ASSERTED enum amd_ip_type ip_type, uint32_t event_type, - uint32_t gcr_cntl); - -enum ac_cp_copy_data_flags { - AC_CP_COPY_DATA_WR_CONFIRM = 1u << 0, - AC_CP_COPY_DATA_COUNT_SEL = 1u << 1, /* 64 bits */ - AC_CP_COPY_DATA_ENGINE_PFP = 1u << 2, -}; - -void -ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, - uint64_t src_va, uint64_t dst_va, - enum ac_cp_copy_data_flags flags, bool predicate); - -void -ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate); - -void -ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - uint64_t va, uint32_t op); - -void -ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, - uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa); - -void -ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, - uint64_t va); - -void -ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - uint64_t va, uint32_t size); - -void -ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, - enum amd_ip_type ip_type, uint32_t engine, - uint32_t gcr_cntl); - -void -ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op, - uint32_t atomic_cmd, uint64_t va, uint64_t data, - uint64_t compare_data); - -void -ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value); - -void -ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg, - uint32_t reg_count, uint64_t va, - bool predicate); - void ac_cmdbuf_flush_vgt_streamout(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level); diff --git a/src/amd/common/ac_cmdbuf_cp.c b/src/amd/common/ac_cmdbuf_cp.c new file mode 100644 index 00000000000..fd53fd2cd9f --- /dev/null +++ b/src/amd/common/ac_cmdbuf_cp.c @@ -0,0 +1,465 @@ +/* + * Copyright 2012 Advanced Micro Devices, Inc. + * Copyright 2024 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "ac_cmdbuf.h" +#include "ac_cmdbuf_cp.h" +#include "ac_gpu_info.h" +#include "ac_shader_util.h" + +#include "amd_family.h" +#include "sid.h" + +void +ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + uint64_t va, uint32_t count) +{ + ac_cmdbuf_begin(cs); + if (gfx_level >= GFX7) { + ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 3, 0)); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(va >> 32); + ac_cmdbuf_emit(0); + ac_cmdbuf_emit(count); + } else { + ac_cmdbuf_emit(PKT3(PKT3_COND_EXEC, 2, 0)); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(va >> 32); + ac_cmdbuf_emit(count); + } + ac_cmdbuf_end(); +} + +void +ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel, + uint32_t dst_sel, uint64_t va, uint32_t size, + bool predicate) +{ + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_WRITE_DATA, 2 + size, predicate)); + ac_cmdbuf_emit(S_370_DST_SEL(dst_sel) | + S_370_WR_CONFIRM(1) | + S_370_ENGINE_SEL(engine_sel)); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(va >> 32); + ac_cmdbuf_end(); +} + +void +ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel, + uint32_t dst_sel, uint64_t va, uint32_t size, + const uint32_t *data, bool predicate) +{ + ac_emit_cp_write_data_head(cs, engine_sel, dst_sel, va, size, predicate); + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit_array(data, size); + ac_cmdbuf_end(); +} + +void +ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel, + uint64_t va, uint32_t value) +{ + ac_emit_cp_write_data(cs, engine_sel, V_370_MEM, va, 1, &value, false); +} + +void +ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref, + uint32_t mask, unsigned flags) +{ + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + ac_cmdbuf_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(va >> 32); + ac_cmdbuf_emit(ref); /* reference value */ + ac_cmdbuf_emit(mask); /* mask */ + ac_cmdbuf_emit(4); /* poll interval */ + ac_cmdbuf_end(); +} + +static bool +is_ts_event(unsigned event_type) +{ + return event_type == V_028A90_CACHE_FLUSH_TS || + event_type == V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT || + event_type == V_028A90_BOTTOM_OF_PIPE_TS || + event_type == V_028A90_FLUSH_AND_INV_DB_DATA_TS || + event_type == V_028A90_FLUSH_AND_INV_CB_DATA_TS; +} + +/* This will wait or insert into the pipeline a wait for a previous + * RELEASE_MEM PWS event. + * + * "event_type" must be the same as the RELEASE_MEM PWS event. + * + * "stage_sel" determines when the waiting happens. It can be CP_PFP, CP_ME, + * PRE_SHADER, PRE_DEPTH, or PRE_PIX_SHADER, allowing to wait later in the + * pipeline instead of completely idling the hw at the frontend. + * + * "gcr_cntl" must be 0 if not waiting in PFP or ME. When waiting later in the + * pipeline, any cache flushes must be part of RELEASE_MEM, not ACQUIRE_MEM. + * + * "distance" determines how many RELEASE_MEM PWS events ago it should wait + * for, minus one (starting from 0). There are 3 event types: PS_DONE, + * CS_DONE, and TS events. The distance counter increments separately for each + * type, so 0 with PS_DONE means wait for the last PS_DONE event, while 0 with + * *_TS means wait for the last TS event (even if it's a different TS event + * because all TS events share the same counter). + * + * PRE_SHADER waits before the first shader that has IMAGE_OP=1, while + * PRE_PIX_SHADER waits before PS if it has IMAGE_OP=1 (IMAGE_OP should really + * be called SYNC_ENABLE) PRE_DEPTH waits before depth/stencil tests. + * + * PRE_COLOR also exists but shouldn't be used because it can hang. It's + * recommended to use PRE_PIX_SHADER instead, which means all PS that have + * color exports with enabled color buffers, non-zero colormask, and non-zero + * sample mask must have IMAGE_OP=1 to enable the sync before PS. + * + * Waiting for a PWS fence that was generated by a previous IB is valid, but + * if there is an IB from another process in between and that IB also inserted + * a PWS fence, the hw will wait for the newer fence instead because the PWS + * counter was incremented. + */ +void +ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, + ASSERTED enum amd_ip_type ip_type, uint32_t event_type, + uint32_t stage_sel, uint32_t count, + uint32_t gcr_cntl) +{ + assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX); + + const bool ts = is_ts_event(event_type); + const bool ps_done = event_type == V_028A90_PS_DONE; + const bool cs_done = event_type == V_028A90_CS_DONE; + const uint32_t counter_sel = ts ? V_580_TS_SELECT : ps_done ? V_580_PS_SELECT : V_580_CS_SELECT; + + assert((int)ts + (int)cs_done + (int)ps_done == 1); + assert(!gcr_cntl || stage_sel == V_580_CP_PFP || stage_sel == V_580_CP_ME); + assert(stage_sel != V_580_PRE_COLOR); + + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + ac_cmdbuf_emit(S_580_PWS_STAGE_SEL(stage_sel) | + S_580_PWS_COUNTER_SEL(counter_sel) | + S_580_PWS_ENA2(1) | + S_580_PWS_COUNT(count)); + ac_cmdbuf_emit(0xffffffff); /* GCR_SIZE */ + ac_cmdbuf_emit(0x01ffffff); /* GCR_SIZE_HI */ + ac_cmdbuf_emit(0); /* GCR_BASE_LO */ + ac_cmdbuf_emit(0); /* GCR_BASE_HI */ + ac_cmdbuf_emit(S_585_PWS_ENA(1)); + ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL (this has no effect if PWS_STAGE_SEL isn't PFP or ME) */ + ac_cmdbuf_end(); +} + +/* Insert CS_DONE, PS_DONE, or a *_TS event into the pipeline, which will + * signal after the work indicated by the event is complete, which optionally + * includes flushing caches using "gcr_cntl" after the completion of the work. + * *_TS events are always signaled at the end of the pipeline, while CS_DONE + * and PS_DONE are signaled when those shaders finish. This call only inserts + * the event into the pipeline. It doesn't wait for anything and it doesn't + * execute anything immediately. The only way to wait for the event completion + * is to call si_cp_acquire_mem_pws with the same "event_type". + */ +void +ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, + ASSERTED enum amd_ip_type ip_type, uint32_t event_type, + uint32_t gcr_cntl) +{ + assert(gfx_level >= GFX11 && ip_type == AMD_IP_GFX); + + /* Extract GCR_CNTL fields because the encoding is different in RELEASE_MEM. */ + assert(G_586_GLI_INV(gcr_cntl) == 0); + assert(G_586_GL1_RANGE(gcr_cntl) == 0); + const uint32_t glm_wb = G_586_GLM_WB(gcr_cntl); + const uint32_t glm_inv = G_586_GLM_INV(gcr_cntl); + const uint32_t glk_wb = G_586_GLK_WB(gcr_cntl); + const uint32_t glk_inv = G_586_GLK_INV(gcr_cntl); + const uint32_t glv_inv = G_586_GLV_INV(gcr_cntl); + const uint32_t gl1_inv = G_586_GL1_INV(gcr_cntl); + assert(G_586_GL2_US(gcr_cntl) == 0); + assert(G_586_GL2_RANGE(gcr_cntl) == 0); + assert(G_586_GL2_DISCARD(gcr_cntl) == 0); + const uint32_t gl2_inv = G_586_GL2_INV(gcr_cntl); + const uint32_t gl2_wb = G_586_GL2_WB(gcr_cntl); + const uint32_t gcr_seq = G_586_SEQ(gcr_cntl); + const bool ts = is_ts_event(event_type); + + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_RELEASE_MEM, 6, 0)); + ac_cmdbuf_emit(S_490_EVENT_TYPE(event_type) | + S_490_EVENT_INDEX(ts ? 5 : 6) | + S_490_GLM_WB(glm_wb) | + S_490_GLM_INV(glm_inv) | + S_490_GLV_INV(glv_inv) | + S_490_GL1_INV(gl1_inv) | + S_490_GL2_INV(gl2_inv) | + S_490_GL2_WB(gl2_wb) | + S_490_SEQ(gcr_seq) | + S_490_GLK_WB(glk_wb) | + S_490_GLK_INV(glk_inv) | + S_490_PWS_ENABLE(1)); + ac_cmdbuf_emit(0); /* DST_SEL, INT_SEL, DATA_SEL */ + ac_cmdbuf_emit(0); /* ADDRESS_LO */ + ac_cmdbuf_emit(0); /* ADDRESS_HI */ + ac_cmdbuf_emit(0); /* DATA_LO */ + ac_cmdbuf_emit(0); /* DATA_HI */ + ac_cmdbuf_emit(0); /* INT_CTXID */ + ac_cmdbuf_end(); +} + +void +ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, + uint64_t src_va, uint64_t dst_va, + enum ac_cp_copy_data_flags flags, bool predicate) +{ + uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) | + COPY_DATA_DST_SEL(dst_sel); + + if (flags & AC_CP_COPY_DATA_WR_CONFIRM) + dword0 |= COPY_DATA_WR_CONFIRM; + if (flags & AC_CP_COPY_DATA_COUNT_SEL) + dword0 |= COPY_DATA_COUNT_SEL; + if (flags & AC_CP_COPY_DATA_ENGINE_PFP) + dword0 |= COPY_DATA_ENGINE_PFP; + + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate)); + ac_cmdbuf_emit(dword0); + ac_cmdbuf_emit(src_va); + ac_cmdbuf_emit(src_va >> 32); + ac_cmdbuf_emit(dst_va); + ac_cmdbuf_emit(dst_va >> 32); + ac_cmdbuf_end(); +} + +void +ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate) +{ + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_PFP_SYNC_ME, 0, predicate)); + ac_cmdbuf_emit(0); + ac_cmdbuf_end(); +} + +void +ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + uint64_t va, uint32_t op) +{ + ac_cmdbuf_begin(cs); + if (gfx_level >= GFX9) { + ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 2, 0)); + ac_cmdbuf_emit(op); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(va >> 32); + } else { + ac_cmdbuf_emit(PKT3(PKT3_SET_PREDICATION, 1, 0)); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(op | ((va >> 32) & 0xFF)); + } + ac_cmdbuf_end(); +} + +void +ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, + uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa) +{ + assert(info->gfx_level >= GFX11); + assert((attr_ring_va >> 32) == info->address32_hi); + + ac_cmdbuf_begin(cs); + + ac_cmdbuf_set_uconfig_reg_seq(R_031110_SPI_GS_THROTTLE_CNTL1, 4); + ac_cmdbuf_emit(0x12355123); + ac_cmdbuf_emit(0x1544D); + ac_cmdbuf_emit(attr_ring_va >> 16); + ac_cmdbuf_emit(S_03111C_MEM_SIZE((info->attribute_ring_size_per_se >> 16) - 1) | + S_03111C_BIG_PAGE(info->discardable_allows_big_page) | + S_03111C_L1_POLICY(1)); + + if (info->gfx_level >= GFX12) { + const uint64_t pos_va = attr_ring_va + info->pos_ring_offset; + const uint64_t prim_va = attr_ring_va + info->prim_ring_offset; + + /* When one of these 4 registers is updated, all 4 must be updated. */ + ac_cmdbuf_set_uconfig_reg_seq(R_0309A0_GE_POS_RING_BASE, 4); + ac_cmdbuf_emit(pos_va >> 16); + ac_cmdbuf_emit(S_0309A4_MEM_SIZE(info->pos_ring_size_per_se >> 5)); + ac_cmdbuf_emit(prim_va >> 16); + ac_cmdbuf_emit(S_0309AC_MEM_SIZE(info->prim_ring_size_per_se >> 5) | + S_0309AC_SCOPE(gfx12_scope_device) | + S_0309AC_PAF_TEMPORAL(gfx12_store_high_temporal_stay_dirty) | + S_0309AC_PAB_TEMPORAL(gfx12_load_last_use_discard) | + S_0309AC_SPEC_DATA_READ(gfx12_spec_read_auto) | + S_0309AC_FORCE_SE_SCOPE(1) | + S_0309AC_PAB_NOFILL(1)); + + if (info->gfx_level == GFX12 && info->pfp_fw_version >= 2680) { + /* Mitigate the HiZ GPU hang by increasing a timeout when + * BOTTOM_OF_PIPE_TS is used as the workaround. This must be emitted + * when the gfx queue is idle. + */ + const uint32_t timeout = enable_gfx12_partial_hiz_wa ? 0xfff : 0; + + ac_cmdbuf_emit(PKT3(PKT3_UPDATE_DB_SUMMARIZER_TIMEOUT, 0, 0)); + ac_cmdbuf_emit(S_EF1_SUMM_CNTL_EVICT_TIMEOUT(timeout)); + } + } + + ac_cmdbuf_end(); +} + +void +ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, + uint64_t attr_ring_va) +{ + const uint64_t va = attr_ring_va + info->tess_offchip_ring_size; + uint32_t tf_ring_size = info->tess_factor_ring_size / 4; + + if (info->gfx_level >= GFX11) { + /* TF_RING_SIZE is per SE on GFX11. */ + tf_ring_size /= info->max_se; + } + + assert((tf_ring_size & C_030938_SIZE) == 0); + + ac_cmdbuf_begin(cs); + + if (info->gfx_level >= GFX7) { + ac_cmdbuf_set_uconfig_reg_seq(R_030938_VGT_TF_RING_SIZE, 3); + ac_cmdbuf_emit(S_030938_SIZE(tf_ring_size)); + ac_cmdbuf_emit(info->hs_offchip_param); + ac_cmdbuf_emit(va >> 8); + + if (info->gfx_level >= GFX12) { + ac_cmdbuf_set_uconfig_reg(R_03099C_VGT_TF_MEMORY_BASE_HI, S_03099C_BASE_HI(va >> 40)); + } else if (info->gfx_level >= GFX10) { + ac_cmdbuf_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI, S_030984_BASE_HI(va >> 40)); + } else if (info->gfx_level == GFX9) { + ac_cmdbuf_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI, S_030944_BASE_HI(va >> 40)); + } + } else { + ac_cmdbuf_set_config_reg(R_008988_VGT_TF_RING_SIZE, S_008988_SIZE(tf_ring_size)); + ac_cmdbuf_set_config_reg(R_0089B8_VGT_TF_MEMORY_BASE, va >> 8); + ac_cmdbuf_set_config_reg(R_0089B0_VGT_HS_OFFCHIP_PARAM, info->hs_offchip_param); + } + + ac_cmdbuf_end(); +} + +void +ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + uint64_t va, uint32_t size) +{ + ac_cmdbuf_begin(cs); + + if (gfx_level >= GFX11) { + ac_cmdbuf_set_context_reg_seq(R_0286E8_SPI_TMPRING_SIZE, 3); + ac_cmdbuf_emit(size); + ac_cmdbuf_emit(va >> 8); + ac_cmdbuf_emit(va >> 40); + } else { + ac_cmdbuf_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, size); + } + + ac_cmdbuf_end(); +} + +/* Execute plain ACQUIRE_MEM that just flushes caches. This optionally waits + * for idle on older chips. "engine" determines whether to sync in PFP or ME. + */ +void +ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + enum amd_ip_type ip_type, uint32_t engine, + uint32_t gcr_cntl) +{ + assert(engine == V_580_CP_PFP || engine == V_580_CP_ME); + assert(gcr_cntl); + + ac_cmdbuf_begin(cs); + + if (gfx_level >= GFX10) { + /* ACQUIRE_MEM in PFP is implemented as ACQUIRE_MEM in ME + PFP_SYNC_ME. */ + const uint32_t engine_flag = engine == V_580_CP_ME ? BITFIELD_BIT(31) : 0; + + /* Flush caches. This doesn't wait for idle. */ + ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + ac_cmdbuf_emit(engine_flag); /* which engine to use */ + ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ + ac_cmdbuf_emit(0x01ffffff); /* CP_COHER_SIZE_HI */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */ + ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ + ac_cmdbuf_emit(gcr_cntl); /* GCR_CNTL */ + } else { + const bool is_mec = gfx_level >= GFX7 && ip_type == AMD_IP_COMPUTE; + + if (gfx_level == GFX9 || is_mec) { + /* Flush caches and wait for the caches to assert idle. */ + ac_cmdbuf_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0) | PKT3_SHADER_TYPE_S(is_mec)); + ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */ + ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ + ac_cmdbuf_emit(0xffffff); /* CP_COHER_SIZE_HI */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE_HI */ + ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ + } else { + /* ACQUIRE_MEM is only required on the compute ring. */ + ac_cmdbuf_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0)); + ac_cmdbuf_emit(gcr_cntl); /* CP_COHER_CNTL */ + ac_cmdbuf_emit(0xffffffff); /* CP_COHER_SIZE */ + ac_cmdbuf_emit(0); /* CP_COHER_BASE */ + ac_cmdbuf_emit(0x0000000A); /* POLL_INTERVAL */ + } + } + + ac_cmdbuf_end(); +} + +void +ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op, + uint32_t atomic_cmd, uint64_t va, uint64_t data, + uint64_t compare_data) +{ + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0)); + ac_cmdbuf_emit(ATOMIC_OP(atomic_op) | + ATOMIC_COMMAND(atomic_cmd)); + ac_cmdbuf_emit(va); /* addr lo */ + ac_cmdbuf_emit(va >> 32); /* addr hi */ + ac_cmdbuf_emit(data); /* data lo */ + ac_cmdbuf_emit(data >> 32); /* data hi */ + ac_cmdbuf_emit(compare_data); /* compare data lo */ + ac_cmdbuf_emit(compare_data >> 32); /* compare data hi */ + ac_cmdbuf_emit(10); /* loop interval */ + ac_cmdbuf_end(); +} + +void +ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value) +{ + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_NOP, 0, 0)); + ac_cmdbuf_emit(value); + ac_cmdbuf_end(); +} + +void +ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg, + uint32_t reg_count, uint64_t va, + bool predicate) +{ + assert(reg_count); + + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, predicate)); + ac_cmdbuf_emit(va); + ac_cmdbuf_emit(va >> 32); + ac_cmdbuf_emit((reg - SI_CONTEXT_REG_OFFSET) >> 2); + ac_cmdbuf_emit(reg_count); /* in DWORDS */ + ac_cmdbuf_end(); +} diff --git a/src/amd/common/ac_cmdbuf_cp.h b/src/amd/common/ac_cmdbuf_cp.h new file mode 100644 index 00000000000..7bc373a92ef --- /dev/null +++ b/src/amd/common/ac_cmdbuf_cp.h @@ -0,0 +1,109 @@ +/* + * Copyright 2012 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +#ifndef AC_CMDBUF_CP_H +#define AC_CMDBUF_CP_H + +#include +#include + +#include "amd_family.h" + +#include "util/macros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ac_cmdbuf; +struct radeon_info; + +void +ac_emit_cp_cond_exec(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + uint64_t va, uint32_t count); + +void +ac_emit_cp_write_data_head(struct ac_cmdbuf *cs, uint32_t engine_sel, + uint32_t dst_sel, uint64_t va, uint32_t size, + bool predicate); + +void +ac_emit_cp_write_data(struct ac_cmdbuf *cs, uint32_t engine_sel, + uint32_t dst_sel, uint64_t va, uint32_t size, + const uint32_t *data, bool predicate); + +void +ac_emit_cp_write_data_imm(struct ac_cmdbuf *cs, unsigned engine_sel, + uint64_t va, uint32_t value); + +void +ac_emit_cp_wait_mem(struct ac_cmdbuf *cs, uint64_t va, uint32_t ref, + uint32_t mask, unsigned flags); + +void +ac_emit_cp_acquire_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, + ASSERTED enum amd_ip_type ip_type, uint32_t event_type, + uint32_t stage_sel, uint32_t count, + uint32_t gcr_cntl); + +void +ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx_level, + ASSERTED enum amd_ip_type ip_type, uint32_t event_type, + uint32_t gcr_cntl); + +enum ac_cp_copy_data_flags { + AC_CP_COPY_DATA_WR_CONFIRM = 1u << 0, + AC_CP_COPY_DATA_COUNT_SEL = 1u << 1, /* 64 bits */ + AC_CP_COPY_DATA_ENGINE_PFP = 1u << 2, +}; + +void +ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, + uint64_t src_va, uint64_t dst_va, + enum ac_cp_copy_data_flags flags, bool predicate); + +void +ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate); + +void +ac_emit_cp_set_predication(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + uint64_t va, uint32_t op); + +void +ac_emit_cp_gfx11_ge_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, + uint64_t attr_ring_va, bool enable_gfx12_partial_hiz_wa); + +void +ac_emit_cp_tess_rings(struct ac_cmdbuf *cs, const struct radeon_info *info, + uint64_t va); + +void +ac_emit_cp_gfx_scratch(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + uint64_t va, uint32_t size); + +void +ac_emit_cp_acquire_mem(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, + enum amd_ip_type ip_type, uint32_t engine, + uint32_t gcr_cntl); + +void +ac_emit_cp_atomic_mem(struct ac_cmdbuf *cs, uint32_t atomic_op, + uint32_t atomic_cmd, uint64_t va, uint64_t data, + uint64_t compare_data); + +void +ac_emit_cp_nop(struct ac_cmdbuf *cs, uint32_t value); + +void +ac_emit_cp_load_context_reg_index(struct ac_cmdbuf *cs, uint32_t reg, + uint32_t reg_count, uint64_t va, + bool predicate); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index fb28b475f0e..de5e8ba7af7 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -56,6 +56,8 @@ amd_common_files = files( 'ac_binary.h', 'ac_cmdbuf.c', 'ac_cmdbuf.h', + 'ac_cmdbuf_cp.c', + 'ac_cmdbuf_cp.h', 'ac_cmdbuf_sdma.c', 'ac_cmdbuf_sdma.h', 'ac_shader_args.c', diff --git a/src/amd/vulkan/radv_cs.h b/src/amd/vulkan/radv_cs.h index e50718ab531..d91b0fca84f 100644 --- a/src/amd/vulkan/radv_cs.h +++ b/src/amd/vulkan/radv_cs.h @@ -17,6 +17,7 @@ #include "radv_sdma.h" #include "sid.h" +#include "ac_cmdbuf_cp.h" #include "ac_cmdbuf_sdma.h" static inline unsigned diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index 806547fad5a..ed279429f66 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -11,6 +11,8 @@ #ifndef SI_BUILD_PM4_H #define SI_BUILD_PM4_H +#include "ac_cmdbuf_cp.h" + #include "si_pipe.h" #include "sid.h"