amd: add a predicate parameter to ac_emit_cp_copy_data()

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37881>
This commit is contained in:
Samuel Pitoiset 2025-10-15 10:24:07 +02:00
parent 29c2d02d64
commit ed7f9df864
8 changed files with 30 additions and 26 deletions

View file

@ -1040,7 +1040,7 @@ ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx
void
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
uint64_t src_va, uint64_t dst_va,
enum ac_cp_copy_data_flags flags)
enum ac_cp_copy_data_flags flags, bool predicate)
{
uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) |
COPY_DATA_DST_SEL(dst_sel);
@ -1053,7 +1053,7 @@ ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
dword0 |= COPY_DATA_ENGINE_PFP;
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, 0));
ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate));
ac_cmdbuf_emit(dword0);
ac_cmdbuf_emit(src_va);
ac_cmdbuf_emit(src_va >> 32);

View file

@ -152,7 +152,7 @@ enum ac_cp_copy_data_flags {
void
ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel,
uint64_t src_va, uint64_t dst_va,
enum ac_cp_copy_data_flags flags);
enum ac_cp_copy_data_flags flags, bool predicate);
void
ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate);

View file

@ -5065,7 +5065,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct rad
ac_emit_cp_load_context_reg_index(cs->b, reg, reg_count, va, false);
} else {
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_REG, va, reg >> 2,
(reg_count == 2 ? AC_CP_COPY_DATA_COUNT_SEL : 0));
(reg_count == 2 ? AC_CP_COPY_DATA_COUNT_SEL : 0), false);
ac_emit_cp_pfp_sync_me(cs->b, false);
}
@ -10063,7 +10063,8 @@ radv_emit_copy_data_imm(const struct radv_physical_device *pdev, struct radv_cmd
uint64_t dst_va)
{
ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, src_imm, dst_va,
AC_CP_COPY_DATA_WR_CONFIRM | (pdev->info.gfx_level == GFX6 ? AC_CP_COPY_DATA_ENGINE_PFP : 0));
AC_CP_COPY_DATA_WR_CONFIRM | (pdev->info.gfx_level == GFX6 ? AC_CP_COPY_DATA_ENGINE_PFP : 0),
false);
}
/**
@ -10778,7 +10779,7 @@ radv_emit_indirect_taskmesh_draw_packets(const struct radv_device *device, struc
* - When workaround BO != 0 (count was 0), execute an empty direct dispatch
*/
ac_emit_cp_copy_data(ace_cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 1, workaround_cond_va,
AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_WR_CONFIRM, false);
/* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
ace_predication_size += 2 * 5 + 6 + 6 * num_views;
@ -10792,7 +10793,7 @@ radv_emit_indirect_taskmesh_draw_packets(const struct radv_device *device, struc
6 + 11 * num_views /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */);
ac_emit_cp_copy_data(ace_cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 0, workaround_cond_va,
AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_WR_CONFIRM, false);
}
if (!view_mask) {
@ -13130,7 +13131,8 @@ radv_save_dispatch_size(struct radv_cmd_buffer *cmd_buffer, uint64_t indirect_va
uint64_t va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, indirect_dispatch);
for (uint32_t i = 0; i < 3; i++) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, indirect_va, va, AC_CP_COPY_DATA_WR_CONFIRM);
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, indirect_va, va, AC_CP_COPY_DATA_WR_CONFIRM,
false);
indirect_va += 4;
va += 4;
@ -13229,7 +13231,7 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv
const uint64_t dst_va = indirect_va + i * 4;
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, src_va, dst_va,
AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_WR_CONFIRM, false);
}
}
@ -14650,7 +14652,8 @@ radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va
radeon_check_space(device->ws, cs->b, 8);
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, emulated_va, AC_CP_COPY_DATA_WR_CONFIRM);
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, emulated_va, AC_CP_COPY_DATA_WR_CONFIRM,
false);
ac_emit_cp_pfp_sync_me(cs->b, false);
@ -14887,12 +14890,12 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC
if (pdev->info.gfx_level >= GFX12) {
if (append) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, so->state_va + i * 8 + 4,
AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_WR_CONFIRM, false);
}
} else if (pdev->use_ngg_streamout) {
if (append) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_REG, va,
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, AC_CP_COPY_DATA_WR_CONFIRM);
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, AC_CP_COPY_DATA_WR_CONFIRM, false);
} else {
/* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */
radeon_begin(cs);
@ -14992,12 +14995,13 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou
if (pdev->info.gfx_level >= GFX12) {
if (append) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, so->state_va + i * 8 + 4, va,
AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_WR_CONFIRM, false);
}
} else if (pdev->use_ngg_streamout) {
if (append) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_REG, COPY_DATA_DST_MEM,
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, va, AC_CP_COPY_DATA_WR_CONFIRM);
(R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, va, AC_CP_COPY_DATA_WR_CONFIRM,
false);
}
} else {
radeon_begin(cs);
@ -15063,7 +15067,7 @@ radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_d
draw_info->strmout_va, false);
} else {
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_REG, draw_info->strmout_va,
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, AC_CP_COPY_DATA_WR_CONFIRM);
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, AC_CP_COPY_DATA_WR_CONFIRM, false);
}
}
@ -15132,7 +15136,7 @@ radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlag
ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs->b, 12);
if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, marker, va, AC_CP_COPY_DATA_WR_CONFIRM);
ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, marker, va, AC_CP_COPY_DATA_WR_CONFIRM, false);
} else {
radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT, va, marker,

View file

@ -561,7 +561,7 @@ radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_p
reg = regs->counters[idx];
ac_emit_cp_copy_data(cs->b, COPY_DATA_PERF, COPY_DATA_TC_L2, reg >> 2, va,
AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL);
AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL, false);
va += sizeof(uint64_t) * 2 * radv_pc_get_num_instances(pdev, block);
reg += reg_delta;

View file

@ -37,7 +37,7 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkQueryType qu
static void
gfx10_copy_shader_query(struct radv_cmd_stream *cs, uint32_t src_sel, uint64_t src_va, uint64_t dst_va)
{
ac_emit_cp_copy_data(cs->b, src_sel, COPY_DATA_DST_MEM, src_va, dst_va, AC_CP_COPY_DATA_WR_CONFIRM);
ac_emit_cp_copy_data(cs->b, src_sel, COPY_DATA_DST_MEM, src_va, dst_va, AC_CP_COPY_DATA_WR_CONFIRM, false);
}
static void
@ -2714,7 +2714,7 @@ radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipeline
if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
ac_emit_cp_copy_data(cs->b, COPY_DATA_TIMESTAMP, COPY_DATA_DST_MEM, 0, va,
AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL);
AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL, false);
} else {
radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_TIMESTAMP, va, 0,
@ -2815,7 +2815,7 @@ radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer,
}
ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, query_va,
AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL);
AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL, false);
query_va += pool->stride;
}

View file

@ -1522,16 +1522,16 @@ radv_create_perf_counter_lock_cs(struct radv_device *device, unsigned pass, bool
uint64_t set_va = va + (unlock ? 0 : 8 * pass);
ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 0, unset_va,
AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM, false);
ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 1, set_va,
AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM, false);
if (unlock) {
uint64_t mutex_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_LOCK_OFFSET;
ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 0, mutex_va,
AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM, false);
}
assert(cs->b->cdw <= cdw);

View file

@ -380,5 +380,5 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned
uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
ac_emit_cp_copy_data(&cs->current, src_sel, dst_sel, src_va, dst_va,
AC_CP_COPY_DATA_WR_CONFIRM);
AC_CP_COPY_DATA_WR_CONFIRM, false);
}

View file

@ -235,7 +235,7 @@ static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block,
reg = regs->counters[idx];
ac_emit_cp_copy_data(&cs->current, COPY_DATA_PERF, COPY_DATA_DST_MEM,
reg >> 2, va, AC_CP_COPY_DATA_COUNT_SEL);
reg >> 2, va, AC_CP_COPY_DATA_COUNT_SEL, false);
va += sizeof(uint64_t);
reg += reg_delta;
}
@ -243,7 +243,7 @@ static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block,
/* Fake counters. */
for (idx = 0; idx < count; ++idx) {
ac_emit_cp_copy_data(&cs->current, COPY_DATA_IMM, COPY_DATA_DST_MEM,
0, va, AC_CP_COPY_DATA_COUNT_SEL);
0, va, AC_CP_COPY_DATA_COUNT_SEL, false);
va += sizeof(uint64_t);
}
}