From ed7f9df864ba6fb9338ad4dccc2f0eb96fa431d4 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 15 Oct 2025 10:24:07 +0200 Subject: [PATCH] amd: add a predicate parameter to ac_emit_cp_copy_data() Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_cmdbuf.c | 4 +-- src/amd/common/ac_cmdbuf.h | 2 +- src/amd/vulkan/radv_cmd_buffer.c | 30 +++++++++++-------- src/amd/vulkan/radv_perfcounter.c | 2 +- src/amd/vulkan/radv_query.c | 6 ++-- src/amd/vulkan/radv_queue.c | 6 ++-- src/gallium/drivers/radeonsi/si_cp_dma.c | 2 +- src/gallium/drivers/radeonsi/si_perfcounter.c | 4 +-- 8 files changed, 30 insertions(+), 26 deletions(-) diff --git a/src/amd/common/ac_cmdbuf.c b/src/amd/common/ac_cmdbuf.c index d272f910647..633eb7a6bcf 100644 --- a/src/amd/common/ac_cmdbuf.c +++ b/src/amd/common/ac_cmdbuf.c @@ -1040,7 +1040,7 @@ ac_emit_cp_release_mem_pws(struct ac_cmdbuf *cs, ASSERTED enum amd_gfx_level gfx void ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, uint64_t src_va, uint64_t dst_va, - enum ac_cp_copy_data_flags flags) + enum ac_cp_copy_data_flags flags, bool predicate) { uint32_t dword0 = COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel); @@ -1053,7 +1053,7 @@ ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, dword0 |= COPY_DATA_ENGINE_PFP; ac_cmdbuf_begin(cs); - ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, 0)); + ac_cmdbuf_emit(PKT3(PKT3_COPY_DATA, 4, predicate)); ac_cmdbuf_emit(dword0); ac_cmdbuf_emit(src_va); ac_cmdbuf_emit(src_va >> 32); diff --git a/src/amd/common/ac_cmdbuf.h b/src/amd/common/ac_cmdbuf.h index 019558a40ec..9b072a9500f 100644 --- a/src/amd/common/ac_cmdbuf.h +++ b/src/amd/common/ac_cmdbuf.h @@ -152,7 +152,7 @@ enum ac_cp_copy_data_flags { void ac_emit_cp_copy_data(struct ac_cmdbuf *cs, uint32_t src_sel, uint32_t dst_sel, uint64_t src_va, uint64_t dst_va, - enum ac_cp_copy_data_flags flags); + enum ac_cp_copy_data_flags flags, bool predicate); void ac_emit_cp_pfp_sync_me(struct ac_cmdbuf *cs, bool predicate); diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index a7750bd903f..359889a497d 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -5065,7 +5065,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct rad ac_emit_cp_load_context_reg_index(cs->b, reg, reg_count, va, false); } else { ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_REG, va, reg >> 2, - (reg_count == 2 ? AC_CP_COPY_DATA_COUNT_SEL : 0)); + (reg_count == 2 ? AC_CP_COPY_DATA_COUNT_SEL : 0), false); ac_emit_cp_pfp_sync_me(cs->b, false); } @@ -10063,7 +10063,8 @@ radv_emit_copy_data_imm(const struct radv_physical_device *pdev, struct radv_cmd uint64_t dst_va) { ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, src_imm, dst_va, - AC_CP_COPY_DATA_WR_CONFIRM | (pdev->info.gfx_level == GFX6 ? AC_CP_COPY_DATA_ENGINE_PFP : 0)); + AC_CP_COPY_DATA_WR_CONFIRM | (pdev->info.gfx_level == GFX6 ? AC_CP_COPY_DATA_ENGINE_PFP : 0), + false); } /** @@ -10778,7 +10779,7 @@ radv_emit_indirect_taskmesh_draw_packets(const struct radv_device *device, struc * - When workaround BO != 0 (count was 0), execute an empty direct dispatch */ ac_emit_cp_copy_data(ace_cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 1, workaround_cond_va, - AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_WR_CONFIRM, false); /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */ ace_predication_size += 2 * 5 + 6 + 6 * num_views; @@ -10792,7 +10793,7 @@ radv_emit_indirect_taskmesh_draw_packets(const struct radv_device *device, struc 6 + 11 * num_views /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */); ac_emit_cp_copy_data(ace_cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 0, workaround_cond_va, - AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_WR_CONFIRM, false); } if (!view_mask) { @@ -13130,7 +13131,8 @@ radv_save_dispatch_size(struct radv_cmd_buffer *cmd_buffer, uint64_t indirect_va uint64_t va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, indirect_dispatch); for (uint32_t i = 0; i < 3; i++) { - ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, indirect_va, va, AC_CP_COPY_DATA_WR_CONFIRM); + ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, indirect_va, va, AC_CP_COPY_DATA_WR_CONFIRM, + false); indirect_va += 4; va += 4; @@ -13229,7 +13231,7 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv const uint64_t dst_va = indirect_va + i * 4; ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, src_va, dst_va, - AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_WR_CONFIRM, false); } } @@ -14650,7 +14652,8 @@ radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va radeon_check_space(device->ws, cs->b, 8); - ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, emulated_va, AC_CP_COPY_DATA_WR_CONFIRM); + ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, emulated_va, AC_CP_COPY_DATA_WR_CONFIRM, + false); ac_emit_cp_pfp_sync_me(cs->b, false); @@ -14887,12 +14890,12 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC if (pdev->info.gfx_level >= GFX12) { if (append) { ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, so->state_va + i * 8 + 4, - AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_WR_CONFIRM, false); } } else if (pdev->use_ngg_streamout) { if (append) { ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_REG, va, - (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, AC_CP_COPY_DATA_WR_CONFIRM); + (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, AC_CP_COPY_DATA_WR_CONFIRM, false); } else { /* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */ radeon_begin(cs); @@ -14992,12 +14995,13 @@ radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCou if (pdev->info.gfx_level >= GFX12) { if (append) { ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, so->state_va + i * 8 + 4, va, - AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_WR_CONFIRM, false); } } else if (pdev->use_ngg_streamout) { if (append) { ac_emit_cp_copy_data(cs->b, COPY_DATA_REG, COPY_DATA_DST_MEM, - (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, va, AC_CP_COPY_DATA_WR_CONFIRM); + (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i, va, AC_CP_COPY_DATA_WR_CONFIRM, + false); } } else { radeon_begin(cs); @@ -15063,7 +15067,7 @@ radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_d draw_info->strmout_va, false); } else { ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_REG, draw_info->strmout_va, - R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, AC_CP_COPY_DATA_WR_CONFIRM); + R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, AC_CP_COPY_DATA_WR_CONFIRM, false); } } @@ -15132,7 +15136,7 @@ radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlag ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs->b, 12); if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) { - ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, marker, va, AC_CP_COPY_DATA_WR_CONFIRM); + ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, marker, va, AC_CP_COPY_DATA_WR_CONFIRM, false); } else { radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT, va, marker, diff --git a/src/amd/vulkan/radv_perfcounter.c b/src/amd/vulkan/radv_perfcounter.c index b27aeff43d5..2fd7710145c 100644 --- a/src/amd/vulkan/radv_perfcounter.c +++ b/src/amd/vulkan/radv_perfcounter.c @@ -561,7 +561,7 @@ radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_p reg = regs->counters[idx]; ac_emit_cp_copy_data(cs->b, COPY_DATA_PERF, COPY_DATA_TC_L2, reg >> 2, va, - AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL); + AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL, false); va += sizeof(uint64_t) * 2 * radv_pc_get_num_instances(pdev, block); reg += reg_delta; diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index a5cd0968e44..d20cc8aaa09 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -37,7 +37,7 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkQueryType qu static void gfx10_copy_shader_query(struct radv_cmd_stream *cs, uint32_t src_sel, uint64_t src_va, uint64_t dst_va) { - ac_emit_cp_copy_data(cs->b, src_sel, COPY_DATA_DST_MEM, src_va, dst_va, AC_CP_COPY_DATA_WR_CONFIRM); + ac_emit_cp_copy_data(cs->b, src_sel, COPY_DATA_DST_MEM, src_va, dst_va, AC_CP_COPY_DATA_WR_CONFIRM, false); } static void @@ -2714,7 +2714,7 @@ radv_write_timestamp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, VkPipeline if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) { ac_emit_cp_copy_data(cs->b, COPY_DATA_TIMESTAMP, COPY_DATA_DST_MEM, 0, va, - AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL); + AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL, false); } else { radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_TIMESTAMP, va, 0, @@ -2815,7 +2815,7 @@ radv_CmdWriteAccelerationStructuresPropertiesKHR(VkCommandBuffer commandBuffer, } ac_emit_cp_copy_data(cs->b, COPY_DATA_SRC_MEM, COPY_DATA_DST_MEM, va, query_va, - AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL); + AC_CP_COPY_DATA_WR_CONFIRM | AC_CP_COPY_DATA_COUNT_SEL, false); query_va += pool->stride; } diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index 29850aca98c..a14a31ca4d9 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -1522,16 +1522,16 @@ radv_create_perf_counter_lock_cs(struct radv_device *device, unsigned pass, bool uint64_t set_va = va + (unlock ? 0 : 8 * pass); ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 0, unset_va, - AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM, false); ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 1, set_va, - AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM, false); if (unlock) { uint64_t mutex_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_LOCK_OFFSET; ac_emit_cp_copy_data(cs->b, COPY_DATA_IMM, COPY_DATA_DST_MEM, 0, mutex_va, - AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_COUNT_SEL | AC_CP_COPY_DATA_WR_CONFIRM, false); } assert(cs->b->cdw <= cdw); diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 48e10d610e8..c8884597e26 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -380,5 +380,5 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset; ac_emit_cp_copy_data(&cs->current, src_sel, dst_sel, src_va, dst_va, - AC_CP_COPY_DATA_WR_CONFIRM); + AC_CP_COPY_DATA_WR_CONFIRM, false); } diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 326fc22adee..e2361ef14db 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -235,7 +235,7 @@ static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, reg = regs->counters[idx]; ac_emit_cp_copy_data(&cs->current, COPY_DATA_PERF, COPY_DATA_DST_MEM, - reg >> 2, va, AC_CP_COPY_DATA_COUNT_SEL); + reg >> 2, va, AC_CP_COPY_DATA_COUNT_SEL, false); va += sizeof(uint64_t); reg += reg_delta; } @@ -243,7 +243,7 @@ static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block, /* Fake counters. */ for (idx = 0; idx < count; ++idx) { ac_emit_cp_copy_data(&cs->current, COPY_DATA_IMM, COPY_DATA_DST_MEM, - 0, va, AC_CP_COPY_DATA_COUNT_SEL); + 0, va, AC_CP_COPY_DATA_COUNT_SEL, false); va += sizeof(uint64_t); } }