From 6413651bcf6481d095537aa3c188df7685440d60 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 13 Nov 2025 18:26:04 +0100 Subject: [PATCH] ac,radv,radeonsi: add ac_emit_sdma_copy_linear() RadeonSI wasn't considering the undocumented HW limitation apparently. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_cmdbuf_sdma.c | 37 +++++++++++++++ src/amd/common/ac_cmdbuf_sdma.h | 5 +++ src/amd/vulkan/radv_sdma.c | 45 +++---------------- .../drivers/radeonsi/si_sdma_copy_image.c | 28 ++++-------- 4 files changed, 57 insertions(+), 58 deletions(-) diff --git a/src/amd/common/ac_cmdbuf_sdma.c b/src/amd/common/ac_cmdbuf_sdma.c index 0ac1379c3bd..d5b90f5cd94 100644 --- a/src/amd/common/ac_cmdbuf_sdma.c +++ b/src/amd/common/ac_cmdbuf_sdma.c @@ -86,3 +86,40 @@ ac_emit_sdma_constant_fill(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_versi return bytes_written; } + +uint64_t +ac_emit_sdma_copy_linear(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version, + uint64_t src_va, uint64_t dst_va, uint64_t size, + bool tmz) +{ + const unsigned max_size_per_packet = + sdma_ip_version >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES; + uint32_t align = ~0u; + + assert(sdma_ip_version >= SDMA_2_0); + + /* SDMA FW automatically enables a faster dword copy mode when + * source, destination and size are all dword-aligned. + * + * When source and destination are dword-aligned, round down the size to + * take advantage of faster dword copy, and copy the remaining few bytes + * with the last copy packet. + */ + if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) { + align = ~0x3u; + } + + const uint64_t bytes_written = size >= 4 ? MIN2(size & align, max_size_per_packet) : size; + + ac_cmdbuf_begin(cs); + ac_cmdbuf_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, (tmz ? 4 : 0))); + ac_cmdbuf_emit(sdma_ip_version >= SDMA_4_0 ? bytes_written - 1 : bytes_written); + ac_cmdbuf_emit(0); + ac_cmdbuf_emit(src_va); + ac_cmdbuf_emit(src_va >> 32); + ac_cmdbuf_emit(dst_va); + ac_cmdbuf_emit(dst_va >> 32); + ac_cmdbuf_end(); + + return bytes_written; +} diff --git a/src/amd/common/ac_cmdbuf_sdma.h b/src/amd/common/ac_cmdbuf_sdma.h index a8381bed566..67c771bf4ee 100644 --- a/src/amd/common/ac_cmdbuf_sdma.h +++ b/src/amd/common/ac_cmdbuf_sdma.h @@ -28,6 +28,11 @@ uint64_t ac_emit_sdma_constant_fill(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version, uint64_t va, uint64_t size, uint32_t value); +uint64_t +ac_emit_sdma_copy_linear(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version, + uint64_t src_va, uint64_t dst_va, uint64_t size, + bool tmz); + #ifdef __cplusplus } #endif diff --git a/src/amd/vulkan/radv_sdma.c b/src/amd/vulkan/radv_sdma.c index dc64a5b4a69..00c859361f3 100644 --- a/src/amd/vulkan/radv_sdma.c +++ b/src/amd/vulkan/radv_sdma.c @@ -358,49 +358,16 @@ void radv_sdma_copy_memory(const struct radv_device *device, struct radv_cmd_stream *cs, uint64_t src_va, uint64_t dst_va, uint64_t size) { - if (size == 0) - return; - const struct radv_physical_device *pdev = radv_device_physical(device); - const enum sdma_version ver = pdev->info.sdma_ip_version; - const unsigned max_size_per_packet = ver >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES; - unsigned align = ~0u; - unsigned ncopy = DIV_ROUND_UP(size, max_size_per_packet); + while (size > 0) { + radeon_check_space(device->ws, cs->b, 7); + uint64_t bytes_written = ac_emit_sdma_copy_linear(cs->b, pdev->info.sdma_ip_version, src_va, dst_va, size, false); - assert(ver >= SDMA_2_0); - - /* SDMA FW automatically enables a faster dword copy mode when - * source, destination and size are all dword-aligned. - * - * When source and destination are dword-aligned, round down the size to - * take advantage of faster dword copy, and copy the remaining few bytes - * with the last copy packet. - */ - if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) { - align = ~0x3u; - ncopy++; + size -= bytes_written; + src_va += bytes_written; + dst_va += bytes_written; } - - radeon_check_space(device->ws, cs->b, ncopy * 7); - - radeon_begin(cs); - - for (unsigned i = 0; i < ncopy; i++) { - unsigned csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size; - radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0)); - radeon_emit(ver >= SDMA_4_0 ? csize - 1 : csize); - radeon_emit(0); /* src/dst endian swap */ - radeon_emit(src_va); - radeon_emit(src_va >> 32); - radeon_emit(dst_va); - radeon_emit(dst_va >> 32); - dst_va += csize; - src_va += csize; - size -= csize; - } - - radeon_end(); } void diff --git a/src/gallium/drivers/radeonsi/si_sdma_copy_image.c b/src/gallium/drivers/radeonsi/si_sdma_copy_image.c index ef1f5c25709..8733b2008e5 100644 --- a/src/gallium/drivers/radeonsi/si_sdma_copy_image.c +++ b/src/gallium/drivers/radeonsi/si_sdma_copy_image.c @@ -9,6 +9,7 @@ #include "sid.h" #include "util/u_memory.h" #include "ac_formats.h" +#include "ac_cmdbuf_sdma.h" static bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src) @@ -56,7 +57,6 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur struct si_texture *ssrc) { bool is_v5 = sctx->gfx_level >= GFX10; - bool is_v5_2 = sctx->gfx_level >= GFX10_3; bool is_v7 = sctx->gfx_level >= GFX12; unsigned bpp = sdst->surface.bpe; uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset; @@ -74,30 +74,20 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur struct radeon_cmdbuf *cs = sctx->sdma_cs; uint64_t bytes = (uint64_t)src_pitch * copy_height * bpp; - uint32_t chunk_size = 1u << (is_v5_2 ? 30 : 22); - uint32_t chunk_count = DIV_ROUND_UP(bytes, chunk_size); src_address += ssrc->surface.u.gfx9.offset[0]; dst_address += sdst->surface.u.gfx9.offset[0]; - radeon_begin(cs); - for (int i = 0; i < chunk_count; i++) { - uint32_t size = MIN2(chunk_size, bytes); - radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, - SDMA_COPY_SUB_OPCODE_LINEAR, - (tmz ? 4 : 0))); - radeon_emit(size - 1); - radeon_emit(0); - radeon_emit(src_address); - radeon_emit(src_address >> 32); - radeon_emit(dst_address); - radeon_emit(dst_address >> 32); + while (bytes > 0) { + uint64_t bytes_written = + ac_emit_sdma_copy_linear(&cs->current, sctx->screen->info.sdma_ip_version, + src_address, dst_address, bytes, tmz); - src_address += size; - dst_address += size; - bytes -= size; + bytes -= bytes_written; + src_address += bytes_written; + dst_address += bytes_written; } - radeon_end(); + return true; }