ac,radv,radeonsi: add ac_emit_sdma_copy_linear()

RadeonSI wasn't considering the undocumented HW limitation apparently. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38430>
2026-05-09 04:38:03 +02:00 · 2025-11-13 18:26:04 +01:00 · 2025-11-13 18:26:04 +01:00 · 6413651bcf
commit 6413651bcf
parent 191bf7aba6
4 changed files with 57 additions and 58 deletions
--- a/src/amd/common/ac_cmdbuf_sdma.c
+++ b/src/amd/common/ac_cmdbuf_sdma.c
@ -86,3 +86,40 @@ ac_emit_sdma_constant_fill(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_versi

   return bytes_written;
 }
+
+uint64_t
+ac_emit_sdma_copy_linear(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version,
+                         uint64_t src_va, uint64_t dst_va, uint64_t size,
+                         bool tmz)
+{
+   const unsigned max_size_per_packet =
+      sdma_ip_version >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES;
+   uint32_t align = ~0u;
+
+   assert(sdma_ip_version >= SDMA_2_0);
+
+   /* SDMA FW automatically enables a faster dword copy mode when
+    * source, destination and size are all dword-aligned.
+    *
+    * When source and destination are dword-aligned, round down the size to
+    * take advantage of faster dword copy, and copy the remaining few bytes
+    * with the last copy packet.
+    */
+   if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) {
+      align = ~0x3u;
+   }
+
+   const uint64_t bytes_written = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
+
+   ac_cmdbuf_begin(cs);
+   ac_cmdbuf_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, (tmz ? 4 : 0)));
+   ac_cmdbuf_emit(sdma_ip_version >= SDMA_4_0 ? bytes_written - 1 : bytes_written);
+   ac_cmdbuf_emit(0);
+   ac_cmdbuf_emit(src_va);
+   ac_cmdbuf_emit(src_va >> 32);
+   ac_cmdbuf_emit(dst_va);
+   ac_cmdbuf_emit(dst_va >> 32);
+   ac_cmdbuf_end();
+
+   return bytes_written;
+}
--- a/src/amd/common/ac_cmdbuf_sdma.h
+++ b/src/amd/common/ac_cmdbuf_sdma.h
@ -28,6 +28,11 @@ uint64_t
 ac_emit_sdma_constant_fill(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version,
                           uint64_t va, uint64_t size, uint32_t value);

+uint64_t
+ac_emit_sdma_copy_linear(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version,
+                         uint64_t src_va, uint64_t dst_va, uint64_t size,
+                         bool tmz);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/amd/vulkan/radv_sdma.c
+++ b/src/amd/vulkan/radv_sdma.c
@ -358,49 +358,16 @@ void
 radv_sdma_copy_memory(const struct radv_device *device, struct radv_cmd_stream *cs, uint64_t src_va, uint64_t dst_va,
                      uint64_t size)
 {
-   if (size == 0)
-      return;
-
   const struct radv_physical_device *pdev = radv_device_physical(device);
-   const enum sdma_version ver = pdev->info.sdma_ip_version;
-   const unsigned max_size_per_packet = ver >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES;

-   unsigned align = ~0u;
-   unsigned ncopy = DIV_ROUND_UP(size, max_size_per_packet);
+   while (size > 0) {
+      radeon_check_space(device->ws, cs->b, 7);
+      uint64_t bytes_written = ac_emit_sdma_copy_linear(cs->b, pdev->info.sdma_ip_version, src_va, dst_va, size, false);

-   assert(ver >= SDMA_2_0);
-
-   /* SDMA FW automatically enables a faster dword copy mode when
-    * source, destination and size are all dword-aligned.
-    *
-    * When source and destination are dword-aligned, round down the size to
-    * take advantage of faster dword copy, and copy the remaining few bytes
-    * with the last copy packet.
-    */
-   if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) {
-      align = ~0x3u;
-      ncopy++;
+      size -= bytes_written;
+      src_va += bytes_written;
+      dst_va += bytes_written;
   }
-
-   radeon_check_space(device->ws, cs->b, ncopy * 7);
-
-   radeon_begin(cs);
-
-   for (unsigned i = 0; i < ncopy; i++) {
-      unsigned csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
-      radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0));
-      radeon_emit(ver >= SDMA_4_0 ? csize - 1 : csize);
-      radeon_emit(0); /* src/dst endian swap */
-      radeon_emit(src_va);
-      radeon_emit(src_va >> 32);
-      radeon_emit(dst_va);
-      radeon_emit(dst_va >> 32);
-      dst_va += csize;
-      src_va += csize;
-      size -= csize;
-   }
-
-   radeon_end();
 }

 void
--- a/src/gallium/drivers/radeonsi/si_sdma_copy_image.c
+++ b/src/gallium/drivers/radeonsi/si_sdma_copy_image.c
@ -9,6 +9,7 @@
 #include "sid.h"
 #include "util/u_memory.h"
 #include "ac_formats.h"
+#include "ac_cmdbuf_sdma.h"

 static
 bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src)
@ -56,7 +57,6 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
                                       struct si_texture *ssrc)
 {
   bool is_v5 = sctx->gfx_level >= GFX10;
-   bool is_v5_2 = sctx->gfx_level >= GFX10_3;
   bool is_v7 = sctx->gfx_level >= GFX12;
   unsigned bpp = sdst->surface.bpe;
   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
@ -74,30 +74,20 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
      struct radeon_cmdbuf *cs = sctx->sdma_cs;

      uint64_t bytes = (uint64_t)src_pitch * copy_height * bpp;
-      uint32_t chunk_size = 1u << (is_v5_2 ? 30 : 22);
-      uint32_t chunk_count = DIV_ROUND_UP(bytes, chunk_size);

      src_address += ssrc->surface.u.gfx9.offset[0];
      dst_address += sdst->surface.u.gfx9.offset[0];

-      radeon_begin(cs);
-      for (int i = 0; i < chunk_count; i++) {
-         uint32_t size = MIN2(chunk_size, bytes);
-         radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
-                                     SDMA_COPY_SUB_OPCODE_LINEAR,
-                                     (tmz ? 4 : 0)));
-         radeon_emit(size - 1);
-         radeon_emit(0);
-         radeon_emit(src_address);
-         radeon_emit(src_address >> 32);
-         radeon_emit(dst_address);
-         radeon_emit(dst_address >> 32);
+      while (bytes > 0) {
+         uint64_t bytes_written =
+            ac_emit_sdma_copy_linear(&cs->current, sctx->screen->info.sdma_ip_version,
+                                     src_address, dst_address, bytes, tmz);

-         src_address += size;
-         dst_address += size;
-         bytes -= size;
+         bytes -= bytes_written;
+         src_address += bytes_written;
+         dst_address += bytes_written;
      }
-      radeon_end();
+
      return true;
   }