ac,radv,radeonsi: add ac_emit_sdma_copy_linear()

RadeonSI wasn't considering the undocumented HW limitation apparently.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38430>
This commit is contained in:
Samuel Pitoiset 2025-11-13 18:26:04 +01:00
parent 191bf7aba6
commit 6413651bcf
4 changed files with 57 additions and 58 deletions

View file

@ -86,3 +86,40 @@ ac_emit_sdma_constant_fill(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_versi
return bytes_written;
}
uint64_t
ac_emit_sdma_copy_linear(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version,
uint64_t src_va, uint64_t dst_va, uint64_t size,
bool tmz)
{
const unsigned max_size_per_packet =
sdma_ip_version >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES;
uint32_t align = ~0u;
assert(sdma_ip_version >= SDMA_2_0);
/* SDMA FW automatically enables a faster dword copy mode when
* source, destination and size are all dword-aligned.
*
* When source and destination are dword-aligned, round down the size to
* take advantage of faster dword copy, and copy the remaining few bytes
* with the last copy packet.
*/
if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) {
align = ~0x3u;
}
const uint64_t bytes_written = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
ac_cmdbuf_begin(cs);
ac_cmdbuf_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, (tmz ? 4 : 0)));
ac_cmdbuf_emit(sdma_ip_version >= SDMA_4_0 ? bytes_written - 1 : bytes_written);
ac_cmdbuf_emit(0);
ac_cmdbuf_emit(src_va);
ac_cmdbuf_emit(src_va >> 32);
ac_cmdbuf_emit(dst_va);
ac_cmdbuf_emit(dst_va >> 32);
ac_cmdbuf_end();
return bytes_written;
}

View file

@ -28,6 +28,11 @@ uint64_t
ac_emit_sdma_constant_fill(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version,
uint64_t va, uint64_t size, uint32_t value);
uint64_t
ac_emit_sdma_copy_linear(struct ac_cmdbuf *cs, enum sdma_version sdma_ip_version,
uint64_t src_va, uint64_t dst_va, uint64_t size,
bool tmz);
#ifdef __cplusplus
}
#endif

View file

@ -358,49 +358,16 @@ void
radv_sdma_copy_memory(const struct radv_device *device, struct radv_cmd_stream *cs, uint64_t src_va, uint64_t dst_va,
uint64_t size)
{
if (size == 0)
return;
const struct radv_physical_device *pdev = radv_device_physical(device);
const enum sdma_version ver = pdev->info.sdma_ip_version;
const unsigned max_size_per_packet = ver >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES;
unsigned align = ~0u;
unsigned ncopy = DIV_ROUND_UP(size, max_size_per_packet);
while (size > 0) {
radeon_check_space(device->ws, cs->b, 7);
uint64_t bytes_written = ac_emit_sdma_copy_linear(cs->b, pdev->info.sdma_ip_version, src_va, dst_va, size, false);
assert(ver >= SDMA_2_0);
/* SDMA FW automatically enables a faster dword copy mode when
* source, destination and size are all dword-aligned.
*
* When source and destination are dword-aligned, round down the size to
* take advantage of faster dword copy, and copy the remaining few bytes
* with the last copy packet.
*/
if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) {
align = ~0x3u;
ncopy++;
size -= bytes_written;
src_va += bytes_written;
dst_va += bytes_written;
}
radeon_check_space(device->ws, cs->b, ncopy * 7);
radeon_begin(cs);
for (unsigned i = 0; i < ncopy; i++) {
unsigned csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0));
radeon_emit(ver >= SDMA_4_0 ? csize - 1 : csize);
radeon_emit(0); /* src/dst endian swap */
radeon_emit(src_va);
radeon_emit(src_va >> 32);
radeon_emit(dst_va);
radeon_emit(dst_va >> 32);
dst_va += csize;
src_va += csize;
size -= csize;
}
radeon_end();
}
void

View file

@ -9,6 +9,7 @@
#include "sid.h"
#include "util/u_memory.h"
#include "ac_formats.h"
#include "ac_cmdbuf_sdma.h"
static
bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src)
@ -56,7 +57,6 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
struct si_texture *ssrc)
{
bool is_v5 = sctx->gfx_level >= GFX10;
bool is_v5_2 = sctx->gfx_level >= GFX10_3;
bool is_v7 = sctx->gfx_level >= GFX12;
unsigned bpp = sdst->surface.bpe;
uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
@ -74,30 +74,20 @@ static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_textur
struct radeon_cmdbuf *cs = sctx->sdma_cs;
uint64_t bytes = (uint64_t)src_pitch * copy_height * bpp;
uint32_t chunk_size = 1u << (is_v5_2 ? 30 : 22);
uint32_t chunk_count = DIV_ROUND_UP(bytes, chunk_size);
src_address += ssrc->surface.u.gfx9.offset[0];
dst_address += sdst->surface.u.gfx9.offset[0];
radeon_begin(cs);
for (int i = 0; i < chunk_count; i++) {
uint32_t size = MIN2(chunk_size, bytes);
radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
SDMA_COPY_SUB_OPCODE_LINEAR,
(tmz ? 4 : 0)));
radeon_emit(size - 1);
radeon_emit(0);
radeon_emit(src_address);
radeon_emit(src_address >> 32);
radeon_emit(dst_address);
radeon_emit(dst_address >> 32);
while (bytes > 0) {
uint64_t bytes_written =
ac_emit_sdma_copy_linear(&cs->current, sctx->screen->info.sdma_ip_version,
src_address, dst_address, bytes, tmz);
src_address += size;
dst_address += size;
bytes -= size;
bytes -= bytes_written;
src_address += bytes_written;
dst_address += bytes_written;
}
radeon_end();
return true;
}