From 37a269e3031088d736ec436ee810d8d1f67cde50 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 16 Jan 2026 11:53:16 +0100 Subject: [PATCH] asahi: Use GPU for buffer copies in resource_copy_region() Use a compute shader to copy PIPE_BUFFERs. Based on hk's hk_cmd_copy(). For large copy sizes (>= 128MB) it achieves 3/4 of the available memory bandwidth on a M1 Ultra (G13D). `gpu-ratemeter gl.bufbw` reports ~625 GB/s for 256MB buffer size. Apple specifies the memory bandwidth of the M1 Ultra with 819.2 GB/s. Signed-off-by: Janne Grunau (cherry picked from commit 3f5497ded8055e0c241f4a87e2f10907be87e02c) Part-of: --- .pick_status.json | 2 +- src/gallium/drivers/asahi/agx_blit.c | 33 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/.pick_status.json b/.pick_status.json index f16e76023df..5e0f6166fc8 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -2924,7 +2924,7 @@ "description": "asahi: Use GPU for buffer copies in resource_copy_region()", "nominated": false, "nomination_type": 0, - "resolution": 4, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c index bb0aee154bf..12901f71241 100644 --- a/src/gallium/drivers/asahi/agx_blit.c +++ b/src/gallium/drivers/asahi/agx_blit.c @@ -613,6 +613,39 @@ agx_resource_copy_region(struct pipe_context *pctx, struct pipe_resource *dst, unsigned dstz, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box) { + if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + struct agx_batch *batch = agx_get_compute_batch(agx_context(pctx)); + agx_batch_init_state(batch); + assert(dst->format == src->format); + unsigned bs = util_format_get_blocksize(dst->format); + unsigned size = bs * src_box->width; + uint64_t dst_addr = agx_map_gpu(agx_resource(dst)) + dstx * bs; + uint64_t src_addr = agx_map_gpu(agx_resource(src)) + src_box->x * bs; + + agx_batch_reads(batch, agx_resource(src)); + agx_batch_writes_range(batch, agx_resource(dst), dst_addr, size); + /* Use vectorized copies for as much of the buffer as possible. This requires + * that dst, src, and size are all properly aligned. Failing to check for + * alignment on the buffers causes subtle and hard-to-debug issues! + */ + if (size >= 16 && (dst_addr & 0xf) == 0 && (src_addr & 0xf) == 0) { + unsigned uint4s = size / 16; + unsigned bytes = uint4s * 16; + + libagx_copy_uint4(batch, agx_1d(uint4s), AGX_BARRIER_ALL, dst_addr, src_addr); + + dst_addr += bytes; + src_addr += bytes; + size -= bytes; + } + + if (size) { + libagx_copy_uchar(batch, agx_1d(size), AGX_BARRIER_ALL, dst_addr, src_addr); + } + + return; + } + if (try_copy_via_blit(pctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box)) return;