asahi: Use GPU for buffer copies in resource_copy_region()

Use a compute shader to copy PIPE_BUFFERs. Based on hk's hk_cmd_copy().
For large copy sizes (>= 128MB) it achieves 3/4 of the available memory
bandwidth on a M1 Ultra (G13D). `gpu-ratemeter gl.bufbw` reports
~625 GB/s for 256MB buffer size. Apple specifies the memory bandwidth of
the M1 Ultra with 819.2 GB/s.

Signed-off-by: Janne Grunau <j@jannau.net>
(cherry picked from commit 3f5497ded8)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40092>
This commit is contained in:
Janne Grunau 2026-01-16 11:53:16 +01:00 committed by Eric Engestrom
parent 0f21dc1bd4
commit 37a269e303
2 changed files with 34 additions and 1 deletions

View file

@ -2924,7 +2924,7 @@
"description": "asahi: Use GPU for buffer copies in resource_copy_region()",
"nominated": false,
"nomination_type": 0,
"resolution": 4,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View file

@ -613,6 +613,39 @@ agx_resource_copy_region(struct pipe_context *pctx, struct pipe_resource *dst,
unsigned dstz, struct pipe_resource *src,
unsigned src_level, const struct pipe_box *src_box)
{
if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
struct agx_batch *batch = agx_get_compute_batch(agx_context(pctx));
agx_batch_init_state(batch);
assert(dst->format == src->format);
unsigned bs = util_format_get_blocksize(dst->format);
unsigned size = bs * src_box->width;
uint64_t dst_addr = agx_map_gpu(agx_resource(dst)) + dstx * bs;
uint64_t src_addr = agx_map_gpu(agx_resource(src)) + src_box->x * bs;
agx_batch_reads(batch, agx_resource(src));
agx_batch_writes_range(batch, agx_resource(dst), dst_addr, size);
/* Use vectorized copies for as much of the buffer as possible. This requires
* that dst, src, and size are all properly aligned. Failing to check for
* alignment on the buffers causes subtle and hard-to-debug issues!
*/
if (size >= 16 && (dst_addr & 0xf) == 0 && (src_addr & 0xf) == 0) {
unsigned uint4s = size / 16;
unsigned bytes = uint4s * 16;
libagx_copy_uint4(batch, agx_1d(uint4s), AGX_BARRIER_ALL, dst_addr, src_addr);
dst_addr += bytes;
src_addr += bytes;
size -= bytes;
}
if (size) {
libagx_copy_uchar(batch, agx_1d(size), AGX_BARRIER_ALL, dst_addr, src_addr);
}
return;
}
if (try_copy_via_blit(pctx, dst, dst_level, dstx, dsty, dstz, src, src_level,
src_box))
return;