From 0f6a06bbbabdb846f86fa892e2c13b864252fb01 Mon Sep 17 00:00:00 2001 From: Olivia Lee Date: Sat, 28 Jun 2025 18:55:34 -0700 Subject: [PATCH] pan/shared: add function to copy between two tiled images This is needed for VK_EXT_host_image_copy. Most other mesa drivers use a similar approach to implement tiled->tiled copy, with a few differences. They use a temp buffer sized for only one tile, don't attempt to tile-align the copies in either the src or dest, and they don't have the memcpy fast path. I measured performance of a variety of implementations on a rock5b, and found: - The fast path for when the copy region is tile-aligned is a 167% improvement. - Aligning the temp buffer chunks to src tiles is a 20% improvement. - Using a 64k buffer instead of a tile-sized buffer is a 14% improvement. This buffer size appears optimal in my benchmark, smaller and larger buffers are both slower. Skipping the chunk approach and just (de)tiling to a temp buffer that fits the whole image (what NVK does) is also slower. - I had no luck with attempts at a direct tiled->tiled copy algorithm that didn't need a temp buffer. The fastest I got was ~1/4 the speed of the temp buffer implementation. Signed-off-by: Olivia Lee Reviewed-by: Erik Faye-Lund Part-of: --- src/panfrost/shared/pan_tiling.c | 95 ++++++++++++++++++++++++++++++++ src/panfrost/shared/pan_tiling.h | 20 +++++++ 2 files changed, 115 insertions(+) diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c index f64f858bfb8..d96d2892484 100644 --- a/src/panfrost/shared/pan_tiling.c +++ b/src/panfrost/shared/pan_tiling.c @@ -26,9 +26,11 @@ */ #include "pan_tiling.h" +#include #include #include "util/bitscan.h" #include "util/macros.h" +#include "util/ralloc.h" /* * This file implements software encode/decode of u-interleaved textures. @@ -400,3 +402,96 @@ pan_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y, pan_access_tiled_image((void *)src, dst, x, y, w, h, src_stride, dst_stride, format, false); } + +void +pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x, unsigned dst_y, + unsigned src_x, unsigned src_y, unsigned w, unsigned h, + uint32_t dst_stride, uint32_t src_stride, + enum pipe_format format) +{ + const struct util_format_description *desc = util_format_description(format); + unsigned block_size_B = desc->block.bits / 8; + + /* If both the src and dst region are tile-aligned, we can just memcpy + * whole tiles without any (de)tiling */ + if (src_x % TILE_WIDTH == 0 && src_y % TILE_HEIGHT == 0 && + dst_x % TILE_WIDTH == 0 && dst_y % TILE_HEIGHT == 0 && + w % TILE_WIDTH == 0 && h % TILE_HEIGHT == 0) { + + unsigned tile_size_B = block_size_B * PIXELS_PER_TILE; + + unsigned w_t = w / TILE_WIDTH; + unsigned h_t = h / TILE_HEIGHT; + unsigned src_x_t = src_x / TILE_WIDTH; + unsigned src_y_t = src_y / TILE_HEIGHT; + unsigned dst_x_t = dst_x / TILE_WIDTH; + unsigned dst_y_t = dst_y / TILE_HEIGHT; + + for (unsigned y_t = 0; y_t < h_t; y_t++) { + void *dst_tile_row = dst + + (y_t + dst_y_t) * dst_stride + + dst_x_t * tile_size_B; + const void *src_tile_row = src + + (y_t + src_y_t) * src_stride + + src_x_t * tile_size_B; + memcpy(dst_tile_row, src_tile_row, tile_size_B * w_t); + } + + return; + } + + /* Otherwise, we copy by working across the copy region in 64KiB chunks. + * For each chunk, we detile part of the src into a linear tempoaray + * buffer, then tile to the dst */ + + /* This could fit on the stack easily on glibc, but it's dicier on musl, + * which has a 128KiB stack size */ + const size_t chunk_size_B = 65536; + void *chunk = ralloc_size(NULL, chunk_size_B); + + /* Choose pixel dimensions of the chunk. These should be tile aligned, + * maximize used space in the buffer, and be close to a square. */ + unsigned chunk_size_bl = chunk_size_B / block_size_B; + unsigned chunk_width_bl = (unsigned) sqrtf((float) (chunk_size_bl)); + chunk_width_bl = (chunk_width_bl / TILE_WIDTH) * TILE_WIDTH; + unsigned chunk_height_bl = chunk_size_bl / chunk_width_bl; + chunk_height_bl = (chunk_height_bl / TILE_HEIGHT) * TILE_HEIGHT; + + unsigned chunk_width_px = chunk_width_bl * desc->block.width; + unsigned chunk_height_px = chunk_height_bl * desc->block.height; + + unsigned chunk_row_stride_B = chunk_width_bl * block_size_B; + + /* Align chunk copy regions to src tiles, to optimize detiling. We can't + * get tile alignment on both src and dst, but one is better than nothing. */ + unsigned src_first_tile_x = (src_x / TILE_WIDTH) * TILE_WIDTH; + unsigned src_first_tile_y = (src_y / TILE_HEIGHT) * TILE_HEIGHT; + + for (unsigned x = src_first_tile_x; x < src_x + w; x += chunk_width_px) { + for (unsigned y = src_first_tile_y; y < src_y + h; y += chunk_height_px) { + /* x/y are tile-aligned, but because the actual copy region is not, + * we may need to start at an offset position on the left/top edges */ + unsigned src_chunk_x = MAX2(src_x, x); + unsigned src_chunk_y = MAX2(src_y, y); + unsigned dst_chunk_x = dst_x + (src_chunk_x - src_x); + unsigned dst_chunk_y = dst_y + (src_chunk_y - src_y); + + /* Similarly, right/bottom edges may not need a whole chunk */ + unsigned src_chunk_right = MIN2(src_chunk_x + chunk_width_px, + src_x + w); + unsigned src_chunk_bottom = MIN2(src_chunk_y + chunk_height_px, + src_y + h); + unsigned width = src_chunk_right - src_chunk_x; + unsigned height = src_chunk_bottom - src_chunk_y; + + pan_load_tiled_image( + chunk, src, src_chunk_x, src_chunk_y, width, height, + chunk_row_stride_B, src_stride, format); + pan_store_tiled_image( + dst, chunk, dst_chunk_x, dst_chunk_y, width, height, dst_stride, + chunk_row_stride_B, format); + } + } + + ralloc_free(chunk); +} diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h index 2432cfedd6d..d7f66019b19 100644 --- a/src/panfrost/shared/pan_tiling.h +++ b/src/panfrost/shared/pan_tiling.h @@ -68,6 +68,26 @@ void pan_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, enum pipe_format format); +/** + * Copy a rectangular region from one tiled image to another. + * + * @dst Tiled destination + * @src Tiled source + * @dst_x Region of interest of destination in pixels, aligned to block size + * @dst_y Region of interest of destination in pixels, aligned to block size + * @src_x Region of interest of source in pixels, aligned to block size + * @src_y Region of interest of source in pixels, aligned to block size + * @w Size of region of interest in pixels, aligned to block size + * @h Size of region of interest in pixels, aligned to block size + * @dst_stride Number of bytes between adjacent rows of tiles in destination. + * @src_stride Number of bytes between adjacent rows of tiles in source. + * @format Format of the source and destination image + */ +void pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x, + unsigned dst_y, unsigned src_x, unsigned src_y, + unsigned w, unsigned h, uint32_t dst_stride, + uint32_t src_stride, enum pipe_format format); + #ifdef __cplusplus } /* extern C */ #endif