From 0f6a06bbbabdb846f86fa892e2c13b864252fb01 Mon Sep 17 00:00:00 2001
From: Olivia Lee <olivia.lee@collabora.com>
Date: Sat, 28 Jun 2025 18:55:34 -0700
Subject: [PATCH] pan/shared: add function to copy between two tiled images

This is needed for VK_EXT_host_image_copy.

Most other mesa drivers use a similar approach to implement tiled->tiled
copy, with a few differences. They use a temp buffer sized for only one
tile, don't attempt to tile-align the copies in either the src or dest,
and they don't have the memcpy fast path. I measured performance of a
variety of implementations on a rock5b, and found:

 - The fast path for when the copy region is tile-aligned is a 167%
   improvement.
 - Aligning the temp buffer chunks to src tiles is a 20% improvement.
 - Using a 64k buffer instead of a tile-sized buffer is a 14%
   improvement. This buffer size appears optimal in my benchmark,
   smaller and larger buffers are both slower. Skipping the chunk
   approach and just (de)tiling to a temp buffer that fits the whole
   image (what NVK does) is also slower.
 - I had no luck with attempts at a direct tiled->tiled copy algorithm
   that didn't need a temp buffer. The fastest I got was ~1/4 the speed
   of the temp buffer implementation.

Signed-off-by: Olivia Lee <olivia.lee@collabora.com>
Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35910>
---
 src/panfrost/shared/pan_tiling.c | 95 ++++++++++++++++++++++++++++++++
 src/panfrost/shared/pan_tiling.h | 20 +++++++
 2 files changed, 115 insertions(+)

diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c
index f64f858bfb8..d96d2892484 100644
--- a/src/panfrost/shared/pan_tiling.c
+++ b/src/panfrost/shared/pan_tiling.c
@@ -26,9 +26,11 @@
  */
 
 #include "pan_tiling.h"
+#include <math.h>
 #include <stdbool.h>
 #include "util/bitscan.h"
 #include "util/macros.h"
+#include "util/ralloc.h"
 
 /*
  * This file implements software encode/decode of u-interleaved textures.
@@ -400,3 +402,96 @@ pan_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
    pan_access_tiled_image((void *)src, dst, x, y, w, h, src_stride, dst_stride,
                           format, false);
 }
+
+void
+pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x, unsigned dst_y,
+                     unsigned src_x, unsigned src_y, unsigned w, unsigned h,
+                     uint32_t dst_stride, uint32_t src_stride,
+                     enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned block_size_B = desc->block.bits / 8;
+
+   /* If both the src and dst region are tile-aligned, we can just memcpy
+    * whole tiles without any (de)tiling */
+   if (src_x % TILE_WIDTH == 0 && src_y % TILE_HEIGHT == 0 &&
+       dst_x % TILE_WIDTH == 0 && dst_y % TILE_HEIGHT == 0 &&
+       w % TILE_WIDTH == 0 && h % TILE_HEIGHT == 0) {
+
+      unsigned tile_size_B = block_size_B * PIXELS_PER_TILE;
+
+      unsigned w_t = w / TILE_WIDTH;
+      unsigned h_t = h / TILE_HEIGHT;
+      unsigned src_x_t = src_x / TILE_WIDTH;
+      unsigned src_y_t = src_y / TILE_HEIGHT;
+      unsigned dst_x_t = dst_x / TILE_WIDTH;
+      unsigned dst_y_t = dst_y / TILE_HEIGHT;
+
+      for (unsigned y_t = 0; y_t < h_t; y_t++) {
+         void *dst_tile_row = dst +
+            (y_t + dst_y_t) * dst_stride +
+            dst_x_t * tile_size_B;
+         const void *src_tile_row = src +
+            (y_t + src_y_t) * src_stride +
+            src_x_t * tile_size_B;
+         memcpy(dst_tile_row, src_tile_row, tile_size_B * w_t);
+      }
+
+      return;
+   }
+
+   /* Otherwise, we copy by working across the copy region in 64KiB chunks.
+    * For each chunk, we detile part of the src into a linear tempoaray
+    * buffer, then tile to the dst */
+
+   /* This could fit on the stack easily on glibc, but it's dicier on musl,
+    * which has a 128KiB stack size */
+   const size_t chunk_size_B = 65536;
+   void *chunk = ralloc_size(NULL, chunk_size_B);
+
+   /* Choose pixel dimensions of the chunk. These should be tile aligned,
+    * maximize used space in the buffer, and be close to a square. */
+   unsigned chunk_size_bl = chunk_size_B / block_size_B;
+   unsigned chunk_width_bl = (unsigned) sqrtf((float) (chunk_size_bl));
+   chunk_width_bl = (chunk_width_bl / TILE_WIDTH) * TILE_WIDTH;
+   unsigned chunk_height_bl = chunk_size_bl / chunk_width_bl;
+   chunk_height_bl = (chunk_height_bl / TILE_HEIGHT) * TILE_HEIGHT;
+
+   unsigned chunk_width_px = chunk_width_bl * desc->block.width;
+   unsigned chunk_height_px = chunk_height_bl * desc->block.height;
+
+   unsigned chunk_row_stride_B = chunk_width_bl * block_size_B;
+
+   /* Align chunk copy regions to src tiles, to optimize detiling. We can't
+    * get tile alignment on both src and dst, but one is better than nothing. */
+   unsigned src_first_tile_x = (src_x / TILE_WIDTH) * TILE_WIDTH;
+   unsigned src_first_tile_y = (src_y / TILE_HEIGHT) * TILE_HEIGHT;
+
+   for (unsigned x = src_first_tile_x; x < src_x + w; x += chunk_width_px) {
+      for (unsigned y = src_first_tile_y; y < src_y + h; y += chunk_height_px) {
+         /* x/y are tile-aligned, but because the actual copy region is not,
+          * we may need to start at an offset position on the left/top edges */
+         unsigned src_chunk_x = MAX2(src_x, x);
+         unsigned src_chunk_y = MAX2(src_y, y);
+         unsigned dst_chunk_x = dst_x + (src_chunk_x - src_x);
+         unsigned dst_chunk_y = dst_y + (src_chunk_y - src_y);
+
+         /* Similarly, right/bottom edges may not need a whole chunk */
+         unsigned src_chunk_right = MIN2(src_chunk_x + chunk_width_px,
+                                         src_x + w);
+         unsigned src_chunk_bottom = MIN2(src_chunk_y + chunk_height_px,
+                                          src_y + h);
+         unsigned width = src_chunk_right - src_chunk_x;
+         unsigned height = src_chunk_bottom - src_chunk_y;
+
+         pan_load_tiled_image(
+            chunk, src, src_chunk_x, src_chunk_y, width, height,
+            chunk_row_stride_B, src_stride, format);
+         pan_store_tiled_image(
+            dst, chunk, dst_chunk_x, dst_chunk_y, width, height, dst_stride,
+            chunk_row_stride_B, format);
+      }
+   }
+
+   ralloc_free(chunk);
+}
diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h
index 2432cfedd6d..d7f66019b19 100644
--- a/src/panfrost/shared/pan_tiling.h
+++ b/src/panfrost/shared/pan_tiling.h
@@ -68,6 +68,26 @@ void pan_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
                            unsigned w, unsigned h, uint32_t dst_stride,
                            uint32_t src_stride, enum pipe_format format);
 
+/**
+ * Copy a rectangular region from one tiled image to another.
+ *
+ * @dst Tiled destination
+ * @src Tiled source
+ * @dst_x Region of interest of destination in pixels, aligned to block size
+ * @dst_y Region of interest of destination in pixels, aligned to block size
+ * @src_x Region of interest of source in pixels, aligned to block size
+ * @src_y Region of interest of source in pixels, aligned to block size
+ * @w Size of region of interest in pixels, aligned to block size
+ * @h Size of region of interest in pixels, aligned to block size
+ * @dst_stride Number of bytes between adjacent rows of tiles in destination.
+ * @src_stride Number of bytes between adjacent rows of tiles in source.
+ * @format Format of the source and destination image
+ */
+void pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x,
+                          unsigned dst_y, unsigned src_x, unsigned src_y,
+                          unsigned w, unsigned h, uint32_t dst_stride,
+                          uint32_t src_stride, enum pipe_format format);
+
 #ifdef __cplusplus
 } /* extern C */
 #endif