pan/shared: add function to copy between two tiled images

This is needed for VK_EXT_host_image_copy.

Most other mesa drivers use a similar approach to implement tiled->tiled
copy, with a few differences. They use a temp buffer sized for only one
tile, don't attempt to tile-align the copies in either the src or dest,
and they don't have the memcpy fast path. I measured performance of a
variety of implementations on a rock5b, and found:

 - The fast path for when the copy region is tile-aligned is a 167%
   improvement.
 - Aligning the temp buffer chunks to src tiles is a 20% improvement.
 - Using a 64k buffer instead of a tile-sized buffer is a 14%
   improvement. This buffer size appears optimal in my benchmark,
   smaller and larger buffers are both slower. Skipping the chunk
   approach and just (de)tiling to a temp buffer that fits the whole
   image (what NVK does) is also slower.
 - I had no luck with attempts at a direct tiled->tiled copy algorithm
   that didn't need a temp buffer. The fastest I got was ~1/4 the speed
   of the temp buffer implementation.

Signed-off-by: Olivia Lee <olivia.lee@collabora.com>
Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35910>
This commit is contained in:
Olivia Lee 2025-06-28 18:55:34 -07:00 committed by Marge Bot
parent d3150006be
commit 0f6a06bbba
2 changed files with 115 additions and 0 deletions

View file

@ -26,9 +26,11 @@
*/
#include "pan_tiling.h"
#include <math.h>
#include <stdbool.h>
#include "util/bitscan.h"
#include "util/macros.h"
#include "util/ralloc.h"
/*
* This file implements software encode/decode of u-interleaved textures.
@ -400,3 +402,96 @@ pan_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
pan_access_tiled_image((void *)src, dst, x, y, w, h, src_stride, dst_stride,
format, false);
}
void
pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x, unsigned dst_y,
unsigned src_x, unsigned src_y, unsigned w, unsigned h,
uint32_t dst_stride, uint32_t src_stride,
enum pipe_format format)
{
const struct util_format_description *desc = util_format_description(format);
unsigned block_size_B = desc->block.bits / 8;
/* If both the src and dst region are tile-aligned, we can just memcpy
* whole tiles without any (de)tiling */
if (src_x % TILE_WIDTH == 0 && src_y % TILE_HEIGHT == 0 &&
dst_x % TILE_WIDTH == 0 && dst_y % TILE_HEIGHT == 0 &&
w % TILE_WIDTH == 0 && h % TILE_HEIGHT == 0) {
unsigned tile_size_B = block_size_B * PIXELS_PER_TILE;
unsigned w_t = w / TILE_WIDTH;
unsigned h_t = h / TILE_HEIGHT;
unsigned src_x_t = src_x / TILE_WIDTH;
unsigned src_y_t = src_y / TILE_HEIGHT;
unsigned dst_x_t = dst_x / TILE_WIDTH;
unsigned dst_y_t = dst_y / TILE_HEIGHT;
for (unsigned y_t = 0; y_t < h_t; y_t++) {
void *dst_tile_row = dst +
(y_t + dst_y_t) * dst_stride +
dst_x_t * tile_size_B;
const void *src_tile_row = src +
(y_t + src_y_t) * src_stride +
src_x_t * tile_size_B;
memcpy(dst_tile_row, src_tile_row, tile_size_B * w_t);
}
return;
}
/* Otherwise, we copy by working across the copy region in 64KiB chunks.
* For each chunk, we detile part of the src into a linear tempoaray
* buffer, then tile to the dst */
/* This could fit on the stack easily on glibc, but it's dicier on musl,
* which has a 128KiB stack size */
const size_t chunk_size_B = 65536;
void *chunk = ralloc_size(NULL, chunk_size_B);
/* Choose pixel dimensions of the chunk. These should be tile aligned,
* maximize used space in the buffer, and be close to a square. */
unsigned chunk_size_bl = chunk_size_B / block_size_B;
unsigned chunk_width_bl = (unsigned) sqrtf((float) (chunk_size_bl));
chunk_width_bl = (chunk_width_bl / TILE_WIDTH) * TILE_WIDTH;
unsigned chunk_height_bl = chunk_size_bl / chunk_width_bl;
chunk_height_bl = (chunk_height_bl / TILE_HEIGHT) * TILE_HEIGHT;
unsigned chunk_width_px = chunk_width_bl * desc->block.width;
unsigned chunk_height_px = chunk_height_bl * desc->block.height;
unsigned chunk_row_stride_B = chunk_width_bl * block_size_B;
/* Align chunk copy regions to src tiles, to optimize detiling. We can't
* get tile alignment on both src and dst, but one is better than nothing. */
unsigned src_first_tile_x = (src_x / TILE_WIDTH) * TILE_WIDTH;
unsigned src_first_tile_y = (src_y / TILE_HEIGHT) * TILE_HEIGHT;
for (unsigned x = src_first_tile_x; x < src_x + w; x += chunk_width_px) {
for (unsigned y = src_first_tile_y; y < src_y + h; y += chunk_height_px) {
/* x/y are tile-aligned, but because the actual copy region is not,
* we may need to start at an offset position on the left/top edges */
unsigned src_chunk_x = MAX2(src_x, x);
unsigned src_chunk_y = MAX2(src_y, y);
unsigned dst_chunk_x = dst_x + (src_chunk_x - src_x);
unsigned dst_chunk_y = dst_y + (src_chunk_y - src_y);
/* Similarly, right/bottom edges may not need a whole chunk */
unsigned src_chunk_right = MIN2(src_chunk_x + chunk_width_px,
src_x + w);
unsigned src_chunk_bottom = MIN2(src_chunk_y + chunk_height_px,
src_y + h);
unsigned width = src_chunk_right - src_chunk_x;
unsigned height = src_chunk_bottom - src_chunk_y;
pan_load_tiled_image(
chunk, src, src_chunk_x, src_chunk_y, width, height,
chunk_row_stride_B, src_stride, format);
pan_store_tiled_image(
dst, chunk, dst_chunk_x, dst_chunk_y, width, height, dst_stride,
chunk_row_stride_B, format);
}
}
ralloc_free(chunk);
}

View file

@ -68,6 +68,26 @@ void pan_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
unsigned w, unsigned h, uint32_t dst_stride,
uint32_t src_stride, enum pipe_format format);
/**
* Copy a rectangular region from one tiled image to another.
*
* @dst Tiled destination
* @src Tiled source
* @dst_x Region of interest of destination in pixels, aligned to block size
* @dst_y Region of interest of destination in pixels, aligned to block size
* @src_x Region of interest of source in pixels, aligned to block size
* @src_y Region of interest of source in pixels, aligned to block size
* @w Size of region of interest in pixels, aligned to block size
* @h Size of region of interest in pixels, aligned to block size
* @dst_stride Number of bytes between adjacent rows of tiles in destination.
* @src_stride Number of bytes between adjacent rows of tiles in source.
* @format Format of the source and destination image
*/
void pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x,
unsigned dst_y, unsigned src_x, unsigned src_y,
unsigned w, unsigned h, uint32_t dst_stride,
uint32_t src_stride, enum pipe_format format);
#ifdef __cplusplus
} /* extern C */
#endif