mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-03-21 07:30:33 +01:00
pan/shared: add function to copy between two tiled images
This is needed for VK_EXT_host_image_copy. Most other mesa drivers use a similar approach to implement tiled->tiled copy, with a few differences. They use a temp buffer sized for only one tile, don't attempt to tile-align the copies in either the src or dest, and they don't have the memcpy fast path. I measured performance of a variety of implementations on a rock5b, and found: - The fast path for when the copy region is tile-aligned is a 167% improvement. - Aligning the temp buffer chunks to src tiles is a 20% improvement. - Using a 64k buffer instead of a tile-sized buffer is a 14% improvement. This buffer size appears optimal in my benchmark, smaller and larger buffers are both slower. Skipping the chunk approach and just (de)tiling to a temp buffer that fits the whole image (what NVK does) is also slower. - I had no luck with attempts at a direct tiled->tiled copy algorithm that didn't need a temp buffer. The fastest I got was ~1/4 the speed of the temp buffer implementation. Signed-off-by: Olivia Lee <olivia.lee@collabora.com> Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35910>
This commit is contained in:
parent
d3150006be
commit
0f6a06bbba
2 changed files with 115 additions and 0 deletions
|
|
@ -26,9 +26,11 @@
|
|||
*/
|
||||
|
||||
#include "pan_tiling.h"
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include "util/bitscan.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/ralloc.h"
|
||||
|
||||
/*
|
||||
* This file implements software encode/decode of u-interleaved textures.
|
||||
|
|
@ -400,3 +402,96 @@ pan_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
|
|||
pan_access_tiled_image((void *)src, dst, x, y, w, h, src_stride, dst_stride,
|
||||
format, false);
|
||||
}
|
||||
|
||||
void
|
||||
pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x, unsigned dst_y,
|
||||
unsigned src_x, unsigned src_y, unsigned w, unsigned h,
|
||||
uint32_t dst_stride, uint32_t src_stride,
|
||||
enum pipe_format format)
|
||||
{
|
||||
const struct util_format_description *desc = util_format_description(format);
|
||||
unsigned block_size_B = desc->block.bits / 8;
|
||||
|
||||
/* If both the src and dst region are tile-aligned, we can just memcpy
|
||||
* whole tiles without any (de)tiling */
|
||||
if (src_x % TILE_WIDTH == 0 && src_y % TILE_HEIGHT == 0 &&
|
||||
dst_x % TILE_WIDTH == 0 && dst_y % TILE_HEIGHT == 0 &&
|
||||
w % TILE_WIDTH == 0 && h % TILE_HEIGHT == 0) {
|
||||
|
||||
unsigned tile_size_B = block_size_B * PIXELS_PER_TILE;
|
||||
|
||||
unsigned w_t = w / TILE_WIDTH;
|
||||
unsigned h_t = h / TILE_HEIGHT;
|
||||
unsigned src_x_t = src_x / TILE_WIDTH;
|
||||
unsigned src_y_t = src_y / TILE_HEIGHT;
|
||||
unsigned dst_x_t = dst_x / TILE_WIDTH;
|
||||
unsigned dst_y_t = dst_y / TILE_HEIGHT;
|
||||
|
||||
for (unsigned y_t = 0; y_t < h_t; y_t++) {
|
||||
void *dst_tile_row = dst +
|
||||
(y_t + dst_y_t) * dst_stride +
|
||||
dst_x_t * tile_size_B;
|
||||
const void *src_tile_row = src +
|
||||
(y_t + src_y_t) * src_stride +
|
||||
src_x_t * tile_size_B;
|
||||
memcpy(dst_tile_row, src_tile_row, tile_size_B * w_t);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* Otherwise, we copy by working across the copy region in 64KiB chunks.
|
||||
* For each chunk, we detile part of the src into a linear tempoaray
|
||||
* buffer, then tile to the dst */
|
||||
|
||||
/* This could fit on the stack easily on glibc, but it's dicier on musl,
|
||||
* which has a 128KiB stack size */
|
||||
const size_t chunk_size_B = 65536;
|
||||
void *chunk = ralloc_size(NULL, chunk_size_B);
|
||||
|
||||
/* Choose pixel dimensions of the chunk. These should be tile aligned,
|
||||
* maximize used space in the buffer, and be close to a square. */
|
||||
unsigned chunk_size_bl = chunk_size_B / block_size_B;
|
||||
unsigned chunk_width_bl = (unsigned) sqrtf((float) (chunk_size_bl));
|
||||
chunk_width_bl = (chunk_width_bl / TILE_WIDTH) * TILE_WIDTH;
|
||||
unsigned chunk_height_bl = chunk_size_bl / chunk_width_bl;
|
||||
chunk_height_bl = (chunk_height_bl / TILE_HEIGHT) * TILE_HEIGHT;
|
||||
|
||||
unsigned chunk_width_px = chunk_width_bl * desc->block.width;
|
||||
unsigned chunk_height_px = chunk_height_bl * desc->block.height;
|
||||
|
||||
unsigned chunk_row_stride_B = chunk_width_bl * block_size_B;
|
||||
|
||||
/* Align chunk copy regions to src tiles, to optimize detiling. We can't
|
||||
* get tile alignment on both src and dst, but one is better than nothing. */
|
||||
unsigned src_first_tile_x = (src_x / TILE_WIDTH) * TILE_WIDTH;
|
||||
unsigned src_first_tile_y = (src_y / TILE_HEIGHT) * TILE_HEIGHT;
|
||||
|
||||
for (unsigned x = src_first_tile_x; x < src_x + w; x += chunk_width_px) {
|
||||
for (unsigned y = src_first_tile_y; y < src_y + h; y += chunk_height_px) {
|
||||
/* x/y are tile-aligned, but because the actual copy region is not,
|
||||
* we may need to start at an offset position on the left/top edges */
|
||||
unsigned src_chunk_x = MAX2(src_x, x);
|
||||
unsigned src_chunk_y = MAX2(src_y, y);
|
||||
unsigned dst_chunk_x = dst_x + (src_chunk_x - src_x);
|
||||
unsigned dst_chunk_y = dst_y + (src_chunk_y - src_y);
|
||||
|
||||
/* Similarly, right/bottom edges may not need a whole chunk */
|
||||
unsigned src_chunk_right = MIN2(src_chunk_x + chunk_width_px,
|
||||
src_x + w);
|
||||
unsigned src_chunk_bottom = MIN2(src_chunk_y + chunk_height_px,
|
||||
src_y + h);
|
||||
unsigned width = src_chunk_right - src_chunk_x;
|
||||
unsigned height = src_chunk_bottom - src_chunk_y;
|
||||
|
||||
pan_load_tiled_image(
|
||||
chunk, src, src_chunk_x, src_chunk_y, width, height,
|
||||
chunk_row_stride_B, src_stride, format);
|
||||
pan_store_tiled_image(
|
||||
dst, chunk, dst_chunk_x, dst_chunk_y, width, height, dst_stride,
|
||||
chunk_row_stride_B, format);
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(chunk);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,6 +68,26 @@ void pan_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
|
|||
unsigned w, unsigned h, uint32_t dst_stride,
|
||||
uint32_t src_stride, enum pipe_format format);
|
||||
|
||||
/**
|
||||
* Copy a rectangular region from one tiled image to another.
|
||||
*
|
||||
* @dst Tiled destination
|
||||
* @src Tiled source
|
||||
* @dst_x Region of interest of destination in pixels, aligned to block size
|
||||
* @dst_y Region of interest of destination in pixels, aligned to block size
|
||||
* @src_x Region of interest of source in pixels, aligned to block size
|
||||
* @src_y Region of interest of source in pixels, aligned to block size
|
||||
* @w Size of region of interest in pixels, aligned to block size
|
||||
* @h Size of region of interest in pixels, aligned to block size
|
||||
* @dst_stride Number of bytes between adjacent rows of tiles in destination.
|
||||
* @src_stride Number of bytes between adjacent rows of tiles in source.
|
||||
* @format Format of the source and destination image
|
||||
*/
|
||||
void pan_copy_tiled_image(void *dst, const void *src, unsigned dst_x,
|
||||
unsigned dst_y, unsigned src_x, unsigned src_y,
|
||||
unsigned w, unsigned h, uint32_t dst_stride,
|
||||
uint32_t src_stride, enum pipe_format format);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern C */
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue