mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 04:48:08 +02:00
vc4: Handle partial loads/stores of tiled textures.
Previously, we would load out the tile-aligned area, update the raster copy, and store it back. This was a huge cost for XPutImage calls to the screen under glamor. Instead, implement a general load/store path that walks over the source x/y writing into the corresponding pixel of the destination (using clever math from https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/). If things are aligned, we go through the previous utile-at-a-time loop. Improves x11perf -putimage10 performance by 139.777% +/- 2.83464% (n=5) Improves x11perf -putimage100 performance by 383.908% +/- 22.6297% (n=11) Improves x11perf -getimage10 performance by 2.75731% +/- 0.585054% (n=145)
This commit is contained in:
parent
3e06b918aa
commit
25bee5ef9e
3 changed files with 155 additions and 60 deletions
|
|
@ -181,9 +181,6 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
|
||||||
|
|
||||||
struct vc4_resource_slice *slice = &rsc->slices[level];
|
struct vc4_resource_slice *slice = &rsc->slices[level];
|
||||||
if (rsc->tiled) {
|
if (rsc->tiled) {
|
||||||
uint32_t utile_w = vc4_utile_width(rsc->cpp);
|
|
||||||
uint32_t utile_h = vc4_utile_height(rsc->cpp);
|
|
||||||
|
|
||||||
/* No direct mappings of tiled, since we need to manually
|
/* No direct mappings of tiled, since we need to manually
|
||||||
* tile/untile.
|
* tile/untile.
|
||||||
*/
|
*/
|
||||||
|
|
@ -204,49 +201,12 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
|
||||||
ptrans->box.height = (ptrans->box.height + 3) >> 2;
|
ptrans->box.height = (ptrans->box.height + 3) >> 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We need to align the box to utile boundaries, since that's
|
|
||||||
* what load/store operates on. This may cause us to need to
|
|
||||||
* read out the original contents in that border area. Right
|
|
||||||
* now we just read out the entire contents, including the
|
|
||||||
* middle area that will just get overwritten.
|
|
||||||
*/
|
|
||||||
uint32_t box_start_x = ptrans->box.x & (utile_w - 1);
|
|
||||||
uint32_t box_start_y = ptrans->box.y & (utile_h - 1);
|
|
||||||
bool needs_load = (usage & PIPE_TRANSFER_READ) != 0;
|
|
||||||
|
|
||||||
if (box_start_x) {
|
|
||||||
ptrans->box.width += box_start_x;
|
|
||||||
ptrans->box.x -= box_start_x;
|
|
||||||
needs_load = true;
|
|
||||||
}
|
|
||||||
if (box_start_y) {
|
|
||||||
ptrans->box.height += box_start_y;
|
|
||||||
ptrans->box.y -= box_start_y;
|
|
||||||
needs_load = true;
|
|
||||||
}
|
|
||||||
if (ptrans->box.width & (utile_w - 1)) {
|
|
||||||
/* We only need to force a load if our border region
|
|
||||||
* we're extending into is actually part of the
|
|
||||||
* texture.
|
|
||||||
*/
|
|
||||||
uint32_t slice_width = u_minify(prsc->width0, level);
|
|
||||||
if (ptrans->box.x + ptrans->box.width != slice_width)
|
|
||||||
needs_load = true;
|
|
||||||
ptrans->box.width = align(ptrans->box.width, utile_w);
|
|
||||||
}
|
|
||||||
if (ptrans->box.height & (utile_h - 1)) {
|
|
||||||
uint32_t slice_height = u_minify(prsc->height0, level);
|
|
||||||
if (ptrans->box.y + ptrans->box.height != slice_height)
|
|
||||||
needs_load = true;
|
|
||||||
ptrans->box.height = align(ptrans->box.height, utile_h);
|
|
||||||
}
|
|
||||||
|
|
||||||
ptrans->stride = ptrans->box.width * rsc->cpp;
|
ptrans->stride = ptrans->box.width * rsc->cpp;
|
||||||
ptrans->layer_stride = ptrans->stride * ptrans->box.height;
|
ptrans->layer_stride = ptrans->stride * ptrans->box.height;
|
||||||
|
|
||||||
trans->map = malloc(ptrans->layer_stride * ptrans->box.depth);
|
trans->map = malloc(ptrans->layer_stride * ptrans->box.depth);
|
||||||
|
|
||||||
if (needs_load) {
|
if (usage & PIPE_TRANSFER_READ) {
|
||||||
vc4_load_tiled_image(trans->map, ptrans->stride,
|
vc4_load_tiled_image(trans->map, ptrans->stride,
|
||||||
buf + slice->offset +
|
buf + slice->offset +
|
||||||
ptrans->box.z * rsc->cube_map_stride,
|
ptrans->box.z * rsc->cube_map_stride,
|
||||||
|
|
@ -254,9 +214,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
|
||||||
slice->tiling, rsc->cpp,
|
slice->tiling, rsc->cpp,
|
||||||
&ptrans->box);
|
&ptrans->box);
|
||||||
}
|
}
|
||||||
return (trans->map +
|
return trans->map;
|
||||||
box_start_x * rsc->cpp +
|
|
||||||
box_start_y * ptrans->stride);
|
|
||||||
} else {
|
} else {
|
||||||
ptrans->stride = slice->stride;
|
ptrans->stride = slice->stride;
|
||||||
ptrans->layer_stride = ptrans->stride;
|
ptrans->layer_stride = ptrans->stride;
|
||||||
|
|
|
||||||
|
|
@ -63,15 +63,6 @@ vc4_size_is_lt(uint32_t width, uint32_t height, int cpp)
|
||||||
height <= 4 * vc4_utile_height(cpp));
|
height <= 4 * vc4_utile_height(cpp));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
check_box_utile_alignment(const struct pipe_box *box, int cpp)
|
|
||||||
{
|
|
||||||
assert(!(box->x & (vc4_utile_width(cpp) - 1)));
|
|
||||||
assert(!(box->y & (vc4_utile_height(cpp) - 1)));
|
|
||||||
assert(!(box->width & (vc4_utile_width(cpp) - 1)));
|
|
||||||
assert(!(box->height & (vc4_utile_height(cpp) - 1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Takes a utile x and y (and the number of utiles of width of the image) and
|
* Takes a utile x and y (and the number of utiles of width of the image) and
|
||||||
* returns the offset to the utile within a VC4_TILING_FORMAT_TF image.
|
* returns the offset to the utile within a VC4_TILING_FORMAT_TF image.
|
||||||
|
|
@ -216,8 +207,6 @@ vc4_load_tiled_image(void *dst, uint32_t dst_stride,
|
||||||
uint8_t tiling_format, int cpp,
|
uint8_t tiling_format, int cpp,
|
||||||
const struct pipe_box *box)
|
const struct pipe_box *box)
|
||||||
{
|
{
|
||||||
check_box_utile_alignment(box, cpp);
|
|
||||||
|
|
||||||
if (tiling_format == VC4_TILING_FORMAT_LT) {
|
if (tiling_format == VC4_TILING_FORMAT_LT) {
|
||||||
vc4_load_lt_image(dst, dst_stride,
|
vc4_load_lt_image(dst, dst_stride,
|
||||||
src, src_stride,
|
src, src_stride,
|
||||||
|
|
@ -240,8 +229,6 @@ vc4_store_tiled_image(void *dst, uint32_t dst_stride,
|
||||||
uint8_t tiling_format, int cpp,
|
uint8_t tiling_format, int cpp,
|
||||||
const struct pipe_box *box)
|
const struct pipe_box *box)
|
||||||
{
|
{
|
||||||
check_box_utile_alignment(box, cpp);
|
|
||||||
|
|
||||||
if (tiling_format == VC4_TILING_FORMAT_LT) {
|
if (tiling_format == VC4_TILING_FORMAT_LT) {
|
||||||
vc4_store_lt_image(dst, dst_stride,
|
vc4_store_lt_image(dst, dst_stride,
|
||||||
src, src_stride,
|
src, src_stride,
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,12 @@
|
||||||
#define NEON_TAG(x) x ## _base
|
#define NEON_TAG(x) x ## _base
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static inline uint32_t
|
||||||
|
align_down(uint32_t val, uint32_t align)
|
||||||
|
{
|
||||||
|
return val & ~(align - 1);
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns the stride in bytes of a 64-byte microtile. */
|
/** Returns the stride in bytes of a 64-byte microtile. */
|
||||||
static uint32_t
|
static uint32_t
|
||||||
vc4_utile_stride(int cpp)
|
vc4_utile_stride(int cpp)
|
||||||
|
|
@ -252,6 +258,66 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Returns the X value into the address bits for LT tiling.
|
||||||
|
*
|
||||||
|
* The LT tile load/stores rely on the X bits not intersecting with the Y
|
||||||
|
* bits. Because of this, we have to choose to put the utile index within the
|
||||||
|
* LT tile into one of the two values, and we do so in swizzle_lt_x() to make
|
||||||
|
* NPOT handling easier.
|
||||||
|
*/
|
||||||
|
static uint32_t
|
||||||
|
swizzle_lt_x(int x, int cpp)
|
||||||
|
{
|
||||||
|
switch (cpp) {
|
||||||
|
case 1:
|
||||||
|
/* 8x8 inside of 4x4 */
|
||||||
|
return ((x & 0x7) << (0 - 0) |
|
||||||
|
(x & ~0x7) << (6 - 3));
|
||||||
|
case 2:
|
||||||
|
/* 8x4 inside of 4x4 */
|
||||||
|
return ((x & 0x7) << (1 - 0) |
|
||||||
|
(x & ~0x7) << (6 - 3));
|
||||||
|
case 4:
|
||||||
|
/* 4x4 inside of 4x4 */
|
||||||
|
return ((x & 0x3) << (2 - 0) |
|
||||||
|
(x & ~0x3) << (6 - 2));
|
||||||
|
case 8:
|
||||||
|
/* 2x4 inside of 4x4 */
|
||||||
|
return ((x & 0x1) << (3 - 0) |
|
||||||
|
(x & ~0x1) << (6 - 1));
|
||||||
|
default:
|
||||||
|
unreachable("bad cpp");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the Y value into the address bits for LT tiling.
|
||||||
|
*
|
||||||
|
* The LT tile load/stores rely on the X bits not intersecting with the Y
|
||||||
|
* bits.
|
||||||
|
*/
|
||||||
|
static uint32_t
|
||||||
|
swizzle_lt_y(int y, int cpp)
|
||||||
|
{
|
||||||
|
|
||||||
|
switch (cpp) {
|
||||||
|
case 1:
|
||||||
|
/* 8x8 inside of 4x4 */
|
||||||
|
return ((y & 0x7) << 3);
|
||||||
|
case 2:
|
||||||
|
/* 8x4 inside of 4x4 */
|
||||||
|
return ((y & 0x3) << 4);
|
||||||
|
case 4:
|
||||||
|
/* 4x4 inside of 4x4 */
|
||||||
|
return ((y & 0x3) << 4);
|
||||||
|
case 8:
|
||||||
|
/* 2x4 inside of 4x4 */
|
||||||
|
return ((y & 0x3) << 4);
|
||||||
|
default:
|
||||||
|
unreachable("bad cpp");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper for loading or storing to an LT image, where the box is aligned
|
* Helper for loading or storing to an LT image, where the box is aligned
|
||||||
|
|
@ -261,9 +327,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
|
||||||
* vc4_load_utile/vc4_store_utile helpers.
|
* vc4_load_utile/vc4_store_utile helpers.
|
||||||
*/
|
*/
|
||||||
static inline void
|
static inline void
|
||||||
vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
|
vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride,
|
||||||
void *cpu, uint32_t cpu_stride,
|
void *cpu, uint32_t cpu_stride,
|
||||||
int cpp, const struct pipe_box *box, bool to_cpu)
|
int cpp, const struct pipe_box *box, bool to_cpu)
|
||||||
{
|
{
|
||||||
uint32_t utile_w = vc4_utile_width(cpp);
|
uint32_t utile_w = vc4_utile_width(cpp);
|
||||||
uint32_t utile_h = vc4_utile_height(cpp);
|
uint32_t utile_h = vc4_utile_height(cpp);
|
||||||
|
|
@ -289,6 +355,90 @@ vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper for loading or storing to an LT image, where the box is not aligned
|
||||||
|
* to utiles.
|
||||||
|
*
|
||||||
|
* This walks through the raster-order data, copying to/from the corresponding
|
||||||
|
* tiled pixel. This means we don't get write-combining on stores, but the
|
||||||
|
* loop is very few CPU instructions since the memcpy will be inlined.
|
||||||
|
*/
|
||||||
|
static inline void
|
||||||
|
vc4_lt_image_unaligned(void *gpu, uint32_t gpu_stride,
|
||||||
|
void *cpu, uint32_t cpu_stride,
|
||||||
|
int cpp, const struct pipe_box *box, bool to_cpu)
|
||||||
|
{
|
||||||
|
|
||||||
|
/* These are the address bits for the start of the box, split out into
|
||||||
|
* x/y so that they can be incremented separately in their loops.
|
||||||
|
*/
|
||||||
|
uint32_t offs_x0 = swizzle_lt_x(box->x, cpp);
|
||||||
|
uint32_t offs_y = swizzle_lt_y(box->y, cpp);
|
||||||
|
/* The *_mask values are "what bits of the address are from x or y" */
|
||||||
|
uint32_t x_mask = swizzle_lt_x(~0, cpp);
|
||||||
|
uint32_t y_mask = swizzle_lt_y(~0, cpp);
|
||||||
|
uint32_t incr_y = swizzle_lt_x(gpu_stride / cpp, cpp);
|
||||||
|
|
||||||
|
assert(!(x_mask & y_mask));
|
||||||
|
|
||||||
|
offs_x0 += incr_y * (box->y / vc4_utile_height(cpp));
|
||||||
|
|
||||||
|
for (uint32_t y = 0; y < box->height; y++) {
|
||||||
|
void *gpu_row = gpu + offs_y;
|
||||||
|
|
||||||
|
uint32_t offs_x = offs_x0;
|
||||||
|
|
||||||
|
for (uint32_t x = 0; x < box->width; x++) {
|
||||||
|
/* Use a memcpy here to move a pixel's worth of data.
|
||||||
|
* We're relying on this function to be inlined, so
|
||||||
|
* this will get expanded into the appropriate 1, 2,
|
||||||
|
* or 4-byte move.
|
||||||
|
*/
|
||||||
|
if (to_cpu) {
|
||||||
|
memcpy(cpu + x * cpp, gpu_row + offs_x, cpp);
|
||||||
|
} else {
|
||||||
|
memcpy(gpu_row + offs_x, cpu + x * cpp, cpp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This math trick with x_mask increments offs_x by 1
|
||||||
|
* in x.
|
||||||
|
*/
|
||||||
|
offs_x = (offs_x - x_mask) & x_mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
offs_y = (offs_y - y_mask) & y_mask;
|
||||||
|
/* When offs_y wraps (we hit the end of the utile), we
|
||||||
|
* increment offs_x0 by effectively the utile stride.
|
||||||
|
*/
|
||||||
|
if (!offs_y)
|
||||||
|
offs_x0 += incr_y;
|
||||||
|
|
||||||
|
cpu += cpu_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* General LT image load/store helper.
|
||||||
|
*/
|
||||||
|
static inline void
|
||||||
|
vc4_lt_image_helper(void *gpu, uint32_t gpu_stride,
|
||||||
|
void *cpu, uint32_t cpu_stride,
|
||||||
|
int cpp, const struct pipe_box *box, bool to_cpu)
|
||||||
|
{
|
||||||
|
if (box->x & (vc4_utile_width(cpp) - 1) ||
|
||||||
|
box->y & (vc4_utile_height(cpp) - 1) ||
|
||||||
|
box->width & (vc4_utile_width(cpp) - 1) ||
|
||||||
|
box->height & (vc4_utile_height(cpp) - 1)) {
|
||||||
|
vc4_lt_image_unaligned(gpu, gpu_stride,
|
||||||
|
cpu, cpu_stride,
|
||||||
|
cpp, box, to_cpu);
|
||||||
|
} else {
|
||||||
|
vc4_lt_image_aligned(gpu, gpu_stride,
|
||||||
|
cpu, cpu_stride,
|
||||||
|
cpp, box, to_cpu);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride,
|
vc4_lt_image_cpp_helper(void *gpu, uint32_t gpu_stride,
|
||||||
void *cpu, uint32_t cpu_stride,
|
void *cpu, uint32_t cpu_stride,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue