freedreno: Implement fast clear_buffer for Adreno 6xx and 7xx

Increase performance of
 - OpenGL ARB_clear_buffer_object impl.
 - OpenCL clEnqueueFillBuffer implementation

Reviewed-by: Rob Clark <robclark@freedesktop.org>
Signed-off-by: David Heidelberg <david@ixit.cz>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30284>
This commit is contained in:
David Heidelberg 2024-07-08 18:20:56 -07:00
parent ddfb8ebf8c
commit dafc4476f7

View file

@ -30,7 +30,9 @@
#include "util/format_srgb.h"
#include "util/half_float.h"
#include "util/u_dump.h"
#include "util/u_helpers.h"
#include "util/u_log.h"
#include "util/u_transfer.h"
#include "util/u_surface.h"
#include "freedreno_blitter.h"
@ -868,6 +870,113 @@ convert_color(enum pipe_format format, union pipe_color_union *pcolor)
return color;
}
template <chip CHIP>
static void
fd6_clear_buffer(struct pipe_context *pctx,
struct pipe_resource *prsc,
unsigned offset, unsigned size,
const void *clear_value, int clear_value_size)
{
enum pipe_format dst_fmt;
union pipe_color_union color;
switch (clear_value_size) {
case 16:
dst_fmt = PIPE_FORMAT_R32G32B32A32_UINT;
memcpy(&color.ui, clear_value, 16);
break;
case 8:
dst_fmt = PIPE_FORMAT_R32G32_UINT;
memcpy(&color.ui, clear_value, 8);
memset(&color.ui[2], 0, 8);
break;
case 4:
dst_fmt = PIPE_FORMAT_R32_UINT;
memcpy(&color.ui, clear_value, 4);
memset(&color.ui[1], 0, 12);
break;
case 2:
dst_fmt = PIPE_FORMAT_R16_UINT;
color.ui[0] = *(unsigned short *)clear_value;
memset(&color.ui[1], 0, 12);
break;
case 1:
dst_fmt = PIPE_FORMAT_R8_UINT;
color.ui[0] = *(unsigned char *)clear_value;
memset(&color.ui[1], 0, 12);
break;
default:
dst_fmt = PIPE_FORMAT_NONE;
break;
}
/* unsupported clear_value_size and when alignment doesn't match fallback */
if ((dst_fmt == PIPE_FORMAT_NONE) || (offset % clear_value_size)) {
u_default_clear_buffer(pctx, prsc, offset, size, clear_value, clear_value_size);
return;
}
if (DEBUG_BLIT) {
fprintf(stderr, "buffer clear:\ndst resource: ");
util_dump_resource(stderr, prsc);
fprintf(stderr, "\n");
}
struct fd_context *ctx = fd_context(pctx);
struct fd_resource *rsc = fd_resource(prsc);
struct fd_batch *batch = fd_bc_alloc_batch(ctx, true);
struct fd_ringbuffer *ring = batch->draw;
fd_screen_lock(ctx->screen);
fd_batch_resource_write(batch, rsc);
fd_screen_unlock(ctx->screen);
assert(!batch->flushed);
/* Marking the batch as needing flush must come after the batch
* dependency tracking (resource_read()/resource_write()), as that
* can trigger a flush
*/
fd_batch_needs_flush(batch);
fd_batch_update_queries(batch);
emit_setup<CHIP>(batch);
emit_clear_color(ring, dst_fmt, &color);
emit_blit_setup<CHIP>(ring, dst_fmt, false, &color, 0, ROTATE_0);
unsigned dshift = (offset / clear_value_size) & 0x3f;
for (unsigned part_offset = 0; part_offset < size; part_offset += (0x4000 - 0x40)) {
unsigned doff = (offset + part_offset) & ~0x3f;
unsigned w = MIN2((size - part_offset) / clear_value_size, (0x4000 - 0x40));
emit_blit_buffer_dst(ring, rsc, doff, 0, fd6_color_format(dst_fmt, TILE6_LINEAR));
OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2);
OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dshift) | A6XX_GRAS_2D_DST_TL_Y(0));
OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dshift + w - 1) |
A6XX_GRAS_2D_DST_BR_Y(0));
emit_blit_fini<CHIP>(ctx, ring);
}
fd6_emit_flushes<CHIP>(batch->ctx, ring,
FD6_FLUSH_CCU_COLOR |
FD6_FLUSH_CCU_DEPTH |
FD6_FLUSH_CACHE |
FD6_WAIT_FOR_IDLE);
fd_batch_flush(batch);
fd_batch_reference(&batch, NULL);
/* Acc query state will have been dirtied by our fd_batch_update_queries, so
* the ctx->batch may need to turn its queries back on.
*/
fd_context_dirty(ctx, FD_DIRTY_QUERY);
}
template <chip CHIP>
void
fd6_clear_surface(struct fd_context *ctx, struct fd_ringbuffer *ring,
@ -1369,6 +1478,7 @@ fd6_blitter_init(struct pipe_context *pctx)
if (FD_DBG(NOBLIT))
return;
pctx->clear_buffer = fd6_clear_buffer<CHIP>;
pctx->clear_texture = fd6_clear_texture<CHIP>;
ctx->blit = fd6_blit<CHIP>;
}