freedreno/a6xx: Rework barrier handling

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20575>
This commit is contained in:
Rob Clark 2023-01-07 08:01:27 -08:00 committed by Marge Bot
parent 63e889516d
commit aac66fe039
11 changed files with 263 additions and 63 deletions

View file

@ -0,0 +1,198 @@
/*
* Copyright © 2023 Google, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define FD_BO_NO_HARDPIN 1
#include "freedreno_batch.h"
#include "fd6_barrier.h"
#include "fd6_context.h"
/* TODO probably more of the various fd6_event_write() should be
* consolidated here.
*/
static uint32_t
event_write(struct fd_context *ctx, struct fd_ringbuffer *ring,
enum vgt_event_type evt)
{
bool timestamp = false;
switch (evt) {
case CACHE_FLUSH_TS:
case WT_DONE_TS:
case RB_DONE_TS:
case PC_CCU_FLUSH_DEPTH_TS:
case PC_CCU_FLUSH_COLOR_TS:
case PC_CCU_RESOLVE_TS:
timestamp = true;
break;
default:
break;
}
OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1);
OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt));
if (timestamp) {
struct fd6_context *fd6_ctx = fd6_context(ctx);
uint32_t seqno = ++fd6_ctx->seqno;
OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); /* ADDR_LO/HI */
OUT_RING(ring, seqno);
return seqno;
}
return 0;
}
static void
fd6_emit_flushes(struct fd_context *ctx, struct fd_ringbuffer *ring,
unsigned flushes)
{
/* Experiments show that invalidating CCU while it still has data in it
* doesn't work, so make sure to always flush before invalidating in case
* any data remains that hasn't yet been made available through a barrier.
* However it does seem to work for UCHE.
*/
if (flushes & (FD6_FLUSH_CCU_COLOR | FD6_INVALIDATE_CCU_COLOR))
event_write(ctx, ring, PC_CCU_FLUSH_COLOR_TS);
if (flushes & (FD6_FLUSH_CCU_DEPTH | FD6_INVALIDATE_CCU_DEPTH))
event_write(ctx, ring, PC_CCU_FLUSH_DEPTH_TS);
if (flushes & FD6_INVALIDATE_CCU_COLOR)
event_write(ctx, ring, PC_CCU_INVALIDATE_COLOR);
if (flushes & FD6_INVALIDATE_CCU_DEPTH)
event_write(ctx, ring, PC_CCU_INVALIDATE_DEPTH);
if (flushes & FD6_FLUSH_CACHE)
event_write(ctx, ring, CACHE_FLUSH_TS);
if (flushes & FD6_INVALIDATE_CACHE)
event_write(ctx, ring, CACHE_INVALIDATE);
if (flushes & FD6_WAIT_MEM_WRITES)
OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0);
if ((flushes & FD6_WAIT_FOR_IDLE) ||
(ctx->screen->info->a6xx.has_ccu_flush_bug &&
(flushes & (FD6_FLUSH_CCU_COLOR | FD6_FLUSH_CCU_DEPTH))))
OUT_PKT7(ring, CP_WAIT_FOR_IDLE, 0);
if (flushes & FD6_WAIT_FOR_ME)
OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
}
void
fd6_barrier_flush(struct fd_batch *batch)
{
fd6_emit_flushes(batch->ctx, batch->draw, batch->barrier);
batch->barrier = 0;
}
static void
add_flushes(struct pipe_context *pctx, unsigned flushes)
assert_dt
{
struct fd_batch *batch = NULL;
fd_batch_reference(&batch, fd_context(pctx)->batch);
/* A batch flush is already a sufficient barrier: */
if (!batch)
return;
batch->barrier |= flushes;
fd_batch_reference(&batch, NULL);
}
static void
fd6_texture_barrier(struct pipe_context *pctx, unsigned flags)
in_dt
{
unsigned flushes = 0;
if (flags & PIPE_TEXTURE_BARRIER_SAMPLER) {
/* If we are sampling from the fb, we could get away with treating
* this as a PIPE_TEXTURE_BARRIER_FRAMEBUFFER in sysmem mode, but
* that won't work out in gmem mode because we don't patch the tex
* state outside of the case that the frag shader tells us it is
* an fb-read. And in particular, the fb-read case guarantees us
* that the read will be from the same texel, but the fb-bound-as-
* tex case does not.
*
* We could try to be clever here and detect if zsbuf/cbuf[n] is
* bound as a texture, but that doesn't really help if it is bound
* as a texture after the barrier without a lot of extra book-
* keeping. So hopefully no one calls glTextureBarrierNV() just
* for lolz.
*/
pctx->flush(pctx, NULL, 0);
return;
}
if (flags & PIPE_TEXTURE_BARRIER_FRAMEBUFFER) {
flushes |= FD6_WAIT_FOR_IDLE | FD6_WAIT_FOR_ME |
FD6_FLUSH_CCU_COLOR | FD6_FLUSH_CCU_DEPTH |
FD6_FLUSH_CACHE | FD6_INVALIDATE_CACHE;
}
add_flushes(pctx, flushes);
}
static void
fd6_memory_barrier(struct pipe_context *pctx, unsigned flags)
in_dt
{
unsigned flushes = 0;
if (flags & (PIPE_BARRIER_SHADER_BUFFER |
PIPE_BARRIER_IMAGE |
PIPE_BARRIER_CONSTANT_BUFFER |
PIPE_BARRIER_VERTEX_BUFFER |
PIPE_BARRIER_INDEX_BUFFER |
PIPE_BARRIER_STREAMOUT_BUFFER)) {
flushes |= FD6_WAIT_FOR_IDLE;
}
if (flags & (PIPE_BARRIER_TEXTURE |
PIPE_BARRIER_INDIRECT_BUFFER |
PIPE_BARRIER_UPDATE_BUFFER |
PIPE_BARRIER_UPDATE_TEXTURE)) {
flushes |= FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE;
}
if (flags & PIPE_BARRIER_FRAMEBUFFER) {
fd6_texture_barrier(pctx, PIPE_TEXTURE_BARRIER_FRAMEBUFFER);
}
add_flushes(pctx, flushes);
}
void
fd6_barrier_init(struct pipe_context *pctx)
{
pctx->texture_barrier = fd6_texture_barrier;
pctx->memory_barrier = fd6_memory_barrier;
}

View file

@ -0,0 +1,48 @@
/*
* Copyright © 2023 Google, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef FD6_BARRIER_H_
#define FD6_BARRIER_H_
#include "freedreno_context.h"
/**
* Various flush operations that could be needed
*/
enum fd6_flush {
FD6_FLUSH_CCU_COLOR = BIT(0),
FD6_FLUSH_CCU_DEPTH = BIT(1),
FD6_INVALIDATE_CCU_COLOR = BIT(2),
FD6_INVALIDATE_CCU_DEPTH = BIT(3),
FD6_FLUSH_CACHE = BIT(4),
FD6_INVALIDATE_CACHE = BIT(5),
FD6_WAIT_MEM_WRITES = BIT(6),
FD6_WAIT_FOR_IDLE = BIT(7),
FD6_WAIT_FOR_ME = BIT(8),
};
void fd6_barrier_flush(struct fd_batch *batch) assert_dt;
void fd6_barrier_init(struct pipe_context *pctx);
#endif /* FD6_BARRIER_H_ */

View file

@ -33,6 +33,7 @@
#include "freedreno_resource.h"
#include "freedreno_tracepoints.h"
#include "fd6_barrier.h"
#include "fd6_compute.h"
#include "fd6_const.h"
#include "fd6_context.h"
@ -181,6 +182,9 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
if (ctx->batch->barrier)
fd6_barrier_flush(ctx->batch);
if (info->indirect) {
struct fd_resource *rsc = fd_resource(info->indirect);

View file

@ -30,6 +30,7 @@
#include "freedreno_query_acc.h"
#include "freedreno_state.h"
#include "fd6_barrier.h"
#include "fd6_blend.h"
#include "fd6_blitter.h"
#include "fd6_compute.h"
@ -229,7 +230,6 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv,
fd6_gmem_init(pctx);
fd6_texture_init(pctx);
fd6_prog_init(pctx);
fd6_emit_init(pctx);
fd6_query_context_init(pctx);
setup_state_map(&fd6_ctx->base);
@ -243,6 +243,9 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv,
/* after fd_context_init() to override set_shader_images() */
fd6_image_init(pctx);
/* after fd_context_init() to override memory_barrier/texture_barrier(): */
fd6_barrier_init(pctx);
util_blitter_set_texture_multisample(fd6_ctx->base.blitter, true);
pctx->delete_vertex_elements_state = fd6_vertex_state_delete;

View file

@ -35,6 +35,7 @@
#include "freedreno_resource.h"
#include "freedreno_state.h"
#include "fd6_barrier.h"
#include "fd6_context.h"
#include "fd6_draw.h"
#include "fd6_emit.h"
@ -326,6 +327,9 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
if (emit.dirty_groups)
fd6_emit_3d_state(ring, &emit);
if (ctx->batch->barrier)
fd6_barrier_flush(ctx->batch);
/* for debug after a lock up, write a unique counter value
* to scratch7 for each draw, to make it easier to match up
* register dumps to cmdstream. The combination of IB

View file

@ -1131,48 +1131,6 @@ fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
}
}
/* this is *almost* the same as fd6_cache_flush().. which I guess
* could be re-worked to be something a bit more generic w/ param
* indicating what needs to be flushed.. although that would mean
* figuring out which events trigger what state to flush..
*/
static void
fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt
{
struct fd6_context *fd6_ctx = fd6_context(ctx);
struct fd_batch *batch = fd_context_batch_locked(ctx);
struct fd_ringbuffer *ring = batch->draw;
unsigned seqno;
fd_batch_needs_flush(batch);
seqno = fd6_event_write(batch, ring, RB_DONE_TS, true);
OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
CP_WAIT_REG_MEM_0_POLL_MEMORY);
OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));
OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));
OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
fd_wfi(batch, ring);
fd6_event_write(batch, ring, CACHE_INVALIDATE, false);
OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);
OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));
OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));
fd_batch_unlock_submit(batch);
fd_batch_reference(&batch, NULL);
}
void
fd6_emit_init_screen(struct pipe_screen *pscreen)
{
@ -1180,10 +1138,3 @@ fd6_emit_init_screen(struct pipe_screen *pscreen)
screen->emit_ib = fd6_emit_ib;
screen->mem_to_mem = fd6_mem_to_mem;
}
void
fd6_emit_init(struct pipe_context *pctx) disable_thread_safety_analysis
{
struct fd_context *ctx = fd_context(pctx);
ctx->framebuffer_barrier = fd6_framebuffer_barrier;
}

View file

@ -324,7 +324,6 @@ void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
void fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring);
void fd6_emit_init_screen(struct pipe_screen *pscreen);
void fd6_emit_init(struct pipe_context *pctx);
static inline void
fd6_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)

View file

@ -114,6 +114,9 @@ struct fd_batch {
*/
const struct fd_gmem_stateobj *gmem_state;
/* Driver specific barrier/flush flags: */
unsigned barrier;
/* A calculated "draw cost" value for the batch, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:

View file

@ -156,15 +156,6 @@ out:
static void
fd_texture_barrier(struct pipe_context *pctx, unsigned flags) in_dt
{
if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER) {
struct fd_context *ctx = fd_context(pctx);
if (ctx->framebuffer_barrier) {
ctx->framebuffer_barrier(ctx);
return;
}
}
/* On devices that could sample from GMEM we could possibly do better.
* Or if we knew that we were doing GMEM bypass we could just emit a
* cache flush, perhaps? But we don't know if future draws would cause

View file

@ -510,9 +510,6 @@ struct fd_context {
void (*validate_format)(struct fd_context *ctx, struct fd_resource *rsc,
enum pipe_format format) dt;
/* handling for barriers: */
void (*framebuffer_barrier)(struct fd_context *ctx) dt;
/* logger: */
void (*record_timestamp)(struct fd_ringbuffer *ring, struct fd_bo *bo,
unsigned offset);

View file

@ -178,6 +178,8 @@ files_libfreedreno = files(
'a5xx/fd5_texture.h',
'a5xx/fd5_zsa.c',
'a5xx/fd5_zsa.h',
'a6xx/fd6_barrier.c',
'a6xx/fd6_barrier.h',
'a6xx/fd6_blend.c',
'a6xx/fd6_blend.h',
'a6xx/fd6_blitter.c',