freedreno/a6xx: Allocate a fixed-size tess factor BO.

Saves per-batch allocations, avoids reallocation for various vertex
counts, and avoids needing the indirect tess addrs constobj so that we
could emit the relocs to the tess BO after we'd emitted all the draws.

Also apparently it fixes one of our CTS fails.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13851>
This commit is contained in:
Emma Anholt 2021-11-17 14:10:41 -08:00 committed by Marge Bot
parent 577a0a7352
commit d7226e9a9e
10 changed files with 48 additions and 83 deletions

View file

@ -17,10 +17,6 @@ KHR-GLES31.core.geometry_shader.layered_framebuffer.depth_support,Fail
KHR-GLES31.core.geometry_shader.layered_framebuffer.stencil_support,Fail
# " Pixel data comparison failed; expected: (0.1, 0.2, 0.3, 0.4) rendered: (0, 0, 0, 0) epsilon: 0.00392157
# Pixel data comparison failed at esextcTessellationShaderPoints.cpp:597"
KHR-GLES31.core.tessellation_shader.tessellation_shader_point_mode.point_rendering,Fail
# "Invalid value returned: expected:[1, 1, 1, 1] retrieved: [0, 0, 0, 0
# Invalid rendering result at esextcTessellationShaderBarrier.cpp:504"
KHR-GLES31.core.tessellation_shader.tessellation_shader_tc_barriers.barrier_guarded_read_calls,Fail

View file

@ -108,21 +108,23 @@ emit_const_ptrs(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v,
}
static void
emit_tess_bos(struct fd_ringbuffer *ring, struct fd6_emit *emit,
emit_tess_bos(struct fd_screen *screen, struct fd_ringbuffer *ring,
struct ir3_shader_variant *s) assert_dt
{
struct fd_context *ctx = emit->ctx;
const struct ir3_const_state *const_state = ir3_const_state(s);
const unsigned regid = const_state->offsets.primitive_param * 4 + 4;
uint32_t dwords = 16;
uint32_t dwords = 8;
OUT_PKT7(ring, fd6_stage2opcode(s->type), 3);
OUT_PKT7(ring, fd6_stage2opcode(s->type), 7);
OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) |
CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4));
OUT_RB(ring, ctx->batch->tess_addrs_constobj);
OUT_RING(ring, 0);
OUT_RING(ring, 0);
OUT_RELOC(ring, screen->tess_bo, FD6_TESS_FACTOR_SIZE, 0, 0);
OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
}
static void
@ -166,7 +168,7 @@ fd6_build_tess_consts(struct fd6_emit *emit)
emit_stage_tess_consts(constobj, emit->hs, hs_params,
ARRAY_SIZE(hs_params));
emit_tess_bos(constobj, emit, emit->hs);
emit_tess_bos(ctx->screen, constobj, emit->hs);
if (emit->gs)
num_vertices = emit->gs->shader->nir->info.gs.vertices_in;
@ -179,7 +181,7 @@ fd6_build_tess_consts(struct fd6_emit *emit)
emit_stage_tess_consts(constobj, emit->ds, ds_params,
ARRAY_SIZE(ds_params));
emit_tess_bos(constobj, emit, emit->ds);
emit_tess_bos(ctx->screen, constobj, emit->ds);
}
if (emit->gs) {

View file

@ -261,41 +261,16 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
draw0.prim_type = DI_PT_PATCHES0 + ctx->patch_vertices;
draw0.tess_enable = true;
const unsigned max_count = 2048;
unsigned count;
/**
* We can cap tessparam/tessfactor buffer sizes at the sub-draw
* limit. But in the indirect-draw case we must assume the worst.
*/
if (indirect && indirect->buffer) {
count = ALIGN_NPOT(max_count, ctx->patch_vertices);
} else {
count = MIN2(max_count, draw->count);
count = ALIGN_NPOT(count, ctx->patch_vertices);
}
/* maximum number of patches that can fit in tess factor/param buffers */
uint32_t subdraw_size = MIN2(FD6_TESS_FACTOR_SIZE / factor_stride,
FD6_TESS_PARAM_SIZE / (emit.hs->output_size * 4));
/* convert from # of patches to draw count */
subdraw_size *= ctx->patch_vertices;
OUT_PKT7(ring, CP_SET_SUBDRAW_SIZE, 1);
OUT_RING(ring, count);
OUT_RING(ring, subdraw_size);
ctx->batch->tessellation = true;
ctx->batch->tessparam_size =
MAX2(ctx->batch->tessparam_size, emit.hs->output_size * 4 * count);
ctx->batch->tessfactor_size =
MAX2(ctx->batch->tessfactor_size, factor_stride * count);
if (!ctx->batch->tess_addrs_constobj) {
/* Reserve space for the bo address - we'll write them later in
* setup_tess_buffers(). We need 2 bo address, but indirect
* constant upload needs at least 4 vec4s.
*/
unsigned size = 4 * 16;
ctx->batch->tess_addrs_constobj = fd_submit_new_ringbuffer(
ctx->batch->submit, size, FD_RINGBUFFER_STREAMING);
ctx->batch->tess_addrs_constobj->cur += size;
}
}
uint32_t index_start = info->index_size ? draw->index_bias : draw->start;

View file

@ -1339,6 +1339,15 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
OUT_RING(ring, 0x00000000);
/* This happens after all drawing has been emitted to the draw CS, so we know
* whether we need the tess BO pointers.
*/
if (batch->tessellation) {
assert(screen->tess_bo);
OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2);
OUT_RELOC(ring, screen->tess_bo, 0, 0, 0);
}
if (!batch->nondraw) {
trace_end_state_restore(&batch->trace, ring);
}

View file

@ -1552,25 +1552,6 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt
trace_end_clear_restore(&batch->trace, ring);
}
static void
setup_tess_buffers(struct fd_batch *batch, struct fd_ringbuffer *ring)
{
struct fd_context *ctx = batch->ctx;
batch->tessfactor_bo = fd_bo_new(ctx->screen->dev, batch->tessfactor_size,
0, "tessfactor");
batch->tessparam_bo = fd_bo_new(ctx->screen->dev, batch->tessparam_size,
0, "tessparam");
OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2);
OUT_RELOC(ring, batch->tessfactor_bo, 0, 0, 0);
batch->tess_addrs_constobj->cur = batch->tess_addrs_constobj->start;
OUT_RELOC(batch->tess_addrs_constobj, batch->tessparam_bo, 0, 0, 0);
OUT_RELOC(batch->tess_addrs_constobj, batch->tessfactor_bo, 0, 0, 0);
}
static void
fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
{
@ -1612,9 +1593,6 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
emit_marker6(ring, 7);
if (batch->tessellation)
setup_tess_buffers(batch, ring);
OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
OUT_RING(ring, 0x0);

View file

@ -1206,6 +1206,7 @@ fd6_program_create(void *data, struct ir3_shader_variant *bs,
const struct ir3_shader_key *key) in_dt
{
struct fd_context *ctx = fd_context(data);
struct fd_screen *screen = ctx->screen;
struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state);
tc_assert_driver_thread(ctx->tc);
@ -1233,6 +1234,19 @@ fd6_program_create(void *data, struct ir3_shader_variant *bs,
}
#endif
if (hs) {
/* Allocate the fixed-size tess factor BO globally on the screen. This
* lets the program (which ideally we would have shared across contexts,
* though the current ir3_cache impl doesn't do that) bake in the
* addresses.
*/
fd_screen_lock(screen);
if (!screen->tess_bo)
screen->tess_bo =
fd_bo_new(screen->dev, FD6_TESS_BO_SIZE, 0, "tessfactor");
fd_screen_unlock(screen);
}
setup_config_stateobj(ctx, state);
setup_stateobj(state->binning_stateobj, ctx, state, key, true);
setup_stateobj(state->stateobj, ctx, state, key, false);

View file

@ -178,12 +178,6 @@ cleanup_submit(struct fd_batch *batch)
batch->tile_fini = NULL;
}
if (batch->tessellation) {
fd_bo_del(batch->tessfactor_bo);
fd_bo_del(batch->tessparam_bo);
fd_ringbuffer_del(batch->tess_addrs_constobj);
}
fd_submit_del(batch->submit);
batch->submit = NULL;
}

View file

@ -255,18 +255,6 @@ struct fd_batch {
/** set of dependent batches.. holds refs to dependent batches: */
uint32_t dependents_mask;
/* Buffer for tessellation engine input
*/
struct fd_bo *tessfactor_bo;
uint32_t tessfactor_size;
/* Buffer for passing parameters between TCS and TES
*/
struct fd_bo *tessparam_bo;
uint32_t tessparam_size;
struct fd_ringbuffer *tess_addrs_constobj;
};
struct fd_batch *fd_batch_create(struct fd_context *ctx, bool nondraw);

View file

@ -144,6 +144,9 @@ fd_screen_destroy(struct pipe_screen *pscreen)
{
struct fd_screen *screen = fd_screen(pscreen);
if (screen->tess_bo)
fd_bo_del(screen->tess_bo);
if (screen->pipe)
fd_pipe_del(screen->pipe);

View file

@ -149,6 +149,12 @@ struct fd_screen {
struct renderonly *ro;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define FD6_TESS_FACTOR_SIZE (8 * 1024)
#define FD6_TESS_PARAM_SIZE (128 * 1024)
#define FD6_TESS_BO_SIZE (FD6_TESS_FACTOR_SIZE + FD6_TESS_PARAM_SIZE)
struct fd_bo *tess_bo;
/* table with PIPE_PRIM_MAX+1 entries mapping PIPE_PRIM_x to
* DI_PT_x value to use for draw initiator. There are some
* slight differences between generation.