From dec49ec50a2f017dabe72f5ec882054de85540b7 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 22 Jan 2023 08:32:43 -0800 Subject: [PATCH] freedreno/a6xx: Move CS state to PROG state group It is pretty easy to just cache the stateobj with the hwcso (since unlike 3d, there is only a single shader state) and re-emit it by pointer when it changes, now that the CS state doesn't depend on the grid info. This also moves immed consts into the PROG state, so they are only updated when the PROG state is dirty. And splits user consts and driver param consts, so they are only re-emit when needed. Signed-off-by: Rob Clark Part-of: --- .../drivers/freedreno/a6xx/fd6_compute.c | 63 ++++++++++++++----- .../drivers/freedreno/a6xx/fd6_compute.h | 7 +++ .../drivers/freedreno/a6xx/fd6_const.c | 19 ++++-- .../drivers/freedreno/a6xx/fd6_const.h | 10 ++- src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 22 ++++++- src/gallium/drivers/freedreno/a6xx/fd6_emit.h | 3 +- src/gallium/drivers/freedreno/ir3/ir3_const.h | 24 ++++--- 7 files changed, 114 insertions(+), 34 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c index 5d751cfe1b0..07139cc5ee3 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c @@ -100,32 +100,38 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, } fd6_emit_shader(ctx, ring, v); + fd6_emit_immediates(ctx->screen, v, ring); } static void fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt { - struct ir3_shader_key key = {}; - struct ir3_shader_variant *v; + struct fd6_compute_state *cs = ctx->compute; struct fd_ringbuffer *ring = ctx->batch->draw; unsigned nglobal = 0; + if (unlikely(!cs->v)) { + struct ir3_shader_key key = {}; + + cs->v = ir3_shader_variant(ir3_get_shader(cs->hwcso), key, false, &ctx->debug); + if (!cs->v) + return; + + cs->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + cs_program_emit(ctx, cs->stateobj, cs->v); + + cs->user_consts_cmdstream_size = fd6_user_consts_cmdstream_size(cs->v); + } + trace_start_compute(&ctx->batch->trace, ring, !!info->indirect, info->work_dim, info->block[0], info->block[1], info->block[2], info->grid[0], info->grid[1], info->grid[2]); - v = ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug); - if (!v) - return; - if (ctx->batch->barrier) fd6_barrier_flush(ctx->batch); - if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) - cs_program_emit(ctx, ring, v); - bool emit_instrlen_workaround = - v->instrlen > ctx->screen->info->a6xx.instr_cache_size; + cs->v->instrlen > ctx->screen->info->a6xx.instr_cache_size; /* There appears to be a HW bug where in some rare circumstances it appears * to accidentally use the FS instrlen instead of the CS instrlen, which @@ -143,12 +149,18 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt * See https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19023 */ if (emit_instrlen_workaround) { - OUT_REG(ring, A6XX_SP_FS_INSTRLEN(v->instrlen)); + OUT_REG(ring, A6XX_SP_FS_INSTRLEN(cs->v->instrlen)); fd6_event_write(ctx->batch, ring, LABEL, false); } - fd6_emit_cs_state(ctx, ring, v); - fd6_emit_cs_consts(v, ring, ctx, info); + if (ctx->gen_dirty) + fd6_emit_cs_state(ctx, ring, cs); + + if (ctx->gen_dirty & BIT(FD6_GROUP_CONST)) + fd6_emit_cs_user_consts(ctx, ring, cs); + + if (cs->v->need_driver_params || info->input) + fd6_emit_cs_driver_params(ctx, ring, cs, info); u_foreach_bit (i, ctx->global_bindings.enabled_mask) nglobal++; @@ -171,7 +183,7 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); uint32_t shared_size = - MAX2(((int)(v->cs.req_local_mem + info->variable_shared_mem) - 1) / 1024, 1); + MAX2(((int)(cs->v->cs.req_local_mem + info->variable_shared_mem) - 1) / 1024, 1); OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | A6XX_SP_CS_UNKNOWN_A9B1_UNK6); @@ -230,11 +242,30 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt fd_context_all_clean(ctx); } +static void * +fd6_compute_state_create(struct pipe_context *pctx, + const struct pipe_compute_state *cso) +{ + struct fd6_compute_state *hwcso = calloc(1, sizeof(*hwcso)); + hwcso->hwcso = ir3_shader_compute_state_create(pctx, cso); + return hwcso; +} + +static void +fd6_compute_state_delete(struct pipe_context *pctx, void *_hwcso) +{ + struct fd6_compute_state *hwcso = _hwcso; + ir3_shader_state_delete(pctx, hwcso->hwcso); + if (hwcso->stateobj) + fd_ringbuffer_del(hwcso->stateobj); + free(hwcso); +} + void fd6_compute_init(struct pipe_context *pctx) disable_thread_safety_analysis { struct fd_context *ctx = fd_context(pctx); ctx->launch_grid = fd6_launch_grid; - pctx->create_compute_state = ir3_shader_compute_state_create; - pctx->delete_compute_state = ir3_shader_state_delete; + pctx->create_compute_state = fd6_compute_state_create; + pctx->delete_compute_state = fd6_compute_state_delete; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.h b/src/gallium/drivers/freedreno/a6xx/fd6_compute.h index 61354f4b851..a2508d2f26c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.h @@ -29,6 +29,13 @@ #include "pipe/p_context.h" +struct fd6_compute_state { + void *hwcso; /* ir3_shader_state */ + struct ir3_shader_variant *v; + struct fd_ringbuffer *stateobj; + uint32_t user_consts_cmdstream_size; +}; + void fd6_compute_init(struct pipe_context *pctx); #endif /* FD6_COMPUTE_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c index c7c37a1e778..e6e2abbeaec 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c @@ -25,6 +25,7 @@ #define FD_BO_NO_HARDPIN 1 #include "fd6_const.h" +#include "fd6_compute.h" #include "fd6_pack.h" #define emit_const_user fd6_emit_const_user @@ -334,12 +335,20 @@ fd6_build_driver_params(struct fd6_emit *emit) } void -fd6_emit_cs_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_context *ctx, - const struct pipe_grid_info *info) +fd6_emit_cs_driver_params(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs, + const struct pipe_grid_info *info) { - ir3_emit_cs_consts(v, ring, ctx, info); - fd6_emit_ubos(v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); + ir3_emit_cs_driver_params(cs->v, ring, ctx, info); +} + +void +fd6_emit_cs_user_consts(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs) +{ + emit_user_consts(cs->v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); } void diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.h b/src/gallium/drivers/freedreno/a6xx/fd6_const.h index 43398246ec7..82c927796a7 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.h @@ -33,9 +33,13 @@ struct fd_ringbuffer *fd6_build_user_consts(struct fd6_emit *emit) assert_dt; struct fd_ringbuffer * fd6_build_driver_params(struct fd6_emit *emit) assert_dt; -void fd6_emit_cs_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_context *ctx, - const struct pipe_grid_info *info) assert_dt; +void fd6_emit_cs_driver_params(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs, + const struct pipe_grid_info *info) assert_dt; +void fd6_emit_cs_user_consts(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs) assert_dt; void fd6_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) assert_dt; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index c576b58c8cd..d84e78a4b61 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -42,6 +42,7 @@ #include "fd6_blend.h" #include "fd6_const.h" #include "fd6_context.h" +#include "fd6_compute.h" #include "fd6_emit.h" #include "fd6_image.h" #include "fd6_pack.h" @@ -674,14 +675,31 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) + struct fd6_compute_state *cs) { struct fd6_state state = {}; - u_foreach_bit (b, ctx->gen_dirty) { + /* We want CP_SET_DRAW_STATE to execute immediately, otherwise we need to + * emit consts as draw state groups (which otherwise has no benefit outside + * of GMEM 3d using viz stream from binning pass). + * + * In particular, the PROG state group sets up the configuration for the + * const state, so it must execute before we start loading consts, rather + * than be deferred until CP_EXEC_CS. + */ + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 1); + + uint32_t gen_dirty = ctx->gen_dirty & + (BIT(FD6_GROUP_PROG) | BIT(FD6_GROUP_CS_TEX) | BIT(FD6_GROUP_CS_BINDLESS)); + + u_foreach_bit (b, gen_dirty) { enum fd6_state_id group = b; switch (group) { + case FD6_GROUP_PROG: + fd6_state_add_group(&state, cs->stateobj, FD6_GROUP_PROG); + break; case FD6_GROUP_CS_TEX: fd6_state_take_group( &state, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 0dc1f41bef3..d908eee9fb1 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -325,8 +325,9 @@ fd6_gl2spacing(enum gl_tess_spacing spacing) void fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt; +struct fd6_compute_state; void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) assert_dt; + struct fd6_compute_state *cs) assert_dt; void fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index c4ab8aa3c9c..a50c54d28cd 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -585,15 +585,12 @@ ir3_emit_fs_consts(const struct ir3_shader_variant *v, emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT); } -/* emit compute-shader consts: */ static inline void -ir3_emit_cs_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_context *ctx, - const struct pipe_grid_info *info) assert_dt +ir3_emit_cs_driver_params(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) + assert_dt { - assert(gl_shader_stage_is_compute(v->type)); - - emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); emit_kernel_params(ctx, v, ring, info); /* a3xx/a4xx can inject these directly */ @@ -651,3 +648,16 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, } } } + +/* emit compute-shader consts: */ +static inline void +ir3_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) assert_dt +{ + assert(gl_shader_stage_is_compute(v->type)); + + emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); + + ir3_emit_cs_driver_params(v, ring, ctx, info); +}