diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c index 5d751cfe1b0..07139cc5ee3 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c @@ -100,32 +100,38 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, } fd6_emit_shader(ctx, ring, v); + fd6_emit_immediates(ctx->screen, v, ring); } static void fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt { - struct ir3_shader_key key = {}; - struct ir3_shader_variant *v; + struct fd6_compute_state *cs = ctx->compute; struct fd_ringbuffer *ring = ctx->batch->draw; unsigned nglobal = 0; + if (unlikely(!cs->v)) { + struct ir3_shader_key key = {}; + + cs->v = ir3_shader_variant(ir3_get_shader(cs->hwcso), key, false, &ctx->debug); + if (!cs->v) + return; + + cs->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + cs_program_emit(ctx, cs->stateobj, cs->v); + + cs->user_consts_cmdstream_size = fd6_user_consts_cmdstream_size(cs->v); + } + trace_start_compute(&ctx->batch->trace, ring, !!info->indirect, info->work_dim, info->block[0], info->block[1], info->block[2], info->grid[0], info->grid[1], info->grid[2]); - v = ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug); - if (!v) - return; - if (ctx->batch->barrier) fd6_barrier_flush(ctx->batch); - if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) - cs_program_emit(ctx, ring, v); - bool emit_instrlen_workaround = - v->instrlen > ctx->screen->info->a6xx.instr_cache_size; + cs->v->instrlen > ctx->screen->info->a6xx.instr_cache_size; /* There appears to be a HW bug where in some rare circumstances it appears * to accidentally use the FS instrlen instead of the CS instrlen, which @@ -143,12 +149,18 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt * See https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19023 */ if (emit_instrlen_workaround) { - OUT_REG(ring, A6XX_SP_FS_INSTRLEN(v->instrlen)); + OUT_REG(ring, A6XX_SP_FS_INSTRLEN(cs->v->instrlen)); fd6_event_write(ctx->batch, ring, LABEL, false); } - fd6_emit_cs_state(ctx, ring, v); - fd6_emit_cs_consts(v, ring, ctx, info); + if (ctx->gen_dirty) + fd6_emit_cs_state(ctx, ring, cs); + + if (ctx->gen_dirty & BIT(FD6_GROUP_CONST)) + fd6_emit_cs_user_consts(ctx, ring, cs); + + if (cs->v->need_driver_params || info->input) + fd6_emit_cs_driver_params(ctx, ring, cs, info); u_foreach_bit (i, ctx->global_bindings.enabled_mask) nglobal++; @@ -171,7 +183,7 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); uint32_t shared_size = - MAX2(((int)(v->cs.req_local_mem + info->variable_shared_mem) - 1) / 1024, 1); + MAX2(((int)(cs->v->cs.req_local_mem + info->variable_shared_mem) - 1) / 1024, 1); OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | A6XX_SP_CS_UNKNOWN_A9B1_UNK6); @@ -230,11 +242,30 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt fd_context_all_clean(ctx); } +static void * +fd6_compute_state_create(struct pipe_context *pctx, + const struct pipe_compute_state *cso) +{ + struct fd6_compute_state *hwcso = calloc(1, sizeof(*hwcso)); + hwcso->hwcso = ir3_shader_compute_state_create(pctx, cso); + return hwcso; +} + +static void +fd6_compute_state_delete(struct pipe_context *pctx, void *_hwcso) +{ + struct fd6_compute_state *hwcso = _hwcso; + ir3_shader_state_delete(pctx, hwcso->hwcso); + if (hwcso->stateobj) + fd_ringbuffer_del(hwcso->stateobj); + free(hwcso); +} + void fd6_compute_init(struct pipe_context *pctx) disable_thread_safety_analysis { struct fd_context *ctx = fd_context(pctx); ctx->launch_grid = fd6_launch_grid; - pctx->create_compute_state = ir3_shader_compute_state_create; - pctx->delete_compute_state = ir3_shader_state_delete; + pctx->create_compute_state = fd6_compute_state_create; + pctx->delete_compute_state = fd6_compute_state_delete; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.h b/src/gallium/drivers/freedreno/a6xx/fd6_compute.h index 61354f4b851..a2508d2f26c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.h @@ -29,6 +29,13 @@ #include "pipe/p_context.h" +struct fd6_compute_state { + void *hwcso; /* ir3_shader_state */ + struct ir3_shader_variant *v; + struct fd_ringbuffer *stateobj; + uint32_t user_consts_cmdstream_size; +}; + void fd6_compute_init(struct pipe_context *pctx); #endif /* FD6_COMPUTE_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c index c7c37a1e778..e6e2abbeaec 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c @@ -25,6 +25,7 @@ #define FD_BO_NO_HARDPIN 1 #include "fd6_const.h" +#include "fd6_compute.h" #include "fd6_pack.h" #define emit_const_user fd6_emit_const_user @@ -334,12 +335,20 @@ fd6_build_driver_params(struct fd6_emit *emit) } void -fd6_emit_cs_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_context *ctx, - const struct pipe_grid_info *info) +fd6_emit_cs_driver_params(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs, + const struct pipe_grid_info *info) { - ir3_emit_cs_consts(v, ring, ctx, info); - fd6_emit_ubos(v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); + ir3_emit_cs_driver_params(cs->v, ring, ctx, info); +} + +void +fd6_emit_cs_user_consts(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs) +{ + emit_user_consts(cs->v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); } void diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.h b/src/gallium/drivers/freedreno/a6xx/fd6_const.h index 43398246ec7..82c927796a7 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.h @@ -33,9 +33,13 @@ struct fd_ringbuffer *fd6_build_user_consts(struct fd6_emit *emit) assert_dt; struct fd_ringbuffer * fd6_build_driver_params(struct fd6_emit *emit) assert_dt; -void fd6_emit_cs_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_context *ctx, - const struct pipe_grid_info *info) assert_dt; +void fd6_emit_cs_driver_params(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs, + const struct pipe_grid_info *info) assert_dt; +void fd6_emit_cs_user_consts(struct fd_context *ctx, + struct fd_ringbuffer *ring, + struct fd6_compute_state *cs) assert_dt; void fd6_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) assert_dt; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index c576b58c8cd..d84e78a4b61 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -42,6 +42,7 @@ #include "fd6_blend.h" #include "fd6_const.h" #include "fd6_context.h" +#include "fd6_compute.h" #include "fd6_emit.h" #include "fd6_image.h" #include "fd6_pack.h" @@ -674,14 +675,31 @@ fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) + struct fd6_compute_state *cs) { struct fd6_state state = {}; - u_foreach_bit (b, ctx->gen_dirty) { + /* We want CP_SET_DRAW_STATE to execute immediately, otherwise we need to + * emit consts as draw state groups (which otherwise has no benefit outside + * of GMEM 3d using viz stream from binning pass). + * + * In particular, the PROG state group sets up the configuration for the + * const state, so it must execute before we start loading consts, rather + * than be deferred until CP_EXEC_CS. + */ + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 1); + + uint32_t gen_dirty = ctx->gen_dirty & + (BIT(FD6_GROUP_PROG) | BIT(FD6_GROUP_CS_TEX) | BIT(FD6_GROUP_CS_BINDLESS)); + + u_foreach_bit (b, gen_dirty) { enum fd6_state_id group = b; switch (group) { + case FD6_GROUP_PROG: + fd6_state_add_group(&state, cs->stateobj, FD6_GROUP_PROG); + break; case FD6_GROUP_CS_TEX: fd6_state_take_group( &state, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 0dc1f41bef3..d908eee9fb1 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -325,8 +325,9 @@ fd6_gl2spacing(enum gl_tess_spacing spacing) void fd6_emit_3d_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt; +struct fd6_compute_state; void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) assert_dt; + struct fd6_compute_state *cs) assert_dt; void fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index c4ab8aa3c9c..a50c54d28cd 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -585,15 +585,12 @@ ir3_emit_fs_consts(const struct ir3_shader_variant *v, emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT); } -/* emit compute-shader consts: */ static inline void -ir3_emit_cs_consts(const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_context *ctx, - const struct pipe_grid_info *info) assert_dt +ir3_emit_cs_driver_params(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) + assert_dt { - assert(gl_shader_stage_is_compute(v->type)); - - emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); emit_kernel_params(ctx, v, ring, info); /* a3xx/a4xx can inject these directly */ @@ -651,3 +648,16 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, } } } + +/* emit compute-shader consts: */ +static inline void +ir3_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) assert_dt +{ + assert(gl_shader_stage_is_compute(v->type)); + + emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); + + ir3_emit_cs_driver_params(v, ring, ctx, info); +}