From f3211e243f32fbec2277dce147b6272bb38f2de7 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 12 Sep 2024 11:38:52 -0700 Subject: [PATCH] freedreno/a6xx: Support variable wg size If local wg size isn't known at compile time, we need to move some of the state emit out of the state object and into IB2 cmdstream. This still doesn't account for the fact that RA currently must assume the worst case, meaning limiting cl kernels to a miniumum number of regs and spilling excessively. Signed-off-by: Rob Clark Part-of: --- .../drivers/freedreno/a6xx/fd6_compute.cc | 73 +++++++++++++------ 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc index d32cdb30ed4..fc619a96b8d 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc @@ -23,7 +23,47 @@ #include "fd6_emit.h" #include "fd6_pack.h" -/* maybe move to fd6_program? */ +template +static void +cs_program_emit_local_size(struct fd_context *ctx, struct fd_ringbuffer *ring, + struct ir3_shader_variant *v, uint16_t local_size[3]) +{ + /* + * Devices that do not support double threadsize take the threadsize from + * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE + * which is always set to THREAD128. + */ + enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; + enum a6xx_threadsize thrsz_cs = ctx->screen->info->a6xx + .supports_double_threadsize ? thrsz : THREAD128; + + if (CHIP == A7XX) { + unsigned tile_height = (local_size[1] % 8 == 0) ? 3 + : (local_size[1] % 4 == 0) ? 5 + : (local_size[1] % 2 == 0) ? 9 + : 17; + + OUT_REG(ring, + HLSQ_CS_CNTL_1( + CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz_cs, + .workgrouprastorderzfirsten = true, + .wgtilewidth = 4, + .wgtileheight = tile_height, + ) + ); + + OUT_REG(ring, + A7XX_HLSQ_CS_LOCAL_SIZE( + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + ) + ); + } +} + template static void cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, @@ -86,22 +126,6 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); } } else { - unsigned tile_height = (v->local_size[1] % 8 == 0) ? 3 - : (v->local_size[1] % 4 == 0) ? 5 - : (v->local_size[1] % 2 == 0) ? 9 - : 17; - - OUT_REG(ring, - HLSQ_CS_CNTL_1( - CHIP, - .linearlocalidregid = regid(63, 0), - .threadsize = thrsz_cs, - .workgrouprastorderzfirsten = true, - .wgtilewidth = 4, - .wgtileheight = tile_height, - ) - ); - OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64)); OUT_REG(ring, A6XX_SP_CS_CNTL_0( @@ -121,16 +145,12 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, : WORKITEMRASTORDER_TILED, ) ); - OUT_REG(ring, - A7XX_HLSQ_CS_LOCAL_SIZE( - .localsizex = v->local_size[0] - 1, - .localsizey = v->local_size[1] - 1, - .localsizez = v->local_size[2] - 1, - ) - ); OUT_REG(ring, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000 } + if (!v->local_size_variable) + cs_program_emit_local_size(ctx, ring, v, v->local_size); + fd6_emit_shader(ctx, ring, v); } @@ -216,6 +236,11 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ const unsigned work_dim = info->work_dim ? info->work_dim : 3; + if (cs->v->local_size_variable) { + uint16_t wg[] = {local_size[0], local_size[1], local_size[2]}; + cs_program_emit_local_size(ctx, ring, cs->v, wg); + } + OUT_REG(ring, HLSQ_CS_NDRANGE_0( CHIP,