diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc index d32cdb30ed4..fc619a96b8d 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc @@ -23,7 +23,47 @@ #include "fd6_emit.h" #include "fd6_pack.h" -/* maybe move to fd6_program? */ +template +static void +cs_program_emit_local_size(struct fd_context *ctx, struct fd_ringbuffer *ring, + struct ir3_shader_variant *v, uint16_t local_size[3]) +{ + /* + * Devices that do not support double threadsize take the threadsize from + * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE + * which is always set to THREAD128. + */ + enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; + enum a6xx_threadsize thrsz_cs = ctx->screen->info->a6xx + .supports_double_threadsize ? thrsz : THREAD128; + + if (CHIP == A7XX) { + unsigned tile_height = (local_size[1] % 8 == 0) ? 3 + : (local_size[1] % 4 == 0) ? 5 + : (local_size[1] % 2 == 0) ? 9 + : 17; + + OUT_REG(ring, + HLSQ_CS_CNTL_1( + CHIP, + .linearlocalidregid = INVALID_REG, + .threadsize = thrsz_cs, + .workgrouprastorderzfirsten = true, + .wgtilewidth = 4, + .wgtileheight = tile_height, + ) + ); + + OUT_REG(ring, + A7XX_HLSQ_CS_LOCAL_SIZE( + .localsizex = local_size[0] - 1, + .localsizey = local_size[1] - 1, + .localsizez = local_size[2] - 1, + ) + ); + } +} + template static void cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, @@ -86,22 +126,6 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz)); } } else { - unsigned tile_height = (v->local_size[1] % 8 == 0) ? 3 - : (v->local_size[1] % 4 == 0) ? 5 - : (v->local_size[1] % 2 == 0) ? 9 - : 17; - - OUT_REG(ring, - HLSQ_CS_CNTL_1( - CHIP, - .linearlocalidregid = regid(63, 0), - .threadsize = thrsz_cs, - .workgrouprastorderzfirsten = true, - .wgtilewidth = 4, - .wgtileheight = tile_height, - ) - ); - OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64)); OUT_REG(ring, A6XX_SP_CS_CNTL_0( @@ -121,16 +145,12 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, : WORKITEMRASTORDER_TILED, ) ); - OUT_REG(ring, - A7XX_HLSQ_CS_LOCAL_SIZE( - .localsizex = v->local_size[0] - 1, - .localsizey = v->local_size[1] - 1, - .localsizez = v->local_size[2] - 1, - ) - ); OUT_REG(ring, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000 } + if (!v->local_size_variable) + cs_program_emit_local_size(ctx, ring, v, v->local_size); + fd6_emit_shader(ctx, ring, v); } @@ -216,6 +236,11 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ const unsigned work_dim = info->work_dim ? info->work_dim : 3; + if (cs->v->local_size_variable) { + uint16_t wg[] = {local_size[0], local_size[1], local_size[2]}; + cs_program_emit_local_size(ctx, ring, cs->v, wg); + } + OUT_REG(ring, HLSQ_CS_NDRANGE_0( CHIP,