freedreno/a6xx: Support variable wg size

If local wg size isn't known at compile time, we need to move some of
the state emit out of the state object and into IB2 cmdstream.

This still doesn't account for the fact that RA currently must assume
the worst case, meaning limiting cl kernels to a miniumum number of regs
and spilling excessively.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30835>
This commit is contained in:
Rob Clark 2024-09-12 11:38:52 -07:00 committed by Marge Bot
parent 1569219f51
commit f3211e243f

View file

@ -23,7 +23,47 @@
#include "fd6_emit.h"
#include "fd6_pack.h"
/* maybe move to fd6_program? */
template <chip CHIP>
static void
cs_program_emit_local_size(struct fd_context *ctx, struct fd_ringbuffer *ring,
struct ir3_shader_variant *v, uint16_t local_size[3])
{
/*
* Devices that do not support double threadsize take the threadsize from
* A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
* which is always set to THREAD128.
*/
enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
enum a6xx_threadsize thrsz_cs = ctx->screen->info->a6xx
.supports_double_threadsize ? thrsz : THREAD128;
if (CHIP == A7XX) {
unsigned tile_height = (local_size[1] % 8 == 0) ? 3
: (local_size[1] % 4 == 0) ? 5
: (local_size[1] % 2 == 0) ? 9
: 17;
OUT_REG(ring,
HLSQ_CS_CNTL_1(
CHIP,
.linearlocalidregid = INVALID_REG,
.threadsize = thrsz_cs,
.workgrouprastorderzfirsten = true,
.wgtilewidth = 4,
.wgtileheight = tile_height,
)
);
OUT_REG(ring,
A7XX_HLSQ_CS_LOCAL_SIZE(
.localsizex = local_size[0] - 1,
.localsizey = local_size[1] - 1,
.localsizez = local_size[2] - 1,
)
);
}
}
template <chip CHIP>
static void
cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
@ -86,22 +126,6 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
}
} else {
unsigned tile_height = (v->local_size[1] % 8 == 0) ? 3
: (v->local_size[1] % 4 == 0) ? 5
: (v->local_size[1] % 2 == 0) ? 9
: 17;
OUT_REG(ring,
HLSQ_CS_CNTL_1(
CHIP,
.linearlocalidregid = regid(63, 0),
.threadsize = thrsz_cs,
.workgrouprastorderzfirsten = true,
.wgtilewidth = 4,
.wgtileheight = tile_height,
)
);
OUT_REG(ring, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
OUT_REG(ring,
A6XX_SP_CS_CNTL_0(
@ -121,16 +145,12 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
: WORKITEMRASTORDER_TILED,
)
);
OUT_REG(ring,
A7XX_HLSQ_CS_LOCAL_SIZE(
.localsizex = v->local_size[0] - 1,
.localsizey = v->local_size[1] - 1,
.localsizez = v->local_size[2] - 1,
)
);
OUT_REG(ring, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
}
if (!v->local_size_variable)
cs_program_emit_local_size<CHIP>(ctx, ring, v, v->local_size);
fd6_emit_shader<CHIP>(ctx, ring, v);
}
@ -216,6 +236,11 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
/* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */
const unsigned work_dim = info->work_dim ? info->work_dim : 3;
if (cs->v->local_size_variable) {
uint16_t wg[] = {local_size[0], local_size[1], local_size[2]};
cs_program_emit_local_size<CHIP>(ctx, ring, cs->v, wg);
}
OUT_REG(ring,
HLSQ_CS_NDRANGE_0(
CHIP,