freedreno,turnip: Make number of VSC pipes configurable

a610/a608 has less pipes, so we need to make it configurable.

In particular we need to program all of the VSC_PIPE_CONFIG_REG[n]
rather than leaving garbage values for the unused pipes.  Pointing
multiple VSC pipes at the same bin makes the hw angry.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20991>
This commit is contained in:
Danylo Piliaiev 2023-01-30 18:09:42 +01:00 committed by Marge Bot
parent 0814c2c7a6
commit 05fffc7b25
4 changed files with 53 additions and 35 deletions

View file

@ -126,14 +126,13 @@ class A6xxGPUInfo(GPUInfo):
into distinct sub-generations. The template parameter avoids
duplication of parameters that are unique to the sub-generation.
"""
def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, magic_regs):
def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, magic_regs):
super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
tile_align_w = tile_align_w,
tile_align_h = tile_align_h,
tile_max_w = 1024, # max_bitfield_val(5, 0, 5)
tile_max_h = max_bitfield_val(14, 8, 4),
num_vsc_pipes = 32)
num_vsc_pipes = num_vsc_pipes)
# The # of SP cores seems to always match # of CCU
self.num_sp_cores = num_ccu
self.num_ccu = num_ccu
@ -307,6 +306,7 @@ add_gpus([
num_ccu = 1,
tile_align_w = 32,
tile_align_h = 32,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0x00108000,
@ -332,6 +332,7 @@ add_gpus([
num_ccu = 1,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0x01008000,
@ -357,6 +358,7 @@ add_gpus([
num_ccu = 2,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x00108000,
@ -382,6 +384,7 @@ add_gpus([
num_ccu = 2,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x00008000,
@ -407,6 +410,7 @@ add_gpus([
num_ccu = 4,
tile_align_w = 64,
tile_align_h = 32,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 3,
TPL1_DBG_ECO_CNTL = 0x00108000,
@ -432,6 +436,7 @@ add_gpus([
num_ccu = 3,
tile_align_w = 96,
tile_align_h = 16,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 2,
# this seems to be a chicken bit that fixes cubic filtering:
@ -462,6 +467,7 @@ add_gpus([
num_ccu = 2,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x05008000,
@ -487,6 +493,7 @@ add_gpus([
num_ccu = 3,
tile_align_w = 96,
tile_align_h = 16,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 2,
TPL1_DBG_ECO_CNTL = 0x05008000,
@ -512,6 +519,7 @@ add_gpus([
num_ccu = 8,
tile_align_w = 64,
tile_align_h = 32,
num_vsc_pipes = 32,
magic_regs = dict(
PC_POWER_CNTL = 7,
TPL1_DBG_ECO_CNTL = 0x01008000,
@ -539,6 +547,7 @@ add_gpus([
num_ccu = 4,
tile_align_w = 64,
tile_align_h = 32,
num_vsc_pipes = 32,
magic_regs = dict()
))

View file

@ -87,6 +87,7 @@ static void
tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
struct tu_device *dev = cmd->device;
uint32_t num_vsc_pipes = dev->physical_device->info->num_vsc_pipes;
/* VSC buffers:
* use vsc pitches from the largest values used so far with this device
@ -114,18 +115,19 @@ tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
mtx_unlock(&dev->mutex);
struct tu_bo *vsc_bo;
uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
uint32_t size0 = cmd->vsc_prim_strm_pitch * num_vsc_pipes +
cmd->vsc_draw_strm_pitch * num_vsc_pipes;
tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
tu_get_scratch_bo(dev, size0 + num_vsc_pipes * 4, &vsc_bo);
tu_cs_emit_regs(cs,
A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
tu_cs_emit_regs(cs,
A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
tu_cs_emit_regs(cs,
A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
.bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
tu_cs_emit_regs(
cs, A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
.bo_offset = cmd->vsc_prim_strm_pitch *
num_vsc_pipes));
cmd->vsc_initialized = true;
}
@ -1144,7 +1146,9 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
}
static void
update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
update_vsc_pipe(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t num_vsc_pipes)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
@ -1156,8 +1160,8 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width,
.ny = tiling->tile_count.height));
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
tu_cs_emit_array(cs, tiling->pipe_config, 32);
tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), num_vsc_pipes);
tu_cs_emit_array(cs, tiling->pipe_config, num_vsc_pipes);
tu_cs_emit_regs(cs,
A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
@ -1244,7 +1248,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_regs(cs,
A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
update_vsc_pipe(cmd, cs);
update_vsc_pipe(cmd, cs, phys_dev->info->num_vsc_pipes);
tu_cs_emit_regs(cs,
A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));

View file

@ -222,7 +222,8 @@ static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
const struct tu_device *dev)
{
const uint32_t max_pipe_count = 32; /* A6xx */
const uint32_t max_pipe_count =
dev->physical_device->info->num_vsc_pipes;
/* start from 1 tile per pipe */
tiling->pipe0 = (VkExtent2D) {
@ -248,7 +249,8 @@ static void
tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
const struct tu_device *dev)
{
const uint32_t max_pipe_count = 32; /* A6xx */
const uint32_t max_pipe_count =
dev->physical_device->info->num_vsc_pipes;
const uint32_t used_pipe_count =
tiling->pipe_count.width * tiling->pipe_count.height;
const VkExtent2D last_pipe = {

View file

@ -494,10 +494,6 @@ update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb,
}
}
/* extra size to store VSC_DRAW_STRM_SIZE: */
#define VSC_DRAW_STRM_SIZE(pitch) ((pitch)*32 + 0x100)
#define VSC_PRIM_STRM_SIZE(pitch) ((pitch)*32)
static void
update_vsc_pipe(struct fd_batch *batch)
{
@ -505,6 +501,7 @@ update_vsc_pipe(struct fd_batch *batch)
struct fd6_context *fd6_ctx = fd6_context(ctx);
const struct fd_gmem_stateobj *gmem = batch->gmem_state;
struct fd_ringbuffer *ring = batch->gmem;
unsigned max_vsc_pipes = batch->ctx->screen->info->num_vsc_pipes;
int i;
if (batch->draw_strm_bits / 8 > fd6_ctx->vsc_draw_strm_pitch) {
@ -530,27 +527,31 @@ update_vsc_pipe(struct fd_batch *batch)
}
if (!fd6_ctx->vsc_draw_strm) {
fd6_ctx->vsc_draw_strm = fd_bo_new(
ctx->screen->dev, VSC_DRAW_STRM_SIZE(fd6_ctx->vsc_draw_strm_pitch),
FD_BO_NOMAP, "vsc_draw_strm");
/* We also use four bytes per vsc pipe at the end of the draw
* stream buffer for VSC_DRAW_STRM_SIZE written back by hw
* (see VSC_DRAW_STRM_SIZE_ADDRESS)
*/
unsigned sz = (max_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch) +
(max_vsc_pipes * 4);
fd6_ctx->vsc_draw_strm =
fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_draw_strm");
}
if (!fd6_ctx->vsc_prim_strm) {
fd6_ctx->vsc_prim_strm = fd_bo_new(
ctx->screen->dev, VSC_PRIM_STRM_SIZE(fd6_ctx->vsc_prim_strm_pitch),
FD_BO_NOMAP, "vsc_prim_strm");
unsigned sz = max_vsc_pipes * fd6_ctx->vsc_prim_strm_pitch;
fd6_ctx->vsc_prim_strm =
fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_prim_strm");
}
OUT_REG(
ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h),
A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm,
.bo_offset =
32 * fd6_ctx->vsc_draw_strm_pitch));
OUT_REG(ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h),
A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm,
.bo_offset = max_vsc_pipes *
fd6_ctx->vsc_draw_strm_pitch));
OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, .ny = gmem->nbins_y));
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
for (i = 0; i < 32; i++) {
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), max_vsc_pipes);
for (i = 0; i < max_vsc_pipes; i++) {
const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i];
OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
@ -1088,6 +1089,7 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile)
if (use_hw_binning(batch)) {
const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p];
unsigned num_vsc_pipes = ctx->screen->info->num_vsc_pipes;
OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
@ -1099,9 +1101,10 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile)
CP_SET_BIN_DATA5_0_VSC_N(tile->n));
OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */
(tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0);
OUT_RELOC(ring,
fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */
(tile->p * 4) + (32 * fd6_ctx->vsc_draw_strm_pitch), 0, 0);
OUT_RELOC(
ring, fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */
(tile->p * 4) + (num_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch),
0, 0);
OUT_RELOC(ring, fd6_ctx->vsc_prim_strm,
(tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0);