From 05fffc7b25379700548afe64466de1fd7b95de93 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Mon, 30 Jan 2023 18:09:42 +0100 Subject: [PATCH] freedreno,turnip: Make number of VSC pipes configurable a610/a608 has less pipes, so we need to make it configurable. In particular we need to program all of the VSC_PIPE_CONFIG_REG[n] rather than leaving garbage values for the unused pipes. Pointing multiple VSC pipes at the same bin makes the hw angry. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/common/freedreno_devices.py | 15 +++++-- src/freedreno/vulkan/tu_cmd_buffer.cc | 24 ++++++----- src/freedreno/vulkan/tu_util.cc | 6 ++- .../drivers/freedreno/a6xx/fd6_gmem.cc | 43 ++++++++++--------- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 39ce9215c49..b445c237b93 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -126,14 +126,13 @@ class A6xxGPUInfo(GPUInfo): into distinct sub-generations. The template parameter avoids duplication of parameters that are unique to the sub-generation. """ - def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, magic_regs): + def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, magic_regs): super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4, tile_align_w = tile_align_w, tile_align_h = tile_align_h, tile_max_w = 1024, # max_bitfield_val(5, 0, 5) tile_max_h = max_bitfield_val(14, 8, 4), - num_vsc_pipes = 32) - + num_vsc_pipes = num_vsc_pipes) # The # of SP cores seems to always match # of CCU self.num_sp_cores = num_ccu self.num_ccu = num_ccu @@ -307,6 +306,7 @@ add_gpus([ num_ccu = 1, tile_align_w = 32, tile_align_h = 32, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -332,6 +332,7 @@ add_gpus([ num_ccu = 1, tile_align_w = 32, tile_align_h = 16, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 0, TPL1_DBG_ECO_CNTL = 0x01008000, @@ -357,6 +358,7 @@ add_gpus([ num_ccu = 2, tile_align_w = 32, tile_align_h = 16, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -382,6 +384,7 @@ add_gpus([ num_ccu = 2, tile_align_w = 32, tile_align_h = 16, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x00008000, @@ -407,6 +410,7 @@ add_gpus([ num_ccu = 4, tile_align_w = 64, tile_align_h = 32, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 3, TPL1_DBG_ECO_CNTL = 0x00108000, @@ -432,6 +436,7 @@ add_gpus([ num_ccu = 3, tile_align_w = 96, tile_align_h = 16, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 2, # this seems to be a chicken bit that fixes cubic filtering: @@ -462,6 +467,7 @@ add_gpus([ num_ccu = 2, tile_align_w = 32, tile_align_h = 16, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 1, TPL1_DBG_ECO_CNTL = 0x05008000, @@ -487,6 +493,7 @@ add_gpus([ num_ccu = 3, tile_align_w = 96, tile_align_h = 16, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 2, TPL1_DBG_ECO_CNTL = 0x05008000, @@ -512,6 +519,7 @@ add_gpus([ num_ccu = 8, tile_align_w = 64, tile_align_h = 32, + num_vsc_pipes = 32, magic_regs = dict( PC_POWER_CNTL = 7, TPL1_DBG_ECO_CNTL = 0x01008000, @@ -539,6 +547,7 @@ add_gpus([ num_ccu = 4, tile_align_w = 64, tile_align_h = 32, + num_vsc_pipes = 32, magic_regs = dict() )) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index b7accee3ca2..6324a1d9999 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -87,6 +87,7 @@ static void tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { struct tu_device *dev = cmd->device; + uint32_t num_vsc_pipes = dev->physical_device->info->num_vsc_pipes; /* VSC buffers: * use vsc pitches from the largest values used so far with this device @@ -114,18 +115,19 @@ tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) mtx_unlock(&dev->mutex); struct tu_bo *vsc_bo; - uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES + - cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES; + uint32_t size0 = cmd->vsc_prim_strm_pitch * num_vsc_pipes + + cmd->vsc_draw_strm_pitch * num_vsc_pipes; - tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo); + tu_get_scratch_bo(dev, size0 + num_vsc_pipes * 4, &vsc_bo); tu_cs_emit_regs(cs, A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0)); tu_cs_emit_regs(cs, A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo)); - tu_cs_emit_regs(cs, - A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo, - .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES)); + tu_cs_emit_regs( + cs, A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo, + .bo_offset = cmd->vsc_prim_strm_pitch * + num_vsc_pipes)); cmd->vsc_initialized = true; } @@ -1144,7 +1146,9 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } static void -update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +update_vsc_pipe(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t num_vsc_pipes) { const struct tu_tiling_config *tiling = cmd->state.tiling; @@ -1156,8 +1160,8 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs) A6XX_VSC_BIN_COUNT(.nx = tiling->tile_count.width, .ny = tiling->tile_count.height)); - tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); - tu_cs_emit_array(cs, tiling->pipe_config, 32); + tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), num_vsc_pipes); + tu_cs_emit_array(cs, tiling->pipe_config, num_vsc_pipes); tu_cs_emit_regs(cs, A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch), @@ -1244,7 +1248,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_regs(cs, A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS)); - update_vsc_pipe(cmd, cs); + update_vsc_pipe(cmd, cs, phys_dev->info->num_vsc_pipes); tu_cs_emit_regs(cs, A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL)); diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index 1f24f4f2d9e..08fefb73d46 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -222,7 +222,8 @@ static void tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling, const struct tu_device *dev) { - const uint32_t max_pipe_count = 32; /* A6xx */ + const uint32_t max_pipe_count = + dev->physical_device->info->num_vsc_pipes; /* start from 1 tile per pipe */ tiling->pipe0 = (VkExtent2D) { @@ -248,7 +249,8 @@ static void tu_tiling_config_update_pipes(struct tu_tiling_config *tiling, const struct tu_device *dev) { - const uint32_t max_pipe_count = 32; /* A6xx */ + const uint32_t max_pipe_count = + dev->physical_device->info->num_vsc_pipes; const uint32_t used_pipe_count = tiling->pipe_count.width * tiling->pipe_count.height; const VkExtent2D last_pipe = { diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc index e60958f5eda..55b5d64c898 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc @@ -494,10 +494,6 @@ update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb, } } -/* extra size to store VSC_DRAW_STRM_SIZE: */ -#define VSC_DRAW_STRM_SIZE(pitch) ((pitch)*32 + 0x100) -#define VSC_PRIM_STRM_SIZE(pitch) ((pitch)*32) - static void update_vsc_pipe(struct fd_batch *batch) { @@ -505,6 +501,7 @@ update_vsc_pipe(struct fd_batch *batch) struct fd6_context *fd6_ctx = fd6_context(ctx); const struct fd_gmem_stateobj *gmem = batch->gmem_state; struct fd_ringbuffer *ring = batch->gmem; + unsigned max_vsc_pipes = batch->ctx->screen->info->num_vsc_pipes; int i; if (batch->draw_strm_bits / 8 > fd6_ctx->vsc_draw_strm_pitch) { @@ -530,27 +527,31 @@ update_vsc_pipe(struct fd_batch *batch) } if (!fd6_ctx->vsc_draw_strm) { - fd6_ctx->vsc_draw_strm = fd_bo_new( - ctx->screen->dev, VSC_DRAW_STRM_SIZE(fd6_ctx->vsc_draw_strm_pitch), - FD_BO_NOMAP, "vsc_draw_strm"); + /* We also use four bytes per vsc pipe at the end of the draw + * stream buffer for VSC_DRAW_STRM_SIZE written back by hw + * (see VSC_DRAW_STRM_SIZE_ADDRESS) + */ + unsigned sz = (max_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch) + + (max_vsc_pipes * 4); + fd6_ctx->vsc_draw_strm = + fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_draw_strm"); } if (!fd6_ctx->vsc_prim_strm) { - fd6_ctx->vsc_prim_strm = fd_bo_new( - ctx->screen->dev, VSC_PRIM_STRM_SIZE(fd6_ctx->vsc_prim_strm_pitch), - FD_BO_NOMAP, "vsc_prim_strm"); + unsigned sz = max_vsc_pipes * fd6_ctx->vsc_prim_strm_pitch; + fd6_ctx->vsc_prim_strm = + fd_bo_new(ctx->screen->dev, sz, FD_BO_NOMAP, "vsc_prim_strm"); } - OUT_REG( - ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h), - A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm, - .bo_offset = - 32 * fd6_ctx->vsc_draw_strm_pitch)); + OUT_REG(ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h), + A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm, + .bo_offset = max_vsc_pipes * + fd6_ctx->vsc_draw_strm_pitch)); OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, .ny = gmem->nbins_y)); - OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); - for (i = 0; i < 32; i++) { + OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), max_vsc_pipes); + for (i = 0; i < max_vsc_pipes; i++) { const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | @@ -1088,6 +1089,7 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) if (use_hw_binning(batch)) { const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + unsigned num_vsc_pipes = ctx->screen->info->num_vsc_pipes; OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); @@ -1099,9 +1101,10 @@ fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) CP_SET_BIN_DATA5_0_VSC_N(tile->n)); OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */ (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0); - OUT_RELOC(ring, - fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */ - (tile->p * 4) + (32 * fd6_ctx->vsc_draw_strm_pitch), 0, 0); + OUT_RELOC( + ring, fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */ + (tile->p * 4) + (num_vsc_pipes * fd6_ctx->vsc_draw_strm_pitch), + 0, 0); OUT_RELOC(ring, fd6_ctx->vsc_prim_strm, (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0);