From 47fa229605010257858e561e64984fceb4c89230 Mon Sep 17 00:00:00 2001 From: Jose Maria Casanova Crespo Date: Thu, 19 Mar 2026 10:37:40 +0100 Subject: [PATCH] broadcom/common: add tile alloc block size macros and sizing helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE = 128 and V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE = 64 to v3d_limits.h. Corresponding _ENUM macros provide the 2-bit hardware encoding for the TILE_BINNING_MODE_CFG packets. The previous implicit 64B initial blocks were too small: a single draw call emits ~88 bytes of per-tile BCL state, immediately overflowing into continuation blocks. 128B initial blocks avoid the first continuation allocation for simple single-draw passes. Add v3d_tile_alloc_sizes() to v3d_util with the full tile alloc BO and TSDA sizing logic. This uses the 128B initial blocks and tile_alloc becomes proportional to the number of draws and size of the initial blocks allocation with the cap of the previous fixed allocation. So jobs with 0 or 1 drawcalls (blits/fills) reduce their headroom dramatically. The draw-proportional formula replaces a flat 512 KB continuation pool: headroom = MIN2((tiles_size * draw_count) / 2, 512 KB) Benchmarked on RPi5 (V3D 7.1) against GfxBench GL tests and apitrace replays at 1080p. Tile-alloc memory reduction versus the flat 512 KB headroom (taking into account 256kb kernel alloc per OOM): GfxBench (5 benchmarks): -45% to -70% reduction, OOM at or below baseline Apitrace (19 traces): -4% to -77% reduction on 20/24 traces No FPS regressions observed on any workload. Reviewed-by: MaĆ­ra Canal Reviewed-by: Iago Toral Quiroga Part-of: --- src/broadcom/common/v3d_limits.h | 16 +++++++++++ src/broadcom/common/v3d_util.c | 46 ++++++++++++++++++++++++++++++++ src/broadcom/common/v3d_util.h | 9 +++++++ 3 files changed, 71 insertions(+) diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h index 357e51bc675..157b73700cc 100644 --- a/src/broadcom/common/v3d_limits.h +++ b/src/broadcom/common/v3d_limits.h @@ -71,4 +71,20 @@ #define V3D_MAX_VERTEX_ATTRIB_DIVISOR 0xffff +/* Tile allocation block sizes for the PTB, as enum values matching + * the TILE_BINNING_MODE_CFG / TILE_LIST_INITIAL_BLOCK_SIZE packets. + * The byte size is 64 << enum_value (0 = 64B, 1 = 128B, 2 = 256B). + * + * Using 128B initial blocks avoids tile overflow for simple draws + * (a single draw emits ~88 bytes of state per tile). 64B continuation + * blocks reduce internal fragmentation in the tile allocation pool. + */ +#define V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE 128 +#define V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE 64 + +#define V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE_ENUM \ + (V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE >> 7) +#define V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE_ENUM \ + (V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE >> 7) + #endif /* V3D_LIMITS_H */ diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c index 65233dabfc4..c967ede6153 100644 --- a/src/broadcom/common/v3d_util.c +++ b/src/broadcom/common/v3d_util.c @@ -22,7 +22,9 @@ */ #include "v3d_util.h" +#include "v3d_limits.h" #include "util/macros.h" +#include "util/u_math.h" /* Choose a number of workgroups per supergroup that maximizes * lane occupancy. We can pack up to 16 workgroups into a supergroup. @@ -266,6 +268,50 @@ v3d_internal_bpp_words(uint32_t internal_bpp) } } +void +v3d_tile_alloc_sizes(uint32_t layers, + uint32_t tiles_x, + uint32_t tiles_y, + uint32_t draws, + uint32_t page_size, + uint32_t *tile_alloc_size, + uint32_t *tile_state_size) +{ + assert(layers > 0); + /* The PTB will request the tile alloc initial size per tile at start + * of tile binning. The size must match the initial block size + * configured in the TILE_BINNING_MODE_CFG packet. + */ + uint32_t tiles_size = + layers * tiles_x * tiles_y * V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE; + + /* The PTB allocates in aligned 4k chunks after the initial setup. */ + uint32_t alloc_size = align(tiles_size, 4096); + + /* Include the first two chunk allocations that the PTB does so that + * we definitely clear the OOM condition before triggering one (the HW + * won't trigger OOM during the first allocations). + */ + alloc_size += 8192; + + /* Pre-allocate a continuation pool so the GPU rarely has to stall + * waiting for the kernel OOM handler. Each draw call writes per-tile + * BCL state (primitives, uniforms, shader records) whose size scales + * with both the number of tiles and the number of draws. Use the + * product (tiles_size * draws) / 2 as an estimate, capped at 512 KB + * to avoid over-allocating on high draw-count scenes. Align the + * total to page_size. + * The formula assumes the initial block size of 128B, so if it is + * changed it needs to be adjusted. + */ + STATIC_ASSERT(V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE == 128); + alloc_size += MIN2((tiles_size * draws) / 2, 512 * 1024); + alloc_size = align(alloc_size, page_size); + + *tile_alloc_size = alloc_size; + *tile_state_size = layers * tiles_x * tiles_y * 256; +} + uint32_t v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, uint32_t bpp) diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h index 32856a2335f..7b60d938faf 100644 --- a/src/broadcom/common/v3d_util.h +++ b/src/broadcom/common/v3d_util.h @@ -98,6 +98,15 @@ log2_tile_size(uint32_t size) } } +void +v3d_tile_alloc_sizes(uint32_t layers, + uint32_t tiles_x, + uint32_t tiles_y, + uint32_t draws, + uint32_t page_size, + uint32_t *tile_alloc_size, + uint32_t *tile_state_size); + uint32_t v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width, uint32_t bpp);