mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-07 04:58:05 +02:00
broadcom/common: add tile alloc block size macros and sizing helper
Add V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE = 128 and V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE = 64 to v3d_limits.h. Corresponding _ENUM macros provide the 2-bit hardware encoding for the TILE_BINNING_MODE_CFG packets. The previous implicit 64B initial blocks were too small: a single draw call emits ~88 bytes of per-tile BCL state, immediately overflowing into continuation blocks. 128B initial blocks avoid the first continuation allocation for simple single-draw passes. Add v3d_tile_alloc_sizes() to v3d_util with the full tile alloc BO and TSDA sizing logic. This uses the 128B initial blocks and tile_alloc becomes proportional to the number of draws and size of the initial blocks allocation with the cap of the previous fixed allocation. So jobs with 0 or 1 drawcalls (blits/fills) reduce their headroom dramatically. The draw-proportional formula replaces a flat 512 KB continuation pool: headroom = MIN2((tiles_size * draw_count) / 2, 512 KB) Benchmarked on RPi5 (V3D 7.1) against GfxBench GL tests and apitrace replays at 1080p. Tile-alloc memory reduction versus the flat 512 KB headroom (taking into account 256kb kernel alloc per OOM): GfxBench (5 benchmarks): -45% to -70% reduction, OOM at or below baseline Apitrace (19 traces): -4% to -77% reduction on 20/24 traces No FPS regressions observed on any workload. Reviewed-by: Maíra Canal <mcanal@igalia.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40554>
This commit is contained in:
parent
4542982062
commit
47fa229605
3 changed files with 71 additions and 0 deletions
|
|
@ -71,4 +71,20 @@
|
|||
|
||||
#define V3D_MAX_VERTEX_ATTRIB_DIVISOR 0xffff
|
||||
|
||||
/* Tile allocation block sizes for the PTB, as enum values matching
|
||||
* the TILE_BINNING_MODE_CFG / TILE_LIST_INITIAL_BLOCK_SIZE packets.
|
||||
* The byte size is 64 << enum_value (0 = 64B, 1 = 128B, 2 = 256B).
|
||||
*
|
||||
* Using 128B initial blocks avoids tile overflow for simple draws
|
||||
* (a single draw emits ~88 bytes of state per tile). 64B continuation
|
||||
* blocks reduce internal fragmentation in the tile allocation pool.
|
||||
*/
|
||||
#define V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE 128
|
||||
#define V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE 64
|
||||
|
||||
#define V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE_ENUM \
|
||||
(V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE >> 7)
|
||||
#define V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE_ENUM \
|
||||
(V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE >> 7)
|
||||
|
||||
#endif /* V3D_LIMITS_H */
|
||||
|
|
|
|||
|
|
@ -22,7 +22,9 @@
|
|||
*/
|
||||
|
||||
#include "v3d_util.h"
|
||||
#include "v3d_limits.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/u_math.h"
|
||||
|
||||
/* Choose a number of workgroups per supergroup that maximizes
|
||||
* lane occupancy. We can pack up to 16 workgroups into a supergroup.
|
||||
|
|
@ -266,6 +268,50 @@ v3d_internal_bpp_words(uint32_t internal_bpp)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
v3d_tile_alloc_sizes(uint32_t layers,
|
||||
uint32_t tiles_x,
|
||||
uint32_t tiles_y,
|
||||
uint32_t draws,
|
||||
uint32_t page_size,
|
||||
uint32_t *tile_alloc_size,
|
||||
uint32_t *tile_state_size)
|
||||
{
|
||||
assert(layers > 0);
|
||||
/* The PTB will request the tile alloc initial size per tile at start
|
||||
* of tile binning. The size must match the initial block size
|
||||
* configured in the TILE_BINNING_MODE_CFG packet.
|
||||
*/
|
||||
uint32_t tiles_size =
|
||||
layers * tiles_x * tiles_y * V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE;
|
||||
|
||||
/* The PTB allocates in aligned 4k chunks after the initial setup. */
|
||||
uint32_t alloc_size = align(tiles_size, 4096);
|
||||
|
||||
/* Include the first two chunk allocations that the PTB does so that
|
||||
* we definitely clear the OOM condition before triggering one (the HW
|
||||
* won't trigger OOM during the first allocations).
|
||||
*/
|
||||
alloc_size += 8192;
|
||||
|
||||
/* Pre-allocate a continuation pool so the GPU rarely has to stall
|
||||
* waiting for the kernel OOM handler. Each draw call writes per-tile
|
||||
* BCL state (primitives, uniforms, shader records) whose size scales
|
||||
* with both the number of tiles and the number of draws. Use the
|
||||
* product (tiles_size * draws) / 2 as an estimate, capped at 512 KB
|
||||
* to avoid over-allocating on high draw-count scenes. Align the
|
||||
* total to page_size.
|
||||
* The formula assumes the initial block size of 128B, so if it is
|
||||
* changed it needs to be adjusted.
|
||||
*/
|
||||
STATIC_ASSERT(V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE == 128);
|
||||
alloc_size += MIN2((tiles_size * draws) / 2, 512 * 1024);
|
||||
alloc_size = align(alloc_size, page_size);
|
||||
|
||||
*tile_alloc_size = alloc_size;
|
||||
*tile_state_size = layers * tiles_x * tiles_y * 256;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
|
||||
uint32_t bpp)
|
||||
|
|
|
|||
|
|
@ -98,6 +98,15 @@ log2_tile_size(uint32_t size)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
v3d_tile_alloc_sizes(uint32_t layers,
|
||||
uint32_t tiles_x,
|
||||
uint32_t tiles_y,
|
||||
uint32_t draws,
|
||||
uint32_t page_size,
|
||||
uint32_t *tile_alloc_size,
|
||||
uint32_t *tile_state_size);
|
||||
|
||||
uint32_t
|
||||
v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
|
||||
uint32_t bpp);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue