broadcom/common: add tile alloc block size macros and sizing helper

Add V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE = 128 and V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE = 64 to v3d_limits.h. Corresponding _ENUM macros provide the 2-bit hardware encoding for the TILE_BINNING_MODE_CFG packets. The previous implicit 64B initial blocks were too small: a single draw call emits ~88 bytes of per-tile BCL state, immediately overflowing into continuation blocks. 128B initial blocks avoid the first continuation allocation for simple single-draw passes. Add v3d_tile_alloc_sizes() to v3d_util with the full tile alloc BO and TSDA sizing logic. This uses the 128B initial blocks and tile_alloc becomes proportional to the number of draws and size of the initial blocks allocation with the cap of the previous fixed allocation. So jobs with 0 or 1 drawcalls (blits/fills) reduce their headroom dramatically. The draw-proportional formula replaces a flat 512 KB continuation pool: headroom = MIN2((tiles_size * draw_count) / 2, 512 KB) Benchmarked on RPi5 (V3D 7.1) against GfxBench GL tests and apitrace replays at 1080p. Tile-alloc memory reduction versus the flat 512 KB headroom (taking into account 256kb kernel alloc per OOM): GfxBench (5 benchmarks): -45% to -70% reduction, OOM at or below baseline Apitrace (19 traces): -4% to -77% reduction on 20/24 traces No FPS regressions observed on any workload. Reviewed-by: Maíra Canal <mcanal@igalia.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40554>
2026-05-07 04:58:05 +02:00 · 2026-03-19 10:37:40 +01:00 · 2026-03-19 10:37:40 +01:00 · 47fa229605
commit 47fa229605
parent 4542982062
3 changed files with 71 additions and 0 deletions
--- a/src/broadcom/common/v3d_limits.h
+++ b/src/broadcom/common/v3d_limits.h
@ -71,4 +71,20 @@

 #define V3D_MAX_VERTEX_ATTRIB_DIVISOR 0xffff

+/* Tile allocation block sizes for the PTB, as enum values matching
+ * the TILE_BINNING_MODE_CFG / TILE_LIST_INITIAL_BLOCK_SIZE packets.
+ * The byte size is 64 << enum_value (0 = 64B, 1 = 128B, 2 = 256B).
+ *
+ * Using 128B initial blocks avoids tile overflow for simple draws
+ * (a single draw emits ~88 bytes of state per tile).  64B continuation
+ * blocks reduce internal fragmentation in the tile allocation pool.
+ */
+#define V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE  128
+#define V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE  64
+
+#define V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE_ENUM \
+        (V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE >> 7)
+#define V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE_ENUM \
+        (V3D_TILE_ALLOC_OVERFLOW_BLOCK_SIZE >> 7)
+
 #endif /* V3D_LIMITS_H */
--- a/src/broadcom/common/v3d_util.c
+++ b/src/broadcom/common/v3d_util.c
@ -22,7 +22,9 @@
 */

 #include "v3d_util.h"
+#include "v3d_limits.h"
 #include "util/macros.h"
+#include "util/u_math.h"

 /* Choose a number of workgroups per supergroup that maximizes
 * lane occupancy. We can pack up to 16 workgroups into a supergroup.
@ -266,6 +268,50 @@ v3d_internal_bpp_words(uint32_t internal_bpp)
        }
 }

+void
+v3d_tile_alloc_sizes(uint32_t layers,
+                     uint32_t tiles_x,
+                     uint32_t tiles_y,
+                     uint32_t draws,
+                     uint32_t page_size,
+                     uint32_t *tile_alloc_size,
+                     uint32_t *tile_state_size)
+{
+   assert(layers > 0);
+   /* The PTB will request the tile alloc initial size per tile at start
+    * of tile binning. The size must match the initial block size
+    * configured in the TILE_BINNING_MODE_CFG packet.
+    */
+   uint32_t tiles_size =
+      layers * tiles_x * tiles_y * V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE;
+
+   /* The PTB allocates in aligned 4k chunks after the initial setup. */
+   uint32_t alloc_size = align(tiles_size, 4096);
+
+   /* Include the first two chunk allocations that the PTB does so that
+    * we definitely clear the OOM condition before triggering one (the HW
+    * won't trigger OOM during the first allocations).
+    */
+   alloc_size += 8192;
+
+   /* Pre-allocate a continuation pool so the GPU rarely has to stall
+    * waiting for the kernel OOM handler. Each draw call writes per-tile
+    * BCL state (primitives, uniforms, shader records) whose size scales
+    * with both the number of tiles and the number of draws. Use the
+    * product (tiles_size * draws) / 2 as an estimate, capped at 512 KB
+    * to avoid over-allocating on high draw-count scenes. Align the
+    * total to page_size.
+    * The formula assumes the initial block size of 128B, so if it is
+    * changed it needs to be adjusted.
+    */
+   STATIC_ASSERT(V3D_TILE_ALLOC_INITIAL_BLOCK_SIZE == 128);
+   alloc_size += MIN2((tiles_size * draws) / 2, 512 * 1024);
+   alloc_size = align(alloc_size, page_size);
+
+   *tile_alloc_size = alloc_size;
+   *tile_state_size = layers * tiles_x * tiles_y * 256;
+}
+
 uint32_t
 v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
                                       uint32_t bpp)
--- a/src/broadcom/common/v3d_util.h
+++ b/src/broadcom/common/v3d_util.h
@ -98,6 +98,15 @@ log2_tile_size(uint32_t size)
        }
 }

+void
+v3d_tile_alloc_sizes(uint32_t layers,
+                     uint32_t tiles_x,
+                     uint32_t tiles_y,
+                     uint32_t draws,
+                     uint32_t page_size,
+                     uint32_t *tile_alloc_size,
+                     uint32_t *tile_state_size);
+
 uint32_t
 v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
                                       uint32_t bpp);