v3dv: share zero-fill TFU staging BO at device level

The TFU stride-0 fill path allocates a 64 KiB staging BO (V3D_TFU_MAX_DIM * cpp = 16384 * 4), maps it, fills it with the pattern, and caches it on the command buffer. For non-zero patterns the per-cmd-buffer cache works well, but WebGPU/Dawn workloads issue many zero-fills (lazy buffer init) across separate command buffers, so the cache misses almost every time and each fill pays for a fresh alloc + mmap + memcpy. Add a device-wide staging BO held in v3dv_device::meta.tfu_fill_zero, lazily allocated under meta.mtx and used whenever data == 0. The BO is read-only after init so it can be shared across queues without extra synchronization, and it is freed in destroy_device_meta. Measured on a Dawn/WebGPU zero-fill-heavy workload (RPi5, ~60 meta_fill_buffer calls, ~218 MiB total, all zero-fills): before: TFU branch total 7.328 ms, avg 115.55 us/call after: TFU branch total 0.296 ms, avg 4.78 us/call (~24x) Non-zero patterns continue to use the per-cmd-buffer cache. Assisted-by: Claude Opus 4.7 Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41725>
2026-06-03 17:38:25 +02:00 · 2026-05-25 21:45:17 +02:00 · 2026-05-25 21:45:17 +02:00 · ae604b4bdd
commit ae604b4bdd
parent 2a62490fa7
3 changed files with 66 additions and 25 deletions
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@ -1899,6 +1899,10 @@ init_device_meta(struct v3dv_device *device)
 static void
 destroy_device_meta(struct v3dv_device *device)
 {
+   if (device->meta.tfu_fill_zero.src_bo) {
+      v3dv_bo_free(device, device->meta.tfu_fill_zero.src_bo);
+      device->meta.tfu_fill_zero.src_bo = NULL;
+   }
   mtx_destroy(&device->meta.mtx);
   v3dv_meta_clear_finish(device);
   v3dv_meta_blit_finish(device);
--- a/src/broadcom/vulkan/v3dv_device.h
+++ b/src/broadcom/vulkan/v3dv_device.h
@ -290,6 +290,13 @@ struct v3dv_device {
         VkPipelineLayout p_layout;
         struct hash_table *cache[3]; /* v3dv_meta_texel_buffer_copy_pipeline for 1d, 2d, 3d */
      } texel_buffer_copy;
+      /* Device-wide staging BO pre-filled with zeros, used by TFU stride-0
+       * fill (vkCmdFillBuffer) when data == 0. Lazily allocated under
+       * meta.mtx; freed in destroy_device_meta.
+       */
+      struct {
+         struct v3dv_bo *src_bo;
+      } tfu_fill_zero;
   } meta;

   struct v3dv_bo_cache {
--- a/src/broadcom/vulkan/v3dvx_meta_common.c
+++ b/src/broadcom/vulkan/v3dvx_meta_common.c
@ -1529,33 +1529,63 @@ meta_fill_buffer_tfu_stride0(struct v3dv_cmd_buffer *cmd_buffer,
      v3dX(get_format)(VK_FORMAT_R8G8B8A8_UINT);
   assert(format && format->plane_count == 1);

-   /* Get or create the cached staging BO for this command buffer.
-    * Always allocate for V3D_TFU_MAX_DIM width so we can reuse the
-    * same staging BO regardless of fill size. The BO is owned by the
-    * command buffer's private_objs list; the meta.tfu_fill.src_bo
-    * pointer is just a cache slot for reuse and does not own the BO.
+   /* To pick the staging BO that backs the TFU stride-0 source row. There
+    * are two sources, in order of preference:
+    *   1. data == 0: device-wide shared BO pre-filled with zeros. Lazily
+    *      allocated under meta.mtx. Zero is the common case (Vulkan
+    *      implementations zero-init a lot of resources, and WebGPU lazy
+    *      buffer init issues many zero-fills across separate command
+    *      buffers), so a device-wide BO removes the alloc+map+memcpy from
+    *      those hot paths.
+    *   2. per-cmd-buffer cached BO (data matches a previous fill in this
+    *      cmd buffer). On miss, a fresh BO is allocated, mapped, filled
+    *      with the pattern, and cached.
    */
-   struct v3dv_bo *src_bo = cmd_buffer->meta.tfu_fill.src_bo;
-   if (!src_bo || cmd_buffer->meta.tfu_fill.data != data) {
-      uint32_t src_size = V3D_TFU_MAX_DIM * cpp;
-      src_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_src", true);
-      if (!src_bo) {
-         v3dv_flag_oom(cmd_buffer, NULL);
-         return;
-      }
-      if (!v3dv_bo_map(device, src_bo, src_size)) {
-         v3dv_bo_free(device, src_bo);
-         v3dv_flag_oom(cmd_buffer, NULL);
-         return;
-      }
-      uint32_t *map = (uint32_t *)src_bo->map;
-      for (uint32_t i = 0; i < V3D_TFU_MAX_DIM; i++)
-         map[i] = data;
+   const uint32_t src_size = V3D_TFU_MAX_DIM * cpp;
+   struct v3dv_bo *src_bo = NULL;

-      v3dv_cmd_buffer_add_private_obj(
-         cmd_buffer, (uint64_t)(uintptr_t)src_bo, v3dv_cmd_buffer_destroy_bo_cb);
-      cmd_buffer->meta.tfu_fill.src_bo = src_bo;
-      cmd_buffer->meta.tfu_fill.data = data;
+   if (data == 0) {
+      mtx_lock(&device->meta.mtx);
+      struct v3dv_bo *zero_bo = device->meta.tfu_fill_zero.src_bo;
+      if (!zero_bo) {
+         zero_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_zero", true);
+         if (zero_bo && !v3dv_bo_map(device, zero_bo, src_size)) {
+            v3dv_bo_free(device, zero_bo);
+            zero_bo = NULL;
+         }
+         if (zero_bo) {
+            memset(zero_bo->map, 0, src_size);
+            device->meta.tfu_fill_zero.src_bo = zero_bo;
+         }
+      }
+      mtx_unlock(&device->meta.mtx);
+      if (!zero_bo) {
+         v3dv_flag_oom(cmd_buffer, NULL);
+         return;
+      }
+      src_bo = zero_bo;
+   } else {
+      src_bo = cmd_buffer->meta.tfu_fill.src_bo;
+      if (!src_bo || cmd_buffer->meta.tfu_fill.data != data) {
+         src_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_src", true);
+         if (!src_bo) {
+            v3dv_flag_oom(cmd_buffer, NULL);
+            return;
+         }
+         if (!v3dv_bo_map(device, src_bo, src_size)) {
+            v3dv_bo_free(device, src_bo);
+            v3dv_flag_oom(cmd_buffer, NULL);
+            return;
+         }
+         uint32_t *map = (uint32_t *)src_bo->map;
+         for (uint32_t i = 0; i < V3D_TFU_MAX_DIM; i++)
+            map[i] = data;
+
+         v3dv_cmd_buffer_add_private_obj(
+            cmd_buffer, (uint64_t)(uintptr_t)src_bo, v3dv_cmd_buffer_destroy_bo_cb);
+         cmd_buffer->meta.tfu_fill.src_bo = src_bo;
+         cmd_buffer->meta.tfu_fill.data = data;
+      }
   }

   uint32_t remaining = num_pixels;