From ae604b4bddae909a419a6c332ac830df3e2c6d5a Mon Sep 17 00:00:00 2001 From: Jose Maria Casanova Crespo Date: Mon, 25 May 2026 21:45:17 +0200 Subject: [PATCH] v3dv: share zero-fill TFU staging BO at device level The TFU stride-0 fill path allocates a 64 KiB staging BO (V3D_TFU_MAX_DIM * cpp = 16384 * 4), maps it, fills it with the pattern, and caches it on the command buffer. For non-zero patterns the per-cmd-buffer cache works well, but WebGPU/Dawn workloads issue many zero-fills (lazy buffer init) across separate command buffers, so the cache misses almost every time and each fill pays for a fresh alloc + mmap + memcpy. Add a device-wide staging BO held in v3dv_device::meta.tfu_fill_zero, lazily allocated under meta.mtx and used whenever data == 0. The BO is read-only after init so it can be shared across queues without extra synchronization, and it is freed in destroy_device_meta. Measured on a Dawn/WebGPU zero-fill-heavy workload (RPi5, ~60 meta_fill_buffer calls, ~218 MiB total, all zero-fills): before: TFU branch total 7.328 ms, avg 115.55 us/call after: TFU branch total 0.296 ms, avg 4.78 us/call (~24x) Non-zero patterns continue to use the per-cmd-buffer cache. Assisted-by: Claude Opus 4.7 Reviewed-by: Iago Toral Quiroga Part-of: --- src/broadcom/vulkan/v3dv_device.c | 4 ++ src/broadcom/vulkan/v3dv_device.h | 7 +++ src/broadcom/vulkan/v3dvx_meta_common.c | 80 +++++++++++++++++-------- 3 files changed, 66 insertions(+), 25 deletions(-) diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index 307c060e509..d153a15d982 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -1899,6 +1899,10 @@ init_device_meta(struct v3dv_device *device) static void destroy_device_meta(struct v3dv_device *device) { + if (device->meta.tfu_fill_zero.src_bo) { + v3dv_bo_free(device, device->meta.tfu_fill_zero.src_bo); + device->meta.tfu_fill_zero.src_bo = NULL; + } mtx_destroy(&device->meta.mtx); v3dv_meta_clear_finish(device); v3dv_meta_blit_finish(device); diff --git a/src/broadcom/vulkan/v3dv_device.h b/src/broadcom/vulkan/v3dv_device.h index 68635f868fc..e9be3f5295f 100644 --- a/src/broadcom/vulkan/v3dv_device.h +++ b/src/broadcom/vulkan/v3dv_device.h @@ -290,6 +290,13 @@ struct v3dv_device { VkPipelineLayout p_layout; struct hash_table *cache[3]; /* v3dv_meta_texel_buffer_copy_pipeline for 1d, 2d, 3d */ } texel_buffer_copy; + /* Device-wide staging BO pre-filled with zeros, used by TFU stride-0 + * fill (vkCmdFillBuffer) when data == 0. Lazily allocated under + * meta.mtx; freed in destroy_device_meta. + */ + struct { + struct v3dv_bo *src_bo; + } tfu_fill_zero; } meta; struct v3dv_bo_cache { diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c index 347172f22fd..82cff9b55c6 100644 --- a/src/broadcom/vulkan/v3dvx_meta_common.c +++ b/src/broadcom/vulkan/v3dvx_meta_common.c @@ -1529,33 +1529,63 @@ meta_fill_buffer_tfu_stride0(struct v3dv_cmd_buffer *cmd_buffer, v3dX(get_format)(VK_FORMAT_R8G8B8A8_UINT); assert(format && format->plane_count == 1); - /* Get or create the cached staging BO for this command buffer. - * Always allocate for V3D_TFU_MAX_DIM width so we can reuse the - * same staging BO regardless of fill size. The BO is owned by the - * command buffer's private_objs list; the meta.tfu_fill.src_bo - * pointer is just a cache slot for reuse and does not own the BO. + /* To pick the staging BO that backs the TFU stride-0 source row. There + * are two sources, in order of preference: + * 1. data == 0: device-wide shared BO pre-filled with zeros. Lazily + * allocated under meta.mtx. Zero is the common case (Vulkan + * implementations zero-init a lot of resources, and WebGPU lazy + * buffer init issues many zero-fills across separate command + * buffers), so a device-wide BO removes the alloc+map+memcpy from + * those hot paths. + * 2. per-cmd-buffer cached BO (data matches a previous fill in this + * cmd buffer). On miss, a fresh BO is allocated, mapped, filled + * with the pattern, and cached. */ - struct v3dv_bo *src_bo = cmd_buffer->meta.tfu_fill.src_bo; - if (!src_bo || cmd_buffer->meta.tfu_fill.data != data) { - uint32_t src_size = V3D_TFU_MAX_DIM * cpp; - src_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_src", true); - if (!src_bo) { - v3dv_flag_oom(cmd_buffer, NULL); - return; - } - if (!v3dv_bo_map(device, src_bo, src_size)) { - v3dv_bo_free(device, src_bo); - v3dv_flag_oom(cmd_buffer, NULL); - return; - } - uint32_t *map = (uint32_t *)src_bo->map; - for (uint32_t i = 0; i < V3D_TFU_MAX_DIM; i++) - map[i] = data; + const uint32_t src_size = V3D_TFU_MAX_DIM * cpp; + struct v3dv_bo *src_bo = NULL; - v3dv_cmd_buffer_add_private_obj( - cmd_buffer, (uint64_t)(uintptr_t)src_bo, v3dv_cmd_buffer_destroy_bo_cb); - cmd_buffer->meta.tfu_fill.src_bo = src_bo; - cmd_buffer->meta.tfu_fill.data = data; + if (data == 0) { + mtx_lock(&device->meta.mtx); + struct v3dv_bo *zero_bo = device->meta.tfu_fill_zero.src_bo; + if (!zero_bo) { + zero_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_zero", true); + if (zero_bo && !v3dv_bo_map(device, zero_bo, src_size)) { + v3dv_bo_free(device, zero_bo); + zero_bo = NULL; + } + if (zero_bo) { + memset(zero_bo->map, 0, src_size); + device->meta.tfu_fill_zero.src_bo = zero_bo; + } + } + mtx_unlock(&device->meta.mtx); + if (!zero_bo) { + v3dv_flag_oom(cmd_buffer, NULL); + return; + } + src_bo = zero_bo; + } else { + src_bo = cmd_buffer->meta.tfu_fill.src_bo; + if (!src_bo || cmd_buffer->meta.tfu_fill.data != data) { + src_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_src", true); + if (!src_bo) { + v3dv_flag_oom(cmd_buffer, NULL); + return; + } + if (!v3dv_bo_map(device, src_bo, src_size)) { + v3dv_bo_free(device, src_bo); + v3dv_flag_oom(cmd_buffer, NULL); + return; + } + uint32_t *map = (uint32_t *)src_bo->map; + for (uint32_t i = 0; i < V3D_TFU_MAX_DIM; i++) + map[i] = data; + + v3dv_cmd_buffer_add_private_obj( + cmd_buffer, (uint64_t)(uintptr_t)src_bo, v3dv_cmd_buffer_destroy_bo_cb); + cmd_buffer->meta.tfu_fill.src_bo = src_bo; + cmd_buffer->meta.tfu_fill.data = data; + } } uint32_t remaining = num_pixels;