mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-03 17:38:25 +02:00
v3dv: share zero-fill TFU staging BO at device level
The TFU stride-0 fill path allocates a 64 KiB staging BO (V3D_TFU_MAX_DIM * cpp = 16384 * 4), maps it, fills it with the pattern, and caches it on the command buffer. For non-zero patterns the per-cmd-buffer cache works well, but WebGPU/Dawn workloads issue many zero-fills (lazy buffer init) across separate command buffers, so the cache misses almost every time and each fill pays for a fresh alloc + mmap + memcpy. Add a device-wide staging BO held in v3dv_device::meta.tfu_fill_zero, lazily allocated under meta.mtx and used whenever data == 0. The BO is read-only after init so it can be shared across queues without extra synchronization, and it is freed in destroy_device_meta. Measured on a Dawn/WebGPU zero-fill-heavy workload (RPi5, ~60 meta_fill_buffer calls, ~218 MiB total, all zero-fills): before: TFU branch total 7.328 ms, avg 115.55 us/call after: TFU branch total 0.296 ms, avg 4.78 us/call (~24x) Non-zero patterns continue to use the per-cmd-buffer cache. Assisted-by: Claude Opus 4.7 Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41725>
This commit is contained in:
parent
2a62490fa7
commit
ae604b4bdd
3 changed files with 66 additions and 25 deletions
|
|
@ -1899,6 +1899,10 @@ init_device_meta(struct v3dv_device *device)
|
|||
static void
|
||||
destroy_device_meta(struct v3dv_device *device)
|
||||
{
|
||||
if (device->meta.tfu_fill_zero.src_bo) {
|
||||
v3dv_bo_free(device, device->meta.tfu_fill_zero.src_bo);
|
||||
device->meta.tfu_fill_zero.src_bo = NULL;
|
||||
}
|
||||
mtx_destroy(&device->meta.mtx);
|
||||
v3dv_meta_clear_finish(device);
|
||||
v3dv_meta_blit_finish(device);
|
||||
|
|
|
|||
|
|
@ -290,6 +290,13 @@ struct v3dv_device {
|
|||
VkPipelineLayout p_layout;
|
||||
struct hash_table *cache[3]; /* v3dv_meta_texel_buffer_copy_pipeline for 1d, 2d, 3d */
|
||||
} texel_buffer_copy;
|
||||
/* Device-wide staging BO pre-filled with zeros, used by TFU stride-0
|
||||
* fill (vkCmdFillBuffer) when data == 0. Lazily allocated under
|
||||
* meta.mtx; freed in destroy_device_meta.
|
||||
*/
|
||||
struct {
|
||||
struct v3dv_bo *src_bo;
|
||||
} tfu_fill_zero;
|
||||
} meta;
|
||||
|
||||
struct v3dv_bo_cache {
|
||||
|
|
|
|||
|
|
@ -1529,33 +1529,63 @@ meta_fill_buffer_tfu_stride0(struct v3dv_cmd_buffer *cmd_buffer,
|
|||
v3dX(get_format)(VK_FORMAT_R8G8B8A8_UINT);
|
||||
assert(format && format->plane_count == 1);
|
||||
|
||||
/* Get or create the cached staging BO for this command buffer.
|
||||
* Always allocate for V3D_TFU_MAX_DIM width so we can reuse the
|
||||
* same staging BO regardless of fill size. The BO is owned by the
|
||||
* command buffer's private_objs list; the meta.tfu_fill.src_bo
|
||||
* pointer is just a cache slot for reuse and does not own the BO.
|
||||
/* To pick the staging BO that backs the TFU stride-0 source row. There
|
||||
* are two sources, in order of preference:
|
||||
* 1. data == 0: device-wide shared BO pre-filled with zeros. Lazily
|
||||
* allocated under meta.mtx. Zero is the common case (Vulkan
|
||||
* implementations zero-init a lot of resources, and WebGPU lazy
|
||||
* buffer init issues many zero-fills across separate command
|
||||
* buffers), so a device-wide BO removes the alloc+map+memcpy from
|
||||
* those hot paths.
|
||||
* 2. per-cmd-buffer cached BO (data matches a previous fill in this
|
||||
* cmd buffer). On miss, a fresh BO is allocated, mapped, filled
|
||||
* with the pattern, and cached.
|
||||
*/
|
||||
struct v3dv_bo *src_bo = cmd_buffer->meta.tfu_fill.src_bo;
|
||||
if (!src_bo || cmd_buffer->meta.tfu_fill.data != data) {
|
||||
uint32_t src_size = V3D_TFU_MAX_DIM * cpp;
|
||||
src_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_src", true);
|
||||
if (!src_bo) {
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return;
|
||||
}
|
||||
if (!v3dv_bo_map(device, src_bo, src_size)) {
|
||||
v3dv_bo_free(device, src_bo);
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return;
|
||||
}
|
||||
uint32_t *map = (uint32_t *)src_bo->map;
|
||||
for (uint32_t i = 0; i < V3D_TFU_MAX_DIM; i++)
|
||||
map[i] = data;
|
||||
const uint32_t src_size = V3D_TFU_MAX_DIM * cpp;
|
||||
struct v3dv_bo *src_bo = NULL;
|
||||
|
||||
v3dv_cmd_buffer_add_private_obj(
|
||||
cmd_buffer, (uint64_t)(uintptr_t)src_bo, v3dv_cmd_buffer_destroy_bo_cb);
|
||||
cmd_buffer->meta.tfu_fill.src_bo = src_bo;
|
||||
cmd_buffer->meta.tfu_fill.data = data;
|
||||
if (data == 0) {
|
||||
mtx_lock(&device->meta.mtx);
|
||||
struct v3dv_bo *zero_bo = device->meta.tfu_fill_zero.src_bo;
|
||||
if (!zero_bo) {
|
||||
zero_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_zero", true);
|
||||
if (zero_bo && !v3dv_bo_map(device, zero_bo, src_size)) {
|
||||
v3dv_bo_free(device, zero_bo);
|
||||
zero_bo = NULL;
|
||||
}
|
||||
if (zero_bo) {
|
||||
memset(zero_bo->map, 0, src_size);
|
||||
device->meta.tfu_fill_zero.src_bo = zero_bo;
|
||||
}
|
||||
}
|
||||
mtx_unlock(&device->meta.mtx);
|
||||
if (!zero_bo) {
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return;
|
||||
}
|
||||
src_bo = zero_bo;
|
||||
} else {
|
||||
src_bo = cmd_buffer->meta.tfu_fill.src_bo;
|
||||
if (!src_bo || cmd_buffer->meta.tfu_fill.data != data) {
|
||||
src_bo = v3dv_bo_alloc(device, src_size, "tfu_fill_src", true);
|
||||
if (!src_bo) {
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return;
|
||||
}
|
||||
if (!v3dv_bo_map(device, src_bo, src_size)) {
|
||||
v3dv_bo_free(device, src_bo);
|
||||
v3dv_flag_oom(cmd_buffer, NULL);
|
||||
return;
|
||||
}
|
||||
uint32_t *map = (uint32_t *)src_bo->map;
|
||||
for (uint32_t i = 0; i < V3D_TFU_MAX_DIM; i++)
|
||||
map[i] = data;
|
||||
|
||||
v3dv_cmd_buffer_add_private_obj(
|
||||
cmd_buffer, (uint64_t)(uintptr_t)src_bo, v3dv_cmd_buffer_destroy_bo_cb);
|
||||
cmd_buffer->meta.tfu_fill.src_bo = src_bo;
|
||||
cmd_buffer->meta.tfu_fill.data = data;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t remaining = num_pixels;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue