diff --git a/src/panfrost/libpan/copy.cl b/src/panfrost/libpan/copy.cl new file mode 100644 index 00000000000..c3ad0bd78e5 --- /dev/null +++ b/src/panfrost/libpan/copy.cl @@ -0,0 +1,32 @@ +/* + * Copyright 2026 Google LLC + * Copyright 2024 Valve Corporation + * SPDX-License-Identifier: MIT + */ +#include "compiler/libcl/libcl.h" + +#if PAN_ARCH >= 6 +KERNEL(32) +panlib_fill(global uint32_t *address, uint32_t value) +{ + address[cl_global_id.x] = value; +} + +KERNEL(32) +panlib_fill_uint4(global uint4 *address, uint a, uint b, uint c, uint d) +{ + address[cl_global_id.x] = (uint4)(a, b, c, d); +} + +KERNEL(1) +panlib_fill_scalar(global uint32_t *address, uint32_t value) +{ + address[cl_global_id.x] = value; +} + +KERNEL(1) +panlib_fill_uint4_scalar(global uint4 *address, uint a, uint b, uint c, uint d) +{ + address[cl_global_id.x] = (uint4)(a, b, c, d); +} +#endif diff --git a/src/panfrost/libpan/meson.build b/src/panfrost/libpan/meson.build index 9bf43632d28..734660b5735 100644 --- a/src/panfrost/libpan/meson.build +++ b/src/panfrost/libpan/meson.build @@ -3,6 +3,7 @@ libpan_shader_files = files( + 'copy.cl', 'query_pool.cl', 'draw_helper.cl', 'indirect_dispatch.cl', diff --git a/src/panfrost/vulkan/panvk_vX_cmd_meta.c b/src/panfrost/vulkan/panvk_vX_cmd_meta.c index 5899df728eb..bca7e716177 100644 --- a/src/panfrost/vulkan/panvk_vX_cmd_meta.c +++ b/src/panfrost/vulkan/panvk_vX_cmd_meta.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: MIT */ +#include "panvk_buffer.h" #include "panvk_cmd_meta.h" #include "panvk_entrypoints.h" #include "panvk_meta.h" @@ -469,13 +470,49 @@ panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, uint32_t data) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); - struct panvk_cmd_meta_compute_save_ctx save = {0}; + VK_FROM_HANDLE(panvk_buffer, buffer, dstBuffer); + struct panvk_physical_device *phys_dev = + to_panvk_physical_device(cmdbuf->vk.base.device->physical); - meta_compute_start(cmdbuf, &save); - vk_meta_fill_buffer(&cmdbuf->vk, &dev->meta, dstBuffer, dstOffset, fillSize, - data); - meta_compute_end(cmdbuf, &save); + uint64_t addr = panvk_buffer_gpu_ptr(buffer, dstOffset); + uint64_t range = panvk_buffer_range(buffer, dstOffset, fillSize) & ~3ULL; + if (!range) + return; + + const uint32_t max_wg = phys_dev->vk.properties.maxComputeWorkGroupCount[0]; + struct panvk_precomp_ctx ctx = panvk_per_arch(precomp_cs)(cmdbuf); + + const bool uint4_path = + util_is_aligned(addr, 16) && util_is_aligned(range, 16); + const uint32_t elem_size = uint4_path ? 16 : 4; + const uint32_t wg_bytes = 32 * elem_size; + + while (range >= wg_bytes) { + const uint32_t wgs = MIN2(range / wg_bytes, max_wg); + const uint64_t bulk = (uint64_t)wgs * wg_bytes; + + if (uint4_path) { + panlib_fill_uint4(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr, + data, data, data, data); + } else { + panlib_fill(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr, data); + } + + addr += bulk; + range -= bulk; + } + + if (range) { + const uint32_t tail = range / elem_size; + + if (uint4_path) { + panlib_fill_uint4_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE, + addr, data, data, data, data); + } else { + panlib_fill_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE, addr, + data); + } + } } VKAPI_ATTR void VKAPI_CALL