panvk: Implement vkCmdFillBuffer with panlib kernels

Replace the vk_meta_fill_buffer call with direct panlib precomp dispatches: a KERNEL(32) uint4 bulk path for 16-byte-aligned fills and a KERNEL(32) uint32 path otherwise, each with a KERNEL(1) scalar tail for sub-workgroup remainders. gpu-ratemeter vk.bufbw on Mali-G610 MC4 shows a 1.15-1.18x median speedup across alignment classes and roughly 5x on fills <= 512 B, thanks to the removed pipeline bind / descriptor-set setup that vk_meta_fill_buffer pays per call. Signed-off-by: Christian Gmeiner <cgmeiner@igalia.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41079>
2026-05-06 13:48:06 +02:00 · 2026-04-21 10:55:47 +02:00 · 2026-04-21 10:55:47 +02:00 · 3d7d2115f8
commit 3d7d2115f8
parent ac52fb569a
3 changed files with 76 additions and 6 deletions
--- a/src/panfrost/libpan/copy.cl
+++ b/src/panfrost/libpan/copy.cl
@ -0,0 +1,32 @@
+/*
+ * Copyright 2026 Google LLC
+ * Copyright 2024 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/libcl/libcl.h"
+
+#if PAN_ARCH >= 6
+KERNEL(32)
+panlib_fill(global uint32_t *address, uint32_t value)
+{
+   address[cl_global_id.x] = value;
+}
+
+KERNEL(32)
+panlib_fill_uint4(global uint4 *address, uint a, uint b, uint c, uint d)
+{
+   address[cl_global_id.x] = (uint4)(a, b, c, d);
+}
+
+KERNEL(1)
+panlib_fill_scalar(global uint32_t *address, uint32_t value)
+{
+   address[cl_global_id.x] = value;
+}
+
+KERNEL(1)
+panlib_fill_uint4_scalar(global uint4 *address, uint a, uint b, uint c, uint d)
+{
+   address[cl_global_id.x] = (uint4)(a, b, c, d);
+}
+#endif
--- a/src/panfrost/libpan/meson.build
+++ b/src/panfrost/libpan/meson.build
@ -3,6 +3,7 @@


 libpan_shader_files = files(
+  'copy.cl',
  'query_pool.cl',
  'draw_helper.cl',
  'indirect_dispatch.cl',
--- a/src/panfrost/vulkan/panvk_vX_cmd_meta.c
+++ b/src/panfrost/vulkan/panvk_vX_cmd_meta.c
@ -4,6 +4,7 @@
 * SPDX-License-Identifier: MIT
 */

+#include "panvk_buffer.h"
 #include "panvk_cmd_meta.h"
 #include "panvk_entrypoints.h"
 #include "panvk_meta.h"
@ -469,13 +470,49 @@ panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
                              uint32_t data)
 {
   VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
-   struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
-   struct panvk_cmd_meta_compute_save_ctx save = {0};
+   VK_FROM_HANDLE(panvk_buffer, buffer, dstBuffer);
+   struct panvk_physical_device *phys_dev =
+      to_panvk_physical_device(cmdbuf->vk.base.device->physical);

-   meta_compute_start(cmdbuf, &save);
-   vk_meta_fill_buffer(&cmdbuf->vk, &dev->meta, dstBuffer, dstOffset, fillSize,
-                       data);
-   meta_compute_end(cmdbuf, &save);
+   uint64_t addr = panvk_buffer_gpu_ptr(buffer, dstOffset);
+   uint64_t range = panvk_buffer_range(buffer, dstOffset, fillSize) & ~3ULL;
+   if (!range)
+      return;
+
+   const uint32_t max_wg = phys_dev->vk.properties.maxComputeWorkGroupCount[0];
+   struct panvk_precomp_ctx ctx = panvk_per_arch(precomp_cs)(cmdbuf);
+
+   const bool uint4_path =
+      util_is_aligned(addr, 16) && util_is_aligned(range, 16);
+   const uint32_t elem_size = uint4_path ? 16 : 4;
+   const uint32_t wg_bytes = 32 * elem_size;
+
+   while (range >= wg_bytes) {
+      const uint32_t wgs = MIN2(range / wg_bytes, max_wg);
+      const uint64_t bulk = (uint64_t)wgs * wg_bytes;
+
+      if (uint4_path) {
+         panlib_fill_uint4(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr,
+                           data, data, data, data);
+      } else {
+         panlib_fill(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr, data);
+      }
+
+      addr += bulk;
+      range -= bulk;
+   }
+
+   if (range) {
+      const uint32_t tail = range / elem_size;
+
+      if (uint4_path) {
+         panlib_fill_uint4_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE,
+                                  addr, data, data, data, data);
+      } else {
+         panlib_fill_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE, addr,
+                            data);
+      }
+   }
 }

 VKAPI_ATTR void VKAPI_CALL