panvk: Implement vkCmdFillBuffer with panlib kernels
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Replace the vk_meta_fill_buffer call with direct panlib precomp
dispatches: a KERNEL(32) uint4 bulk path for 16-byte-aligned fills and a
KERNEL(32) uint32 path otherwise, each with a KERNEL(1) scalar tail for
sub-workgroup remainders.

gpu-ratemeter vk.bufbw on Mali-G610 MC4 shows a 1.15-1.18x median
speedup across alignment classes and roughly 5x on fills <= 512 B,
thanks to the removed pipeline bind / descriptor-set setup that
vk_meta_fill_buffer pays per call.

Signed-off-by: Christian Gmeiner <cgmeiner@igalia.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41079>
This commit is contained in:
Christian Gmeiner 2026-04-21 10:55:47 +02:00 committed by Marge Bot
parent ac52fb569a
commit 3d7d2115f8
3 changed files with 76 additions and 6 deletions

View file

@ -0,0 +1,32 @@
/*
* Copyright 2026 Google LLC
* Copyright 2024 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl.h"
#if PAN_ARCH >= 6
KERNEL(32)
panlib_fill(global uint32_t *address, uint32_t value)
{
address[cl_global_id.x] = value;
}
KERNEL(32)
panlib_fill_uint4(global uint4 *address, uint a, uint b, uint c, uint d)
{
address[cl_global_id.x] = (uint4)(a, b, c, d);
}
KERNEL(1)
panlib_fill_scalar(global uint32_t *address, uint32_t value)
{
address[cl_global_id.x] = value;
}
KERNEL(1)
panlib_fill_uint4_scalar(global uint4 *address, uint a, uint b, uint c, uint d)
{
address[cl_global_id.x] = (uint4)(a, b, c, d);
}
#endif

View file

@ -3,6 +3,7 @@
libpan_shader_files = files(
'copy.cl',
'query_pool.cl',
'draw_helper.cl',
'indirect_dispatch.cl',

View file

@ -4,6 +4,7 @@
* SPDX-License-Identifier: MIT
*/
#include "panvk_buffer.h"
#include "panvk_cmd_meta.h"
#include "panvk_entrypoints.h"
#include "panvk_meta.h"
@ -469,13 +470,49 @@ panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
uint32_t data)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_cmd_meta_compute_save_ctx save = {0};
VK_FROM_HANDLE(panvk_buffer, buffer, dstBuffer);
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
meta_compute_start(cmdbuf, &save);
vk_meta_fill_buffer(&cmdbuf->vk, &dev->meta, dstBuffer, dstOffset, fillSize,
data);
meta_compute_end(cmdbuf, &save);
uint64_t addr = panvk_buffer_gpu_ptr(buffer, dstOffset);
uint64_t range = panvk_buffer_range(buffer, dstOffset, fillSize) & ~3ULL;
if (!range)
return;
const uint32_t max_wg = phys_dev->vk.properties.maxComputeWorkGroupCount[0];
struct panvk_precomp_ctx ctx = panvk_per_arch(precomp_cs)(cmdbuf);
const bool uint4_path =
util_is_aligned(addr, 16) && util_is_aligned(range, 16);
const uint32_t elem_size = uint4_path ? 16 : 4;
const uint32_t wg_bytes = 32 * elem_size;
while (range >= wg_bytes) {
const uint32_t wgs = MIN2(range / wg_bytes, max_wg);
const uint64_t bulk = (uint64_t)wgs * wg_bytes;
if (uint4_path) {
panlib_fill_uint4(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr,
data, data, data, data);
} else {
panlib_fill(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr, data);
}
addr += bulk;
range -= bulk;
}
if (range) {
const uint32_t tail = range / elem_size;
if (uint4_path) {
panlib_fill_uint4_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE,
addr, data, data, data, data);
} else {
panlib_fill_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE, addr,
data);
}
}
}
VKAPI_ATTR void VKAPI_CALL