mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-06 13:48:06 +02:00
panvk: Implement vkCmdFillBuffer with panlib kernels
Replace the vk_meta_fill_buffer call with direct panlib precomp dispatches: a KERNEL(32) uint4 bulk path for 16-byte-aligned fills and a KERNEL(32) uint32 path otherwise, each with a KERNEL(1) scalar tail for sub-workgroup remainders. gpu-ratemeter vk.bufbw on Mali-G610 MC4 shows a 1.15-1.18x median speedup across alignment classes and roughly 5x on fills <= 512 B, thanks to the removed pipeline bind / descriptor-set setup that vk_meta_fill_buffer pays per call. Signed-off-by: Christian Gmeiner <cgmeiner@igalia.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41079>
This commit is contained in:
parent
ac52fb569a
commit
3d7d2115f8
3 changed files with 76 additions and 6 deletions
32
src/panfrost/libpan/copy.cl
Normal file
32
src/panfrost/libpan/copy.cl
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Copyright 2026 Google LLC
|
||||
* Copyright 2024 Valve Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "compiler/libcl/libcl.h"
|
||||
|
||||
#if PAN_ARCH >= 6
|
||||
KERNEL(32)
|
||||
panlib_fill(global uint32_t *address, uint32_t value)
|
||||
{
|
||||
address[cl_global_id.x] = value;
|
||||
}
|
||||
|
||||
KERNEL(32)
|
||||
panlib_fill_uint4(global uint4 *address, uint a, uint b, uint c, uint d)
|
||||
{
|
||||
address[cl_global_id.x] = (uint4)(a, b, c, d);
|
||||
}
|
||||
|
||||
KERNEL(1)
|
||||
panlib_fill_scalar(global uint32_t *address, uint32_t value)
|
||||
{
|
||||
address[cl_global_id.x] = value;
|
||||
}
|
||||
|
||||
KERNEL(1)
|
||||
panlib_fill_uint4_scalar(global uint4 *address, uint a, uint b, uint c, uint d)
|
||||
{
|
||||
address[cl_global_id.x] = (uint4)(a, b, c, d);
|
||||
}
|
||||
#endif
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
|
||||
libpan_shader_files = files(
|
||||
'copy.cl',
|
||||
'query_pool.cl',
|
||||
'draw_helper.cl',
|
||||
'indirect_dispatch.cl',
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "panvk_buffer.h"
|
||||
#include "panvk_cmd_meta.h"
|
||||
#include "panvk_entrypoints.h"
|
||||
#include "panvk_meta.h"
|
||||
|
|
@ -469,13 +470,49 @@ panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
|
|||
uint32_t data)
|
||||
{
|
||||
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
struct panvk_cmd_meta_compute_save_ctx save = {0};
|
||||
VK_FROM_HANDLE(panvk_buffer, buffer, dstBuffer);
|
||||
struct panvk_physical_device *phys_dev =
|
||||
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
|
||||
|
||||
meta_compute_start(cmdbuf, &save);
|
||||
vk_meta_fill_buffer(&cmdbuf->vk, &dev->meta, dstBuffer, dstOffset, fillSize,
|
||||
data);
|
||||
meta_compute_end(cmdbuf, &save);
|
||||
uint64_t addr = panvk_buffer_gpu_ptr(buffer, dstOffset);
|
||||
uint64_t range = panvk_buffer_range(buffer, dstOffset, fillSize) & ~3ULL;
|
||||
if (!range)
|
||||
return;
|
||||
|
||||
const uint32_t max_wg = phys_dev->vk.properties.maxComputeWorkGroupCount[0];
|
||||
struct panvk_precomp_ctx ctx = panvk_per_arch(precomp_cs)(cmdbuf);
|
||||
|
||||
const bool uint4_path =
|
||||
util_is_aligned(addr, 16) && util_is_aligned(range, 16);
|
||||
const uint32_t elem_size = uint4_path ? 16 : 4;
|
||||
const uint32_t wg_bytes = 32 * elem_size;
|
||||
|
||||
while (range >= wg_bytes) {
|
||||
const uint32_t wgs = MIN2(range / wg_bytes, max_wg);
|
||||
const uint64_t bulk = (uint64_t)wgs * wg_bytes;
|
||||
|
||||
if (uint4_path) {
|
||||
panlib_fill_uint4(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr,
|
||||
data, data, data, data);
|
||||
} else {
|
||||
panlib_fill(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr, data);
|
||||
}
|
||||
|
||||
addr += bulk;
|
||||
range -= bulk;
|
||||
}
|
||||
|
||||
if (range) {
|
||||
const uint32_t tail = range / elem_size;
|
||||
|
||||
if (uint4_path) {
|
||||
panlib_fill_uint4_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE,
|
||||
addr, data, data, data, data);
|
||||
} else {
|
||||
panlib_fill_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE, addr,
|
||||
data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue