mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
panvk: Implement vkCmdFillBuffer with panlib kernels
Replace the vk_meta_fill_buffer call with direct panlib precomp dispatches: a KERNEL(32) uint4 bulk path for 16-byte-aligned fills and a KERNEL(32) uint32 path otherwise, each with a KERNEL(1) scalar tail for sub-workgroup remainders. gpu-ratemeter vk.bufbw on Mali-G610 MC4 shows a 1.15-1.18x median speedup across alignment classes and roughly 5x on fills <= 512 B, thanks to the removed pipeline bind / descriptor-set setup that vk_meta_fill_buffer pays per call. Signed-off-by: Christian Gmeiner <cgmeiner@igalia.com> Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41079>
This commit is contained in:
parent
ac52fb569a
commit
3d7d2115f8
3 changed files with 76 additions and 6 deletions
32
src/panfrost/libpan/copy.cl
Normal file
32
src/panfrost/libpan/copy.cl
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
/*
|
||||||
|
* Copyright 2026 Google LLC
|
||||||
|
* Copyright 2024 Valve Corporation
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*/
|
||||||
|
#include "compiler/libcl/libcl.h"
|
||||||
|
|
||||||
|
#if PAN_ARCH >= 6
|
||||||
|
KERNEL(32)
|
||||||
|
panlib_fill(global uint32_t *address, uint32_t value)
|
||||||
|
{
|
||||||
|
address[cl_global_id.x] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
KERNEL(32)
|
||||||
|
panlib_fill_uint4(global uint4 *address, uint a, uint b, uint c, uint d)
|
||||||
|
{
|
||||||
|
address[cl_global_id.x] = (uint4)(a, b, c, d);
|
||||||
|
}
|
||||||
|
|
||||||
|
KERNEL(1)
|
||||||
|
panlib_fill_scalar(global uint32_t *address, uint32_t value)
|
||||||
|
{
|
||||||
|
address[cl_global_id.x] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
KERNEL(1)
|
||||||
|
panlib_fill_uint4_scalar(global uint4 *address, uint a, uint b, uint c, uint d)
|
||||||
|
{
|
||||||
|
address[cl_global_id.x] = (uint4)(a, b, c, d);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
|
|
||||||
libpan_shader_files = files(
|
libpan_shader_files = files(
|
||||||
|
'copy.cl',
|
||||||
'query_pool.cl',
|
'query_pool.cl',
|
||||||
'draw_helper.cl',
|
'draw_helper.cl',
|
||||||
'indirect_dispatch.cl',
|
'indirect_dispatch.cl',
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "panvk_buffer.h"
|
||||||
#include "panvk_cmd_meta.h"
|
#include "panvk_cmd_meta.h"
|
||||||
#include "panvk_entrypoints.h"
|
#include "panvk_entrypoints.h"
|
||||||
#include "panvk_meta.h"
|
#include "panvk_meta.h"
|
||||||
|
|
@ -469,13 +470,49 @@ panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
|
||||||
uint32_t data)
|
uint32_t data)
|
||||||
{
|
{
|
||||||
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
VK_FROM_HANDLE(panvk_buffer, buffer, dstBuffer);
|
||||||
struct panvk_cmd_meta_compute_save_ctx save = {0};
|
struct panvk_physical_device *phys_dev =
|
||||||
|
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
|
||||||
|
|
||||||
meta_compute_start(cmdbuf, &save);
|
uint64_t addr = panvk_buffer_gpu_ptr(buffer, dstOffset);
|
||||||
vk_meta_fill_buffer(&cmdbuf->vk, &dev->meta, dstBuffer, dstOffset, fillSize,
|
uint64_t range = panvk_buffer_range(buffer, dstOffset, fillSize) & ~3ULL;
|
||||||
data);
|
if (!range)
|
||||||
meta_compute_end(cmdbuf, &save);
|
return;
|
||||||
|
|
||||||
|
const uint32_t max_wg = phys_dev->vk.properties.maxComputeWorkGroupCount[0];
|
||||||
|
struct panvk_precomp_ctx ctx = panvk_per_arch(precomp_cs)(cmdbuf);
|
||||||
|
|
||||||
|
const bool uint4_path =
|
||||||
|
util_is_aligned(addr, 16) && util_is_aligned(range, 16);
|
||||||
|
const uint32_t elem_size = uint4_path ? 16 : 4;
|
||||||
|
const uint32_t wg_bytes = 32 * elem_size;
|
||||||
|
|
||||||
|
while (range >= wg_bytes) {
|
||||||
|
const uint32_t wgs = MIN2(range / wg_bytes, max_wg);
|
||||||
|
const uint64_t bulk = (uint64_t)wgs * wg_bytes;
|
||||||
|
|
||||||
|
if (uint4_path) {
|
||||||
|
panlib_fill_uint4(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr,
|
||||||
|
data, data, data, data);
|
||||||
|
} else {
|
||||||
|
panlib_fill(&ctx, panlib_1d(wgs), PANLIB_BARRIER_NONE, addr, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
addr += bulk;
|
||||||
|
range -= bulk;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (range) {
|
||||||
|
const uint32_t tail = range / elem_size;
|
||||||
|
|
||||||
|
if (uint4_path) {
|
||||||
|
panlib_fill_uint4_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE,
|
||||||
|
addr, data, data, data, data);
|
||||||
|
} else {
|
||||||
|
panlib_fill_scalar(&ctx, panlib_1d(tail), PANLIB_BARRIER_NONE, addr,
|
||||||
|
data);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
VKAPI_ATTR void VKAPI_CALL
|
VKAPI_ATTR void VKAPI_CALL
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue