panvk: Add timestamp copy

Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Acked-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34932>
This commit is contained in:
Christoph Pillmayer 2025-05-16 11:49:15 +00:00
parent 92c4dfe6ea
commit 369b3826fd
3 changed files with 217 additions and 0 deletions

View file

@ -1,11 +1,97 @@
/*
* Copyright 2024 Collabora Ltd.
* Copyright 2025 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl.h"
#include "compiler/libcl/libcl_vk.h"
#include "genxml/gen_macros.h"
#include "vulkan/panvk_cmd_ts.h"
#if (PAN_ARCH >= 10)
static inline uint64_t
compute_timestamp_query_result(global uint64_t *report_addr,
uint32_t report_count, uint32_t op,
uint32_t sq_mask, bool *available)
{
uint64_t result = 0;
/* Compute min/max and keep track of which streams had non-zero results. */
switch (op) {
case PANVK_QUERY_TS_OP_MIN: {
result = ULONG_MAX;
for (uint32_t i = 0; i < report_count - 1; ++i) {
uint64_t r = report_addr[i];
if (r != 0 && (sq_mask & (1 << i))) {
result = min(result, report_addr[i]);
sq_mask &= ~(1 << i);
}
}
if (result == ULONG_MAX)
result = 0;
break;
}
case PANVK_QUERY_TS_OP_MAX: {
for (uint32_t i = 0; i < report_count - 1; ++i) {
uint64_t r = report_addr[i];
if (r != 0 && (sq_mask & (1 << i))) {
result = max(result, report_addr[i]);
sq_mask &= ~(1 << i);
}
}
break;
}
default:
unreachable("Invalid timestamp op");
break;
}
/* The result is available if all subqueues have written their value. */
*available = sq_mask == 0;
return result;
}
KERNEL(1)
panlib_copy_ts_query_result(uint64_t pool_addr, global uint32_t *available_addr,
uint32_t query_stride, uint32_t first_query,
uint32_t query_count, uint64_t dst_addr,
uint64_t dst_stride, uint32_t query_type,
uint32_t flags, uint32_t report_count)
{
uint32_t i = cl_global_id.x;
if (i >= query_count)
return;
uintptr_t dst = dst_addr + ((uint64_t)i * dst_stride);
uint32_t query = first_query + i;
global uint64_t *report_addr =
(global uint64_t *)(pool_addr + ((uint64_t)query * query_stride));
uint64_t result = 0;
/* The last report is always metadata. */
uint64_t info = report_addr[report_count - 1];
uint32_t op = panvk_timestamp_info_get_op(info);
uint32_t sq_mask = panvk_timestamp_info_get_sq_mask(info);
bool available = false;
/* If no subqueue should write a result, the query is uninitialized. */
if (sq_mask != 0)
result = compute_timestamp_query_result(report_addr, report_count, op,
sq_mask, &available);
if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available)
vk_write_query(dst, 0, flags, result);
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
vk_write_query(dst, 1, flags, available);
}
#endif
#if (PAN_ARCH >= 6 && PAN_ARCH < 10)
static inline void
write_occlusion_query_result(uintptr_t dst_addr, int32_t idx, uint32_t flags,

View file

@ -18,6 +18,7 @@
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_meta.h"
#include "panvk_cmd_precomp.h"
#include "panvk_cmd_ts.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
@ -530,6 +531,88 @@ panvk_cmd_write_timestamp_query(struct panvk_cmd_buffer *cmd,
cmd->state.contains_timestamp_queries = true;
}
static void
panvk_copy_timestamp_query_results(struct panvk_cmd_buffer *cmd,
struct panvk_query_pool *pool,
uint32_t first_query, uint32_t query_count,
uint64_t dst_buffer_addr,
VkDeviceSize stride,
VkQueryResultFlags flags)
{
/*
* Step 1:
* The point of this is to have each subqueue "save" its own value
* into a buffer, such that any following query operations like reset
* don't have to worry about destroying the result before other
* subqueues are done with it.
*/
uint32_t query_stride = pool->query_stride;
size_t buf_sz = query_count * query_stride;
struct pan_ptr intermediate_buf =
panvk_cmd_alloc_dev_mem(cmd, desc, buf_sz, 16);
for (uint32_t sq = 0; sq < PANVK_SUBQUEUE_COUNT; ++sq) {
struct cs_builder *b = panvk_get_cs_builder(cmd, sq);
uint32_t sq_offset = sq * sizeof(uint64_t);
struct cs_index src = cs_scratch_reg64(b, 0);
struct cs_index dst = cs_scratch_reg64(b, 2);
struct cs_index tmp = cs_scratch_reg64(b, 4);
struct cs_index tmp2 = cs_scratch_reg64(b, 6);
/* Wait for STORE_STATEs to finish. */
cs_wait_slot(b, SB_ID(LS));
cs_move64_to(b, src, panvk_query_report_dev_addr(pool, first_query));
cs_move64_to(b, dst, intermediate_buf.gpu);
struct cs_index count = cs_scratch_reg32(b, 8);
cs_move32_to(b, count, query_count);
cs_while(b, MALI_CS_CONDITION_GREATER, count) {
cs_load64_to(b, tmp, src, sq_offset);
if (sq == PANVK_QUERY_TS_INFO_SUBQUEUE) {
assert(PANVK_QUERY_TS_INFO_SUBQUEUE == PANVK_SUBQUEUE_COUNT - 1);
cs_load64_to(b, tmp2, src, sq_offset + 8);
}
cs_store64(b, tmp, dst, sq_offset);
if (sq == PANVK_QUERY_TS_INFO_SUBQUEUE)
cs_store64(b, tmp2, dst, sq_offset + 8);
cs_add64(b, src, src, query_stride);
cs_add64(b, dst, dst, query_stride);
cs_add32(b, count, count, -1);
}
}
/* Make sure C waits for all copies to be done. */
struct panvk_cs_deps deps = {0};
deps.dst[PANVK_SUBQUEUE_COMPUTE].wait_subqueue_mask =
BITFIELD_MASK(PANVK_SUBQUEUE_COUNT) & ~BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE);
u_foreach_bit(i, deps.dst[PANVK_SUBQUEUE_COMPUTE].wait_subqueue_mask)
deps.src[i].wait_sb_mask = SB_MASK(LS);
panvk_per_arch(emit_barrier)(cmd, deps);
/* Step 2: Copy from the intermediate into the application buffer. */
const struct panlib_copy_ts_query_result_args push = {
.pool_addr = intermediate_buf.gpu,
.available_addr = panvk_query_available_dev_addr(pool, first_query),
.query_stride = pool->query_stride,
/* The intermediate buffer starts at first_query. */
.first_query = 0,
.query_count = query_count,
.report_count = pool->reports_per_query,
.dst_addr = dst_buffer_addr,
.dst_stride = stride,
.flags = flags,
};
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmd);
panlib_copy_ts_query_result_struct(&precomp_ctx, panlib_1d(query_count),
PANLIB_BARRIER_NONE, push);
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdResetQueryPool)(VkCommandBuffer commandBuffer,
VkQueryPool queryPool, uint32_t firstQuery,
@ -627,6 +710,13 @@ panvk_per_arch(CmdCopyQueryPoolResults)(
dst_buffer_addr, stride, flags);
break;
}
#if PAN_ARCH >= 10
case VK_QUERY_TYPE_TIMESTAMP: {
panvk_copy_timestamp_query_results(cmd, pool, firstQuery, queryCount,
dst_buffer_addr, stride, flags);
break;
}
#endif
default:
unreachable("Unsupported query type");
}

View file

@ -177,6 +177,39 @@ cpu_write_occlusion_query_result(void *dst, uint32_t idx,
cpu_write_query_result(dst, idx, flags, result);
}
#if PAN_ARCH >= 10
static void
cpu_write_timestamp_query_result(void *dst, uint32_t idx,
VkQueryResultFlags flags,
const struct panvk_query_report *src,
unsigned input_value_count)
{
enum panvk_query_ts_op op =
panvk_timestamp_info_get_op(src[input_value_count - 1].value);
uint32_t sq_mask =
panvk_timestamp_info_get_sq_mask(src[input_value_count - 1].value);
uint64_t result = op == PANVK_QUERY_TS_OP_MIN ? UINT64_MAX : 0;
for (uint32_t idx = 0; idx < input_value_count - 1; ++idx) {
if ((sq_mask & BITFIELD_BIT(idx)) == 0)
continue;
if (src[idx].value == 0)
continue;
if (op == PANVK_QUERY_TS_OP_MIN)
result = MIN2(result, src[idx].value);
else
result = MAX2(result, src[idx].value);
}
if (op == PANVK_QUERY_TS_OP_MIN && result == UINT64_MAX)
result = 0;
cpu_write_query_result(dst, idx, flags, result);
}
#endif
VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(GetQueryPoolResults)(VkDevice _device, VkQueryPool queryPool,
uint32_t firstQuery, uint32_t queryCount,
@ -218,6 +251,14 @@ panvk_per_arch(GetQueryPoolResults)(VkDevice _device, VkQueryPool queryPool,
pool->reports_per_query);
break;
}
#if PAN_ARCH >= 10
case VK_QUERY_TYPE_TIMESTAMP: {
if (write_results)
cpu_write_timestamp_query_result(dst, 0, flags, src,
pool->reports_per_query);
break;
}
#endif
default:
unreachable("Unsupported query type");
}