diff --git a/src/panfrost/libpan/query_pool.cl b/src/panfrost/libpan/query_pool.cl index 2966dadbd87..180e422bb0f 100644 --- a/src/panfrost/libpan/query_pool.cl +++ b/src/panfrost/libpan/query_pool.cl @@ -1,11 +1,97 @@ /* * Copyright 2024 Collabora Ltd. + * Copyright 2025 Arm Ltd. * SPDX-License-Identifier: MIT */ #include "compiler/libcl/libcl.h" #include "compiler/libcl/libcl_vk.h" #include "genxml/gen_macros.h" +#include "vulkan/panvk_cmd_ts.h" + +#if (PAN_ARCH >= 10) +static inline uint64_t +compute_timestamp_query_result(global uint64_t *report_addr, + uint32_t report_count, uint32_t op, + uint32_t sq_mask, bool *available) +{ + uint64_t result = 0; + + /* Compute min/max and keep track of which streams had non-zero results. */ + switch (op) { + case PANVK_QUERY_TS_OP_MIN: { + result = ULONG_MAX; + for (uint32_t i = 0; i < report_count - 1; ++i) { + uint64_t r = report_addr[i]; + if (r != 0 && (sq_mask & (1 << i))) { + result = min(result, report_addr[i]); + sq_mask &= ~(1 << i); + } + } + if (result == ULONG_MAX) + result = 0; + break; + } + case PANVK_QUERY_TS_OP_MAX: { + for (uint32_t i = 0; i < report_count - 1; ++i) { + uint64_t r = report_addr[i]; + if (r != 0 && (sq_mask & (1 << i))) { + result = max(result, report_addr[i]); + sq_mask &= ~(1 << i); + } + } + break; + } + default: + unreachable("Invalid timestamp op"); + break; + } + + /* The result is available if all subqueues have written their value. */ + *available = sq_mask == 0; + + return result; +} + +KERNEL(1) +panlib_copy_ts_query_result(uint64_t pool_addr, global uint32_t *available_addr, + uint32_t query_stride, uint32_t first_query, + uint32_t query_count, uint64_t dst_addr, + uint64_t dst_stride, uint32_t query_type, + uint32_t flags, uint32_t report_count) +{ + uint32_t i = cl_global_id.x; + + if (i >= query_count) + return; + + uintptr_t dst = dst_addr + ((uint64_t)i * dst_stride); + + uint32_t query = first_query + i; + global uint64_t *report_addr = + (global uint64_t *)(pool_addr + ((uint64_t)query * query_stride)); + + uint64_t result = 0; + + /* The last report is always metadata. */ + uint64_t info = report_addr[report_count - 1]; + uint32_t op = panvk_timestamp_info_get_op(info); + uint32_t sq_mask = panvk_timestamp_info_get_sq_mask(info); + + bool available = false; + /* If no subqueue should write a result, the query is uninitialized. */ + if (sq_mask != 0) + result = compute_timestamp_query_result(report_addr, report_count, op, + sq_mask, &available); + + if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) + vk_write_query(dst, 0, flags, result); + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + vk_write_query(dst, 1, flags, available); +} +#endif + #if (PAN_ARCH >= 6 && PAN_ARCH < 10) static inline void write_occlusion_query_result(uintptr_t dst_addr, int32_t idx, uint32_t flags, diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c index 24d8e31c27b..00d1c1c9638 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_query.c @@ -18,6 +18,7 @@ #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_cmd_meta.h" +#include "panvk_cmd_precomp.h" #include "panvk_cmd_ts.h" #include "panvk_device.h" #include "panvk_entrypoints.h" @@ -530,6 +531,88 @@ panvk_cmd_write_timestamp_query(struct panvk_cmd_buffer *cmd, cmd->state.contains_timestamp_queries = true; } +static void +panvk_copy_timestamp_query_results(struct panvk_cmd_buffer *cmd, + struct panvk_query_pool *pool, + uint32_t first_query, uint32_t query_count, + uint64_t dst_buffer_addr, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + /* + * Step 1: + * The point of this is to have each subqueue "save" its own value + * into a buffer, such that any following query operations like reset + * don't have to worry about destroying the result before other + * subqueues are done with it. + */ + + uint32_t query_stride = pool->query_stride; + size_t buf_sz = query_count * query_stride; + struct pan_ptr intermediate_buf = + panvk_cmd_alloc_dev_mem(cmd, desc, buf_sz, 16); + + for (uint32_t sq = 0; sq < PANVK_SUBQUEUE_COUNT; ++sq) { + struct cs_builder *b = panvk_get_cs_builder(cmd, sq); + uint32_t sq_offset = sq * sizeof(uint64_t); + + struct cs_index src = cs_scratch_reg64(b, 0); + struct cs_index dst = cs_scratch_reg64(b, 2); + struct cs_index tmp = cs_scratch_reg64(b, 4); + struct cs_index tmp2 = cs_scratch_reg64(b, 6); + + /* Wait for STORE_STATEs to finish. */ + cs_wait_slot(b, SB_ID(LS)); + + cs_move64_to(b, src, panvk_query_report_dev_addr(pool, first_query)); + cs_move64_to(b, dst, intermediate_buf.gpu); + + struct cs_index count = cs_scratch_reg32(b, 8); + cs_move32_to(b, count, query_count); + cs_while(b, MALI_CS_CONDITION_GREATER, count) { + cs_load64_to(b, tmp, src, sq_offset); + if (sq == PANVK_QUERY_TS_INFO_SUBQUEUE) { + assert(PANVK_QUERY_TS_INFO_SUBQUEUE == PANVK_SUBQUEUE_COUNT - 1); + cs_load64_to(b, tmp2, src, sq_offset + 8); + } + cs_store64(b, tmp, dst, sq_offset); + if (sq == PANVK_QUERY_TS_INFO_SUBQUEUE) + cs_store64(b, tmp2, dst, sq_offset + 8); + + cs_add64(b, src, src, query_stride); + cs_add64(b, dst, dst, query_stride); + cs_add32(b, count, count, -1); + } + } + + /* Make sure C waits for all copies to be done. */ + struct panvk_cs_deps deps = {0}; + deps.dst[PANVK_SUBQUEUE_COMPUTE].wait_subqueue_mask = + BITFIELD_MASK(PANVK_SUBQUEUE_COUNT) & ~BITFIELD_BIT(PANVK_SUBQUEUE_COMPUTE); + u_foreach_bit(i, deps.dst[PANVK_SUBQUEUE_COMPUTE].wait_subqueue_mask) + deps.src[i].wait_sb_mask = SB_MASK(LS); + panvk_per_arch(emit_barrier)(cmd, deps); + + /* Step 2: Copy from the intermediate into the application buffer. */ + + const struct panlib_copy_ts_query_result_args push = { + .pool_addr = intermediate_buf.gpu, + .available_addr = panvk_query_available_dev_addr(pool, first_query), + .query_stride = pool->query_stride, + /* The intermediate buffer starts at first_query. */ + .first_query = 0, + .query_count = query_count, + .report_count = pool->reports_per_query, + .dst_addr = dst_buffer_addr, + .dst_stride = stride, + .flags = flags, + }; + + struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmd); + panlib_copy_ts_query_result_struct(&precomp_ctx, panlib_1d(query_count), + PANLIB_BARRIER_NONE, push); +} + VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdResetQueryPool)(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery, @@ -627,6 +710,13 @@ panvk_per_arch(CmdCopyQueryPoolResults)( dst_buffer_addr, stride, flags); break; } +#if PAN_ARCH >= 10 + case VK_QUERY_TYPE_TIMESTAMP: { + panvk_copy_timestamp_query_results(cmd, pool, firstQuery, queryCount, + dst_buffer_addr, stride, flags); + break; + } +#endif default: unreachable("Unsupported query type"); } diff --git a/src/panfrost/vulkan/panvk_vX_query_pool.c b/src/panfrost/vulkan/panvk_vX_query_pool.c index a73596d7d2f..81bbfa8fc49 100644 --- a/src/panfrost/vulkan/panvk_vX_query_pool.c +++ b/src/panfrost/vulkan/panvk_vX_query_pool.c @@ -177,6 +177,39 @@ cpu_write_occlusion_query_result(void *dst, uint32_t idx, cpu_write_query_result(dst, idx, flags, result); } +#if PAN_ARCH >= 10 +static void +cpu_write_timestamp_query_result(void *dst, uint32_t idx, + VkQueryResultFlags flags, + const struct panvk_query_report *src, + unsigned input_value_count) +{ + enum panvk_query_ts_op op = + panvk_timestamp_info_get_op(src[input_value_count - 1].value); + uint32_t sq_mask = + panvk_timestamp_info_get_sq_mask(src[input_value_count - 1].value); + + uint64_t result = op == PANVK_QUERY_TS_OP_MIN ? UINT64_MAX : 0; + + for (uint32_t idx = 0; idx < input_value_count - 1; ++idx) { + if ((sq_mask & BITFIELD_BIT(idx)) == 0) + continue; + if (src[idx].value == 0) + continue; + + if (op == PANVK_QUERY_TS_OP_MIN) + result = MIN2(result, src[idx].value); + else + result = MAX2(result, src[idx].value); + } + + if (op == PANVK_QUERY_TS_OP_MIN && result == UINT64_MAX) + result = 0; + + cpu_write_query_result(dst, idx, flags, result); +} +#endif + VKAPI_ATTR VkResult VKAPI_CALL panvk_per_arch(GetQueryPoolResults)(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount, @@ -218,6 +251,14 @@ panvk_per_arch(GetQueryPoolResults)(VkDevice _device, VkQueryPool queryPool, pool->reports_per_query); break; } +#if PAN_ARCH >= 10 + case VK_QUERY_TYPE_TIMESTAMP: { + if (write_results) + cpu_write_timestamp_query_result(dst, 0, flags, src, + pool->reports_per_query); + break; + } +#endif default: unreachable("Unsupported query type"); }