mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-28 12:08:24 +02:00
Most of the time, we can infer the type to append in
util_dynarray_append using __typeof__, which is standardized in C23 and
support in Jesse's MSMSVCV. This patch drops the type argument most of
the time, making util_dynarray a little more ergonomic to use.
This is done in four steps.
First, rename util_dynarray_append -> util_dynarray_append_typed
bash -c "find . -type f -exec sed -i -e 's/util_dynarray_append(/util_dynarray_append_typed(/g' \{} \;"
Then, add a new append that infers the type. This is much more ergonomic
for what you want most of the time.
Next, use type-inferred append as much as possible, via Coccinelle
patch (plus manual fixup):
@@
expression dynarray, element;
type type;
@@
-util_dynarray_append_typed(dynarray, type, element);
+util_dynarray_append(dynarray, element);
Finally, hand fixup cases that Coccinelle missed or incorrectly
translated, of which there were several because we can't used the
untyped append with a literal (since the sizeof won't do what you want).
All four steps are squashed to produce a single patch changing every
util_dynarray_append call site in tree to either drop a type parameter
(if possible) or insert a _typed suffix (if we can't infer). As such,
the final patch is best reviewed by hand even though it was
tool-assisted.
No Long Linguine Meals were involved in the making of this patch.
Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38038>
668 lines
22 KiB
C
668 lines
22 KiB
C
/*
|
|
* Copyright 2024 Valve Corporation
|
|
* Copyright 2024 Alyssa Rosenzweig
|
|
* Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
#include "hk_query_pool.h"
|
|
|
|
#include "agx_compile.h"
|
|
#include "agx_pack.h"
|
|
#include "hk_buffer.h"
|
|
#include "hk_cmd_buffer.h"
|
|
#include "hk_device.h"
|
|
#include "hk_entrypoints.h"
|
|
#include "hk_shader.h"
|
|
|
|
#include "libagx_dgc.h"
|
|
#include "libagx_shaders.h"
|
|
#include "vk_common_entrypoints.h"
|
|
|
|
#include "asahi/lib/agx_bo.h"
|
|
#include "asahi/libagx/libagx.h"
|
|
#include "asahi/libagx/query.h"
|
|
#include "compiler/nir/nir.h"
|
|
#include "compiler/nir/nir_builder.h"
|
|
|
|
#include "drm-uapi/asahi_drm.h"
|
|
#include "util/os_time.h"
|
|
#include "util/u_dynarray.h"
|
|
#include "vulkan/vulkan_core.h"
|
|
|
|
struct hk_query_report {
|
|
uint64_t value;
|
|
};
|
|
|
|
static inline bool
|
|
hk_has_available(const struct hk_query_pool *pool)
|
|
{
|
|
return pool->vk.query_type != VK_QUERY_TYPE_TIMESTAMP;
|
|
}
|
|
|
|
static uint16_t *
|
|
hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
|
|
{
|
|
return agx_bo_map(pool->bo) + pool->query_start;
|
|
}
|
|
|
|
static uint32_t
|
|
hk_reports_per_query(struct hk_query_pool *pool)
|
|
{
|
|
switch (pool->vk.query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION:
|
|
case VK_QUERY_TYPE_TIMESTAMP:
|
|
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
|
|
return 1;
|
|
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
|
|
return util_bitcount(pool->vk.pipeline_statistics);
|
|
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
|
|
// Primitives succeeded and primitives needed
|
|
return 2;
|
|
default:
|
|
UNREACHABLE("Unsupported query type");
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_flush_if_timestamp(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool)
|
|
{
|
|
/* There might not be a barrier between the timestamp write and the copy
|
|
* otherwise but we need one to give the CPU a chance to write the timestamp.
|
|
* This could maybe optimized.
|
|
*/
|
|
if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
|
|
perf_debug(cmd, "Flushing for timestamp copy");
|
|
hk_cmd_buffer_end_graphics(cmd);
|
|
hk_cmd_buffer_end_compute(cmd);
|
|
}
|
|
}
|
|
|
|
static uint64_t
|
|
hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
|
|
{
|
|
assert(hk_has_available(pool));
|
|
assert(query < pool->vk.query_count);
|
|
return pool->bo->va->addr + query * sizeof(uint32_t);
|
|
}
|
|
|
|
static uint32_t *
|
|
hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
|
|
{
|
|
assert(hk_has_available(pool));
|
|
assert(query < pool->vk.query_count);
|
|
return (uint32_t *)agx_bo_map(pool->bo) + query;
|
|
}
|
|
|
|
static uint64_t
|
|
hk_query_offset(struct hk_query_pool *pool, uint32_t query)
|
|
{
|
|
assert(query < pool->vk.query_count);
|
|
return pool->query_start + query * pool->query_stride;
|
|
}
|
|
|
|
static uint64_t
|
|
hk_query_report_addr(struct hk_device *dev, struct hk_query_pool *pool,
|
|
uint32_t query)
|
|
{
|
|
if (pool->oq_queries) {
|
|
uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
|
|
return dev->occlusion_queries.bo->va->addr +
|
|
(oq_index[query] * sizeof(uint64_t));
|
|
} else {
|
|
return pool->bo->va->addr + hk_query_offset(pool, query);
|
|
}
|
|
}
|
|
|
|
static struct hk_query_report *
|
|
hk_query_report_map(struct hk_device *dev, struct hk_query_pool *pool,
|
|
uint32_t query)
|
|
{
|
|
if (pool->oq_queries) {
|
|
uint64_t *queries = (uint64_t *)agx_bo_map(dev->occlusion_queries.bo);
|
|
uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
|
|
|
|
return (struct hk_query_report *)&queries[oq_index[query]];
|
|
} else {
|
|
return (void *)((char *)agx_bo_map(pool->bo) +
|
|
hk_query_offset(pool, query));
|
|
}
|
|
}
|
|
|
|
void
|
|
hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
|
|
{
|
|
/* As soon as we mark a query available, it needs to be available system
|
|
* wide, otherwise a CPU-side get result can query. As such, we cache flush
|
|
* before and then let coherency works its magic. Without this barrier, we
|
|
* get flakes in
|
|
*
|
|
* dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
|
|
*/
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
hk_cdm_cache_flush(dev, cs);
|
|
|
|
perf_debug(cmd, "Queued writes");
|
|
|
|
uint64_t params =
|
|
hk_pool_upload(cmd, cs->imm_writes.data, cs->imm_writes.size, 16);
|
|
|
|
uint32_t count =
|
|
util_dynarray_num_elements(&cs->imm_writes, struct libagx_imm_write);
|
|
assert(count > 0);
|
|
|
|
libagx_write_u32s(cmd, agx_1d(count), AGX_BARRIER_ALL | AGX_POSTGFX, params);
|
|
}
|
|
|
|
void
|
|
hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
|
|
bool after_gfx)
|
|
{
|
|
struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
|
|
cmd, after_gfx ? &cmd->current_cs.post_gfx : &cmd->current_cs.cs, true);
|
|
if (!cs)
|
|
return;
|
|
|
|
/* TODO: Generalize this mechanism suitably */
|
|
if (after_gfx) {
|
|
struct libagx_imm_write imm = {.address = address, .value = value};
|
|
|
|
if (!cs->imm_writes.data) {
|
|
util_dynarray_init(&cs->imm_writes, NULL);
|
|
}
|
|
|
|
util_dynarray_append(&cs->imm_writes, imm);
|
|
return;
|
|
}
|
|
|
|
/* As soon as we mark a query available, it needs to be available system
|
|
* wide, otherwise a CPU-side get result can query. As such, we cache flush
|
|
* before and then let coherency works its magic. Without this barrier, we
|
|
* get flakes in
|
|
*
|
|
* dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
|
|
*/
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
hk_cdm_cache_flush(dev, cs);
|
|
|
|
perf_debug(cmd, "Queued write");
|
|
libagx_write_u32(cmd, agx_1d(1), AGX_BARRIER_ALL, address, value);
|
|
}
|
|
|
|
static void
|
|
emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
|
|
uint32_t first_index, uint32_t num_queries,
|
|
bool set_available)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
perf_debug(cmd, "Query pool zero");
|
|
|
|
struct libagx_reset_query_args info = {
|
|
.availability = hk_has_available(pool) ? pool->bo->va->addr : 0,
|
|
.results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr
|
|
: pool->bo->va->addr + pool->query_start,
|
|
.oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0,
|
|
|
|
.first_query = first_index,
|
|
.reports_per_query = hk_reports_per_query(pool),
|
|
.set_available = set_available,
|
|
};
|
|
|
|
libagx_reset_query_struct(cmd, agx_1d(num_queries), AGX_BARRIER_ALL, info);
|
|
}
|
|
|
|
static void
|
|
host_zero_queries(struct hk_device *dev, struct hk_query_pool *pool,
|
|
uint32_t first_index, uint32_t num_queries,
|
|
bool set_available)
|
|
{
|
|
for (uint32_t i = 0; i < num_queries; i++) {
|
|
struct hk_query_report *reports =
|
|
hk_query_report_map(dev, pool, first_index + i);
|
|
|
|
uint64_t value = 0;
|
|
if (hk_has_available(pool)) {
|
|
uint32_t *available = hk_query_available_map(pool, first_index + i);
|
|
*available = set_available;
|
|
} else {
|
|
value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
|
|
}
|
|
|
|
for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
|
|
reports[j].value = value;
|
|
}
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkQueryPool *pQueryPool)
|
|
{
|
|
VK_FROM_HANDLE(hk_device, dev, device);
|
|
struct hk_query_pool *pool;
|
|
|
|
bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
|
|
bool timestamp = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP;
|
|
unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;
|
|
|
|
pool =
|
|
vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
|
|
if (!pool)
|
|
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
/* We place the availability first and then data */
|
|
pool->query_start = 0;
|
|
if (hk_has_available(pool)) {
|
|
pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
|
|
sizeof(struct hk_query_report));
|
|
}
|
|
|
|
uint32_t reports_per_query = hk_reports_per_query(pool);
|
|
pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
|
|
|
|
if (pool->vk.query_count > 0) {
|
|
uint32_t bo_size = pool->query_start;
|
|
|
|
/* For occlusion queries, we stick the query index remapping here */
|
|
if (occlusion_queries)
|
|
bo_size += sizeof(uint16_t) * pool->vk.query_count;
|
|
else
|
|
bo_size += pool->query_stride * pool->vk.query_count;
|
|
|
|
/* The kernel requires that timestamp buffers are SHARED */
|
|
enum agx_bo_flags flags = AGX_BO_WRITEBACK;
|
|
if (timestamp)
|
|
flags |= AGX_BO_SHARED;
|
|
|
|
pool->bo = agx_bo_create(&dev->dev, bo_size, 0, flags, "Query pool");
|
|
if (!pool->bo) {
|
|
hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
|
|
return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
}
|
|
|
|
/* Timestamp buffers must be explicitly bound as such before we can use
|
|
* them.
|
|
*/
|
|
if (timestamp) {
|
|
int ret = agx_bind_timestamps(&dev->dev, pool->bo, &pool->handle);
|
|
if (ret) {
|
|
hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool),
|
|
pAllocator);
|
|
return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
}
|
|
|
|
assert(pool->handle && "handles are nonzero");
|
|
}
|
|
}
|
|
|
|
uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
|
|
|
|
for (unsigned i = 0; i < occlusion_queries; ++i) {
|
|
uint64_t zero = 0;
|
|
unsigned index;
|
|
|
|
VkResult result = hk_descriptor_table_add(
|
|
dev, &dev->occlusion_queries, &zero, sizeof(uint64_t), &index);
|
|
|
|
if (result != VK_SUCCESS) {
|
|
hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
|
|
return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
}
|
|
|
|
/* We increment as we go so we can clean up properly if we run out */
|
|
assert(pool->oq_queries < occlusion_queries);
|
|
oq_index[pool->oq_queries++] = index;
|
|
}
|
|
|
|
if (pCreateInfo->flags & VK_QUERY_POOL_CREATE_RESET_BIT_KHR)
|
|
host_zero_queries(dev, pool, 0, pool->vk.query_count, false);
|
|
|
|
*pQueryPool = hk_query_pool_to_handle(pool);
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
VK_FROM_HANDLE(hk_device, dev, device);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
|
|
if (!pool)
|
|
return;
|
|
|
|
uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
|
|
|
|
for (unsigned i = 0; i < pool->oq_queries; ++i) {
|
|
hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
|
|
}
|
|
|
|
if (pool->handle)
|
|
dev->dev.ops.bo_unbind_object(&dev->dev, pool->handle);
|
|
|
|
agx_bo_unreference(&dev->dev, pool->bo);
|
|
vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
|
|
uint32_t queryCount)
|
|
{
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
VK_FROM_HANDLE(hk_device, dev, device);
|
|
|
|
host_zero_queries(dev, pool, firstQuery, queryCount, false);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
|
|
uint32_t firstQuery, uint32_t queryCount)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
|
|
hk_flush_if_timestamp(cmd, pool);
|
|
|
|
perf_debug(cmd, "Reset query pool");
|
|
emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
|
|
VkPipelineStageFlags2 stage, VkQueryPool queryPool,
|
|
uint32_t query)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
|
|
uint64_t report_addr = hk_query_report_addr(dev, pool, query);
|
|
bool after_gfx = cmd->current_cs.gfx != NULL;
|
|
|
|
/* When writing timestamps for compute, we split the control stream at each
|
|
* write. This ensures we never need to copy compute timestamps, which would
|
|
* require an extra control stream anyway. Unlike graphics, splitting compute
|
|
* control streams is inexpensive so there's not a strong performance reason
|
|
* to do otherwise. Finally, batching multiple timestamp writes (like we do
|
|
* for graphics) would destroy the ability to profile individual compute
|
|
* dispatches. While that's allowed by the Vulkan spec, it's pretty mean to
|
|
* apps. So.. don't do that.
|
|
*/
|
|
if (!after_gfx && cmd->current_cs.cs &&
|
|
cmd->current_cs.cs->timestamp.end.addr) {
|
|
|
|
perf_debug(cmd, "Splitting for compute timestamp");
|
|
hk_cmd_buffer_end_compute(cmd);
|
|
}
|
|
|
|
struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
|
|
cmd, after_gfx ? &cmd->current_cs.gfx : &cmd->current_cs.cs, true);
|
|
if (!cs)
|
|
return;
|
|
|
|
if (cs->timestamp.end.addr) {
|
|
assert(after_gfx && "compute is handled above");
|
|
|
|
libagx_copy_timestamp(cmd, agx_1d(1), AGX_BARRIER_ALL | AGX_POSTGFX,
|
|
report_addr, cs->timestamp.end.addr);
|
|
} else {
|
|
cs->timestamp.end = (struct agx_timestamp_req){
|
|
.addr = report_addr,
|
|
.handle = pool->handle,
|
|
.offset_B = hk_query_offset(pool, query),
|
|
};
|
|
}
|
|
|
|
/* From the Vulkan spec:
|
|
*
|
|
* "If vkCmdWriteTimestamp2 is called while executing a render pass
|
|
* instance that has multiview enabled, the timestamp uses N consecutive
|
|
* query indices in the query pool (starting at query) where N is the
|
|
* number of bits set in the view mask of the subpass the command is
|
|
* executed in. The resulting query values are determined by an
|
|
* implementation-dependent choice of one of the following behaviors:"
|
|
*
|
|
* In our case, only the first query is used, so we emit zeros for the
|
|
* remaining queries, as described in the first behavior listed in the
|
|
* Vulkan spec:
|
|
*
|
|
* "The first query is a timestamp value and (if more than one bit is set
|
|
* in the view mask) zero is written to the remaining queries."
|
|
*/
|
|
if (cmd->state.gfx.render.view_mask != 0) {
|
|
const uint32_t num_queries =
|
|
util_bitcount(cmd->state.gfx.render.view_mask);
|
|
if (num_queries > 1)
|
|
emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
|
|
}
|
|
}
|
|
|
|
static void
|
|
hk_cmd_begin_end_query(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
|
|
uint32_t query, uint32_t index,
|
|
VkQueryControlFlags flags, bool end)
|
|
{
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
bool graphics = false;
|
|
|
|
switch (pool->vk.query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION: {
|
|
assert(query < pool->oq_queries);
|
|
|
|
if (end) {
|
|
cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
|
|
} else {
|
|
cmd->state.gfx.occlusion.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
|
|
? AGX_VISIBILITY_MODE_COUNTING
|
|
: AGX_VISIBILITY_MODE_BOOLEAN;
|
|
}
|
|
|
|
uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
|
|
cmd->state.gfx.occlusion.index = oq_index[query];
|
|
cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
|
|
graphics = true;
|
|
break;
|
|
}
|
|
|
|
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
|
|
uint64_t addr = hk_query_report_addr(dev, pool, query);
|
|
cmd->state.gfx.xfb_query[index] = end ? 0 : addr;
|
|
break;
|
|
}
|
|
|
|
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
|
|
struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
|
|
cmd->state.gfx.descriptors.root_dirty = true;
|
|
|
|
root->draw.pipeline_stats = hk_query_report_addr(dev, pool, query);
|
|
root->draw.pipeline_stats_flags = pool->vk.pipeline_statistics;
|
|
|
|
/* XXX: I don't think is correct... when does the query become available
|
|
* exactly?
|
|
*/
|
|
graphics = pool->vk.pipeline_statistics &
|
|
~VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
|
|
break;
|
|
}
|
|
|
|
default:
|
|
UNREACHABLE("Unsupported query type");
|
|
}
|
|
|
|
/* We need to set available=1 after the graphics work finishes. */
|
|
if (end) {
|
|
perf_debug(cmd, "Query ending, type %u", pool->vk.query_type);
|
|
hk_queue_write(cmd, hk_query_available_addr(pool, query), 1, graphics);
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
|
|
uint32_t query, VkQueryControlFlags flags,
|
|
uint32_t index)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
|
|
hk_cmd_begin_end_query(cmd, pool, query, index, flags, false);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
|
|
uint32_t query, uint32_t index)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
|
|
hk_cmd_begin_end_query(cmd, pool, query, index, 0, true);
|
|
|
|
/* From the Vulkan spec:
|
|
*
|
|
* "If queries are used while executing a render pass instance that has
|
|
* multiview enabled, the query uses N consecutive query indices in
|
|
* the query pool (starting at query) where N is the number of bits set
|
|
* in the view mask in the subpass the query is used in. How the
|
|
* numerical results of the query are distributed among the queries is
|
|
* implementation-dependent."
|
|
*
|
|
* In our case, only the first query is used, so we emit zeros for the
|
|
* remaining queries.
|
|
*/
|
|
if (cmd->state.gfx.render.view_mask != 0) {
|
|
const uint32_t num_queries =
|
|
util_bitcount(cmd->state.gfx.render.view_mask);
|
|
if (num_queries > 1) {
|
|
perf_debug(cmd, "Multiview query zeroing");
|
|
emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool
|
|
hk_query_is_available(struct hk_device *dev, struct hk_query_pool *pool,
|
|
uint32_t query)
|
|
{
|
|
if (hk_has_available(pool)) {
|
|
uint32_t *available = hk_query_available_map(pool, query);
|
|
return p_atomic_read(available) != 0;
|
|
} else {
|
|
const struct hk_query_report *report =
|
|
hk_query_report_map(dev, pool, query);
|
|
|
|
return report->value != LIBAGX_QUERY_UNAVAILABLE;
|
|
}
|
|
}
|
|
|
|
#define HK_QUERY_TIMEOUT 2000000000ull
|
|
|
|
static VkResult
|
|
hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
|
|
uint32_t query)
|
|
{
|
|
uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);
|
|
|
|
while (os_time_get_nano() < abs_timeout_ns) {
|
|
if (hk_query_is_available(dev, pool, query))
|
|
return VK_SUCCESS;
|
|
|
|
VkResult status = vk_device_check_status(&dev->vk);
|
|
if (status != VK_SUCCESS)
|
|
return status;
|
|
}
|
|
|
|
return vk_device_set_lost(&dev->vk, "query timeout");
|
|
}
|
|
|
|
static void
|
|
cpu_write_query_result(void *dst, uint32_t idx, VkQueryResultFlags flags,
|
|
uint64_t result)
|
|
{
|
|
if (flags & VK_QUERY_RESULT_64_BIT) {
|
|
uint64_t *dst64 = dst;
|
|
dst64[idx] = result;
|
|
} else {
|
|
uint32_t *dst32 = dst;
|
|
dst32[idx] = result;
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
|
|
uint32_t firstQuery, uint32_t queryCount,
|
|
size_t dataSize, void *pData, VkDeviceSize stride,
|
|
VkQueryResultFlags flags)
|
|
{
|
|
VK_FROM_HANDLE(hk_device, dev, device);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
|
|
if (vk_device_is_lost(&dev->vk))
|
|
return VK_ERROR_DEVICE_LOST;
|
|
|
|
VkResult status = VK_SUCCESS;
|
|
for (uint32_t i = 0; i < queryCount; i++) {
|
|
const uint32_t query = firstQuery + i;
|
|
|
|
bool available = hk_query_is_available(dev, pool, query);
|
|
|
|
if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
|
|
status = hk_query_wait_for_available(dev, pool, query);
|
|
if (status != VK_SUCCESS)
|
|
return status;
|
|
|
|
available = true;
|
|
}
|
|
|
|
bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
|
|
|
|
const struct hk_query_report *src = hk_query_report_map(dev, pool, query);
|
|
assert(i * stride < dataSize);
|
|
void *dst = (char *)pData + i * stride;
|
|
|
|
uint32_t reports = hk_reports_per_query(pool);
|
|
if (write_results) {
|
|
for (uint32_t j = 0; j < reports; j++) {
|
|
cpu_write_query_result(dst, j, flags, src[j].value);
|
|
}
|
|
}
|
|
|
|
if (!write_results)
|
|
status = VK_NOT_READY;
|
|
|
|
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
|
|
cpu_write_query_result(dst, reports, flags, available);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
|
|
uint32_t firstQuery, uint32_t queryCount,
|
|
VkBuffer dstBuffer, VkDeviceSize dstOffset,
|
|
VkDeviceSize stride, VkQueryResultFlags flags)
|
|
{
|
|
VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
|
|
VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
|
|
VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);
|
|
|
|
struct hk_device *dev = hk_cmd_buffer_device(cmd);
|
|
hk_flush_if_timestamp(cmd, pool);
|
|
|
|
perf_debug(cmd, "Query pool copy");
|
|
|
|
struct libagx_copy_query_args info = {
|
|
.availability = hk_has_available(pool) ? pool->bo->va->addr : 0,
|
|
.results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr
|
|
: pool->bo->va->addr + pool->query_start,
|
|
.oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0,
|
|
|
|
.first_query = firstQuery,
|
|
.dst_addr = hk_buffer_address_rw(dst_buffer, dstOffset),
|
|
.dst_stride = stride,
|
|
.reports_per_query = hk_reports_per_query(pool),
|
|
.flags = flags,
|
|
};
|
|
|
|
libagx_copy_query_struct(cmd, agx_1d(queryCount), AGX_BARRIER_ALL, info);
|
|
}
|