anv: Use vk_query_pool

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24409>
This commit is contained in:
Faith Ekstrand 2023-07-31 11:30:37 -05:00 committed by Marge Bot
parent f2930ec5dd
commit e4485bc062
2 changed files with 49 additions and 53 deletions

View file

@ -91,6 +91,7 @@
#include "vk_sync.h"
#include "vk_sync_timeline.h"
#include "vk_util.h"
#include "vk_query_pool.h"
#include "vk_queue.h"
#include "vk_log.h"
#include "vk_ycbcr_conversion.h"
@ -4546,14 +4547,11 @@ struct anv_sampler {
#define ANV_PIPELINE_STATISTICS_MASK 0x000007ff
struct anv_query_pool {
struct vk_object_base base;
struct vk_query_pool vk;
VkQueryType type;
VkQueryPipelineStatisticFlags pipeline_statistics;
/** Stride between slots, in bytes */
uint32_t stride;
/** Number of slots in this query pool */
uint32_t slots;
struct anv_bo * bo;
/** Location for the KHR_performance_query small batch updating
@ -4775,7 +4773,7 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline, base, VkPipeline,
VK_OBJECT_TYPE_PIPELINE)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_layout, base, VkPipelineLayout,
VK_OBJECT_TYPE_PIPELINE_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, base, VkQueryPool,
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, vk.base, VkQueryPool,
VK_OBJECT_TYPE_QUERY_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, vk.base, VkSampler,
VK_OBJECT_TYPE_SAMPLER)

View file

@ -195,20 +195,18 @@ VkResult genX(CreateQueryPool)(
assert(!"Invalid query type");
}
if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
VK_OBJECT_TYPE_QUERY_POOL))
if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pool->type = pCreateInfo->queryType;
pool->pipeline_statistics = pipeline_statistics;
vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
pool->stride = uint64s_per_slot * sizeof(uint64_t);
pool->slots = pCreateInfo->queryCount;
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
pool->data_offset = data_offset;
pool->snapshot_size = (pool->stride - data_offset) / 2;
}
else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
pool->pass_size = pool->stride / n_passes;
pool->data_offset = data_offset;
pool->snapshot_size = (pool->pass_size - data_offset) / 2;
@ -226,12 +224,12 @@ VkResult genX(CreateQueryPool)(
pool->pass_query);
}
uint64_t size = pool->slots * (uint64_t)pool->stride;
uint64_t size = pool->vk.query_count * (uint64_t)pool->stride;
/* For KHR_performance_query we need some space in the buffer for a small
* batch updating ANV_PERF_QUERY_OFFSET_REG.
*/
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
pool->khr_perf_preamble_stride = 32;
pool->khr_perf_preambles_offset = size;
size += (uint64_t)pool->n_passes * pool->khr_perf_preamble_stride;
@ -245,7 +243,7 @@ VkResult genX(CreateQueryPool)(
if (result != VK_SUCCESS)
goto fail;
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
struct mi_builder b;
struct anv_batch batch = {
@ -422,7 +420,7 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
static bool
query_is_available(struct anv_query_pool *pool, uint32_t query)
{
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
volatile uint64_t *slot =
pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
@ -441,7 +439,7 @@ wait_for_available(struct anv_device *device,
{
/* By default we leave a 2s timeout before declaring the device lost. */
uint64_t rel_timeout = 2 * NSEC_PER_SEC;
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
/* With performance queries, there is an additional 500us reconfiguration
* time in i915.
*/
@ -479,19 +477,19 @@ VkResult genX(GetQueryPoolResults)(
assert(
#if GFX_VERx10 >= 125
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR ||
#endif
pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
pool->type == VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR);
pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP ||
pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
pool->vk.query_type == VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR);
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
@ -532,7 +530,7 @@ VkResult genX(GetQueryPoolResults)(
bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
uint32_t idx = 0;
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
uint64_t *slot = query_slot(pool, firstQuery + i);
@ -553,7 +551,7 @@ VkResult genX(GetQueryPoolResults)(
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint64_t *slot = query_slot(pool, firstQuery + i);
uint32_t statistics = pool->pipeline_statistics;
uint32_t statistics = pool->vk.pipeline_statistics;
while (statistics) {
UNUSED uint32_t stat = u_bit_scan(&statistics);
if (write_results) {
@ -562,7 +560,7 @@ VkResult genX(GetQueryPoolResults)(
}
idx++;
}
assert(idx == util_bitcount(pool->pipeline_statistics));
assert(idx == util_bitcount(pool->vk.pipeline_statistics));
break;
}
@ -710,7 +708,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
struct mi_builder *b, struct anv_query_pool *pool,
uint32_t first_index, uint32_t num_queries)
{
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_TIMESTAMP:
/* These queries are written with a PIPE_CONTROL so clear them using the
@ -801,7 +799,7 @@ void genX(CmdResetQueryPool)(
trace_intel_begin_query_clear_cs(&cmd_buffer->trace);
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
#if GFX_VERx10 >= 125
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@ -887,7 +885,7 @@ void genX(ResetQueryPool)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
for (uint32_t i = 0; i < queryCount; i++) {
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t p = 0; p < pool->n_passes; p++) {
uint64_t *pass_slot = pool->bo->map +
khr_perf_query_availability_offset(pool, firstQuery + i, p);
@ -1023,7 +1021,7 @@ void genX(CmdBeginQueryIndexedEXT)(
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
cmd_buffer->state.gfx.n_occlusion_queries++;
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
@ -1045,7 +1043,7 @@ void genX(CmdBeginQueryIndexedEXT)(
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
uint32_t statistics = pool->pipeline_statistics;
uint32_t statistics = pool->vk.pipeline_statistics;
uint32_t offset = 8;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@ -1213,7 +1211,7 @@ void genX(CmdEndQueryIndexedEXT)(
struct mi_builder b;
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
emit_query_pc_availability(cmd_buffer, query_addr, true);
@ -1241,7 +1239,7 @@ void genX(CmdEndQueryIndexedEXT)(
ANV_PIPE_CS_STALL_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
uint32_t statistics = pool->pipeline_statistics;
uint32_t statistics = pool->vk.pipeline_statistics;
uint32_t offset = 16;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@ -1388,7 +1386,7 @@ void genX(CmdWriteTimestamp2)(
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
struct anv_address query_addr = anv_query_address(pool, query);
assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP);
emit_query_clear_flush(cmd_buffer, pool,
"CmdWriteTimestamp flush query clears");
@ -1541,8 +1539,8 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,
* previous uses of vkCmdResetQueryPool in the same queue, without any
* additional synchronization."
*/
if (pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_TIMESTAMP)
if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
needed_flushes |= ANV_PIPE_CS_STALL_BIT;
if (needed_flushes) {
@ -1570,7 +1568,7 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,
}
uint32_t idx = 0;
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
result = compute_query_result(&b, anv_address_add(query_addr, 8));
@ -1589,14 +1587,14 @@ copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint32_t statistics = pool->pipeline_statistics;
uint32_t statistics = pool->vk.pipeline_statistics;
while (statistics) {
UNUSED uint32_t stat = u_bit_scan(&statistics);
result = compute_query_result(&b, anv_address_add(query_addr,
idx * 16 + 8));
gpu_write_query_result(&b, dest_addr, flags, idx++, result);
}
assert(idx == util_bitcount(pool->pipeline_statistics));
assert(idx == util_bitcount(pool->vk.pipeline_statistics));
break;
}
@ -1680,10 +1678,10 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
/* Some queries are done with shaders, so we need to have them flush
* high level caches writes. The L3 should be shared across the GPU.
*/
if (pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
pool->type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR) {
if (pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR) {
needed_flushes |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
}
/* And we need to stall for previous CS writes to land or the flushes to
@ -1703,8 +1701,8 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
* previous uses of vkCmdResetQueryPool in the same queue, without any
* additional synchronization."
*/
if (pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_TIMESTAMP)
if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
needed_flushes |= ANV_PIPE_CS_STALL_BIT;
if (needed_flushes) {
@ -1736,7 +1734,7 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
uint32_t num_items = 1;
uint32_t data_offset = 8 /* behind availability */;
switch (pool->type) {
switch (pool->vk.query_type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
@ -1749,7 +1747,7 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
num_items = util_bitcount(pool->pipeline_statistics);
num_items = util_bitcount(pool->vk.pipeline_statistics);
copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
break;