tu: Rework pipeline stat queries

Drop the legacy register offset macros.  And re-work how we map the vk
query to pipeline stat offset, to account for re-ordering of the
pipeline stat regs in gen8.

Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39029>
This commit is contained in:
Rob Clark 2025-12-17 14:40:23 -08:00 committed by Marge Bot
parent d9f1f0e9b9
commit 76cd06426d

View file

@ -27,8 +27,19 @@
#define NSEC_PER_SEC 1000000000ull #define NSEC_PER_SEC 1000000000ull
#define WAIT_TIMEOUT 5 #define WAIT_TIMEOUT 5
#define STAT_COUNT ((REG_A6XX_RBBM_PIPESTAT_CSINVOCATIONS - REG_A6XX_RBBM_PIPESTAT_IAVERTICES) / 2 + 1) #define __COUNTER_REG(CHIP, name) __RBBM_PIPESTAT_ ## name <CHIP>({}).reg
#define COUNTER_REG(name) __RBBM_PIPESTAT_ ## name <CHIP>({}).reg #define COUNTER_REG(name) __COUNTER_REG(CHIP, name)
/* Note: gen8 changes the order of the pipestat regs, but in either case
* they ones we are interested in are consecutive, so for the purposes of
* knowning how many values to read we can just use A6XX reg addresses.
*
* And in both cases, RBBM_PIPESTAT_IAVERTICES is the first one.
*
* Depending on how/if they shuffle around in the future, we might need
* to shift to reading them individually, like gallium does.
*/
#define STAT_COUNT ((__COUNTER_REG(A6XX, CSINVOCATIONS) - __COUNTER_REG(A6XX, IAVERTICES)) / 2 + 1)
struct PACKED query_slot { struct PACKED query_slot {
uint64_t available; uint64_t available;
@ -464,35 +475,38 @@ get_result_count(struct tu_query_pool *pool)
} }
} }
template <chip CHIP>
static uint32_t static uint32_t
statistics_index(uint32_t *statistics) statistics_index(uint32_t *statistics)
{ {
uint32_t stat; uint32_t stat;
stat = u_bit_scan(statistics); stat = u_bit_scan(statistics);
#define COUNTER_OFFSET(name) ((COUNTER_REG(name) - COUNTER_REG(IAVERTICES)) / 2)
switch (1 << stat) { switch (1 << stat) {
case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT: case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT:
return 0; return COUNTER_OFFSET(IAVERTICES);
case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT: case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT:
return 1; return COUNTER_OFFSET(IAPRIMITIVES);
case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT: case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT:
return 2; return COUNTER_OFFSET(VSINVOCATIONS);
case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT: case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT:
return 5; return COUNTER_OFFSET(GSINVOCATIONS);
case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT: case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT:
return 6; return COUNTER_OFFSET(GSPRIMITIVES);
case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT: case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT:
return 7; return COUNTER_OFFSET(CINVOCATIONS);
case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT: case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT:
return 8; return COUNTER_OFFSET(CPRIMITIVES);
case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT: case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT:
return 9; return COUNTER_OFFSET(PSINVOCATIONS);
case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT: case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT:
return 3; return COUNTER_OFFSET(HSINVOCATIONS);
case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT: case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT:
return 4; return COUNTER_OFFSET(DSINVOCATIONS);
case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT: case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT:
return 10; return COUNTER_OFFSET(CSINVOCATIONS);
default: default:
return 0; return 0;
} }
@ -589,6 +603,7 @@ write_performance_query_value_cpu(char *base,
} }
} }
template <chip CHIP>
static VkResult static VkResult
get_query_pool_results(struct tu_device *device, get_query_pool_results(struct tu_device *device,
struct tu_query_pool *pool, struct tu_query_pool *pool,
@ -635,7 +650,7 @@ get_query_pool_results(struct tu_device *device,
uint64_t *result; uint64_t *result;
if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics); uint32_t stat_idx = statistics_index<CHIP>(&statistics);
result = query_result_addr(pool, query, uint64_t, stat_idx); result = query_result_addr(pool, query, uint64_t, stat_idx);
} else if (is_perf_query_raw(pool)) { } else if (is_perf_query_raw(pool)) {
result = query_result_addr(pool, query, struct perfcntr_query_slot, k); result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
@ -704,6 +719,7 @@ get_query_pool_results(struct tu_device *device,
return result; return result;
} }
template <chip CHIP>
VKAPI_ATTR VkResult VKAPI_CALL VKAPI_ATTR VkResult VKAPI_CALL
tu_GetQueryPoolResults(VkDevice _device, tu_GetQueryPoolResults(VkDevice _device,
VkQueryPool queryPool, VkQueryPool queryPool,
@ -732,13 +748,14 @@ tu_GetQueryPoolResults(VkDevice _device,
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
return get_query_pool_results(device, pool, firstQuery, queryCount, return get_query_pool_results<CHIP>(device, pool, firstQuery, queryCount,
dataSize, pData, stride, flags); dataSize, pData, stride, flags);
default: default:
assert(!"Invalid query type"); assert(!"Invalid query type");
} }
return VK_SUCCESS; return VK_SUCCESS;
} }
TU_GENX(tu_GetQueryPoolResults);
/* Copies a query value from one buffer to another from the GPU. */ /* Copies a query value from one buffer to another from the GPU. */
static void static void
@ -809,7 +826,7 @@ emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
uint64_t result_iova; uint64_t result_iova;
if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics); uint32_t stat_idx = statistics_index<CHIP>(&statistics);
result_iova = query_result_iova(pool, query, uint64_t, stat_idx); result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
} else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { } else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
result_iova = query_result_iova(pool, query, result_iova = query_result_iova(pool, query,
@ -896,6 +913,7 @@ tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
} }
TU_GENX(tu_CmdCopyQueryPoolResults); TU_GENX(tu_CmdCopyQueryPoolResults);
template <chip CHIP>
static void static void
emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf, emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool, struct tu_query_pool *pool,
@ -916,7 +934,7 @@ emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
uint64_t result_iova; uint64_t result_iova;
if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics); uint32_t stat_idx = statistics_index<CHIP>(&statistics);
result_iova = query_result_iova(pool, query, uint64_t, stat_idx); result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
} else if (is_perf_query_raw(pool)) { } else if (is_perf_query_raw(pool)) {
result_iova = query_result_iova(pool, query, result_iova = query_result_iova(pool, query,
@ -950,6 +968,7 @@ emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
} }
template <chip CHIP>
VKAPI_ATTR void VKAPI_CALL VKAPI_ATTR void VKAPI_CALL
tu_CmdResetQueryPool(VkCommandBuffer commandBuffer, tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
VkQueryPool queryPool, VkQueryPool queryPool,
@ -970,12 +989,13 @@ tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR: case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount); emit_reset_query_pool<CHIP>(cmdbuf, pool, firstQuery, queryCount);
break; break;
default: default:
assert(!"Invalid query type"); assert(!"Invalid query type");
} }
} }
TU_GENX(tu_CmdResetQueryPool);
VKAPI_ATTR void VKAPI_CALL VKAPI_ATTR void VKAPI_CALL
tu_ResetQueryPool(VkDevice device, tu_ResetQueryPool(VkDevice device,
@ -1148,7 +1168,7 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_wfi(cs); tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PIPESTAT_IAVERTICES) | tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) |
CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
CP_REG_TO_MEM_0_64B); CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, begin_iova); tu_cs_emit_qw(cs, begin_iova);
@ -1634,7 +1654,7 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_wfi(cs); tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PIPESTAT_IAVERTICES) | tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(COUNTER_REG(IAVERTICES)) |
CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) |
CP_REG_TO_MEM_0_64B); CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, end_iova); tu_cs_emit_qw(cs, end_iova);