tu: support exposing derived counters through VK_KHR_performance_query

Turnip's current VK_KHR_performance_query implementation only exposed raw
perfcounters. These aren't exactly trivial to evaluate on their own.

This mode can still be used with the new TU_DEBUG_PERFCRAW flag. Existing
TU_DEBUG_PERFC now enables performance query mode where Freedreno's derived
counters are exposed instead. These default to using the command scope,
making them usable with RenderDoc's performance counter capture.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Reviewed-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33208>
This commit is contained in:
Zan Dobersek 2025-03-03 09:08:51 +01:00 committed by Marge Bot
parent 27fd2d1ad1
commit 86456cf0e6
5 changed files with 405 additions and 98 deletions

View file

@ -198,7 +198,7 @@ get_device_extensions(const struct tu_physical_device *device,
.KHR_maintenance6 = true,
.KHR_map_memory2 = true,
.KHR_multiview = TU_DEBUG(NOCONFORM) ? true : device->info->a6xx.has_hw_multiview,
.KHR_performance_query = TU_DEBUG(PERFC),
.KHR_performance_query = TU_DEBUG(PERFC) || TU_DEBUG(PERFCRAW),
.KHR_pipeline_executable_properties = true,
.KHR_pipeline_library = true,
#ifdef TU_USE_WSI_PLATFORM

View file

@ -82,7 +82,7 @@ struct PACKED perfcntr_query_slot {
uint64_t end;
};
struct PACKED perf_query_slot {
struct PACKED perf_query_raw_slot {
struct query_slot common;
struct perfcntr_query_slot perfcntr;
};
@ -127,6 +127,20 @@ struct PACKED accel_struct_slot {
sizeof(struct perfcntr_query_slot) * (i) + \
offsetof(struct perfcntr_query_slot, field)
#define perf_query_derived_perfcntr_iova(pool, query, field, i) \
pool->bo->iova + pool->query_stride * (query) + \
sizeof(struct query_slot) + \
sizeof(uint64_t) * pool->perf_query.derived.counter_index_count + \
sizeof(struct perfcntr_query_slot) * (i) + \
offsetof(struct perfcntr_query_slot, field)
#define perf_query_derived_perfcntr_addr(pool, query, field, i) \
(uint64_t *) ((char *) pool->bo->map + pool->query_stride * (query) + \
sizeof(struct query_slot) + \
sizeof(uint64_t) * pool->perf_query.derived.counter_index_count + \
sizeof(struct perfcntr_query_slot) * (i) + \
offsetof(struct perfcntr_query_slot, field))
#define primitives_generated_query_iova(pool, query, field) \
query_iova(struct primitives_generated_query_slot, pool, query, field)
@ -192,6 +206,20 @@ slot_address(struct tu_query_pool *pool, uint32_t query)
query * pool->query_stride);
}
static bool
is_perf_query_raw(struct tu_query_pool *pool)
{
return pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
pool->perf_query_type == TU_PERF_QUERY_TYPE_RAW;
}
static bool
is_perf_query_derived(struct tu_query_pool *pool)
{
return pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
pool->perf_query_type == TU_PERF_QUERY_TYPE_DERIVED;
}
static void
perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
uint32_t index, uint32_t *gid, uint32_t *cid)
@ -214,8 +242,8 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
static int
compare_perfcntr_pass(const void *a, const void *b)
{
return ((struct tu_perf_query_data *)a)->pass -
((struct tu_perf_query_data *)b)->pass;
return ((struct tu_perf_query_raw_data *)a)->pass -
((struct tu_perf_query_raw_data *)b)->pass;
}
VKAPI_ATTR VkResult VKAPI_CALL
@ -230,6 +258,7 @@ tu_CreateQueryPool(VkDevice _device,
uint32_t pool_size, slot_size;
const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
enum tu_perf_query_type perf_query_type = TU_PERF_QUERY_TYPE_NONE;
pool_size = sizeof(struct tu_query_pool);
@ -252,21 +281,31 @@ tu_CreateQueryPool(VkDevice _device,
QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
assert(perf_query_info);
slot_size = sizeof(struct perf_query_slot) +
sizeof(struct perfcntr_query_slot) *
(perf_query_info->counterIndexCount - 1);
if (TU_DEBUG(PERFCRAW)) {
perf_query_type = TU_PERF_QUERY_TYPE_RAW;
/* Size of the array pool->tu_perf_query_data */
pool_size += sizeof(struct tu_perf_query_data) *
perf_query_info->counterIndexCount;
slot_size = sizeof(struct perf_query_raw_slot) +
sizeof(struct perfcntr_query_slot) *
(perf_query_info->counterIndexCount - 1);
/* Size of the array pool->perf_query.raw.data */
pool_size += sizeof(struct tu_perf_query_raw_data) *
perf_query_info->counterIndexCount;
} else {
perf_query_type = TU_PERF_QUERY_TYPE_DERIVED;
slot_size = sizeof(struct query_slot) +
sizeof(uint64_t) * perf_query_info->counterIndexCount;
pool_size += sizeof(fd_derived_counter_collection);
}
break;
}
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
slot_size = sizeof(struct accel_struct_slot);
break;
}
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
slot_size = sizeof(struct pipeline_stat_query_slot);
break;
@ -280,11 +319,14 @@ tu_CreateQueryPool(VkDevice _device,
if (!pool)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
&pool->perf_group_count);
pool->perf_query_type = perf_query_type;
pool->counter_index_count = perf_query_info->counterIndexCount;
if (is_perf_query_raw(pool)) {
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
perf_query->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
&perf_query->perf_group_count);
perf_query->counter_index_count = perf_query_info->counterIndexCount;
/* Build all perf counters data that is requested, so we could get
* correct group id, countable id, counter register and pass index with
@ -294,29 +336,29 @@ tu_CreateQueryPool(VkDevice _device,
* should keep the original indices and store perfcntrs results according
* to them so apps can get correct results with their own indices.
*/
uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count];
memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0]));
memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0]));
uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count];
memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0]));
memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0]));
for (uint32_t i = 0; i < pool->counter_index_count; i++) {
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
uint32_t gid = 0, cid = 0;
perfcntr_index(pool->perf_group, pool->perf_group_count,
perfcntr_index(perf_query->perf_group, perf_query->perf_group_count,
perf_query_info->pCounterIndices[i], &gid, &cid);
pool->perf_query_data[i].gid = gid;
pool->perf_query_data[i].cid = cid;
pool->perf_query_data[i].app_idx = i;
perf_query->data[i].gid = gid;
perf_query->data[i].cid = cid;
perf_query->data[i].app_idx = i;
/* When a counter register is over the capacity(num_counters),
* reset it for next pass.
*/
if (regs[gid] < pool->perf_group[gid].num_counters) {
pool->perf_query_data[i].cntr_reg = regs[gid]++;
pool->perf_query_data[i].pass = pass[gid];
if (regs[gid] < perf_query->perf_group[gid].num_counters) {
perf_query->data[i].cntr_reg = regs[gid]++;
perf_query->data[i].pass = pass[gid];
} else {
pool->perf_query_data[i].pass = ++pass[gid];
pool->perf_query_data[i].cntr_reg = regs[gid] = 0;
perf_query->data[i].pass = ++pass[gid];
perf_query->data[i].cntr_reg = regs[gid] = 0;
regs[gid]++;
}
}
@ -324,11 +366,30 @@ tu_CreateQueryPool(VkDevice _device,
/* Sort by pass index so we could easily prepare a command stream
* with the ascending order of pass index.
*/
qsort(pool->perf_query_data, pool->counter_index_count,
sizeof(pool->perf_query_data[0]),
qsort(perf_query->data, perf_query->counter_index_count,
sizeof(perf_query->data[0]),
compare_perfcntr_pass);
}
if (is_perf_query_derived(pool)) {
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
struct fd_derived_counter_collection *collection = perf_query->collection;
perf_query->counter_index_count = perf_query_info->counterIndexCount;
perf_query->derived_counters = fd_derived_counters(&device->physical_device->dev_id,
&perf_query->derived_counters_count);
*collection = {
.num_counters = perf_query_info->counterIndexCount,
};
for (unsigned i = 0; i < collection->num_counters; ++i) {
uint32_t counter_index = perf_query_info->pCounterIndices[i];
collection->counters[i] = perf_query->derived_counters[counter_index];
}
fd_generate_derived_counter_collection(&device->physical_device->dev_id, collection);
slot_size += sizeof(struct perfcntr_query_slot) * collection->num_enabled_perfcntrs;
}
VkResult result = tu_bo_init_new(device, &pool->vk.base, &pool->bo,
pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
if (result != VK_SUCCESS) {
@ -392,7 +453,10 @@ get_result_count(struct tu_query_pool *pool)
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
return util_bitcount(pool->vk.pipeline_statistics);
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
return pool->counter_index_count;
assert(is_perf_query_raw(pool) ^ is_perf_query_derived(pool));
if (is_perf_query_derived(pool))
return pool->perf_query.derived.counter_index_count;
return pool->perf_query.raw.counter_index_count;
default:
assert(!"Invalid query type");
return 0;
@ -574,7 +638,7 @@ get_query_pool_results(struct tu_device *device,
if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics);
result = query_result_addr(pool, query, uint64_t, stat_idx);
} else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
} else if (is_perf_query_raw(pool)) {
result = query_result_addr(pool, query, struct perfcntr_query_slot, k);
} else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
assert(k == 0);
@ -583,10 +647,25 @@ get_query_pool_results(struct tu_device *device,
result = query_result_addr(pool, query, uint64_t, k);
}
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
struct tu_perf_query_data *data = &pool->perf_query_data[k];
if (is_perf_query_raw(pool)) {
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
struct tu_perf_query_raw_data *data = &perf_query->data[k];
VkPerformanceCounterStorageKHR storage =
fd_perfcntr_type_to_vk_storage[pool->perf_group[data->gid].countables[data->cid].query_type];
fd_perfcntr_type_to_vk_storage[perf_query->perf_group[data->gid].countables[data->cid].query_type];
write_performance_query_value_cpu(result_base, k, storage, result);
} else if (is_perf_query_derived(pool)) {
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
const struct fd_derived_counter *derived_counter = perf_query->collection->counters[k];
uint64_t perfcntr_values[FD_DERIVED_COUNTER_MAX_PERFCNTRS];
for (unsigned l = 0; l < derived_counter->num_perfcntrs; ++l) {
uint8_t perfcntr_map = perf_query->collection->enabled_perfcntrs_map[derived_counter->perfcntrs[l]];
uint64_t *perfcntr_result = perf_query_derived_perfcntr_addr(pool, query, result, perfcntr_map);
perfcntr_values[l] = *perfcntr_result;
}
VkPerformanceCounterStorageKHR storage = fd_perfcntr_type_to_vk_storage[derived_counter->type];
*result = derived_counter->derive(&perf_query->collection->derivation_context, perfcntr_values);
write_performance_query_value_cpu(result_base, k, storage, result);
} else {
write_query_value_cpu(result_base, k, result, flags);
@ -840,7 +919,7 @@ emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
if (pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
uint32_t stat_idx = statistics_index(&statistics);
result_iova = query_result_iova(pool, query, uint64_t, stat_idx);
} else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
} else if (is_perf_query_raw(pool)) {
result_iova = query_result_iova(pool, query,
struct perfcntr_query_slot, k);
} else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
@ -854,6 +933,20 @@ emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, result_iova);
tu_cs_emit_qw(cs, 0x0);
}
if (is_perf_query_derived(pool)) {
/* For perf queries with derived counters, we also zero out every used
* perfcntr's result field into which counter value deltas are accumulated.
*/
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
for (uint32_t j = 0; j < perf_query->collection->num_enabled_perfcntrs; ++j) {
uint64_t perfcntr_result_iova = perf_query_derived_perfcntr_iova(pool, query, result, j);
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, perfcntr_result_iova);
tu_cs_emit_qw(cs, 0x00);
}
}
}
}
@ -900,7 +993,7 @@ tu_ResetQueryPool(VkDevice device,
for (uint32_t k = 0; k < get_result_count(pool); k++) {
uint64_t *res;
if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
if (is_perf_query_raw(pool)) {
res = query_result_addr(pool, i + firstQuery,
struct perfcntr_query_slot, k);
} else if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION) {
@ -912,6 +1005,18 @@ tu_ResetQueryPool(VkDevice device,
*res = 0;
}
if (is_perf_query_derived(pool)) {
/* For perf queries with derived counters, we also zero out every used
* perfcntr's result field into which counter value deltas are accumulated.
*/
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
for (uint32_t j = 0; j < perf_query->collection->num_enabled_perfcntrs; ++j) {
uint64_t *perfcntr_res = perf_query_derived_perfcntr_addr(pool, i + firstQuery, result, j);
*perfcntr_res = 0;
}
}
}
}
@ -1043,11 +1148,12 @@ emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass)
template <chip CHIP>
static void
emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
uint32_t query)
emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
uint32_t query)
{
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
uint32_t last_pass = ~0;
if (cmdbuf->state.pass) {
@ -1083,8 +1189,8 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
.scope = INTERRUPTS).value);
}
for (uint32_t i = 0; i < pool->counter_index_count; i++) {
struct tu_perf_query_data *data = &pool->perf_query_data[i];
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
if (last_pass != data->pass) {
last_pass = data->pass;
@ -1095,9 +1201,9 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
}
const struct fd_perfcntr_counter *counter =
&pool->perf_group[data->gid].counters[data->cntr_reg];
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_countable *countable =
&pool->perf_group[data->gid].countables[data->cid];
&perf_query->perf_group[data->gid].countables[data->cid];
tu_cs_emit_pkt4(cs, counter->select_reg, 1);
tu_cs_emit(cs, countable->selector);
@ -1107,8 +1213,8 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
last_pass = ~0;
tu_cs_emit_wfi(cs);
for (uint32_t i = 0; i < pool->counter_index_count; i++) {
struct tu_perf_query_data *data = &pool->perf_query_data[i];
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
if (last_pass != data->pass) {
last_pass = data->pass;
@ -1119,7 +1225,7 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
}
const struct fd_perfcntr_counter *counter =
&pool->perf_group[data->gid].counters[data->cntr_reg];
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx);
@ -1131,6 +1237,59 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf,
tu_cond_exec_end(cs);
}
template <chip CHIP>
static void
emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
uint32_t query)
{
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
tu_cs_emit_wfi(cs);
/* Keep preemption disabled for the duration of this query. This way
* changes in perfcounter values should only apply to work done during
* this query.
*/
if (CHIP == A7XX) {
tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1);
tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true,
.scope = INTERRUPTS).value);
}
for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
const struct fd_perfcntr_counter *counter = perf_query->collection->enabled_perfcntrs[i].counter;
unsigned countable = perf_query->collection->enabled_perfcntrs[i].countable;
tu_cs_emit_pkt4(cs, counter->select_reg, 1);
tu_cs_emit(cs, countable);
}
tu_cs_emit_wfi(cs);
/* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection last, if necessary. */
for (uint32_t i = 1; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
const struct fd_perfcntr_counter *counter = perf_query->collection->enabled_perfcntrs[i].counter;
uint64_t begin_iova = perf_query_derived_perfcntr_iova(pool, query, begin, i);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, begin_iova);
}
if (perf_query->collection->cp_always_count_enabled) {
const struct fd_perfcntr_counter *counter = perf_query->collection->enabled_perfcntrs[0].counter;
uint64_t begin_iova = perf_query_derived_perfcntr_iova(pool, query, begin, 0);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, begin_iova);
}
}
template <chip CHIP>
static void
emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
@ -1213,7 +1372,11 @@ tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
emit_begin_prim_generated_query<CHIP>(cmdbuf, pool, query);
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
emit_begin_perf_query<CHIP>(cmdbuf, pool, query);
assert(pool->perf_query_type != TU_PERF_QUERY_TYPE_NONE);
if (pool->perf_query_type == TU_PERF_QUERY_TYPE_RAW)
emit_begin_perf_query_raw<CHIP>(cmdbuf, pool, query);
else if (pool->perf_query_type == TU_PERF_QUERY_TYPE_DERIVED)
emit_begin_perf_query_derived<CHIP>(cmdbuf, pool, query);
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
emit_begin_stat_query<CHIP>(cmdbuf, pool, query);
@ -1457,19 +1620,25 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
template <chip CHIP>
static void
emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
uint32_t query)
emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
uint32_t query)
{
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
uint64_t available_iova = query_available_iova(pool, query);
uint64_t end_iova;
uint64_t begin_iova;
uint64_t result_iova;
uint32_t last_pass = ~0;
for (uint32_t i = 0; i < pool->counter_index_count; i++) {
struct tu_perf_query_data *data = &pool->perf_query_data[i];
/* Wait for the profiled work to finish so that collected counter values
* are as accurate as possible.
*/
tu_cs_emit_wfi(cs);
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
if (last_pass != data->pass) {
last_pass = data->pass;
@ -1480,7 +1649,7 @@ emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
}
const struct fd_perfcntr_counter *counter =
&pool->perf_group[data->gid].counters[data->cntr_reg];
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
end_iova = perf_query_iova(pool, query, end, data->app_idx);
@ -1494,8 +1663,8 @@ emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
last_pass = ~0;
tu_cs_emit_wfi(cs);
for (uint32_t i = 0; i < pool->counter_index_count; i++) {
struct tu_perf_query_data *data = &pool->perf_query_data[i];
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
struct tu_perf_query_raw_data *data = &perf_query->data[i];
if (last_pass != data->pass) {
last_pass = data->pass;
@ -1544,6 +1713,80 @@ emit_end_perf_query(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_qw(cs, 0x1);
}
template <chip CHIP>
static void
emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
struct tu_query_pool *pool,
uint32_t query)
{
struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
uint64_t available_iova = query_available_iova(pool, query);
/* Wait for the profiled work to finish so that collected counter values
* are as accurate as possible.
*/
tu_cs_emit_wfi(cs);
/* Collect the enabled perfcntrs. Emit CP_ALWAYS_COUNT collection first, if necessary. */
if (perf_query->collection->cp_always_count_enabled) {
const struct fd_perfcntr_counter *counter = perf_query->collection->enabled_perfcntrs[0].counter;
uint64_t end_iova = perf_query_derived_perfcntr_iova(pool, query, end, 0);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, end_iova);
}
for (uint32_t i = 1; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
const struct fd_perfcntr_counter *counter = perf_query->collection->enabled_perfcntrs[i].counter;
uint64_t end_iova = perf_query_derived_perfcntr_iova(pool, query, end, i);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) |
CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, end_iova);
}
tu_cs_emit_wfi(cs);
for (uint32_t i = 0; i < perf_query->collection->num_enabled_perfcntrs; ++i) {
uint64_t result_iova = perf_query_derived_perfcntr_iova(pool, query, result, i);
uint64_t begin_iova = perf_query_derived_perfcntr_iova(pool, query, begin, i);
uint64_t end_iova = perf_query_derived_perfcntr_iova(pool, query, end, i);
/* result += end - begin */
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES |
CP_MEM_TO_MEM_0_DOUBLE |
CP_MEM_TO_MEM_0_NEG_C);
tu_cs_emit_qw(cs, result_iova);
tu_cs_emit_qw(cs, result_iova);
tu_cs_emit_qw(cs, end_iova);
tu_cs_emit_qw(cs, begin_iova);
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
/* This reverts the preemption disablement done at the start
* of the query.
*/
if (CHIP == A7XX) {
tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1);
tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false,
.scope = INTERRUPTS).value);
}
if (cmdbuf->state.pass)
cs = &cmdbuf->draw_epilogue_cs;
/* Set the availability to 1 */
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
tu_cs_emit_qw(cs, available_iova);
tu_cs_emit_qw(cs, 0x1);
}
template <chip CHIP>
static void
emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
@ -1714,7 +1957,11 @@ tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
emit_end_prim_generated_query<CHIP>(cmdbuf, pool, query);
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
emit_end_perf_query<CHIP>(cmdbuf, pool, query);
assert(pool->perf_query_type != TU_PERF_QUERY_TYPE_NONE);
if (pool->perf_query_type == TU_PERF_QUERY_TYPE_RAW)
emit_end_perf_query_raw<CHIP>(cmdbuf, pool, query);
else if (pool->perf_query_type == TU_PERF_QUERY_TYPE_DERIVED)
emit_end_perf_query_derived<CHIP>(cmdbuf, pool, query);
break;
case VK_QUERY_TYPE_PIPELINE_STATISTICS:
emit_end_stat_query<CHIP>(cmdbuf, pool, query);
@ -1866,29 +2113,60 @@ tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
VkPerformanceCounterDescriptionKHR* pCounterDescriptions)
{
VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
uint32_t desc_count = *pCounterCount;
uint32_t group_count;
const struct fd_perfcntr_group *group =
fd_perfcntrs(&phydev->dev_id, &group_count);
VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
pCounterDescriptions, &desc_count);
for (int i = 0; i < group_count; i++) {
for (int j = 0; j < group[i].num_countables; j++) {
if (TU_DEBUG(PERFCRAW)) {
uint32_t group_count;
const struct fd_perfcntr_group *group =
fd_perfcntrs(&phydev->dev_id, &group_count);
for (int i = 0; i < group_count; i++) {
for (int j = 0; j < group[i].num_countables; j++) {
vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
counter->unit =
fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
counter->storage =
fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
unsigned char sha1_result[20];
_mesa_sha1_compute(group[i].countables[j].name,
strlen(group[i].countables[j].name),
sha1_result);
memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
}
vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
desc->flags = 0;
snprintf(desc->name, sizeof(desc->name),
"%s", group[i].countables[j].name);
snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
snprintf(desc->description, sizeof(desc->description),
"%s: %s performance counter",
group[i].name, group[i].countables[j].name);
}
}
}
} else {
unsigned derived_counters_count;
const struct fd_derived_counter **derived_counters =
fd_derived_counters(&phydev->dev_id, &derived_counters_count);
for (unsigned i = 0; i < derived_counters_count; ++i) {
const struct fd_derived_counter *derived_counter = derived_counters[i];
vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_BUFFER_KHR;
counter->unit =
fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type];
counter->storage =
fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type];
counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
counter->unit = fd_perfcntr_type_to_vk_unit[derived_counter->type];
counter->storage = fd_perfcntr_type_to_vk_storage[derived_counter->type];
unsigned char sha1_result[20];
_mesa_sha1_compute(group[i].countables[j].name,
strlen(group[i].countables[j].name),
_mesa_sha1_compute(derived_counter->name, strlen(derived_counter->name),
sha1_result);
memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
}
@ -1896,12 +2174,9 @@ tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
desc->flags = 0;
snprintf(desc->name, sizeof(desc->name),
"%s", group[i].countables[j].name);
snprintf(desc->category, sizeof(desc->category), "%s", group[i].name);
snprintf(desc->description, sizeof(desc->description),
"%s: %s performance counter",
group[i].name, group[i].countables[j].name);
snprintf(desc->name, sizeof(desc->name), "%s", derived_counter->name);
snprintf(desc->category, sizeof(desc->category), "%s", derived_counter->category);
snprintf(desc->description, sizeof(desc->description), "%s", derived_counter->description);
}
}
}
@ -1915,27 +2190,35 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo,
uint32_t* pNumPasses)
{
VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
uint32_t group_count = 0;
uint32_t gid = 0, cid = 0, n_passes;
const struct fd_perfcntr_group *group =
fd_perfcntrs(&phydev->dev_id, &group_count);
if (TU_DEBUG(PERFCRAW)) {
VK_FROM_HANDLE(tu_physical_device, phydev, physicalDevice);
uint32_t group_count = 0;
uint32_t gid = 0, cid = 0, n_passes;
const struct fd_perfcntr_group *group =
fd_perfcntrs(&phydev->dev_id, &group_count);
uint32_t counters_requested[group_count];
memset(counters_requested, 0x0, sizeof(counters_requested));
*pNumPasses = 1;
uint32_t counters_requested[group_count];
memset(counters_requested, 0x0, sizeof(counters_requested));
*pNumPasses = 1;
for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
perfcntr_index(group, group_count,
pPerformanceQueryCreateInfo->pCounterIndices[i],
&gid, &cid);
for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) {
perfcntr_index(group, group_count,
pPerformanceQueryCreateInfo->pCounterIndices[i],
&gid, &cid);
counters_requested[gid]++;
}
counters_requested[gid]++;
}
for (uint32_t i = 0; i < group_count; i++) {
n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
*pNumPasses = MAX2(*pNumPasses, n_passes);
for (uint32_t i = 0; i < group_count; i++) {
n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters);
*pNumPasses = MAX2(*pNumPasses, n_passes);
}
} else {
/* Derived counters are designed so that the underlying perfcntrs don't go
* beyond the budget of available counter registers. Because of that we
* know we only need one pass for performance queries.
*/
*pNumPasses = 1;
}
}

View file

@ -16,7 +16,13 @@
#define PERF_CNTRS_REG 4
struct tu_perf_query_data
enum tu_perf_query_type {
TU_PERF_QUERY_TYPE_NONE,
TU_PERF_QUERY_TYPE_RAW,
TU_PERF_QUERY_TYPE_DERIVED,
};
struct tu_perf_query_raw_data
{
uint32_t gid; /* group-id */
uint32_t cid; /* countable-id within the group */
@ -25,6 +31,21 @@ struct tu_perf_query_data
uint32_t app_idx; /* index provided by apps */
};
struct tu_perf_query_raw {
const struct fd_perfcntr_group *perf_group;
uint32_t perf_group_count;
uint32_t counter_index_count;
struct tu_perf_query_raw_data data[0];
};
struct tu_perf_query_derived {
const struct fd_derived_counter **derived_counters;
uint32_t derived_counters_count;
uint32_t counter_index_count;
struct fd_derived_counter_collection collection[0];
};
struct tu_query_pool
{
struct vk_query_pool vk;
@ -34,10 +55,11 @@ struct tu_query_pool
struct tu_bo *bo;
/* For performance query */
const struct fd_perfcntr_group *perf_group;
uint32_t perf_group_count;
uint32_t counter_index_count;
struct tu_perf_query_data perf_query_data[0];
enum tu_perf_query_type perf_query_type;
union {
struct tu_perf_query_raw raw;
struct tu_perf_query_derived derived;
} perf_query;
};
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, vk.base, VkQueryPool,

View file

@ -48,6 +48,7 @@ static const struct debug_control tu_debug_options[] = {
{ "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
{ "dumpas", TU_DEBUG_DUMPAS },
{ "nobinmerging", TU_DEBUG_NO_BIN_MERGING },
{ "perfcraw", TU_DEBUG_PERFCRAW },
{ NULL, 0 }
};

View file

@ -68,6 +68,7 @@ enum tu_debug_flags : uint64_t
TU_DEBUG_NO_CONCURRENT_UNRESOLVES = BITFIELD64_BIT(27),
TU_DEBUG_DUMPAS = BITFIELD64_BIT(28),
TU_DEBUG_NO_BIN_MERGING = BITFIELD64_BIT(29),
TU_DEBUG_PERFCRAW = BITFIELD64_BIT(30),
};
struct tu_env {