tu: Use counter allocation helper

Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com>
This commit is contained in:
Rob Clark 2026-04-30 10:20:52 -07:00
parent 74cfe319b7
commit cb27d2e1b2
6 changed files with 74 additions and 93 deletions

View file

@ -1641,15 +1641,12 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result)
auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY");
auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT");
if (preemption_latency_countable && always_count_countable) {
if (cp_group->num_counters >= 2) {
preemption_latency_selector_reg = cp_group->counters[0].select_reg;
preemption_latency_selector = preemption_latency_countable->selector;
preemption_latency_counter_reg_lo = cp_group->counters[0].counter_reg_lo;
preemption_latency_counter =
fd_perfcntr_reserve(device->perfcntrs, cp_group, preemption_latency_countable);
always_count_counter =
fd_perfcntr_reserve(device->perfcntrs, cp_group, always_count_countable);
always_count_selector_reg = cp_group->counters[1].select_reg;
always_count_selector = always_count_countable->selector;
always_count_counter_reg_lo = cp_group->counters[1].counter_reg_lo;
} else {
if (!preemption_latency_counter || !always_count_counter) {
fail_reason = "not enough counters in CP group for preemption latency tracking";
}
} else {
@ -1681,6 +1678,9 @@ tu_autotune::~tu_autotune()
}
tu_bo_suballocator_finish(&suballoc);
fd_perfcntr_release(device->perfcntrs, preemption_latency_counter);
fd_perfcntr_release(device->perfcntrs, always_count_counter);
}
tu_autotune::cmd_buf_ctx::cmd_buf_ctx(struct tu_autotune &autotune): batch(autotune.create_batch())
@ -1934,22 +1934,22 @@ tu_autotune::write_preempt_counters_to_iova(struct tu_cs *cs,
uint64_t aon_iova) const
{
if (emit_selector) {
tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1);
tu_cs_emit(cs, preemption_latency_selector);
tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1);
tu_cs_emit(cs, preemption_latency_countable->selector);
tu_cs_emit_pkt4(cs, always_count_selector_reg, 1);
tu_cs_emit(cs, always_count_selector);
tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1);
tu_cs_emit(cs, always_count_countable->selector);
}
if (emit_wfi)
tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, latency_iova);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, always_count_iova);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
@ -2042,11 +2042,11 @@ tu_autotune::emit_switch_away_amble(struct tu_cs *cs) const
static size_t counter = 0;
if (counter++ % 2 == 0) {
tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1);
tu_cs_emit(cs, preemption_latency_selector);
tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1);
tu_cs_emit(cs, preemption_latency_countable->selector);
tu_cs_emit_pkt4(cs, always_count_selector_reg, 1);
tu_cs_emit(cs, always_count_selector);
tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1);
tu_cs_emit(cs, always_count_countable->selector);
}
tu_cond_exec_end(cs);
@ -2213,4 +2213,4 @@ tu_autotune::emit_preempt_latency_tracking_rp_hash(struct tu_cmd_buffer *cmd)
tu_cs_emit_draw_state(&cmd->cs, TU_DRAW_STATE_AT_WRITE_RP_HASH, ds);
return rp_key;
}
}

View file

@ -242,13 +242,11 @@ struct tu_autotune {
std::mutex rp_latency_mutex; /* Protects rp_latency_tracking */
uint64_t last_latency_cleanup_ts = 0;
uint32_t preemption_latency_selector_reg;
uint32_t preemption_latency_selector;
uint32_t preemption_latency_counter_reg_lo;
const struct fd_perfcntr_counter *preemption_latency_counter;
const struct fd_perfcntr_countable *preemption_latency_countable;
uint32_t always_count_selector_reg;
uint32_t always_count_selector;
uint32_t always_count_counter_reg_lo;
const struct fd_perfcntr_counter *always_count_counter;
const struct fd_perfcntr_countable *always_count_countable;
struct tu_draw_state reset_rp_hash_draw_state;

View file

@ -11,6 +11,7 @@
#include "drm-uapi/drm_fourcc.h"
#include "git_sha1.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "common/freedreno_stompable_regs.h"
/* for fd_get_driver/device_uuid() */
@ -3081,6 +3082,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
}
}
device->perfcntrs = fd_perfcntr_state_alloc(
&physical_device->dev_id,
is_kgsl(physical_device->instance) ? -1 : device->fd);
device->autotune = new tu_autotune(device, result);
if (result != VK_SUCCESS)
goto fail_autotune;
@ -3181,6 +3186,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
fail_timeline_cond:
fail_a725_workaround:
fail_autotune:
fd_perfcntr_state_free(device->perfcntrs);
delete device->autotune;
fail_bin_preamble:
fail_prepare_perfcntrs_pass_cs:
@ -3287,6 +3293,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
delete device->autotune;
fd_perfcntr_state_free(device->perfcntrs);
tu_bo_suballocator_finish(&device->pipeline_suballoc);
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
tu_bo_suballocator_finish(&device->event_suballoc);

View file

@ -11,6 +11,7 @@
#define TU_DEVICE_H
#include "tu_common.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "radix_sort/radix_sort_vk.h"
#include "util/rwlock.h"
@ -486,6 +487,8 @@ struct tu_device
pthread_cond_t timeline_cond;
pthread_mutex_t submit_mutex;
struct fd_perfcntr_state *perfcntrs;
struct tu_autotune *autotune;
struct breadcrumbs_context *breadcrumbs_ctx;

View file

@ -7,6 +7,7 @@
*/
#include "tu_query_pool.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include <fcntl.h>
@ -249,21 +250,6 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
assert(i < group_count);
}
static uint32_t
perfcntr_reserved_counters(const struct fd_perfcntr_group *group)
{
/* Keep raw perf queries off the CP slots reserved by autotune latency optimization.
* TODO: We need to do this in a more robust way.
*/
return strcmp(group->name, "CP") == 0 ? 2 : 0;
}
static uint32_t
perfcntr_available_counters(const struct fd_perfcntr_group *group)
{
return group->num_counters - MIN2(group->num_counters, perfcntr_reserved_counters(group));
}
static int
compare_perfcntr_pass(const void *a, const void *b)
{
@ -271,6 +257,22 @@ compare_perfcntr_pass(const void *a, const void *b)
((struct tu_perf_query_raw_data *)b)->pass;
}
static void
tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool,
const VkAllocationCallbacks *pAllocator)
{
if (is_perf_query_raw(pool)) {
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
for (uint32_t i = 0; i < perf_query->counter_index_count; i++)
fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter);
}
if (pool->bo)
tu_bo_finish(device, pool->bo);
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo,
@ -353,50 +355,26 @@ tu_CreateQueryPool(VkDevice _device,
perf_query->counter_index_count = perf_query_info->counterIndexCount;
/* Build all perf counters data that is requested, so we could get
* correct group id, countable id, counter register and pass index with
* only a counter index provided by applications at each command submit.
*
* Also, since this built data will be sorted by pass index later, we
* should keep the original indices and store perfcntrs results according
* to them so apps can get correct results with their own indices.
*/
uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count];
memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0]));
memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0]));
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
uint32_t gid = 0, cid = 0;
perfcntr_index(perf_query->perf_group, perf_query->perf_group_count,
perf_query_info->pCounterIndices[i], &gid, &cid);
perf_query->data[i].gid = gid;
perf_query->data[i].cid = cid;
perf_query->data[i].app_idx = i;
const struct fd_perfcntr_group *group = &perf_query->perf_group[gid];
uint32_t reserved_counters = perfcntr_reserved_counters(group);
uint32_t available_counters = perfcntr_available_counters(group);
const struct fd_perfcntr_countable *countable = &group->countables[cid];
if (available_counters == 0) {
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
perf_query->data[i].countable = countable;
perf_query->data[i].counter =
fd_perfcntr_reserve(device->perfcntrs, group, countable);
if (!perf_query->data[i].counter) {
tu_query_pool_destroy(device, pool, pAllocator);
return vk_errorf(device, VK_ERROR_FEATURE_NOT_PRESENT, "No raw perf counters available in group %s",
group->name);
}
/* When a counter register is over the capacity(num_counters),
* reset it for next pass.
*/
if (regs[gid] < available_counters) {
perf_query->data[i].cntr_reg = reserved_counters + regs[gid]++;
perf_query->data[i].pass = pass[gid];
} else {
perf_query->data[i].pass = ++pass[gid];
perf_query->data[i].cntr_reg = reserved_counters;
regs[gid] = 0;
regs[gid]++;
}
}
/* Sort by pass index so we could easily prepare a command stream
@ -429,14 +407,13 @@ tu_CreateQueryPool(VkDevice _device,
VkResult result = tu_bo_init_new_cached(device, &pool->vk.base, &pool->bo,
pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
if (result != VK_SUCCESS) {
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
tu_query_pool_destroy(device, pool, pAllocator);
return result;
}
result = tu_bo_map(device, pool->bo, NULL);
if (result != VK_SUCCESS) {
tu_bo_finish(device, pool->bo);
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
tu_query_pool_destroy(device, pool, pAllocator);
return result;
}
@ -463,8 +440,7 @@ tu_DestroyQueryPool(VkDevice _device,
TU_RMV(resource_destroy, device, pool);
tu_bo_finish(device, pool->bo);
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
tu_query_pool_destroy(device, pool, pAllocator);
}
static uint32_t
@ -1276,13 +1252,8 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_countable *countable =
&perf_query->perf_group[data->gid].countables[data->cid];
tu_cs_emit_pkt4(cs, counter->select_reg, 1);
tu_cs_emit(cs, countable->selector);
tu_cs_emit_pkt4(cs, data->counter->select_reg, 1);
tu_cs_emit(cs, data->countable->selector);
}
tu_cond_exec_end(cs);
@ -1300,8 +1271,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_counter *counter = data->counter;
uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx);
@ -1749,8 +1719,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_counter *counter = data->counter;
end_iova = perf_query_iova(pool, query, end, data->app_idx);
@ -2317,9 +2286,12 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
}
for (uint32_t i = 0; i < group_count; i++) {
uint32_t available_counters = perfcntr_available_counters(&group[i]);
if (available_counters == 0)
continue;
/* Some counters may be unavailable at the time the query is
* created due to runtime factors (pps/fdperf using some counters,
* autotune or other queries, etc). But we don't know that up
* front.
*/
uint32_t available_counters = group[i].num_counters;
n_passes = DIV_ROUND_UP(counters_requested[i], available_counters);
*pNumPasses = MAX2(*pNumPasses, n_passes);

View file

@ -11,6 +11,7 @@
#define TU_QUERY_POOL_H
#include "tu_common.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "vk_query_pool.h"
@ -24,9 +25,8 @@ enum tu_perf_query_type {
struct tu_perf_query_raw_data
{
uint32_t gid; /* group-id */
uint32_t cid; /* countable-id within the group */
uint32_t cntr_reg; /* counter register within the group */
const struct fd_perfcntr_counter *counter;
const struct fd_perfcntr_countable *countable;
uint32_t pass; /* pass index that countables can be requested */
uint32_t app_idx; /* index provided by apps */
};