From cb27d2e1b26650043a41f3e6650d7b2fa27819c6 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 30 Apr 2026 10:20:52 -0700 Subject: [PATCH] tu: Use counter allocation helper Signed-off-by: Rob Clark --- src/freedreno/vulkan/tu_autotune.cc | 38 +++++----- src/freedreno/vulkan/tu_autotune.h | 10 +-- src/freedreno/vulkan/tu_device.cc | 8 ++ src/freedreno/vulkan/tu_device.h | 3 + src/freedreno/vulkan/tu_query_pool.cc | 102 ++++++++++---------------- src/freedreno/vulkan/tu_query_pool.h | 6 +- 6 files changed, 74 insertions(+), 93 deletions(-) diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc index 421cd26c3d1..700ba8b7cf1 100644 --- a/src/freedreno/vulkan/tu_autotune.cc +++ b/src/freedreno/vulkan/tu_autotune.cc @@ -1641,15 +1641,12 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result) auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); if (preemption_latency_countable && always_count_countable) { - if (cp_group->num_counters >= 2) { - preemption_latency_selector_reg = cp_group->counters[0].select_reg; - preemption_latency_selector = preemption_latency_countable->selector; - preemption_latency_counter_reg_lo = cp_group->counters[0].counter_reg_lo; + preemption_latency_counter = + fd_perfcntr_reserve(device->perfcntrs, cp_group, preemption_latency_countable); + always_count_counter = + fd_perfcntr_reserve(device->perfcntrs, cp_group, always_count_countable); - always_count_selector_reg = cp_group->counters[1].select_reg; - always_count_selector = always_count_countable->selector; - always_count_counter_reg_lo = cp_group->counters[1].counter_reg_lo; - } else { + if (!preemption_latency_counter || !always_count_counter) { fail_reason = "not enough counters in CP group for preemption latency tracking"; } } else { @@ -1681,6 +1678,9 @@ tu_autotune::~tu_autotune() } tu_bo_suballocator_finish(&suballoc); + + fd_perfcntr_release(device->perfcntrs, preemption_latency_counter); + fd_perfcntr_release(device->perfcntrs, always_count_counter); } tu_autotune::cmd_buf_ctx::cmd_buf_ctx(struct tu_autotune &autotune): batch(autotune.create_batch()) @@ -1934,22 +1934,22 @@ tu_autotune::write_preempt_counters_to_iova(struct tu_cs *cs, uint64_t aon_iova) const { if (emit_selector) { - tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1); - tu_cs_emit(cs, preemption_latency_selector); + tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1); + tu_cs_emit(cs, preemption_latency_countable->selector); - tu_cs_emit_pkt4(cs, always_count_selector_reg, 1); - tu_cs_emit(cs, always_count_selector); + tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1); + tu_cs_emit(cs, always_count_countable->selector); } if (emit_wfi) tu_cs_emit_wfi(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, latency_iova); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, always_count_iova); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); @@ -2042,11 +2042,11 @@ tu_autotune::emit_switch_away_amble(struct tu_cs *cs) const static size_t counter = 0; if (counter++ % 2 == 0) { - tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1); - tu_cs_emit(cs, preemption_latency_selector); + tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1); + tu_cs_emit(cs, preemption_latency_countable->selector); - tu_cs_emit_pkt4(cs, always_count_selector_reg, 1); - tu_cs_emit(cs, always_count_selector); + tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1); + tu_cs_emit(cs, always_count_countable->selector); } tu_cond_exec_end(cs); @@ -2213,4 +2213,4 @@ tu_autotune::emit_preempt_latency_tracking_rp_hash(struct tu_cmd_buffer *cmd) tu_cs_emit_draw_state(&cmd->cs, TU_DRAW_STATE_AT_WRITE_RP_HASH, ds); return rp_key; -} \ No newline at end of file +} diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h index 55e579ae93a..8bf231edd04 100644 --- a/src/freedreno/vulkan/tu_autotune.h +++ b/src/freedreno/vulkan/tu_autotune.h @@ -242,13 +242,11 @@ struct tu_autotune { std::mutex rp_latency_mutex; /* Protects rp_latency_tracking */ uint64_t last_latency_cleanup_ts = 0; - uint32_t preemption_latency_selector_reg; - uint32_t preemption_latency_selector; - uint32_t preemption_latency_counter_reg_lo; + const struct fd_perfcntr_counter *preemption_latency_counter; + const struct fd_perfcntr_countable *preemption_latency_countable; - uint32_t always_count_selector_reg; - uint32_t always_count_selector; - uint32_t always_count_counter_reg_lo; + const struct fd_perfcntr_counter *always_count_counter; + const struct fd_perfcntr_countable *always_count_countable; struct tu_draw_state reset_rp_hash_draw_state; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 2e755551dca..ec79b9790b6 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -11,6 +11,7 @@ #include "drm-uapi/drm_fourcc.h" #include "git_sha1.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "common/freedreno_stompable_regs.h" /* for fd_get_driver/device_uuid() */ @@ -3081,6 +3082,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } } + device->perfcntrs = fd_perfcntr_state_alloc( + &physical_device->dev_id, + is_kgsl(physical_device->instance) ? -1 : device->fd); + device->autotune = new tu_autotune(device, result); if (result != VK_SUCCESS) goto fail_autotune; @@ -3181,6 +3186,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, fail_timeline_cond: fail_a725_workaround: fail_autotune: + fd_perfcntr_state_free(device->perfcntrs); delete device->autotune; fail_bin_preamble: fail_prepare_perfcntrs_pass_cs: @@ -3287,6 +3293,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) delete device->autotune; + fd_perfcntr_state_free(device->perfcntrs); + tu_bo_suballocator_finish(&device->pipeline_suballoc); tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); tu_bo_suballocator_finish(&device->event_suballoc); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..b65b3d5b4e7 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -11,6 +11,7 @@ #define TU_DEVICE_H #include "tu_common.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "radix_sort/radix_sort_vk.h" #include "util/rwlock.h" @@ -486,6 +487,8 @@ struct tu_device pthread_cond_t timeline_cond; pthread_mutex_t submit_mutex; + struct fd_perfcntr_state *perfcntrs; + struct tu_autotune *autotune; struct breadcrumbs_context *breadcrumbs_ctx; diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 3d0851de7c4..e3f6b9fae4b 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -7,6 +7,7 @@ */ #include "tu_query_pool.h" +#include "perfcntrs/freedreno_perfcntr.h" #include @@ -249,21 +250,6 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, assert(i < group_count); } -static uint32_t -perfcntr_reserved_counters(const struct fd_perfcntr_group *group) -{ - /* Keep raw perf queries off the CP slots reserved by autotune latency optimization. - * TODO: We need to do this in a more robust way. - */ - return strcmp(group->name, "CP") == 0 ? 2 : 0; -} - -static uint32_t -perfcntr_available_counters(const struct fd_perfcntr_group *group) -{ - return group->num_counters - MIN2(group->num_counters, perfcntr_reserved_counters(group)); -} - static int compare_perfcntr_pass(const void *a, const void *b) { @@ -271,6 +257,22 @@ compare_perfcntr_pass(const void *a, const void *b) ((struct tu_perf_query_raw_data *)b)->pass; } +static void +tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool, + const VkAllocationCallbacks *pAllocator) +{ + if (is_perf_query_raw(pool)) { + struct tu_perf_query_raw *perf_query = &pool->perf_query.raw; + + for (uint32_t i = 0; i < perf_query->counter_index_count; i++) + fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter); + } + + if (pool->bo) + tu_bo_finish(device, pool->bo); + vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); +} + VKAPI_ATTR VkResult VKAPI_CALL tu_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, @@ -353,50 +355,26 @@ tu_CreateQueryPool(VkDevice _device, perf_query->counter_index_count = perf_query_info->counterIndexCount; - /* Build all perf counters data that is requested, so we could get - * correct group id, countable id, counter register and pass index with - * only a counter index provided by applications at each command submit. - * - * Also, since this built data will be sorted by pass index later, we - * should keep the original indices and store perfcntrs results according - * to them so apps can get correct results with their own indices. - */ - uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count]; - memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0])); - memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0])); - for (uint32_t i = 0; i < perf_query->counter_index_count; i++) { uint32_t gid = 0, cid = 0; perfcntr_index(perf_query->perf_group, perf_query->perf_group_count, perf_query_info->pCounterIndices[i], &gid, &cid); - perf_query->data[i].gid = gid; - perf_query->data[i].cid = cid; perf_query->data[i].app_idx = i; const struct fd_perfcntr_group *group = &perf_query->perf_group[gid]; - uint32_t reserved_counters = perfcntr_reserved_counters(group); - uint32_t available_counters = perfcntr_available_counters(group); + const struct fd_perfcntr_countable *countable = &group->countables[cid]; - if (available_counters == 0) { - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + perf_query->data[i].countable = countable; + perf_query->data[i].counter = + fd_perfcntr_reserve(device->perfcntrs, group, countable); + + if (!perf_query->data[i].counter) { + tu_query_pool_destroy(device, pool, pAllocator); return vk_errorf(device, VK_ERROR_FEATURE_NOT_PRESENT, "No raw perf counters available in group %s", group->name); } - - /* When a counter register is over the capacity(num_counters), - * reset it for next pass. - */ - if (regs[gid] < available_counters) { - perf_query->data[i].cntr_reg = reserved_counters + regs[gid]++; - perf_query->data[i].pass = pass[gid]; - } else { - perf_query->data[i].pass = ++pass[gid]; - perf_query->data[i].cntr_reg = reserved_counters; - regs[gid] = 0; - regs[gid]++; - } } /* Sort by pass index so we could easily prepare a command stream @@ -429,14 +407,13 @@ tu_CreateQueryPool(VkDevice _device, VkResult result = tu_bo_init_new_cached(device, &pool->vk.base, &pool->bo, pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool"); if (result != VK_SUCCESS) { - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); return result; } result = tu_bo_map(device, pool->bo, NULL); if (result != VK_SUCCESS) { - tu_bo_finish(device, pool->bo); - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); return result; } @@ -463,8 +440,7 @@ tu_DestroyQueryPool(VkDevice _device, TU_RMV(resource_destroy, device, pool); - tu_bo_finish(device, pool->bo); - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); } static uint32_t @@ -1276,13 +1252,8 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; - const struct fd_perfcntr_countable *countable = - &perf_query->perf_group[data->gid].countables[data->cid]; - - tu_cs_emit_pkt4(cs, counter->select_reg, 1); - tu_cs_emit(cs, countable->selector); + tu_cs_emit_pkt4(cs, data->counter->select_reg, 1); + tu_cs_emit(cs, data->countable->selector); } tu_cond_exec_end(cs); @@ -1300,8 +1271,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; + const struct fd_perfcntr_counter *counter = data->counter; uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx); @@ -1749,8 +1719,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; + const struct fd_perfcntr_counter *counter = data->counter; end_iova = perf_query_iova(pool, query, end, data->app_idx); @@ -2317,9 +2286,12 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( } for (uint32_t i = 0; i < group_count; i++) { - uint32_t available_counters = perfcntr_available_counters(&group[i]); - if (available_counters == 0) - continue; + /* Some counters may be unavailable at the time the query is + * created due to runtime factors (pps/fdperf using some counters, + * autotune or other queries, etc). But we don't know that up + * front. + */ + uint32_t available_counters = group[i].num_counters; n_passes = DIV_ROUND_UP(counters_requested[i], available_counters); *pNumPasses = MAX2(*pNumPasses, n_passes); diff --git a/src/freedreno/vulkan/tu_query_pool.h b/src/freedreno/vulkan/tu_query_pool.h index b642934f130..b1c004fd484 100644 --- a/src/freedreno/vulkan/tu_query_pool.h +++ b/src/freedreno/vulkan/tu_query_pool.h @@ -11,6 +11,7 @@ #define TU_QUERY_POOL_H #include "tu_common.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "vk_query_pool.h" @@ -24,9 +25,8 @@ enum tu_perf_query_type { struct tu_perf_query_raw_data { - uint32_t gid; /* group-id */ - uint32_t cid; /* countable-id within the group */ - uint32_t cntr_reg; /* counter register within the group */ + const struct fd_perfcntr_counter *counter; + const struct fd_perfcntr_countable *countable; uint32_t pass; /* pass index that countables can be requested */ uint32_t app_idx; /* index provided by apps */ };