mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 09:08:10 +02:00
tu: Use counter allocation helper
Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com>
This commit is contained in:
parent
74cfe319b7
commit
cb27d2e1b2
6 changed files with 74 additions and 93 deletions
|
|
@ -1641,15 +1641,12 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result)
|
|||
auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY");
|
||||
auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT");
|
||||
if (preemption_latency_countable && always_count_countable) {
|
||||
if (cp_group->num_counters >= 2) {
|
||||
preemption_latency_selector_reg = cp_group->counters[0].select_reg;
|
||||
preemption_latency_selector = preemption_latency_countable->selector;
|
||||
preemption_latency_counter_reg_lo = cp_group->counters[0].counter_reg_lo;
|
||||
preemption_latency_counter =
|
||||
fd_perfcntr_reserve(device->perfcntrs, cp_group, preemption_latency_countable);
|
||||
always_count_counter =
|
||||
fd_perfcntr_reserve(device->perfcntrs, cp_group, always_count_countable);
|
||||
|
||||
always_count_selector_reg = cp_group->counters[1].select_reg;
|
||||
always_count_selector = always_count_countable->selector;
|
||||
always_count_counter_reg_lo = cp_group->counters[1].counter_reg_lo;
|
||||
} else {
|
||||
if (!preemption_latency_counter || !always_count_counter) {
|
||||
fail_reason = "not enough counters in CP group for preemption latency tracking";
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1681,6 +1678,9 @@ tu_autotune::~tu_autotune()
|
|||
}
|
||||
|
||||
tu_bo_suballocator_finish(&suballoc);
|
||||
|
||||
fd_perfcntr_release(device->perfcntrs, preemption_latency_counter);
|
||||
fd_perfcntr_release(device->perfcntrs, always_count_counter);
|
||||
}
|
||||
|
||||
tu_autotune::cmd_buf_ctx::cmd_buf_ctx(struct tu_autotune &autotune): batch(autotune.create_batch())
|
||||
|
|
@ -1934,22 +1934,22 @@ tu_autotune::write_preempt_counters_to_iova(struct tu_cs *cs,
|
|||
uint64_t aon_iova) const
|
||||
{
|
||||
if (emit_selector) {
|
||||
tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1);
|
||||
tu_cs_emit(cs, preemption_latency_selector);
|
||||
tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1);
|
||||
tu_cs_emit(cs, preemption_latency_countable->selector);
|
||||
|
||||
tu_cs_emit_pkt4(cs, always_count_selector_reg, 1);
|
||||
tu_cs_emit(cs, always_count_selector);
|
||||
tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1);
|
||||
tu_cs_emit(cs, always_count_countable->selector);
|
||||
}
|
||||
|
||||
if (emit_wfi)
|
||||
tu_cs_emit_wfi(cs);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter_reg_lo) | CP_REG_TO_MEM_0_64B);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
|
||||
tu_cs_emit_qw(cs, latency_iova);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter_reg_lo) | CP_REG_TO_MEM_0_64B);
|
||||
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
|
||||
tu_cs_emit_qw(cs, always_count_iova);
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
|
||||
|
|
@ -2042,11 +2042,11 @@ tu_autotune::emit_switch_away_amble(struct tu_cs *cs) const
|
|||
|
||||
static size_t counter = 0;
|
||||
if (counter++ % 2 == 0) {
|
||||
tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1);
|
||||
tu_cs_emit(cs, preemption_latency_selector);
|
||||
tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1);
|
||||
tu_cs_emit(cs, preemption_latency_countable->selector);
|
||||
|
||||
tu_cs_emit_pkt4(cs, always_count_selector_reg, 1);
|
||||
tu_cs_emit(cs, always_count_selector);
|
||||
tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1);
|
||||
tu_cs_emit(cs, always_count_countable->selector);
|
||||
}
|
||||
|
||||
tu_cond_exec_end(cs);
|
||||
|
|
@ -2213,4 +2213,4 @@ tu_autotune::emit_preempt_latency_tracking_rp_hash(struct tu_cmd_buffer *cmd)
|
|||
tu_cs_emit_draw_state(&cmd->cs, TU_DRAW_STATE_AT_WRITE_RP_HASH, ds);
|
||||
|
||||
return rp_key;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -242,13 +242,11 @@ struct tu_autotune {
|
|||
std::mutex rp_latency_mutex; /* Protects rp_latency_tracking */
|
||||
uint64_t last_latency_cleanup_ts = 0;
|
||||
|
||||
uint32_t preemption_latency_selector_reg;
|
||||
uint32_t preemption_latency_selector;
|
||||
uint32_t preemption_latency_counter_reg_lo;
|
||||
const struct fd_perfcntr_counter *preemption_latency_counter;
|
||||
const struct fd_perfcntr_countable *preemption_latency_countable;
|
||||
|
||||
uint32_t always_count_selector_reg;
|
||||
uint32_t always_count_selector;
|
||||
uint32_t always_count_counter_reg_lo;
|
||||
const struct fd_perfcntr_counter *always_count_counter;
|
||||
const struct fd_perfcntr_countable *always_count_countable;
|
||||
|
||||
struct tu_draw_state reset_rp_hash_draw_state;
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include "drm-uapi/drm_fourcc.h"
|
||||
#include "git_sha1.h"
|
||||
#include "perfcntrs/freedreno_perfcntr.h"
|
||||
|
||||
#include "common/freedreno_stompable_regs.h"
|
||||
/* for fd_get_driver/device_uuid() */
|
||||
|
|
@ -3081,6 +3082,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
}
|
||||
}
|
||||
|
||||
device->perfcntrs = fd_perfcntr_state_alloc(
|
||||
&physical_device->dev_id,
|
||||
is_kgsl(physical_device->instance) ? -1 : device->fd);
|
||||
|
||||
device->autotune = new tu_autotune(device, result);
|
||||
if (result != VK_SUCCESS)
|
||||
goto fail_autotune;
|
||||
|
|
@ -3181,6 +3186,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
fail_timeline_cond:
|
||||
fail_a725_workaround:
|
||||
fail_autotune:
|
||||
fd_perfcntr_state_free(device->perfcntrs);
|
||||
delete device->autotune;
|
||||
fail_bin_preamble:
|
||||
fail_prepare_perfcntrs_pass_cs:
|
||||
|
|
@ -3287,6 +3293,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
|
|||
|
||||
delete device->autotune;
|
||||
|
||||
fd_perfcntr_state_free(device->perfcntrs);
|
||||
|
||||
tu_bo_suballocator_finish(&device->pipeline_suballoc);
|
||||
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
|
||||
tu_bo_suballocator_finish(&device->event_suballoc);
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
#define TU_DEVICE_H
|
||||
|
||||
#include "tu_common.h"
|
||||
#include "perfcntrs/freedreno_perfcntr.h"
|
||||
|
||||
#include "radix_sort/radix_sort_vk.h"
|
||||
#include "util/rwlock.h"
|
||||
|
|
@ -486,6 +487,8 @@ struct tu_device
|
|||
pthread_cond_t timeline_cond;
|
||||
pthread_mutex_t submit_mutex;
|
||||
|
||||
struct fd_perfcntr_state *perfcntrs;
|
||||
|
||||
struct tu_autotune *autotune;
|
||||
|
||||
struct breadcrumbs_context *breadcrumbs_ctx;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
*/
|
||||
|
||||
#include "tu_query_pool.h"
|
||||
#include "perfcntrs/freedreno_perfcntr.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
|
||||
|
|
@ -249,21 +250,6 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
|
|||
assert(i < group_count);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
perfcntr_reserved_counters(const struct fd_perfcntr_group *group)
|
||||
{
|
||||
/* Keep raw perf queries off the CP slots reserved by autotune latency optimization.
|
||||
* TODO: We need to do this in a more robust way.
|
||||
*/
|
||||
return strcmp(group->name, "CP") == 0 ? 2 : 0;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
perfcntr_available_counters(const struct fd_perfcntr_group *group)
|
||||
{
|
||||
return group->num_counters - MIN2(group->num_counters, perfcntr_reserved_counters(group));
|
||||
}
|
||||
|
||||
static int
|
||||
compare_perfcntr_pass(const void *a, const void *b)
|
||||
{
|
||||
|
|
@ -271,6 +257,22 @@ compare_perfcntr_pass(const void *a, const void *b)
|
|||
((struct tu_perf_query_raw_data *)b)->pass;
|
||||
}
|
||||
|
||||
static void
|
||||
tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool,
|
||||
const VkAllocationCallbacks *pAllocator)
|
||||
{
|
||||
if (is_perf_query_raw(pool)) {
|
||||
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
|
||||
|
||||
for (uint32_t i = 0; i < perf_query->counter_index_count; i++)
|
||||
fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter);
|
||||
}
|
||||
|
||||
if (pool->bo)
|
||||
tu_bo_finish(device, pool->bo);
|
||||
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
|
||||
}
|
||||
|
||||
VKAPI_ATTR VkResult VKAPI_CALL
|
||||
tu_CreateQueryPool(VkDevice _device,
|
||||
const VkQueryPoolCreateInfo *pCreateInfo,
|
||||
|
|
@ -353,50 +355,26 @@ tu_CreateQueryPool(VkDevice _device,
|
|||
|
||||
perf_query->counter_index_count = perf_query_info->counterIndexCount;
|
||||
|
||||
/* Build all perf counters data that is requested, so we could get
|
||||
* correct group id, countable id, counter register and pass index with
|
||||
* only a counter index provided by applications at each command submit.
|
||||
*
|
||||
* Also, since this built data will be sorted by pass index later, we
|
||||
* should keep the original indices and store perfcntrs results according
|
||||
* to them so apps can get correct results with their own indices.
|
||||
*/
|
||||
uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count];
|
||||
memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0]));
|
||||
memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0]));
|
||||
|
||||
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
|
||||
uint32_t gid = 0, cid = 0;
|
||||
|
||||
perfcntr_index(perf_query->perf_group, perf_query->perf_group_count,
|
||||
perf_query_info->pCounterIndices[i], &gid, &cid);
|
||||
|
||||
perf_query->data[i].gid = gid;
|
||||
perf_query->data[i].cid = cid;
|
||||
perf_query->data[i].app_idx = i;
|
||||
|
||||
const struct fd_perfcntr_group *group = &perf_query->perf_group[gid];
|
||||
uint32_t reserved_counters = perfcntr_reserved_counters(group);
|
||||
uint32_t available_counters = perfcntr_available_counters(group);
|
||||
const struct fd_perfcntr_countable *countable = &group->countables[cid];
|
||||
|
||||
if (available_counters == 0) {
|
||||
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
|
||||
perf_query->data[i].countable = countable;
|
||||
perf_query->data[i].counter =
|
||||
fd_perfcntr_reserve(device->perfcntrs, group, countable);
|
||||
|
||||
if (!perf_query->data[i].counter) {
|
||||
tu_query_pool_destroy(device, pool, pAllocator);
|
||||
return vk_errorf(device, VK_ERROR_FEATURE_NOT_PRESENT, "No raw perf counters available in group %s",
|
||||
group->name);
|
||||
}
|
||||
|
||||
/* When a counter register is over the capacity(num_counters),
|
||||
* reset it for next pass.
|
||||
*/
|
||||
if (regs[gid] < available_counters) {
|
||||
perf_query->data[i].cntr_reg = reserved_counters + regs[gid]++;
|
||||
perf_query->data[i].pass = pass[gid];
|
||||
} else {
|
||||
perf_query->data[i].pass = ++pass[gid];
|
||||
perf_query->data[i].cntr_reg = reserved_counters;
|
||||
regs[gid] = 0;
|
||||
regs[gid]++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort by pass index so we could easily prepare a command stream
|
||||
|
|
@ -429,14 +407,13 @@ tu_CreateQueryPool(VkDevice _device,
|
|||
VkResult result = tu_bo_init_new_cached(device, &pool->vk.base, &pool->bo,
|
||||
pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
|
||||
if (result != VK_SUCCESS) {
|
||||
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
|
||||
tu_query_pool_destroy(device, pool, pAllocator);
|
||||
return result;
|
||||
}
|
||||
|
||||
result = tu_bo_map(device, pool->bo, NULL);
|
||||
if (result != VK_SUCCESS) {
|
||||
tu_bo_finish(device, pool->bo);
|
||||
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
|
||||
tu_query_pool_destroy(device, pool, pAllocator);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -463,8 +440,7 @@ tu_DestroyQueryPool(VkDevice _device,
|
|||
|
||||
TU_RMV(resource_destroy, device, pool);
|
||||
|
||||
tu_bo_finish(device, pool->bo);
|
||||
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
|
||||
tu_query_pool_destroy(device, pool, pAllocator);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
|
|
@ -1276,13 +1252,8 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
const struct fd_perfcntr_counter *counter =
|
||||
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
|
||||
const struct fd_perfcntr_countable *countable =
|
||||
&perf_query->perf_group[data->gid].countables[data->cid];
|
||||
|
||||
tu_cs_emit_pkt4(cs, counter->select_reg, 1);
|
||||
tu_cs_emit(cs, countable->selector);
|
||||
tu_cs_emit_pkt4(cs, data->counter->select_reg, 1);
|
||||
tu_cs_emit(cs, data->countable->selector);
|
||||
}
|
||||
tu_cond_exec_end(cs);
|
||||
|
||||
|
|
@ -1300,8 +1271,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
const struct fd_perfcntr_counter *counter =
|
||||
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
|
||||
const struct fd_perfcntr_counter *counter = data->counter;
|
||||
|
||||
uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx);
|
||||
|
||||
|
|
@ -1749,8 +1719,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
|
|||
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
|
||||
}
|
||||
|
||||
const struct fd_perfcntr_counter *counter =
|
||||
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
|
||||
const struct fd_perfcntr_counter *counter = data->counter;
|
||||
|
||||
end_iova = perf_query_iova(pool, query, end, data->app_idx);
|
||||
|
||||
|
|
@ -2317,9 +2286,12 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
|
|||
}
|
||||
|
||||
for (uint32_t i = 0; i < group_count; i++) {
|
||||
uint32_t available_counters = perfcntr_available_counters(&group[i]);
|
||||
if (available_counters == 0)
|
||||
continue;
|
||||
/* Some counters may be unavailable at the time the query is
|
||||
* created due to runtime factors (pps/fdperf using some counters,
|
||||
* autotune or other queries, etc). But we don't know that up
|
||||
* front.
|
||||
*/
|
||||
uint32_t available_counters = group[i].num_counters;
|
||||
|
||||
n_passes = DIV_ROUND_UP(counters_requested[i], available_counters);
|
||||
*pNumPasses = MAX2(*pNumPasses, n_passes);
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
#define TU_QUERY_POOL_H
|
||||
|
||||
#include "tu_common.h"
|
||||
#include "perfcntrs/freedreno_perfcntr.h"
|
||||
|
||||
#include "vk_query_pool.h"
|
||||
|
||||
|
|
@ -24,9 +25,8 @@ enum tu_perf_query_type {
|
|||
|
||||
struct tu_perf_query_raw_data
|
||||
{
|
||||
uint32_t gid; /* group-id */
|
||||
uint32_t cid; /* countable-id within the group */
|
||||
uint32_t cntr_reg; /* counter register within the group */
|
||||
const struct fd_perfcntr_counter *counter;
|
||||
const struct fd_perfcntr_countable *countable;
|
||||
uint32_t pass; /* pass index that countables can be requested */
|
||||
uint32_t app_idx; /* index provided by apps */
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue