Merge branch 'fd/perfcntr-config' into 'main'

Draft: freedreno: Add PERFCNTR_CONFIG support

See merge request mesa/mesa!41158
This commit is contained in:
Rob Clark 2026-05-08 00:11:59 +00:00
commit 98408367e2
26 changed files with 2333 additions and 328 deletions

View file

@ -117,6 +117,7 @@ struct drm_msm_timespec {
* ioctl will throw -EPIPE.
*/
#define MSM_PARAM_EN_VM_BIND 0x16 /* WO, once */
#define MSM_PARAM_AQE 0x17 /* RO */
/* For backwards compat. The original support for preemption was based on
* a single ring per priority level so # of priority levels equals the #
@ -490,6 +491,52 @@ struct drm_msm_submitqueue_query {
__u32 pad;
};
#define MSM_PERFCNTR_STREAM 0x00000001
#define MSM_PERFCNTR_UPDATE 0x00000002
#define MSM_PERFCNTR_FLAGS ( \
MSM_PERFCNTR_STREAM | \
MSM_PERFCNTR_UPDATE | \
0)
struct drm_msm_perfcntr_group {
char group_name[16];
__u32 nr_countables;
__u32 pad;
__u64 countables; /* pointer to an array of nr_countables u32 */
};
/*
* Note, for MSM_PERFCNTR_STREAM, the ioctl returns an fd to read recorded
* counters. This only works because the ioctl is DRM_IOW(), if we returned
* a out param in the ioctl struct the copy_to_user() (in drm_ioctl())
* could fault, causing us to leak the fd.
*
* If the ioctl returns with error E2BIG, that means more counters/countables
* are requested than are currently available. If MSM_PERFCNTR_UPDATE flag
* is set, drm_msm_perfcntr_group::nr_countables will be updated to return
* the actual # of counters available.
*
* The data read from the has the following format for each sampling period:
*
* uint64_t timestamp; // CP_ALWAYS_ON_COUNTER captured at sample time
* uint32_t seqno; // increments by 1 each period, reset to 0 on discontinuity
* uint32_t mbz; // pad out counters to 64b
* struct {
* uint64_t counter[nr_countables];
* } groups[nr_groups];
*
* The ordering of groups and counters matches the order in PERFCNTR_CONFIG
* ioctl.
*/
struct drm_msm_perfcntr_config {
__u32 flags; /* bitmask of MSM_PERFCNTR_x */
__u32 nr_groups; /* # of entries in groups array */
__u64 groups; /* pointer to array of drm_msm_perfcntr_group */
__u64 period; /* sampling period in ns */
__u32 bufsz_shift; /* sample buffer size in bytes is 1<<bufsz_shift */
__u32 group_stride; /* sizeof(struct drm_msm_perfcntr_group) */
};
#define DRM_MSM_GET_PARAM 0x00
#define DRM_MSM_SET_PARAM 0x01
#define DRM_MSM_GEM_NEW 0x02
@ -506,6 +553,7 @@ struct drm_msm_submitqueue_query {
#define DRM_MSM_SUBMITQUEUE_CLOSE 0x0B
#define DRM_MSM_SUBMITQUEUE_QUERY 0x0C
#define DRM_MSM_VM_BIND 0x0D
#define DRM_MSM_PERFCNTR_CONFIG 0x0E
#define DRM_IOCTL_MSM_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
#define DRM_IOCTL_MSM_SET_PARAM DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SET_PARAM, struct drm_msm_param)
@ -520,6 +568,7 @@ struct drm_msm_submitqueue_query {
#define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
#define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query)
#define DRM_IOCTL_MSM_VM_BIND DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_VM_BIND, struct drm_msm_vm_bind)
#define DRM_IOCTL_MSM_PERFCNTR_CONFIG DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_PERFCNTR_CONFIG, struct drm_msm_perfcntr_config)
#if defined(__cplusplus)
}

View file

@ -158,6 +158,9 @@ struct BitmaskEnum {
#define BIT(bit) BITFIELD64_BIT(bit)
#define U642VOID(x) ((void *)(unsigned long)(x))
#define VOID2U64(x) ((uint64_t)(unsigned long)(x))
/**
* Helper for allocating sequence #s where zero is a non-valid seqno
*/

View file

@ -134,13 +134,10 @@ setup_counter(const char *name, struct perfcntr *c)
{
for (int i = 0; i < num_groups; i++) {
const struct fd_perfcntr_group *group = &groups[i];
const struct fd_perfcntr_countable *countable =
fd_perfcntrs_countable(group, name);
for (int j = 0; j < group->num_countables; j++) {
const struct fd_perfcntr_countable *countable = &group->countables[j];
if (strcmp(name, countable->name) != 0)
continue;
if (countable) {
/*
* Allocate a counter to use to monitor the requested countable:
*/

File diff suppressed because it is too large Load diff

View file

@ -7,9 +7,16 @@
#include <cstring>
#include <iostream>
#include <perfetto.h>
#include <err.h>
#include <perfetto.h>
#include <poll.h>
#include <xf86drm.h>
#include "common/freedreno_common.h"
#include "common/freedreno_dev_info.h"
#include "drm-uapi/msm_drm.h"
#include "drm/freedreno_drmif.h"
#include "drm/freedreno_ringbuffer.h"
#include "perfcntrs/freedreno_dt.h"
@ -46,6 +53,8 @@ FreedrenoDriver::configure_counters(bool reset, bool wait)
(enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags);
assert(io); /* This is legacy path only */
for (const auto &countable : countables)
countable.configure(ring, reset);
@ -67,12 +76,85 @@ FreedrenoDriver::configure_counters(bool reset, bool wait)
void
FreedrenoDriver::collect_countables()
{
assert(io); /* This is legacy path only */
last_dump_ts = gpu_timestamp();
for (const auto &countable : countables)
countable.collect();
}
int
FreedrenoDriver::configure_counters_stream()
{
if (perfcntr_stream_fd >= 0) {
close(perfcntr_stream_fd);
perfcntr_stream_fd = -1;
}
unsigned sample_size = sizeof(uint64_t) * (2 + countables.size());
unsigned bufsz = 2 * sample_size;
unsigned bufsz_shift = ffs(util_next_power_of_two(bufsz)) - 1;
struct drm_msm_perfcntr_group groups[num_perfcntrs];
memset(groups, 0, sizeof(groups));
struct drm_msm_perfcntr_config req = {
.flags = MSM_PERFCNTR_STREAM,
.groups = VOID2U64(groups),
.period = sampling_period_ns_,
.bufsz_shift = bufsz_shift,
.group_stride = sizeof(struct drm_msm_perfcntr_group),
};
assert(req.period);
for (const auto &countable : countables)
countable.configure_stream(&req);
/* Now that the groups are fully populated, resolve the sample indices: */
for (const auto &countable : countables)
countable.resolve_sample_idx(&req);
int fd = drmIoctl(fd_device_fd(dev), DRM_IOCTL_MSM_PERFCNTR_CONFIG, &req);
if (fd < 0)
return fd;
sample_buf = malloc(sample_size);
perfcntr_stream_fd = fd;
/* Unlike the legacy path, the kernel handles reconfiguring counters
* after power collapse for us, so we won't need to configure the
* stream again. So cleanup allocated memory now:
*/
for (unsigned i = 0; i < num_perfcntrs; i++) {
if (!groups[i].countables)
break;
free(U642VOID(groups[i].countables));
}
return 0;
}
static bool
perfcntr_stream_ready(int perfcntr_stream_fd)
{
struct pollfd pfd;
pfd.fd = perfcntr_stream_fd;
pfd.events = POLLIN;
pfd.revents = 0;
if (poll(&pfd, 1, 0) < 0)
return false;
if (!(pfd.revents & POLLIN))
return false;
return true;
}
static uint64_t
ticks_to_ns(uint64_t ticks)
{
@ -82,6 +164,61 @@ ticks_to_ns(uint64_t ticks)
return ticks / GPU_TICKS_PER_NS;
}
bool
FreedrenoDriver::collect_countables_stream()
{
unsigned nsamples = 0;
bool discontinuity = false;
assert(perfcntr_stream_fd >= 0);
while (perfcntr_stream_ready(perfcntr_stream_fd)) {
unsigned sample_size = sizeof(uint64_t) * (2 + countables.size());
size_t sz = sample_size;
void *ptr = sample_buf;
while (sz > 0) {
ssize_t ret = read(perfcntr_stream_fd, ptr, sz);
if (ret < 0)
ret = -errno;
if (ret == -EINTR || ret == -EAGAIN)
continue;
if (ret < 0)
errx(ret, "read failed");
sz -= ret;
ptr = static_cast<char *>(ptr) + ret;
}
uint64_t *buf = (uint64_t *)sample_buf;
uint64_t ts = buf[0];
uint32_t seqno = buf[1] & 0xffffffff;
discontinuity = seqno == 0;
/* Capture the timestamp from the *start* of the sampling period: */
last_capture_ts = last_dump_ts;
last_dump_ts = ts;
auto elapsed_time_ns = ticks_to_ns(last_dump_ts - last_capture_ts);
time = (float)elapsed_time_ns / 1000000000.0;
/* advance past header: */
buf += 2;
for (const auto &countable : countables)
countable.collect_stream(buf);
nsamples++;
}
return (nsamples > 0) && !discontinuity;
}
bool
FreedrenoDriver::init_perfcnt()
{
@ -107,9 +244,7 @@ FreedrenoDriver::init_perfcnt()
has_suspend_count = true;
}
fd_pipe_set_param(pipe, FD_SYSPROF, 1);
perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs);
perfcntrs = fd_perfcntrs(dev_id, &num_perfcntrs);
if (num_perfcntrs == 0) {
PERFETTO_FATAL("No hw counters available");
return false;
@ -127,6 +262,9 @@ FreedrenoDriver::init_perfcnt()
case 7:
setup_a7xx_counters();
break;
case 8:
setup_a8xx_counters();
break;
default:
PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id));
return false;
@ -137,12 +275,20 @@ FreedrenoDriver::init_perfcnt()
for (const auto &countable : countables)
countable.resolve();
if (!configure_counters_stream()) {
close(perfcntr_stream_fd);
perfcntr_stream_fd = -1;
return true;
}
io = fd_dt_find_io();
if (!io) {
PERFETTO_FATAL("Could not map GPU I/O space");
return false;
}
fd_pipe_set_param(pipe, FD_SYSPROF, 1);
configure_counters(true, true);
collect_countables();
@ -165,14 +311,26 @@ FreedrenoDriver::enable_all_counters()
}
void
FreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */)
FreedrenoDriver::enable_perfcnt(const uint64_t sampling_period_ns)
{
sampling_period_ns_ = sampling_period_ns;
if (!io) {
/* reconfigure counter stream: */
configure_counters_stream();
collect_countables_stream();
}
}
bool
FreedrenoDriver::dump_perfcnt()
{
if (has_suspend_count) {
/* Note, when using perfcntr stream instead of mmio basec counter
* reads, we can skip this (since the seqno in the data read from
* the stream will tell us if there is a discontinuity, and the
* kernel will handle reconfiguring counters on resume)
*/
if (has_suspend_count && io) {
uint64_t val;
fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val);
@ -193,6 +351,9 @@ FreedrenoDriver::dump_perfcnt()
}
}
if (!io)
return collect_countables_stream();
auto last_ts = last_dump_ts;
/* Capture the timestamp from the *start* of the sampling period: */
@ -223,11 +384,13 @@ uint64_t FreedrenoDriver::next()
return ret;
}
void FreedrenoDriver::disable_perfcnt()
void
FreedrenoDriver::disable_perfcnt()
{
/* There isn't really any disable, only reconfiguring which countables
* get muxed to which counters
*/
if (perfcntr_stream_fd >= 0) {
close(perfcntr_stream_fd);
perfcntr_stream_fd = -1;
}
}
/*
@ -278,6 +441,80 @@ FreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset) co
}
}
void
FreedrenoDriver::Countable::configure_stream(struct drm_msm_perfcntr_config *req) const
{
const struct fd_perfcntr_countable *countable = d->state[id].countable;
struct drm_msm_perfcntr_group *groups =
(struct drm_msm_perfcntr_group *)U642VOID(req->groups);
/* Find group: */
struct drm_msm_perfcntr_group *g = NULL;
for (unsigned i = 0; i < req->nr_groups; i++) {
if (!strcmp(groups[i].group_name, group.c_str())) {
g = &groups[i];
break;
}
}
/* If not found, append a new group: */
if (!g) {
g = &groups[req->nr_groups++];
strcpy(g->group_name, group.c_str());
/* allocate countables for max # of counters in the group */
for (unsigned i = 0; i < d->num_perfcntrs; i++) {
if (!strcmp(d->perfcntrs[i].name, group.c_str())) {
void *countables = calloc(sizeof(uint32_t), d->perfcntrs[i].num_counters);
g->countables = VOID2U64(countables);
break;
}
}
assert(g->countables);
}
/* Initially, just store the index within the group, since earlier groups
* are not yet fully populated (ie. we don't yet know the offset of the
* first sample in the group)
*/
d->state[id].idx = g->nr_countables;
/* And last, append the countable: */
uint32_t *countables = (uint32_t *)U642VOID(g->countables);
countables[g->nr_countables++] = countable->selector;
}
static unsigned
find_group_offset(const struct drm_msm_perfcntr_config *req, const char *group)
{
struct drm_msm_perfcntr_group *groups =
(struct drm_msm_perfcntr_group *)U642VOID(req->groups);
unsigned off = 0;
for (unsigned i = 0; i < req->nr_groups; i++) {
if (!strcmp(groups[i].group_name, group))
break;
off += groups[i].nr_countables;
}
return off;
}
void
FreedrenoDriver::Countable::resolve_sample_idx(const struct drm_msm_perfcntr_config *req) const
{
d->state[id].idx += find_group_offset(req, group.c_str());
}
void
FreedrenoDriver::Countable::collect_stream(const uint64_t *buf) const
{
d->state[id].last_value = d->state[id].value;
d->state[id].value = buf[d->state[id].idx];
}
/* Collect current counter value and calculate delta since last sample: */
void
FreedrenoDriver::Countable::collect() const
@ -302,11 +539,10 @@ FreedrenoDriver::Countable::resolve() const
if (group != g->name)
continue;
for (unsigned j = 0; j < g->num_countables; j++) {
const struct fd_perfcntr_countable *c = &g->countables[j];
if (name != c->name)
continue;
const struct fd_perfcntr_countable *c =
fd_perfcntrs_countable(g, name.c_str());
if (c) {
d->state[id].countable = c;
/* Assign counters from high to low to reduce conflicts with UMD-owned

View file

@ -6,6 +6,7 @@
#pragma once
#include "pps/pps_driver.h"
#include "drm-uapi/msm_drm.h"
extern "C" {
struct fd_dev_id;
@ -54,10 +55,26 @@ private:
const struct fd_dev_info *info;
/**
* The memory mapped i/o space for counter readback:
* The memory mapped i/o space for counter readback (legacy):
*/
void *io;
/**
* perfcntr stream fd, if not using memory mapped i/o for counter
* readback.
*/
int perfcntr_stream_fd = -1;
/**
* The configured sampling period
*/
uint64_t sampling_period_ns_ = 1000000000;
/**
* Buffer used to read samples
*/
void *sample_buf;
const struct fd_perfcntr_group *perfcntrs;
unsigned num_perfcntrs;
@ -75,10 +92,14 @@ private:
void setup_a6xx_counters();
void setup_a7xx_counters();
void setup_a8xx_counters();
void configure_counters(bool reset, bool wait);
void collect_countables();
int configure_counters_stream();
bool collect_countables_stream();
/**
* Split out countable mutable state from the class so that copy-
* constructor does something sane when lambda derive function
@ -88,6 +109,9 @@ private:
uint64_t last_value, value;
const struct fd_perfcntr_countable *countable;
const struct fd_perfcntr_counter *counter;
/* index into perfcntr stream sample buf: */
unsigned idx;
};
std::vector<struct CountableState> state;
@ -115,6 +139,11 @@ private:
void collect() const;
void resolve() const;
/* perfcntr stream related APIs */
void configure_stream(struct drm_msm_perfcntr_config *req) const;
void resolve_sample_idx(const struct drm_msm_perfcntr_config *req) const;
void collect_stream(const uint64_t *buf) const;
private:
uint64_t get_value() const;

View file

@ -7,6 +7,7 @@ pps_freedreno_lib = static_library(
sources: [
'fd_pps_a6xx.cc',
'fd_pps_a7xx.cc',
'fd_pps_a8xx.cc',
'fd_pps_driver.cc',
'fd_pps_driver.h',
freedreno_xml_header_files,

View file

@ -0,0 +1,82 @@
/*
* Copyright © 2016 Rob Clark <robclark@freedesktop.org>
* All Rights Reserved.
* SPDX-License-Identifier: MIT
*/
#include <assert.h>
#include <ctype.h>
#include <curses.h>
#include <err.h>
#include <inttypes.h>
#include <libconfig.h>
#include <locale.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <xf86drm.h>
#include "drm/freedreno_drmif.h"
#include "drm/freedreno_ringbuffer.h"
#include "util/os_file.h"
#include "freedreno_dt.h"
#include "freedreno_perfcntr.h"
/*
* Simple tool to dump perfctr tables (so we can make sure nothing gets
* missed while converting to generated tables)
*/
int
main(int argc, char **argv)
{
struct fd_dev_id dev_id = {};
unsigned ngroups = 0;
const struct fd_perfcntr_group *groups;
if (argc != 2)
return -1;
if (!strcmp(argv[1], "a2xx")) {
dev_id.gpu_id = 200;
} else if (!strcmp(argv[1], "a5xx")) {
dev_id.gpu_id = 530;
} else if (!strcmp(argv[1], "a6xx")) {
dev_id.gpu_id = 630;
} else if (!strcmp(argv[1], "a7xx")) {
dev_id.chip_id = 0xffff07030001;
}
groups = fd_perfcntrs(&dev_id, &ngroups);
if (!groups) {
errx(1, "no perfcntr support");
}
for (int i = 0; i < ngroups; i++) {
const struct fd_perfcntr_group *g = &groups[i];
printf("GROUP[%s]: num_counters=%u, num_countables=%u\n",
g->name, g->num_counters, g->num_countables);
for (int j = 0; j < g->num_counters; j++) {
const struct fd_perfcntr_counter *counter = &g->counters[j];
printf("COUNTER: %04x, %04x, %04x, %04x, %04x\n",
counter->select_reg, counter->counter_reg_lo, counter->counter_reg_hi,
counter->enable, counter->clear);
}
for (int j = 0; j < g->num_countables; j++) {
const struct fd_perfcntr_countable *countable = &g->countables[j];
printf("COUNTABLE[%s]: %04x\n", countable->name, countable->selector);
}
printf("\n");
}
return 0;
}

View file

@ -97,107 +97,104 @@ enum {
static_assert(DERIVED_COUNTER_PERFCNTR_MAX_VALUE <= FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS, "");
#define DERIVED_COUNTER_PERFCNTR(_enum, _counter) \
[DERIVED_COUNTER_PERFCNTR_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum }
#define DERIVED_COUNTER_PERFCNTR_BV(_enum, _counter) \
[DERIVED_COUNTER_PERFCNTR_BV_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum }
#define DERIVED_COUNTER_PERFCNTR(_countable, _group) \
[DERIVED_COUNTER_PERFCNTR_##_countable] = { .countable = "PERF_" #_countable, .group = #_group }
#define DERIVED_COUNTER_PERFCNTR_BV(_countable, _group) \
[DERIVED_COUNTER_PERFCNTR_BV_##_countable] = { .countable = "PERF_" #_countable, .group = "BV_" #_group }
static const struct {
const struct fd_perfcntr_counter *counter;
unsigned countable;
} a7xx_derived_counter_perfcntrs[] = {
const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[] = {
/* CP: 3/14 counters */
DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, &cp_counters[0]),
DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, &cp_counters[1]),
DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, &cp_counters[2]),
DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, CP),
DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, CP),
DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, CP),
/* RBBM: 1/4 counters */
DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, &rbbm_counters[0]),
DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, RBBM),
/* PC: 3/8 counters */
DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, &pc_counters[0]),
DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, &pc_counters[1]),
DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, &pc_counters[2]),
DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, PC),
DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, PC),
DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, PC),
/* TSE: 4/4 counters */
DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, &tse_counters[0]),
DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, &tse_counters[1]),
DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, &tse_counters[2]),
DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, &tse_counters[3]),
DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, TSE),
DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, TSE),
DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, TSE),
DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, TSE),
/* UCHE: 5/12 counters */
DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, &uche_counters[0]),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, &uche_counters[1]),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, &uche_counters[2]),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, &uche_counters[3]),
DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, &uche_counters[4]),
DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, UCHE),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, UCHE),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, UCHE),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, UCHE),
DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, UCHE),
/* TP: 7/12 counters */
DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, &tp_counters[0]),
DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, &tp_counters[1]),
DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, &tp_counters[2]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, &tp_counters[3]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, &tp_counters[4]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, &tp_counters[5]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, &tp_counters[6]),
DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, TP),
DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, TP),
DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, TP),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, TP),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, TP),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, TP),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, TP),
/* SP: 24/24 counters */
DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, &sp_counters[ 0]),
DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, &sp_counters[ 1]),
DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, &sp_counters[ 2]),
DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, &sp_counters[ 3]),
DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, &sp_counters[ 4]),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, &sp_counters[ 5]),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 6]),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 7]),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 8]),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 9]),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, &sp_counters[10]),
DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, &sp_counters[11]),
DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, &sp_counters[12]),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, &sp_counters[13]),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, &sp_counters[14]),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, &sp_counters[15]),
DERIVED_COUNTER_PERFCNTR(SP_PIXELS, &sp_counters[16]),
DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, &sp_counters[17]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, &sp_counters[18]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, &sp_counters[19]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, &sp_counters[20]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, &sp_counters[21]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, &sp_counters[22]),
DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, &sp_counters[23]),
DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, SP),
DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, SP),
DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, SP),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, SP),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, SP),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, SP),
DERIVED_COUNTER_PERFCNTR(SP_PIXELS, SP),
DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, SP),
DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, SP),
DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, SP),
/* CMP: 1/4 counters */
DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, &cmp_counters[0]),
DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, CMP),
/* BV_PC: 3/8 counters */
DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, &bv_pc_counters[0]),
DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, &bv_pc_counters[1]),
DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, &bv_pc_counters[2]),
DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, PC),
DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, PC),
DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, PC),
/* BV_TP: 6/6 counters */
DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, &bv_tp_counters[0]),
DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, &bv_tp_counters[1]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, &bv_tp_counters[2]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, &bv_tp_counters[3]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, &bv_tp_counters[4]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, &bv_tp_counters[5]),
DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, TP),
DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, TP),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, TP),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, TP),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, TP),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, TP),
/* GP: 8/12 counters */
DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, &bv_sp_counters[0]),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, &bv_sp_counters[1]),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, &bv_sp_counters[2]),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &bv_sp_counters[3]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, &bv_sp_counters[4]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, &bv_sp_counters[5]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, &bv_sp_counters[6]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, &bv_sp_counters[7]),
DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, SP),
DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, SP),
/* LRZ: 4/4 counters */
DERIVED_COUNTER_PERFCNTR(LRZ_TOTAL_PIXEL, &lrz_counters[0]),
DERIVED_COUNTER_PERFCNTR(LRZ_VISIBLE_PIXEL_AFTER_LRZ, &lrz_counters[1]),
DERIVED_COUNTER_PERFCNTR(LRZ_TILE_KILLED, &lrz_counters[2]),
DERIVED_COUNTER_PERFCNTR(LRZ_PRIM_KILLED_BY_LRZ, &lrz_counters[3]),
DERIVED_COUNTER_PERFCNTR(LRZ_TOTAL_PIXEL, LRZ),
DERIVED_COUNTER_PERFCNTR(LRZ_VISIBLE_PIXEL_AFTER_LRZ, LRZ),
DERIVED_COUNTER_PERFCNTR(LRZ_TILE_KILLED, LRZ),
DERIVED_COUNTER_PERFCNTR(LRZ_PRIM_KILLED_BY_LRZ, LRZ),
};
static uint64_t
@ -985,50 +982,3 @@ const struct fd_derived_counter *a7xx_derived_counters[] = {
const unsigned a7xx_num_derived_counters = ARRAY_SIZE(a7xx_derived_counters);
static_assert(ARRAY_SIZE(a7xx_derived_counters) <= FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS, "");
/* Prototype for linking purposes. */
void
a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection);
void
a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection)
{
/* The provided collection should already specify the derived counters that will be measured.
* This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each
* used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the
* array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored.
*/
collection->num_enabled_perfcntrs = 0;
memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map));
for (unsigned i = 0; i < collection->num_counters; ++i) {
const struct fd_derived_counter *counter = collection->counters[i];
for (unsigned j = 0; j < counter->num_perfcntrs; ++j) {
uint8_t perfcntr = counter->perfcntrs[j];
collection->enabled_perfcntrs_map[perfcntr] = 0x00;
}
}
/* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */
collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0];
for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) {
if (collection->enabled_perfcntrs_map[i] == 0xff)
continue;
uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++;
collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index;
collection->enabled_perfcntrs[enabled_perfcntr_index].counter =
a7xx_derived_counter_perfcntrs[i].counter;
collection->enabled_perfcntrs[enabled_perfcntr_index].countable =
a7xx_derived_counter_perfcntrs[i].countable;
}
const struct fd_dev_info *info = fd_dev_info_raw(id);
collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2;
collection->derivation_context.a7xx.number_of_alus_per_usptp = 128;
}

View file

@ -11,6 +11,8 @@
#include <inttypes.h>
#include <libconfig.h>
#include <locale.h>
#include <poll.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@ -24,9 +26,12 @@
#include "util/os_file.h"
#include "freedreno_common.h"
#include "freedreno_dt.h"
#include "freedreno_perfcntr.h"
#include "drm-uapi/msm_drm.h"
#define MAX_CNTR_PER_GROUP 24
#define REFRESH_MS 500
@ -45,6 +50,11 @@ static struct {
struct counter_group {
const struct fd_perfcntr_group *group;
/* We initially try to use all counters, but can reduce this if
* not all counters are available.
*/
unsigned num_counters;
struct {
const struct fd_perfcntr_counter *counter;
uint16_t select_val;
@ -75,11 +85,30 @@ static struct {
const struct fd_dev_id *dev_id;
struct fd_submit *submit;
struct fd_ringbuffer *ring;
} dev;
/* This is used for PERFCNTR_CONFIG if supported by kernel. In
* this case, dev.io is not used.
*/
struct drm_msm_perfcntr_config perfcntr_config;
int perfcntr_stream_fd;
int num_configured_counters;
uint32_t seqno;
bool discontinuity;
} dev = {
.perfcntr_config = {
.flags = MSM_PERFCNTR_STREAM | MSM_PERFCNTR_UPDATE,
.bufsz_shift = 12,
.group_stride = sizeof(struct drm_msm_perfcntr_group),
},
.perfcntr_stream_fd = -1,
};
static void config_save(void);
static void config_restore(void);
static void restore_counter_groups(void);
static void setup_counter_groups(const struct fd_perfcntr_group *groups);
/*
* helpers
@ -113,6 +142,27 @@ delta(uint64_t a, uint64_t b)
return b - a;
}
static int
perfcntr_config(void)
{
if (dev.perfcntr_stream_fd >= 0) {
close(dev.perfcntr_stream_fd);
dev.perfcntr_stream_fd = -1;
}
errno = 0;
int fd = drmIoctl(fd_device_fd(dev.dev),
DRM_IOCTL_MSM_PERFCNTR_CONFIG,
&dev.perfcntr_config);
if (fd < 0)
return -errno;
dev.perfcntr_stream_fd = fd;
return 0;
}
static void
find_device(void)
{
@ -146,6 +196,42 @@ find_device(void)
printf("min_freq=%u, max_freq=%u\n", dev.min_freq, dev.max_freq);
const struct fd_perfcntr_group *groups;
groups = fd_perfcntrs(dev.dev_id, &dev.ngroups);
if (!groups) {
errx(1, "no perfcntr support");
}
dev.groups = calloc(dev.ngroups, sizeof(struct counter_group));
setup_counter_groups(groups);
ret = perfcntr_config();
if (ret == -E2BIG) {
struct drm_msm_perfcntr_group *g = U642VOID(dev.perfcntr_config.groups);
/* we are trying to use too many counters, back off: */
for (unsigned i = 0; i < dev.ngroups; i++) {
if (g[i].nr_countables < dev.groups[i].num_counters) {
printf("reducing %s counters %u -> %u\n",
groups[i].name, dev.groups[i].num_counters, g[i].nr_countables);
dev.num_configured_counters -=
dev.groups[i].num_counters - g[i].nr_countables;
dev.groups[i].num_counters = g[i].nr_countables;
}
}
ret = perfcntr_config();
}
if (!ret) {
return;
}
/* mmio not supported on gen8+: */
if (fd_dev_gen(dev.dev_id) >= 8) {
err(1, "mmio fallback not supported");
}
dev.io = fd_dt_find_io();
if (!dev.io) {
err(1, "could not map device");
@ -161,6 +247,13 @@ find_device(void)
static void
flush_ring(void)
{
if (!dev.io) {
int ret = perfcntr_config();
if (ret < 0)
errx(1, "perfcntr_config() failed");
return;
}
if (!dev.submit)
return;
@ -181,7 +274,7 @@ flush_ring(void)
static void
select_counter(struct counter_group *group, int ctr, int countable_val)
{
assert(ctr < group->group->num_counters);
assert(ctr < group->num_counters);
unsigned countable_idx = UINT32_MAX;
for (unsigned i = 0; i < group->group->num_countables; i++) {
@ -198,6 +291,20 @@ select_counter(struct counter_group *group, int ctr, int countable_val)
group->label[ctr] = group->group->countables[countable_idx].name;
group->counter[ctr].select_val = countable_val;
/* If using PERFCNTR_CONFIG, then update the ioctl structure: */
if (!dev.io) {
struct drm_msm_perfcntr_group *g = U642VOID(dev.perfcntr_config.groups);
for (int i = 0; i < dev.ngroups; i++) {
if (&dev.groups[i] == group) {
uint32_t *countables = U642VOID(g[i].countables);
countables[ctr] = countable_val;
break;
}
}
return;
}
if (!dev.submit) {
dev.submit = fd_submit_new(dev.pipe);
dev.ring = fd_submit_new_ringbuffer(
@ -311,6 +418,82 @@ check_counter_invalid(struct counter_group *group, int ctr)
group->counter[ctr].is_invalid = (hw_selector != group->counter[ctr].select_val);
}
static bool
perfcntr_stream_ready(void)
{
struct pollfd pfd;
pfd.fd = dev.perfcntr_stream_fd;
pfd.events = POLLIN;
pfd.revents = 0;
if (poll(&pfd, 1, 0) < 0)
return false;
if (!(pfd.revents & POLLIN))
return false;
return true;
}
/* GPU always-on timer constants */
static const uint64_t ALWAYS_ON_FREQUENCY_HZ = 19200000;
static const double GPU_TICKS_PER_US = ALWAYS_ON_FREQUENCY_HZ / 1000000.0;
static uint64_t
ticks_to_us(uint64_t ticks)
{
return ticks / GPU_TICKS_PER_US;
}
static void
resample_perfcntr_stream(void)
{
if (!perfcntr_stream_ready()) {
dev.discontinuity = true;
return;
}
uint64_t buf[dev.num_configured_counters + 2]; /* include 128b header */
void *ptr = buf;
size_t sz = sizeof(buf);
while (sz > 0) {
ssize_t ret = read(dev.perfcntr_stream_fd, ptr, sz);
if (ret < 0)
ret = -errno;
if (ret == -EINTR || ret == -EAGAIN)
continue;
if (ret < 0)
errx(ret, "read failed");
sz -= ret;
ptr += ret;
}
int idx = 0;
uint64_t ts = ticks_to_us(buf[idx++]);
uint32_t seqno = buf[idx++] & 0xffffffff;
dev.discontinuity = (seqno == 0);
for (unsigned i = 0; i < dev.ngroups; i++) {
struct counter_group *group = &dev.groups[i];
for (unsigned ctr = 0; ctr < group->num_counters; ctr++) {
uint64_t previous_value = group->value[ctr];
group->value[ctr] = buf[idx++];
group->value_delta[ctr] = delta(previous_value, group->value[ctr]);
uint64_t previous_sample_time = group->sample_time[ctr];
group->sample_time[ctr] = ts;
group->sample_time_delta[ctr] = delta(previous_sample_time, ts);
}
}
}
/* sample all the counters: */
static void
resample(void)
@ -323,9 +506,14 @@ resample(void)
last_time = current_time;
if (!dev.io) {
resample_perfcntr_stream();
return;
}
for (unsigned i = 0; i < dev.ngroups; i++) {
struct counter_group *group = &dev.groups[i];
for (unsigned j = 0; j < group->group->num_counters; j++) {
for (unsigned j = 0; j < group->num_counters; j++) {
resample_counter(group, j, current_time);
check_counter_invalid(group, j);
}
@ -469,7 +657,7 @@ static void
redraw_counter(WINDOW *win, int row, struct counter_group *group, int ctr,
bool selected)
{
bool is_invalid = group->counter[ctr].is_invalid;
bool is_invalid = group->counter[ctr].is_invalid || dev.discontinuity;
redraw_counter_label(win, row, group->label[ctr], selected, is_invalid);
redraw_counter_value(win, row, group, ctr, is_invalid);
}
@ -513,13 +701,13 @@ redraw(WINDOW *win)
if (group->counter[0].is_gpufreq_counter)
j++;
if (j < group->group->num_counters) {
if (j < group->num_counters) {
if ((scroll <= row) && ((row - scroll) < max))
redraw_group_header(win, row - scroll, group->group->name);
row++;
}
for (; j < group->group->num_counters; j++) {
for (; j < group->num_counters; j++) {
if ((scroll <= row) && ((row - scroll) < max))
redraw_counter(win, row - scroll, group, j, row == current_cntr);
row++;
@ -554,7 +742,7 @@ current_counter(int *ctr)
j++;
/* account for group header: */
if (j < group->group->num_counters) {
if (j < group->num_counters) {
/* cannot select group header.. return null to indicate this
* main_ui():
*/
@ -563,7 +751,7 @@ current_counter(int *ctr)
n++;
}
for (; j < group->group->num_counters; j++) {
for (; j < group->num_counters; j++) {
if (n == current_cntr) {
if (ctr)
*ctr = j;
@ -734,6 +922,9 @@ main_ui(void)
resample();
redraw(mainwin);
if (!dev.io)
continue;
/* restore the counters every 0.5s in case the GPU has suspended,
* in which case the current selected countables will have reset:
*/
@ -761,7 +952,7 @@ dump_counters(void)
for (unsigned i = 0; i < dev.ngroups; i++) {
const struct counter_group *group = &dev.groups[i];
for (unsigned j = 0; j < group->group->num_counters; j++) {
for (unsigned j = 0; j < group->num_counters; j++) {
const char *label = group->label[j];
float val = (float) group->value_delta[j] * 1000000.0 /
(float) group->sample_time_delta[j];
@ -798,7 +989,7 @@ restore_counter_groups(void)
for (unsigned i = 0; i < dev.ngroups; i++) {
struct counter_group *group = &dev.groups[i];
for (unsigned j = 0; j < group->group->num_counters; j++) {
for (unsigned j = 0; j < group->num_counters; j++) {
/* This should also write the CP_ALWAYS_COUNT selectable value into
* the reserved CP counter we use for GPU frequency measurement,
* avoiding someone else writing a different value there.
@ -811,12 +1002,29 @@ restore_counter_groups(void)
static void
setup_counter_groups(const struct fd_perfcntr_group *groups)
{
/* pre-allocate memory needed for PERFCNTR_CONFIG ioctl: */
struct drm_msm_perfcntr_group *g = calloc(sizeof(struct drm_msm_perfcntr_group), dev.ngroups);
dev.perfcntr_config.nr_groups = dev.ngroups;
dev.perfcntr_config.period = options.refresh_ms * 1000000;
dev.perfcntr_config.groups = VOID2U64(g);
for (unsigned i = 0; i < dev.ngroups; i++) {
struct counter_group *group = &dev.groups[i];
group->group = &groups[i];
if (strlen(groups[i].name) > sizeof(g[i].group_name))
errx(1, "group name too large: %s", groups[i].name);
max_rows += group->group->num_counters + 1;
strncpy(g[i].group_name, groups[i].name, sizeof(g[i].group_name));
g[i].nr_countables = groups[i].num_counters;
g[i].countables = VOID2U64(calloc(sizeof(uint32_t), g[i].nr_countables));
dev.num_configured_counters += g[i].nr_countables;
group->group = &groups[i];
group->num_counters = group->group->num_counters;
max_rows += group->num_counters + 1;
/* We reserve the first counter of the CP group (first in the list) for
* measuring GPU frequency that's displayed in the footer.
@ -846,7 +1054,7 @@ setup_counter_groups(const struct fd_perfcntr_group *groups)
}
}
for (unsigned j = 0; j < group->group->num_counters; j++) {
for (unsigned j = 0; j < group->num_counters; j++) {
group->counter[j].counter = &group->group->counters[j];
if (!group->counter[j].is_gpufreq_counter)
@ -889,7 +1097,7 @@ config_save(void)
config_setting_t *sect =
config_setting_get_member(setting, group->group->name);
for (unsigned j = 0; j < group->group->num_counters; j++) {
for (unsigned j = 0; j < group->num_counters; j++) {
/* Don't save the GPU frequency measurement counter. */
if (group->counter[j].is_gpufreq_counter)
continue;
@ -936,7 +1144,7 @@ config_restore(void)
config_setting_add(setting, group->group->name, CONFIG_TYPE_GROUP);
}
for (unsigned j = 0; j < group->group->num_counters; j++) {
for (unsigned j = 0; j < group->num_counters; j++) {
/* Don't restore the GPU frequency measurement counter. */
if (group->counter[j].is_gpufreq_counter)
continue;
@ -997,17 +1205,8 @@ main(int argc, char **argv)
find_device();
const struct fd_perfcntr_group *groups;
groups = fd_perfcntrs(dev.dev_id, &dev.ngroups);
if (!groups) {
errx(1, "no perfcntr support");
}
dev.groups = calloc(dev.ngroups, sizeof(struct counter_group));
setlocale(LC_NUMERIC, "en_US.UTF-8");
setup_counter_groups(groups);
restore_counter_groups();
config_restore();
flush_ring();

View file

@ -7,6 +7,16 @@
*/
#include <stddef.h>
#include <xf86drm.h>
#include "util/hash_table.h"
#include "util/ralloc.h"
#include "drm-uapi/msm_drm.h"
#include "util/bitset.h"
#include "util/log.h"
#include "util/simple_mtx.h"
#include "freedreno_common.h"
#include "freedreno_perfcntr.h"
@ -41,12 +51,269 @@ fd_perfcntrs(const struct fd_dev_id *id, unsigned *count)
case 7:
*count = a7xx_num_perfcntr_groups;
return a7xx_perfcntr_groups;
case 8:
*count = a8xx_num_perfcntr_groups;
return a8xx_perfcntr_groups;
default:
*count = 0;
return NULL;
}
}
struct fd_perfcntr_counter_state {
int group;
int counter;
int countable;
unsigned nr_users;
};
#define MAX_COUNTERS_PER_GROUP 32
typedef BITSET_DECLARE(assigned_counters_t, MAX_COUNTERS_PER_GROUP);
/**
* Helper to manage assigning counters, tracking if there are multiple users
* for the same countable (to avoid assigning duplicate counters for the
* same countable, etc)
*/
struct fd_perfcntr_state {
simple_mtx_t lock;
int fd;
const struct fd_dev_id *id;
unsigned nr_groups;
const struct fd_perfcntr_group *groups;
struct drm_msm_perfcntr_group *group_configs;
struct drm_msm_perfcntr_config config;
/* bitmask of assigned counters per group: */
assigned_counters_t *assigned_counters;
/* maps counter to fd_perfcntr_counter_state: */
struct hash_table *counter_state;
};
static int
update_reserved_counters(struct fd_perfcntr_state *perfcntrs)
{
/* If no kernel support, just carry on and assume we can use all counters: */
if (perfcntrs->fd < 0)
return 0;
return drmIoctl(perfcntrs->fd, DRM_IOCTL_MSM_PERFCNTR_CONFIG, &perfcntrs->config);
}
static int
update_group_counters(struct fd_perfcntr_state *perfcntrs, int group_idx)
{
int ret = 0;
/* Update reserved config with kernel if it changes. We might not
* be assiging/releasing the last counter (and we cannot feasibly
* re-map existing assigned counters to compact away gaps in the
* used counters, as cmdstream might already
* be built encoding the other assigned counters), but if we do
* let the kernel know:
*/
unsigned nr = BITSET_LAST_BIT(perfcntrs->assigned_counters[group_idx]);
if (nr != perfcntrs->group_configs[group_idx].nr_countables) {
mesa_logi("%s: %u -> %u counters", perfcntrs->groups[group_idx].name, perfcntrs->group_configs[group_idx].nr_countables, nr);
perfcntrs->group_configs[group_idx].nr_countables = nr;
ret = update_reserved_counters(perfcntrs);
}
return ret;
}
struct fd_perfcntr_state *
fd_perfcntr_state_alloc(const struct fd_dev_id *id, int fd)
{
const struct fd_perfcntr_group *groups;
unsigned nr_groups;
groups = fd_perfcntrs(id, &nr_groups);
if (!groups)
return NULL;
struct fd_perfcntr_state *perfcntrs = rzalloc(NULL, struct fd_perfcntr_state);
simple_mtx_init(&perfcntrs->lock, mtx_plain);
perfcntrs->fd = fd;
perfcntrs->id = id;
perfcntrs->nr_groups = nr_groups;
perfcntrs->groups = groups;
perfcntrs->group_configs =
rzalloc_array(perfcntrs, struct drm_msm_perfcntr_group, nr_groups);
for (unsigned i = 0; i < nr_groups; i++) {
assert(strlen(groups[i].name) < sizeof(perfcntrs->group_configs[i].group_name));
strcpy(perfcntrs->group_configs[i].group_name, groups[i].name);
}
perfcntrs->config = (struct drm_msm_perfcntr_config) {
.nr_groups = nr_groups,
.groups = VOID2U64(perfcntrs->group_configs),
.group_stride = sizeof(struct drm_msm_perfcntr_group),
};
perfcntrs->assigned_counters = rzalloc_array(perfcntrs, assigned_counters_t, nr_groups);
perfcntrs->counter_state = _mesa_pointer_hash_table_create(perfcntrs);
/* Probe for kernel PERFCNTR_CONFIG support with empty config: */
if (update_reserved_counters(perfcntrs))
perfcntrs->fd = -1;
return perfcntrs;
}
void
fd_perfcntr_state_free(struct fd_perfcntr_state *perfcntrs)
{
if (!perfcntrs)
return;
perfcntrs->config.nr_groups = 0;
update_reserved_counters(perfcntrs);
ralloc_free(perfcntrs);
}
/**
* Does KMD support perfcntr reservation (ie. PERFCNTR_CONFIG)
*/
bool
fd_perfcntr_has_reservation(struct fd_perfcntr_state *perfcntrs)
{
return perfcntrs->fd >= 0;
}
static int
find_group_idx(struct fd_perfcntr_state *perfcntrs,
const struct fd_perfcntr_group *group)
{
for (unsigned i = 0; i < perfcntrs->nr_groups; i++)
if (&perfcntrs->groups[i] == group)
return i;
UNREACHABLE("invalid group");
}
static int
find_countable_idx(const struct fd_perfcntr_group *group,
const struct fd_perfcntr_countable *countable)
{
for (unsigned i = 0; i < group->num_countables; i++)
if (&group->countables[i] == countable)
return i;
UNREACHABLE("invalid countable");
}
const struct fd_perfcntr_counter *
fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs,
const struct fd_perfcntr_group *group,
const struct fd_perfcntr_countable *countable)
{
struct fd_perfcntr_counter_state *state = NULL;
int c, g = find_group_idx(perfcntrs, group);
simple_mtx_lock(&perfcntrs->lock);
/* Check if requested countable is already configured: */
BITSET_FOREACH_SET (c, perfcntrs->assigned_counters[g], MAX_COUNTERS_PER_GROUP) {
struct hash_entry *e =
_mesa_hash_table_search(perfcntrs->counter_state, &group->counters[c]);
assert(e);
struct fd_perfcntr_counter_state *s = e->data;
if (&group->countables[s->countable] == countable) {
state = s;
break;
}
}
/* If we didn't find a counter assigned to this countable, assign a new one: */
if (!state) {
assigned_counters_t *assigned_counters = &perfcntrs->assigned_counters[g];
/* Pick lowest #ed unassigned counter: */
assigned_counters_t free_counters;
memcpy(free_counters, *assigned_counters, sizeof(free_counters));
BITSET_NOT(free_counters);
c = BITSET_FFS(free_counters) - 1;
assert(c >= 0);
mesa_logi("pick counter %d", c);
if (c < group->num_counters) {
state = rzalloc(perfcntrs, struct fd_perfcntr_counter_state);
state->group = g;
state->counter = c;
state->countable = find_countable_idx(group, countable);
assert(!BITSET_TEST(*assigned_counters, state->counter));
BITSET_SET(*assigned_counters, state->counter);
if (update_group_counters(perfcntrs, state->group)) {
BITSET_CLEAR(*assigned_counters, state->counter);
ralloc_free(state);
state = NULL;
} else {
_mesa_hash_table_insert(perfcntrs->counter_state,
&group->counters[state->counter],
state);
}
}
}
if (state)
state->nr_users++;
simple_mtx_unlock(&perfcntrs->lock);
if (!state)
return NULL;
mesa_logi("%s.%s: assigned %d (%d users)", group->name, countable->name, state->counter, state->nr_users);
return &group->counters[state->counter];
}
void
fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs,
const struct fd_perfcntr_counter *counter)
{
if (!counter)
return;
simple_mtx_lock(&perfcntrs->lock);
struct hash_entry *e = _mesa_hash_table_search(perfcntrs->counter_state, counter);
if (e) {
struct fd_perfcntr_counter_state *state = e->data;
assert(state->nr_users > 0);
const struct fd_perfcntr_group *group = &perfcntrs->groups[state->group];
mesa_logi("%s.%s: released %d (%d users)", group->name, group->countables[state->countable].name, state->counter, state->nr_users);
if (--state->nr_users == 0) {
/* dropping last user of the counter: */
_mesa_hash_table_remove(perfcntrs->counter_state, e);
assigned_counters_t *assigned_counters =
&perfcntrs->assigned_counters[state->group];
assert(BITSET_TEST(*assigned_counters, state->counter));
BITSET_CLEAR(*assigned_counters, state->counter);
update_group_counters(perfcntrs, state->group);
ralloc_free(state);
}
}
simple_mtx_unlock(&perfcntrs->lock);
}
extern const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[];
extern const struct fd_derived_counter *a7xx_derived_counters[];
extern const unsigned a7xx_num_derived_counters;
@ -63,16 +330,73 @@ fd_derived_counters(const struct fd_dev_id *id, unsigned *count)
}
}
extern void a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection);
void
fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection)
fd_reserve_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection)
{
const struct fd_derived_counter_perfcntr *derived_counter_perfcntrs = NULL;
const struct fd_dev_id *id = perfcntrs->id;
switch (fd_dev_gen(id)) {
case 7:
a7xx_generate_derived_counter_collection(id, collection);
derived_counter_perfcntrs = a7xx_derived_counter_perfcntrs;
break;
default:
return;
}
/* The provided collection should already specify the derived counters that will be measured.
* This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each
* used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the
* array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored.
*/
collection->num_enabled_perfcntrs = 0;
memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map));
for (unsigned i = 0; i < collection->num_counters; ++i) {
const struct fd_derived_counter *counter = collection->counters[i];
for (unsigned j = 0; j < counter->num_perfcntrs; ++j) {
uint8_t perfcntr = counter->perfcntrs[j];
collection->enabled_perfcntrs_map[perfcntr] = 0x00;
}
}
/* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */
collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0];
for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) {
if (collection->enabled_perfcntrs_map[i] == 0xff)
continue;
uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++;
collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index;
const struct fd_perfcntr_group *group =
fd_perfcntrs_group(perfcntrs->id, derived_counter_perfcntrs[i].group);
const struct fd_perfcntr_countable *countable =
fd_perfcntrs_countable(group, derived_counter_perfcntrs[i].countable);
const struct fd_perfcntr_counter *counter =
fd_perfcntr_reserve(perfcntrs, group, countable);
collection->enabled_perfcntrs[enabled_perfcntr_index].counter = counter;
collection->enabled_perfcntrs[enabled_perfcntr_index].countable = countable->selector;
}
const struct fd_dev_info *info = fd_dev_info_raw(id);
switch (fd_dev_gen(id)) {
case 7:
collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2;
collection->derivation_context.a7xx.number_of_alus_per_usptp = 128;
break;
default:
break;
}
}
void
fd_release_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection)
{
for (unsigned i = 0; i < collection->num_enabled_perfcntrs; i++)
fd_perfcntr_release(perfcntrs, collection->enabled_perfcntrs[i].counter);
}

View file

@ -89,6 +89,48 @@ const struct fd_perfcntr_group *fd_perfcntrs(const struct fd_dev_id *id, unsigne
.countables = _countables, \
}
static inline const struct fd_perfcntr_group *
fd_perfcntrs_group(const struct fd_dev_id *id, const char *name)
{
const struct fd_perfcntr_group *groups;
unsigned count;
groups = fd_perfcntrs(id, &count);
if (!groups)
return NULL;
for (unsigned i = 0; i < count; i++)
if (!strcmp(groups[i].name, name))
return &groups[i];
return NULL;
}
static inline const struct fd_perfcntr_countable *
fd_perfcntrs_countable(const struct fd_perfcntr_group *group, const char *name)
{
for (unsigned i = 0; i < group->num_countables; i++)
if (!strcmp(group->countables[i].name, name))
return &group->countables[i];
return NULL;
}
struct fd_perfcntr_state;
struct fd_perfcntr_state *
fd_perfcntr_state_alloc(const struct fd_dev_id *id, int fd);
void fd_perfcntr_state_free(struct fd_perfcntr_state *perfcntrs);
bool fd_perfcntr_has_reservation(struct fd_perfcntr_state *perfcntrs);
const struct fd_perfcntr_counter *
fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs,
const struct fd_perfcntr_group *group,
const struct fd_perfcntr_countable *countable);
void fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs,
const struct fd_perfcntr_counter *counter);
#define FD_DERIVED_COUNTER_MAX_PERFCNTRS 8
struct fd_derivation_context {
@ -110,6 +152,11 @@ struct fd_derived_counter {
uint64_t (*derive)(struct fd_derivation_context *context, uint64_t *values);
};
struct fd_derived_counter_perfcntr {
const char *countable;
const char *group;
};
const struct fd_derived_counter **fd_derived_counters(const struct fd_dev_id *id, unsigned *count);
#define FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS 64
@ -130,7 +177,8 @@ struct fd_derived_counter_collection {
struct fd_derivation_context derivation_context;
};
void fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection);
void fd_reserve_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection);
void fd_release_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection);
#ifdef __cplusplus
} /* end of extern "C" */

View file

@ -20,7 +20,11 @@ libfreedreno_perfcntrs = static_library(
c_args : [no_override_init_args],
gnu_symbol_visibility : 'hidden',
link_with : [libfreedreno_common],
dependencies : idep_nir_headers,
dependencies : [
dep_libdrm,
idep_mesautil,
idep_nir_headers,
],
build_by_default : false,
)
@ -51,3 +55,23 @@ if dep_libconfig.found() and dep_curses.found()
install : with_tools.contains('freedreno'),
)
endif
dumpctrs = executable(
'dumpctrs',
['dumpctrs.c', freedreno_xml_header_files],
include_directories : [
inc_freedreno,
inc_include,
inc_src,
],
link_with : [
libfreedreno_common,
libfreedreno_drm,
libfreedreno_perfcntrs,
],
dependencies : [
dep_libdrm,
idep_mesautil,
],
build_by_default : with_tools.contains('freedreno'),
)

View file

@ -1003,7 +1003,7 @@ def dump_c(args, guard, func):
# TODO figure out what to do about fd_reg_stomp_allowed()
# vs gcc.. for now only enable the warnings with clang:
print("#if defined(__clang__) && !defined(FD_NO_DEPRECATED_PACK)")
print("#if defined(__clang__) && !defined(FD_NO_DEPRECATED_PACK) && !defined(__KERNEL__)")
print("#define __FD_DEPRECATED _Pragma (\"GCC warning \\\"Deprecated reg builder\\\"\")")
print("#else")
print("#define __FD_DEPRECATED")

View file

@ -1633,41 +1633,20 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result)
tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc");
if (supports_preempt_latency_tracking()) {
uint32_t group_count;
const struct fd_perfcntr_group *groups = fd_perfcntrs(&device->physical_device->dev_id, &group_count);
const char *fail_reason = nullptr;
const fd_perfcntr_group *cp_group = nullptr;
for (uint32_t i = 0; i < group_count; i++) {
if (strcmp(groups[i].name, "CP") == 0) {
cp_group = &groups[i];
break;
}
}
const fd_perfcntr_group *cp_group = fd_perfcntrs_group(&device->physical_device->dev_id, "CP");
if (cp_group) {
auto get_perfcntr_countable = [](const struct fd_perfcntr_group *group,
const char *name) -> const struct fd_perfcntr_countable * {
for (uint32_t i = 0; i < group->num_countables; i++) {
if (strcmp(group->countables[i].name, name) == 0)
return &group->countables[i];
}
return nullptr;
};
auto preemption_latency_countable = get_perfcntr_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY");
auto always_count_countable = get_perfcntr_countable(cp_group, "PERF_CP_ALWAYS_COUNT");
auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY");
auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT");
if (preemption_latency_countable && always_count_countable) {
if (cp_group->num_counters >= 2) {
preemption_latency_selector_reg = cp_group->counters[0].select_reg;
preemption_latency_selector = preemption_latency_countable->selector;
preemption_latency_counter_reg_lo = cp_group->counters[0].counter_reg_lo;
preemption_latency_counter =
fd_perfcntr_reserve(device->perfcntrs, cp_group, preemption_latency_countable);
always_count_counter =
fd_perfcntr_reserve(device->perfcntrs, cp_group, always_count_countable);
always_count_selector_reg = cp_group->counters[1].select_reg;
always_count_selector = always_count_countable->selector;
always_count_counter_reg_lo = cp_group->counters[1].counter_reg_lo;
} else {
if (!preemption_latency_counter || !always_count_counter) {
fail_reason = "not enough counters in CP group for preemption latency tracking";
}
} else {
@ -1699,6 +1678,9 @@ tu_autotune::~tu_autotune()
}
tu_bo_suballocator_finish(&suballoc);
fd_perfcntr_release(device->perfcntrs, preemption_latency_counter);
fd_perfcntr_release(device->perfcntrs, always_count_counter);
}
tu_autotune::cmd_buf_ctx::cmd_buf_ctx(struct tu_autotune &autotune): batch(autotune.create_batch())
@ -1952,22 +1934,22 @@ tu_autotune::write_preempt_counters_to_iova(struct tu_cs *cs,
uint64_t aon_iova) const
{
if (emit_selector) {
tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1);
tu_cs_emit(cs, preemption_latency_selector);
tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1);
tu_cs_emit(cs, preemption_latency_countable->selector);
tu_cs_emit_pkt4(cs, always_count_selector_reg, 1);
tu_cs_emit(cs, always_count_selector);
tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1);
tu_cs_emit(cs, always_count_countable->selector);
}
if (emit_wfi)
tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, latency_iova);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B);
tu_cs_emit_qw(cs, always_count_iova);
tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
@ -2060,11 +2042,11 @@ tu_autotune::emit_switch_away_amble(struct tu_cs *cs) const
static size_t counter = 0;
if (counter++ % 2 == 0) {
tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1);
tu_cs_emit(cs, preemption_latency_selector);
tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1);
tu_cs_emit(cs, preemption_latency_countable->selector);
tu_cs_emit_pkt4(cs, always_count_selector_reg, 1);
tu_cs_emit(cs, always_count_selector);
tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1);
tu_cs_emit(cs, always_count_countable->selector);
}
tu_cond_exec_end(cs);
@ -2231,4 +2213,4 @@ tu_autotune::emit_preempt_latency_tracking_rp_hash(struct tu_cmd_buffer *cmd)
tu_cs_emit_draw_state(&cmd->cs, TU_DRAW_STATE_AT_WRITE_RP_HASH, ds);
return rp_key;
}
}

View file

@ -242,13 +242,11 @@ struct tu_autotune {
std::mutex rp_latency_mutex; /* Protects rp_latency_tracking */
uint64_t last_latency_cleanup_ts = 0;
uint32_t preemption_latency_selector_reg;
uint32_t preemption_latency_selector;
uint32_t preemption_latency_counter_reg_lo;
const struct fd_perfcntr_counter *preemption_latency_counter;
const struct fd_perfcntr_countable *preemption_latency_countable;
uint32_t always_count_selector_reg;
uint32_t always_count_selector;
uint32_t always_count_counter_reg_lo;
const struct fd_perfcntr_counter *always_count_counter;
const struct fd_perfcntr_countable *always_count_countable;
struct tu_draw_state reset_rp_hash_draw_state;

View file

@ -11,6 +11,7 @@
#include "drm-uapi/drm_fourcc.h"
#include "git_sha1.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "common/freedreno_stompable_regs.h"
/* for fd_get_driver/device_uuid() */
@ -3081,6 +3082,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
}
}
device->perfcntrs = fd_perfcntr_state_alloc(
&physical_device->dev_id,
is_kgsl(physical_device->instance) ? -1 : device->fd);
device->autotune = new tu_autotune(device, result);
if (result != VK_SUCCESS)
goto fail_autotune;
@ -3181,6 +3186,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
fail_timeline_cond:
fail_a725_workaround:
fail_autotune:
fd_perfcntr_state_free(device->perfcntrs);
delete device->autotune;
fail_bin_preamble:
fail_prepare_perfcntrs_pass_cs:
@ -3287,6 +3293,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
delete device->autotune;
fd_perfcntr_state_free(device->perfcntrs);
tu_bo_suballocator_finish(&device->pipeline_suballoc);
tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc);
tu_bo_suballocator_finish(&device->event_suballoc);

View file

@ -11,6 +11,7 @@
#define TU_DEVICE_H
#include "tu_common.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "radix_sort/radix_sort_vk.h"
#include "util/rwlock.h"
@ -486,6 +487,8 @@ struct tu_device
pthread_cond_t timeline_cond;
pthread_mutex_t submit_mutex;
struct fd_perfcntr_state *perfcntrs;
struct tu_autotune *autotune;
struct breadcrumbs_context *breadcrumbs_ctx;

View file

@ -7,6 +7,7 @@
*/
#include "tu_query_pool.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include <fcntl.h>
@ -249,21 +250,6 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count,
assert(i < group_count);
}
static uint32_t
perfcntr_reserved_counters(const struct fd_perfcntr_group *group)
{
/* Keep raw perf queries off the CP slots reserved by autotune latency optimization.
* TODO: We need to do this in a more robust way.
*/
return strcmp(group->name, "CP") == 0 ? 2 : 0;
}
static uint32_t
perfcntr_available_counters(const struct fd_perfcntr_group *group)
{
return group->num_counters - MIN2(group->num_counters, perfcntr_reserved_counters(group));
}
static int
compare_perfcntr_pass(const void *a, const void *b)
{
@ -271,6 +257,27 @@ compare_perfcntr_pass(const void *a, const void *b)
((struct tu_perf_query_raw_data *)b)->pass;
}
static void
tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool,
const VkAllocationCallbacks *pAllocator)
{
if (is_perf_query_raw(pool)) {
struct tu_perf_query_raw *perf_query = &pool->perf_query.raw;
for (uint32_t i = 0; i < perf_query->counter_index_count; i++)
fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter);
} else if (is_perf_query_raw(pool)) {
struct tu_perf_query_derived *perf_query = &pool->perf_query.derived;
struct fd_derived_counter_collection *collection = perf_query->collection;
fd_release_derived_counter_collection(device->perfcntrs, collection);
}
if (pool->bo)
tu_bo_finish(device, pool->bo);
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo,
@ -353,50 +360,26 @@ tu_CreateQueryPool(VkDevice _device,
perf_query->counter_index_count = perf_query_info->counterIndexCount;
/* Build all perf counters data that is requested, so we could get
* correct group id, countable id, counter register and pass index with
* only a counter index provided by applications at each command submit.
*
* Also, since this built data will be sorted by pass index later, we
* should keep the original indices and store perfcntrs results according
* to them so apps can get correct results with their own indices.
*/
uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count];
memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0]));
memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0]));
for (uint32_t i = 0; i < perf_query->counter_index_count; i++) {
uint32_t gid = 0, cid = 0;
perfcntr_index(perf_query->perf_group, perf_query->perf_group_count,
perf_query_info->pCounterIndices[i], &gid, &cid);
perf_query->data[i].gid = gid;
perf_query->data[i].cid = cid;
perf_query->data[i].app_idx = i;
const struct fd_perfcntr_group *group = &perf_query->perf_group[gid];
uint32_t reserved_counters = perfcntr_reserved_counters(group);
uint32_t available_counters = perfcntr_available_counters(group);
const struct fd_perfcntr_countable *countable = &group->countables[cid];
if (available_counters == 0) {
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
perf_query->data[i].countable = countable;
perf_query->data[i].counter =
fd_perfcntr_reserve(device->perfcntrs, group, countable);
if (!perf_query->data[i].counter) {
tu_query_pool_destroy(device, pool, pAllocator);
return vk_errorf(device, VK_ERROR_FEATURE_NOT_PRESENT, "No raw perf counters available in group %s",
group->name);
}
/* When a counter register is over the capacity(num_counters),
* reset it for next pass.
*/
if (regs[gid] < available_counters) {
perf_query->data[i].cntr_reg = reserved_counters + regs[gid]++;
perf_query->data[i].pass = pass[gid];
} else {
perf_query->data[i].pass = ++pass[gid];
perf_query->data[i].cntr_reg = reserved_counters;
regs[gid] = 0;
regs[gid]++;
}
}
/* Sort by pass index so we could easily prepare a command stream
@ -422,21 +405,20 @@ tu_CreateQueryPool(VkDevice _device,
collection->counters[i] = perf_query->derived_counters[counter_index];
}
fd_generate_derived_counter_collection(&device->physical_device->dev_id, collection);
fd_reserve_derived_counter_collection(device->perfcntrs, collection);
slot_size += sizeof(struct perfcntr_query_slot) * collection->num_enabled_perfcntrs;
}
VkResult result = tu_bo_init_new_cached(device, &pool->vk.base, &pool->bo,
pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
if (result != VK_SUCCESS) {
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
tu_query_pool_destroy(device, pool, pAllocator);
return result;
}
result = tu_bo_map(device, pool->bo, NULL);
if (result != VK_SUCCESS) {
tu_bo_finish(device, pool->bo);
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
tu_query_pool_destroy(device, pool, pAllocator);
return result;
}
@ -463,8 +445,7 @@ tu_DestroyQueryPool(VkDevice _device,
TU_RMV(resource_destroy, device, pool);
tu_bo_finish(device, pool->bo);
vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk);
tu_query_pool_destroy(device, pool, pAllocator);
}
static uint32_t
@ -1259,7 +1240,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
* changes in perfcounter values should only apply to work done during
* this query.
*/
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1);
tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true,
.scope = INTERRUPTS).value);
@ -1276,13 +1257,15 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_countable *countable =
&perf_query->perf_group[data->gid].countables[data->cid];
tu_cs_emit_pkt4(cs, data->counter->select_reg, 1);
tu_cs_emit(cs, data->countable->selector);
tu_cs_emit_pkt4(cs, counter->select_reg, 1);
tu_cs_emit(cs, countable->selector);
for (unsigned s = 0; s < ARRAY_SIZE(data->counter->slice_select_regs); s++) {
if (!data->counter->slice_select_regs[s])
break;
tu_cs_emit_pkt4(cs, data->counter->slice_select_regs[s], 1);
tu_cs_emit(cs, data->countable->selector);
}
}
tu_cond_exec_end(cs);
@ -1300,8 +1283,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_counter *counter = data->counter;
uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx);
@ -1328,7 +1310,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
* changes in perfcounter values should only apply to work done during
* this query.
*/
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1);
tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true,
.scope = INTERRUPTS).value);
@ -1340,6 +1322,13 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
tu_cs_emit_pkt4(cs, counter->select_reg, 1);
tu_cs_emit(cs, countable);
for (unsigned s = 0; s < ARRAY_SIZE(counter->slice_select_regs); s++) {
if (!counter->slice_select_regs[s])
break;
tu_cs_emit_pkt4(cs, counter->slice_select_regs[s], 1);
tu_cs_emit(cs, countable);
}
}
emit_counter_barrier<CHIP>(cs);
@ -1749,8 +1738,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
emit_perfcntrs_pass_start<CHIP>(has_pred_bit, cs, data->pass);
}
const struct fd_perfcntr_counter *counter =
&perf_query->perf_group[data->gid].counters[data->cntr_reg];
const struct fd_perfcntr_counter *counter = data->counter;
end_iova = perf_query_iova(pool, query, end, data->app_idx);
@ -1799,7 +1787,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf,
/* This reverts the preemption disablement done at the start
* of the query.
*/
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1);
tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false,
.scope = INTERRUPTS).value);
@ -1876,7 +1864,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf,
/* This reverts the preemption disablement done at the start
* of the query.
*/
if (CHIP == A7XX) {
if (CHIP >= A7XX) {
tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1);
tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false,
.scope = INTERRUPTS).value);
@ -2317,9 +2305,12 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
}
for (uint32_t i = 0; i < group_count; i++) {
uint32_t available_counters = perfcntr_available_counters(&group[i]);
if (available_counters == 0)
continue;
/* Some counters may be unavailable at the time the query is
* created due to runtime factors (pps/fdperf using some counters,
* autotune or other queries, etc). But we don't know that up
* front.
*/
uint32_t available_counters = group[i].num_counters;
n_passes = DIV_ROUND_UP(counters_requested[i], available_counters);
*pNumPasses = MAX2(*pNumPasses, n_passes);

View file

@ -11,6 +11,7 @@
#define TU_QUERY_POOL_H
#include "tu_common.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "vk_query_pool.h"
@ -24,9 +25,8 @@ enum tu_perf_query_type {
struct tu_perf_query_raw_data
{
uint32_t gid; /* group-id */
uint32_t cid; /* countable-id within the group */
uint32_t cntr_reg; /* counter register within the group */
const struct fd_perfcntr_counter *counter;
const struct fd_perfcntr_countable *countable;
uint32_t pass; /* pass index that countables can be requested */
uint32_t app_idx; /* index provided by apps */
};

View file

@ -824,6 +824,7 @@ static const struct fd_acc_sample_provider so_overflow_predicate = {
struct fd_batch_query_entry {
uint8_t gid; /* group-id */
uint8_t cid; /* countable-id within the group */
const struct fd_perfcntr_counter *counter;
};
struct fd_batch_query_data {
@ -839,33 +840,32 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
struct fd_screen *screen = data->screen;
fd_cs cs(batch->draw);
unsigned counters_per_group[screen->num_perfcntr_groups];
memset(counters_per_group, 0, sizeof(counters_per_group));
fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
/* configure performance counters for the requested queries: */
for (unsigned i = 0; i < data->num_query_entries; i++) {
struct fd_batch_query_entry *entry = &data->query_entries[i];
const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
unsigned counter_idx = counters_per_group[entry->gid]++;
assert(counter_idx < g->num_counters);
fd_pkt4(cs, 1).add((fd_reg_pair){
.reg = g->counters[counter_idx].select_reg,
.reg = entry->counter->select_reg,
.value = g->countables[entry->cid].selector,
});
}
memset(counters_per_group, 0, sizeof(counters_per_group));
for (unsigned s = 0; s < ARRAY_SIZE(entry->counter->slice_select_regs); s++) {
if (!entry->counter->slice_select_regs[s])
break;
fd_pkt4(cs, 1).add((fd_reg_pair){
.reg = entry->counter->slice_select_regs[s],
.value = g->countables[entry->cid].selector,
});
}
}
/* and snapshot the start values */
for (unsigned i = 0; i < data->num_query_entries; i++) {
struct fd_batch_query_entry *entry = &data->query_entries[i];
const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
unsigned counter_idx = counters_per_group[entry->gid]++;
const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
const struct fd_perfcntr_counter *counter = entry->counter;
fd_pkt7(cs, CP_REG_TO_MEM, 3)
.add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true))
@ -877,12 +877,8 @@ static void
perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
{
struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data;
struct fd_screen *screen = data->screen;
fd_cs cs(batch->draw);
unsigned counters_per_group[screen->num_perfcntr_groups];
memset(counters_per_group, 0, sizeof(counters_per_group));
fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
/* TODO do we need to bother to turn anything off? */
@ -890,9 +886,7 @@ perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
/* snapshot the end values: */
for (unsigned i = 0; i < data->num_query_entries; i++) {
struct fd_batch_query_entry *entry = &data->query_entries[i];
const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
unsigned counter_idx = counters_per_group[entry->gid]++;
const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
const struct fd_perfcntr_counter *counter = entry->counter;
fd_pkt7(cs, CP_REG_TO_MEM, 3)
.add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true))
@ -925,12 +919,24 @@ perfcntr_accumulate_result(struct fd_acc_query *aq,
}
}
static void
perfcntr_cleanup(void *query_data)
{
struct fd_batch_query_data *data = (struct fd_batch_query_data *)query_data;
for (unsigned i = 0; i < data->num_query_entries; i++) {
struct fd_batch_query_entry *entry = &data->query_entries[i];
fd_perfcntr_release(data->screen->perfcntrs, entry->counter);
}
}
static const struct fd_acc_sample_provider perfcntr = {
.query_type = FD_QUERY_FIRST_PERFCNTR,
.always = true,
.resume = perfcntr_resume,
.pause = perfcntr_pause,
.result = perfcntr_accumulate_result,
.cleanup = perfcntr_cleanup,
};
static struct pipe_query *
@ -949,13 +955,6 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
data->screen = screen;
data->num_query_entries = num_queries;
/* validate the requested query_types and ensure we don't try
* to request more query_types of a given group than we have
* counters:
*/
unsigned counters_per_group[screen->num_perfcntr_groups];
memset(counters_per_group, 0, sizeof(counters_per_group));
for (unsigned i = 0; i < num_queries; i++) {
unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
@ -985,13 +984,15 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
entry->cid++;
}
if (counters_per_group[entry->gid] >=
screen->perfcntr_groups[entry->gid].num_counters) {
mesa_loge("too many counters for group %u", entry->gid);
const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
const struct fd_perfcntr_countable *c = &g->countables[entry->cid];
entry->counter = fd_perfcntr_reserve(screen->perfcntrs, g, c);
if (!entry->counter) {
mesa_loge("Could not reserve counter for %s.%s", g->name, c->name);
goto error;
}
counters_per_group[entry->gid]++;
}
q = fd_acc_create_query2(ctx, 0, 0, &perfcntr);
@ -1004,6 +1005,7 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
return (struct pipe_query *)q;
error:
perfcntr_cleanup(data);
free(data);
return NULL;
}

View file

@ -184,8 +184,12 @@ setup_perfcntr_query_info(struct fd_screen *screen)
{
unsigned num_queries = 0;
for (unsigned i = 0; i < screen->num_perfcntr_groups; i++)
num_queries += screen->perfcntr_groups[i].num_countables;
for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) {
const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i];
if (g->pipe > PIPE_BR)
continue;
num_queries += g->num_countables;
}
screen->perfcntr_queries =
calloc(num_queries, sizeof(screen->perfcntr_queries[0]));
@ -194,6 +198,8 @@ setup_perfcntr_query_info(struct fd_screen *screen)
unsigned idx = 0;
for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) {
const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i];
if (g->pipe > PIPE_BR)
continue;
for (unsigned j = 0; j < g->num_countables; j++) {
struct pipe_driver_query_info *info = &screen->perfcntr_queries[idx];
const struct fd_perfcntr_countable *c = &g->countables[j];

View file

@ -21,6 +21,9 @@ fd_acc_destroy_query(struct fd_context *ctx, struct fd_query *q) assert_dt
DBG("%p", q);
if (aq->provider->cleanup)
aq->provider->cleanup(aq->query_data);
pipe_resource_reference(&aq->prsc, NULL);
list_del(&aq->node);

View file

@ -72,6 +72,7 @@ struct fd_acc_sample_provider {
void (*result_resource)(struct fd_acc_query *aq, struct fd_ringbuffer *ring,
enum pipe_query_value_type result_type, int index,
struct fd_resource *dst, unsigned offset);
void (*cleanup)(void *query_data); /* optional cleanup */
};
struct fd_acc_query {

View file

@ -165,6 +165,8 @@ fd_screen_destroy(struct pipe_screen *pscreen)
if (screen->ro)
screen->ro->destroy(screen->ro);
fd_perfcntr_state_free(screen->perfcntrs);
fd_bc_fini(&screen->batch_cache);
fd_gmem_screen_fini(pscreen);
@ -1057,7 +1059,10 @@ fd_screen_create(int fd,
if (screen->primtypes[i])
screen->primtypes_mask |= (1 << i);
if (FD_DBG(PERFC)) {
screen->perfcntrs = fd_perfcntr_state_alloc(screen->dev_id, fd);
if (FD_DBG(PERFC) ||
(screen->perfcntrs && fd_perfcntr_has_reservation(screen->perfcntrs))) {
screen->perfcntr_groups =
fd_perfcntrs(screen->dev_id, &screen->num_perfcntr_groups);
}

View file

@ -106,6 +106,7 @@ struct fd_screen {
unsigned num_perfcntr_groups;
const struct fd_perfcntr_group *perfcntr_groups;
struct fd_perfcntr_state *perfcntrs;
/* generated at startup from the perfcntr groups: */
unsigned num_perfcntr_queries;