From f52f68d548e377ca6eacf8a10ec5a09a9df92dbb Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 17 Mar 2026 12:53:40 -0700 Subject: [PATCH 01/21] WIP: freedreno/perfcntrs: Add tool to dump perfctr tables Just so we have a sane way to check for errors while migrating. --- src/freedreno/perfcntrs/dumpctrs.c | 82 +++++++++++++++++++++++++++++ src/freedreno/perfcntrs/meson.build | 20 +++++++ 2 files changed, 102 insertions(+) create mode 100644 src/freedreno/perfcntrs/dumpctrs.c diff --git a/src/freedreno/perfcntrs/dumpctrs.c b/src/freedreno/perfcntrs/dumpctrs.c new file mode 100644 index 00000000000..62b9fe4e1fe --- /dev/null +++ b/src/freedreno/perfcntrs/dumpctrs.c @@ -0,0 +1,82 @@ +/* + * Copyright © 2016 Rob Clark + * All Rights Reserved. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drm/freedreno_drmif.h" +#include "drm/freedreno_ringbuffer.h" + +#include "util/os_file.h" + +#include "freedreno_dt.h" +#include "freedreno_perfcntr.h" + +/* + * Simple tool to dump perfctr tables (so we can make sure nothing gets + * missed while converting to generated tables) + */ + +int +main(int argc, char **argv) +{ + struct fd_dev_id dev_id = {}; + unsigned ngroups = 0; + const struct fd_perfcntr_group *groups; + + if (argc != 2) + return -1; + + if (!strcmp(argv[1], "a2xx")) { + dev_id.gpu_id = 200; + } else if (!strcmp(argv[1], "a5xx")) { + dev_id.gpu_id = 530; + } else if (!strcmp(argv[1], "a6xx")) { + dev_id.gpu_id = 630; + } else if (!strcmp(argv[1], "a7xx")) { + dev_id.chip_id = 0xffff07030001; + } + + groups = fd_perfcntrs(&dev_id, &ngroups); + if (!groups) { + errx(1, "no perfcntr support"); + } + + for (int i = 0; i < ngroups; i++) { + const struct fd_perfcntr_group *g = &groups[i]; + printf("GROUP[%s]: num_counters=%u, num_countables=%u\n", + g->name, g->num_counters, g->num_countables); + + for (int j = 0; j < g->num_counters; j++) { + const struct fd_perfcntr_counter *counter = &g->counters[j]; + printf("COUNTER: %04x, %04x, %04x, %04x, %04x\n", + counter->select_reg, counter->counter_reg_lo, counter->counter_reg_hi, + counter->enable, counter->clear); + } + + for (int j = 0; j < g->num_countables; j++) { + const struct fd_perfcntr_countable *countable = &g->countables[j]; + printf("COUNTABLE[%s]: %04x\n", countable->name, countable->selector); + } + + printf("\n"); + } + + + return 0; +} diff --git a/src/freedreno/perfcntrs/meson.build b/src/freedreno/perfcntrs/meson.build index 5b0dbf8a0d2..5caa91fbf30 100644 --- a/src/freedreno/perfcntrs/meson.build +++ b/src/freedreno/perfcntrs/meson.build @@ -51,3 +51,23 @@ if dep_libconfig.found() and dep_curses.found() install : with_tools.contains('freedreno'), ) endif + +dumpctrs = executable( + 'dumpctrs', + ['dumpctrs.c', freedreno_xml_header_files], + include_directories : [ + inc_freedreno, + inc_include, + inc_src, + ], + link_with : [ + libfreedreno_common, + libfreedreno_drm, + libfreedreno_perfcntrs, + ], + dependencies : [ + dep_libdrm, + idep_mesautil, + ], + build_by_default : with_tools.contains('freedreno'), +) From 38b029cb7d8bc8c6e76bd1ebbde4630d439e7d9b Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 25 Apr 2026 07:25:26 -0700 Subject: [PATCH 02/21] freedreno/registers: Skip deprecated warns for kernel Signed-off-by: Rob Clark --- src/freedreno/registers/gen_header.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/freedreno/registers/gen_header.py b/src/freedreno/registers/gen_header.py index 07e6f0cb4e6..d3b56a9d84f 100644 --- a/src/freedreno/registers/gen_header.py +++ b/src/freedreno/registers/gen_header.py @@ -1003,7 +1003,7 @@ def dump_c(args, guard, func): # TODO figure out what to do about fd_reg_stomp_allowed() # vs gcc.. for now only enable the warnings with clang: - print("#if defined(__clang__) && !defined(FD_NO_DEPRECATED_PACK)") + print("#if defined(__clang__) && !defined(FD_NO_DEPRECATED_PACK) && !defined(__KERNEL__)") print("#define __FD_DEPRECATED _Pragma (\"GCC warning \\\"Deprecated reg builder\\\"\")") print("#else") print("#define __FD_DEPRECATED") From 3358e40fed51d17c4c5503555aa629d667605658 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 15 Apr 2026 08:29:23 -0700 Subject: [PATCH 03/21] WIP: freedreno/drm: Import new UABI for PERFCNTR_CONFIG --- include/drm-uapi/msm_drm.h | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/include/drm-uapi/msm_drm.h b/include/drm-uapi/msm_drm.h index 5c67294edc9..289cf228b87 100644 --- a/include/drm-uapi/msm_drm.h +++ b/include/drm-uapi/msm_drm.h @@ -117,6 +117,7 @@ struct drm_msm_timespec { * ioctl will throw -EPIPE. */ #define MSM_PARAM_EN_VM_BIND 0x16 /* WO, once */ +#define MSM_PARAM_AQE 0x17 /* RO */ /* For backwards compat. The original support for preemption was based on * a single ring per priority level so # of priority levels equals the # @@ -490,6 +491,52 @@ struct drm_msm_submitqueue_query { __u32 pad; }; +#define MSM_PERFCNTR_STREAM 0x00000001 +#define MSM_PERFCNTR_UPDATE 0x00000002 +#define MSM_PERFCNTR_FLAGS ( \ + MSM_PERFCNTR_STREAM | \ + MSM_PERFCNTR_UPDATE | \ + 0) + +struct drm_msm_perfcntr_group { + char group_name[16]; + __u32 nr_countables; + __u32 pad; + __u64 countables; /* pointer to an array of nr_countables u32 */ +}; + +/* + * Note, for MSM_PERFCNTR_STREAM, the ioctl returns an fd to read recorded + * counters. This only works because the ioctl is DRM_IOW(), if we returned + * a out param in the ioctl struct the copy_to_user() (in drm_ioctl()) + * could fault, causing us to leak the fd. + * + * If the ioctl returns with error E2BIG, that means more counters/countables + * are requested than are currently available. If MSM_PERFCNTR_UPDATE flag + * is set, drm_msm_perfcntr_group::nr_countables will be updated to return + * the actual # of counters available. + * + * The data read from the has the following format for each sampling period: + * + * uint64_t timestamp; // CP_ALWAYS_ON_COUNTER captured at sample time + * uint32_t seqno; // increments by 1 each period, reset to 0 on discontinuity + * uint32_t mbz; // pad out counters to 64b + * struct { + * uint64_t counter[nr_countables]; + * } groups[nr_groups]; + * + * The ordering of groups and counters matches the order in PERFCNTR_CONFIG + * ioctl. + */ +struct drm_msm_perfcntr_config { + __u32 flags; /* bitmask of MSM_PERFCNTR_x */ + __u32 nr_groups; /* # of entries in groups array */ + __u64 groups; /* pointer to array of drm_msm_perfcntr_group */ + __u64 period; /* sampling period in ns */ + __u32 bufsz_shift; /* sample buffer size in bytes is 1< Date: Sat, 25 Apr 2026 07:30:42 -0700 Subject: [PATCH 04/21] freedreno/common: Add ioctl ptr helpers Signed-off-by: Rob Clark --- src/freedreno/common/freedreno_common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/freedreno/common/freedreno_common.h b/src/freedreno/common/freedreno_common.h index eea5cabe718..36dea4e12b5 100644 --- a/src/freedreno/common/freedreno_common.h +++ b/src/freedreno/common/freedreno_common.h @@ -158,6 +158,9 @@ struct BitmaskEnum { #define BIT(bit) BITFIELD64_BIT(bit) +#define U642VOID(x) ((void *)(unsigned long)(x)) +#define VOID2U64(x) ((uint64_t)(unsigned long)(x)) + /** * Helper for allocating sequence #s where zero is a non-valid seqno */ From 0b78c907b1780fab8307ac9d6c82e12578b65b89 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 15 Apr 2026 09:23:03 -0700 Subject: [PATCH 05/21] freedreno/fdperf: Move where we setup counter groups Move this earlier so we have the counter config early enough to probe kernel support for PERFCNTR_CONFIG with a valid config. Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/fdperf.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/freedreno/perfcntrs/fdperf.c b/src/freedreno/perfcntrs/fdperf.c index c0f34ed0385..18ad59222fd 100644 --- a/src/freedreno/perfcntrs/fdperf.c +++ b/src/freedreno/perfcntrs/fdperf.c @@ -80,6 +80,7 @@ static struct { static void config_save(void); static void config_restore(void); static void restore_counter_groups(void); +static void setup_counter_groups(const struct fd_perfcntr_group *groups); /* * helpers @@ -146,6 +147,15 @@ find_device(void) printf("min_freq=%u, max_freq=%u\n", dev.min_freq, dev.max_freq); + const struct fd_perfcntr_group *groups; + groups = fd_perfcntrs(dev.dev_id, &dev.ngroups); + if (!groups) { + errx(1, "no perfcntr support"); + } + + dev.groups = calloc(dev.ngroups, sizeof(struct counter_group)); + setup_counter_groups(groups); + dev.io = fd_dt_find_io(); if (!dev.io) { err(1, "could not map device"); @@ -997,17 +1007,8 @@ main(int argc, char **argv) find_device(); - const struct fd_perfcntr_group *groups; - groups = fd_perfcntrs(dev.dev_id, &dev.ngroups); - if (!groups) { - errx(1, "no perfcntr support"); - } - - dev.groups = calloc(dev.ngroups, sizeof(struct counter_group)); - setlocale(LC_NUMERIC, "en_US.UTF-8"); - setup_counter_groups(groups); restore_counter_groups(); config_restore(); flush_ring(); From 0777a8e3ff9049c91781f5e2b604c9b0f08bacef Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 4 May 2026 11:15:30 -0700 Subject: [PATCH 06/21] freedreno/fdperf: Prepare for partial-counter usage With PERFCNTR_CONFIG, some other process may have already reserved some counters, so not all will be available to fdperf. Prepare for this by using num_counters in counter_group. Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/fdperf.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/freedreno/perfcntrs/fdperf.c b/src/freedreno/perfcntrs/fdperf.c index 18ad59222fd..83be5602098 100644 --- a/src/freedreno/perfcntrs/fdperf.c +++ b/src/freedreno/perfcntrs/fdperf.c @@ -45,6 +45,11 @@ static struct { struct counter_group { const struct fd_perfcntr_group *group; + /* We initially try to use all counters, but can reduce this if + * not all counters are available. + */ + unsigned num_counters; + struct { const struct fd_perfcntr_counter *counter; uint16_t select_val; @@ -191,7 +196,7 @@ flush_ring(void) static void select_counter(struct counter_group *group, int ctr, int countable_val) { - assert(ctr < group->group->num_counters); + assert(ctr < group->num_counters); unsigned countable_idx = UINT32_MAX; for (unsigned i = 0; i < group->group->num_countables; i++) { @@ -335,7 +340,7 @@ resample(void) for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { resample_counter(group, j, current_time); check_counter_invalid(group, j); } @@ -523,13 +528,13 @@ redraw(WINDOW *win) if (group->counter[0].is_gpufreq_counter) j++; - if (j < group->group->num_counters) { + if (j < group->num_counters) { if ((scroll <= row) && ((row - scroll) < max)) redraw_group_header(win, row - scroll, group->group->name); row++; } - for (; j < group->group->num_counters; j++) { + for (; j < group->num_counters; j++) { if ((scroll <= row) && ((row - scroll) < max)) redraw_counter(win, row - scroll, group, j, row == current_cntr); row++; @@ -564,7 +569,7 @@ current_counter(int *ctr) j++; /* account for group header: */ - if (j < group->group->num_counters) { + if (j < group->num_counters) { /* cannot select group header.. return null to indicate this * main_ui(): */ @@ -573,7 +578,7 @@ current_counter(int *ctr) n++; } - for (; j < group->group->num_counters; j++) { + for (; j < group->num_counters; j++) { if (n == current_cntr) { if (ctr) *ctr = j; @@ -771,7 +776,7 @@ dump_counters(void) for (unsigned i = 0; i < dev.ngroups; i++) { const struct counter_group *group = &dev.groups[i]; - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { const char *label = group->label[j]; float val = (float) group->value_delta[j] * 1000000.0 / (float) group->sample_time_delta[j]; @@ -808,7 +813,7 @@ restore_counter_groups(void) for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { /* This should also write the CP_ALWAYS_COUNT selectable value into * the reserved CP counter we use for GPU frequency measurement, * avoiding someone else writing a different value there. @@ -825,8 +830,9 @@ setup_counter_groups(const struct fd_perfcntr_group *groups) struct counter_group *group = &dev.groups[i]; group->group = &groups[i]; + group->num_counters = group->group->num_counters; - max_rows += group->group->num_counters + 1; + max_rows += group->num_counters + 1; /* We reserve the first counter of the CP group (first in the list) for * measuring GPU frequency that's displayed in the footer. @@ -856,7 +862,7 @@ setup_counter_groups(const struct fd_perfcntr_group *groups) } } - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { group->counter[j].counter = &group->group->counters[j]; if (!group->counter[j].is_gpufreq_counter) @@ -899,7 +905,7 @@ config_save(void) config_setting_t *sect = config_setting_get_member(setting, group->group->name); - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { /* Don't save the GPU frequency measurement counter. */ if (group->counter[j].is_gpufreq_counter) continue; @@ -946,7 +952,7 @@ config_restore(void) config_setting_add(setting, group->group->name, CONFIG_TYPE_GROUP); } - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { /* Don't restore the GPU frequency measurement counter. */ if (group->counter[j].is_gpufreq_counter) continue; From 91f94113c876ea533a726f21d6cffa7cd7a6627a Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 15 Apr 2026 09:48:44 -0700 Subject: [PATCH 07/21] freedreno/fdperf: Add PERFCNTR_CONFIG support Add support for the new ioctl for KMD global counter collection. This avoids needing hacks to parse dtb and mmap the GPU's i/o space. Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/fdperf.c | 196 ++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 2 deletions(-) diff --git a/src/freedreno/perfcntrs/fdperf.c b/src/freedreno/perfcntrs/fdperf.c index 83be5602098..751c7dc3408 100644 --- a/src/freedreno/perfcntrs/fdperf.c +++ b/src/freedreno/perfcntrs/fdperf.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -24,9 +26,12 @@ #include "util/os_file.h" +#include "freedreno_common.h" #include "freedreno_dt.h" #include "freedreno_perfcntr.h" +#include "drm-uapi/msm_drm.h" + #define MAX_CNTR_PER_GROUP 24 #define REFRESH_MS 500 @@ -80,7 +85,25 @@ static struct { const struct fd_dev_id *dev_id; struct fd_submit *submit; struct fd_ringbuffer *ring; -} dev; + + /* This is used for PERFCNTR_CONFIG if supported by kernel. In + * this case, dev.io is not used. + */ + struct drm_msm_perfcntr_config perfcntr_config; + int perfcntr_stream_fd; + + int num_configured_counters; + + uint32_t seqno; + bool discontinuity; +} dev = { + .perfcntr_config = { + .flags = MSM_PERFCNTR_STREAM | MSM_PERFCNTR_UPDATE, + .bufsz_shift = 12, + .group_stride = sizeof(struct drm_msm_perfcntr_group), + }, + .perfcntr_stream_fd = -1, +}; static void config_save(void); static void config_restore(void); @@ -119,6 +142,27 @@ delta(uint64_t a, uint64_t b) return b - a; } +static int +perfcntr_config(void) +{ + if (dev.perfcntr_stream_fd >= 0) { + close(dev.perfcntr_stream_fd); + dev.perfcntr_stream_fd = -1; + } + + errno = 0; + + int fd = drmIoctl(fd_device_fd(dev.dev), + DRM_IOCTL_MSM_PERFCNTR_CONFIG, + &dev.perfcntr_config); + if (fd < 0) + return -errno; + + dev.perfcntr_stream_fd = fd; + + return 0; +} + static void find_device(void) { @@ -161,6 +205,33 @@ find_device(void) dev.groups = calloc(dev.ngroups, sizeof(struct counter_group)); setup_counter_groups(groups); + ret = perfcntr_config(); + if (ret == -E2BIG) { + struct drm_msm_perfcntr_group *g = U642VOID(dev.perfcntr_config.groups); + + /* we are trying to use too many counters, back off: */ + for (unsigned i = 0; i < dev.ngroups; i++) { + if (g[i].nr_countables < dev.groups[i].num_counters) { + printf("reducing %s counters %u -> %u\n", + groups[i].name, dev.groups[i].num_counters, g[i].nr_countables); + dev.num_configured_counters -= + dev.groups[i].num_counters - g[i].nr_countables; + dev.groups[i].num_counters = g[i].nr_countables; + } + } + + ret = perfcntr_config(); + } + + if (!ret) { + return; + } + + /* mmio not supported on gen8+: */ + if (fd_dev_gen(dev.dev_id) >= 8) { + err(1, "mmio fallback not supported"); + } + dev.io = fd_dt_find_io(); if (!dev.io) { err(1, "could not map device"); @@ -176,6 +247,13 @@ find_device(void) static void flush_ring(void) { + if (!dev.io) { + int ret = perfcntr_config(); + if (ret < 0) + errx(1, "perfcntr_config() failed"); + return; + } + if (!dev.submit) return; @@ -213,6 +291,20 @@ select_counter(struct counter_group *group, int ctr, int countable_val) group->label[ctr] = group->group->countables[countable_idx].name; group->counter[ctr].select_val = countable_val; + /* If using PERFCNTR_CONFIG, then update the ioctl structure: */ + if (!dev.io) { + struct drm_msm_perfcntr_group *g = U642VOID(dev.perfcntr_config.groups); + + for (int i = 0; i < dev.ngroups; i++) { + if (&dev.groups[i] == group) { + uint32_t *countables = U642VOID(g[i].countables); + countables[ctr] = countable_val; + break; + } + } + return; + } + if (!dev.submit) { dev.submit = fd_submit_new(dev.pipe); dev.ring = fd_submit_new_ringbuffer( @@ -326,6 +418,82 @@ check_counter_invalid(struct counter_group *group, int ctr) group->counter[ctr].is_invalid = (hw_selector != group->counter[ctr].select_val); } +static bool +perfcntr_stream_ready(void) +{ + struct pollfd pfd; + + pfd.fd = dev.perfcntr_stream_fd; + pfd.events = POLLIN; + pfd.revents = 0; + + if (poll(&pfd, 1, 0) < 0) + return false; + + if (!(pfd.revents & POLLIN)) + return false; + + return true; +} + +/* GPU always-on timer constants */ +static const uint64_t ALWAYS_ON_FREQUENCY_HZ = 19200000; +static const double GPU_TICKS_PER_US = ALWAYS_ON_FREQUENCY_HZ / 1000000.0; + +static uint64_t +ticks_to_us(uint64_t ticks) +{ + return ticks / GPU_TICKS_PER_US; +} + +static void +resample_perfcntr_stream(void) +{ + if (!perfcntr_stream_ready()) { + dev.discontinuity = true; + return; + } + + uint64_t buf[dev.num_configured_counters + 2]; /* include 128b header */ + void *ptr = buf; + size_t sz = sizeof(buf); + + while (sz > 0) { + ssize_t ret = read(dev.perfcntr_stream_fd, ptr, sz); + + if (ret < 0) + ret = -errno; + + if (ret == -EINTR || ret == -EAGAIN) + continue; + + if (ret < 0) + errx(ret, "read failed"); + + sz -= ret; + ptr += ret; + } + + int idx = 0; + uint64_t ts = ticks_to_us(buf[idx++]); + uint32_t seqno = buf[idx++] & 0xffffffff; + + dev.discontinuity = (seqno == 0); + + for (unsigned i = 0; i < dev.ngroups; i++) { + struct counter_group *group = &dev.groups[i]; + for (unsigned ctr = 0; ctr < group->num_counters; ctr++) { + uint64_t previous_value = group->value[ctr]; + group->value[ctr] = buf[idx++]; + group->value_delta[ctr] = delta(previous_value, group->value[ctr]); + + uint64_t previous_sample_time = group->sample_time[ctr]; + group->sample_time[ctr] = ts; + group->sample_time_delta[ctr] = delta(previous_sample_time, ts); + } + } +} + /* sample all the counters: */ static void resample(void) @@ -338,6 +506,11 @@ resample(void) last_time = current_time; + if (!dev.io) { + resample_perfcntr_stream(); + return; + } + for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; for (unsigned j = 0; j < group->num_counters; j++) { @@ -484,7 +657,7 @@ static void redraw_counter(WINDOW *win, int row, struct counter_group *group, int ctr, bool selected) { - bool is_invalid = group->counter[ctr].is_invalid; + bool is_invalid = group->counter[ctr].is_invalid || dev.discontinuity; redraw_counter_label(win, row, group->label[ctr], selected, is_invalid); redraw_counter_value(win, row, group, ctr, is_invalid); } @@ -749,6 +922,9 @@ main_ui(void) resample(); redraw(mainwin); + if (!dev.io) + continue; + /* restore the counters every 0.5s in case the GPU has suspended, * in which case the current selected countables will have reset: */ @@ -826,9 +1002,25 @@ restore_counter_groups(void) static void setup_counter_groups(const struct fd_perfcntr_group *groups) { + /* pre-allocate memory needed for PERFCNTR_CONFIG ioctl: */ + struct drm_msm_perfcntr_group *g = calloc(sizeof(struct drm_msm_perfcntr_group), dev.ngroups); + + dev.perfcntr_config.nr_groups = dev.ngroups; + dev.perfcntr_config.period = options.refresh_ms * 1000000; + dev.perfcntr_config.groups = VOID2U64(g); + for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; + if (strlen(groups[i].name) > sizeof(g[i].group_name)) + errx(1, "group name too large: %s", groups[i].name); + + strncpy(g[i].group_name, groups[i].name, sizeof(g[i].group_name)); + g[i].nr_countables = groups[i].num_counters; + g[i].countables = VOID2U64(calloc(sizeof(uint32_t), g[i].nr_countables)); + + dev.num_configured_counters += g[i].nr_countables; + group->group = &groups[i]; group->num_counters = group->group->num_counters; From 663fcb412bda32f7039b9ea6c5a9e2cc0bb0aee6 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 25 Apr 2026 18:07:24 -0700 Subject: [PATCH 08/21] freedreno/ds: PERFCNTR_CONFIG support Signed-off-by: Rob Clark --- src/freedreno/ds/fd_pps_driver.cc | 254 ++++++++++++++++++++++++++++-- src/freedreno/ds/fd_pps_driver.h | 30 +++- 2 files changed, 273 insertions(+), 11 deletions(-) diff --git a/src/freedreno/ds/fd_pps_driver.cc b/src/freedreno/ds/fd_pps_driver.cc index ad201d8f00c..5c3c5675409 100644 --- a/src/freedreno/ds/fd_pps_driver.cc +++ b/src/freedreno/ds/fd_pps_driver.cc @@ -7,9 +7,16 @@ #include #include -#include +#include +#include +#include + +#include + +#include "common/freedreno_common.h" #include "common/freedreno_dev_info.h" +#include "drm-uapi/msm_drm.h" #include "drm/freedreno_drmif.h" #include "drm/freedreno_ringbuffer.h" #include "perfcntrs/freedreno_dt.h" @@ -46,6 +53,8 @@ FreedrenoDriver::configure_counters(bool reset, bool wait) (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE); struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags); + assert(io); /* This is legacy path only */ + for (const auto &countable : countables) countable.configure(ring, reset); @@ -67,12 +76,85 @@ FreedrenoDriver::configure_counters(bool reset, bool wait) void FreedrenoDriver::collect_countables() { + assert(io); /* This is legacy path only */ + last_dump_ts = gpu_timestamp(); for (const auto &countable : countables) countable.collect(); } +int +FreedrenoDriver::configure_counters_stream() +{ + if (perfcntr_stream_fd >= 0) { + close(perfcntr_stream_fd); + perfcntr_stream_fd = -1; + } + + unsigned sample_size = sizeof(uint64_t) * (2 + countables.size()); + unsigned bufsz = 2 * sample_size; + unsigned bufsz_shift = ffs(util_next_power_of_two(bufsz)) - 1; + + struct drm_msm_perfcntr_group groups[num_perfcntrs]; + memset(groups, 0, sizeof(groups)); + + struct drm_msm_perfcntr_config req = { + .flags = MSM_PERFCNTR_STREAM, + .groups = VOID2U64(groups), + .period = sampling_period_ns_, + .bufsz_shift = bufsz_shift, + .group_stride = sizeof(struct drm_msm_perfcntr_group), + }; + + assert(req.period); + + for (const auto &countable : countables) + countable.configure_stream(&req); + + /* Now that the groups are fully populated, resolve the sample indices: */ + for (const auto &countable : countables) + countable.resolve_sample_idx(&req); + + int fd = drmIoctl(fd_device_fd(dev), DRM_IOCTL_MSM_PERFCNTR_CONFIG, &req); + if (fd < 0) + return fd; + + sample_buf = malloc(sample_size); + + perfcntr_stream_fd = fd; + + /* Unlike the legacy path, the kernel handles reconfiguring counters + * after power collapse for us, so we won't need to configure the + * stream again. So cleanup allocated memory now: + */ + for (unsigned i = 0; i < num_perfcntrs; i++) { + if (!groups[i].countables) + break; + free(U642VOID(groups[i].countables)); + } + + return 0; +} + +static bool +perfcntr_stream_ready(int perfcntr_stream_fd) +{ + struct pollfd pfd; + + pfd.fd = perfcntr_stream_fd; + pfd.events = POLLIN; + pfd.revents = 0; + + if (poll(&pfd, 1, 0) < 0) + return false; + + if (!(pfd.revents & POLLIN)) + return false; + + return true; +} + static uint64_t ticks_to_ns(uint64_t ticks) { @@ -82,6 +164,61 @@ ticks_to_ns(uint64_t ticks) return ticks / GPU_TICKS_PER_NS; } +bool +FreedrenoDriver::collect_countables_stream() +{ + unsigned nsamples = 0; + bool discontinuity = false; + + assert(perfcntr_stream_fd >= 0); + + while (perfcntr_stream_ready(perfcntr_stream_fd)) { + unsigned sample_size = sizeof(uint64_t) * (2 + countables.size()); + size_t sz = sample_size; + void *ptr = sample_buf; + + while (sz > 0) { + ssize_t ret = read(perfcntr_stream_fd, ptr, sz); + + if (ret < 0) + ret = -errno; + + if (ret == -EINTR || ret == -EAGAIN) + continue; + + if (ret < 0) + errx(ret, "read failed"); + + sz -= ret; + ptr = static_cast(ptr) + ret; + } + + uint64_t *buf = (uint64_t *)sample_buf; + uint64_t ts = buf[0]; + uint32_t seqno = buf[1] & 0xffffffff; + + discontinuity = seqno == 0; + + /* Capture the timestamp from the *start* of the sampling period: */ + last_capture_ts = last_dump_ts; + last_dump_ts = ts; + + auto elapsed_time_ns = ticks_to_ns(last_dump_ts - last_capture_ts); + + time = (float)elapsed_time_ns / 1000000000.0; + + /* advance past header: */ + buf += 2; + + for (const auto &countable : countables) + countable.collect_stream(buf); + + nsamples++; + } + + return (nsamples > 0) && !discontinuity; +} + bool FreedrenoDriver::init_perfcnt() { @@ -107,9 +244,7 @@ FreedrenoDriver::init_perfcnt() has_suspend_count = true; } - fd_pipe_set_param(pipe, FD_SYSPROF, 1); - - perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs); + perfcntrs = fd_perfcntrs(dev_id, &num_perfcntrs); if (num_perfcntrs == 0) { PERFETTO_FATAL("No hw counters available"); return false; @@ -137,12 +272,20 @@ FreedrenoDriver::init_perfcnt() for (const auto &countable : countables) countable.resolve(); + if (!configure_counters_stream()) { + close(perfcntr_stream_fd); + perfcntr_stream_fd = -1; + return true; + } + io = fd_dt_find_io(); if (!io) { PERFETTO_FATAL("Could not map GPU I/O space"); return false; } + fd_pipe_set_param(pipe, FD_SYSPROF, 1); + configure_counters(true, true); collect_countables(); @@ -165,14 +308,26 @@ FreedrenoDriver::enable_all_counters() } void -FreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */) +FreedrenoDriver::enable_perfcnt(const uint64_t sampling_period_ns) { + sampling_period_ns_ = sampling_period_ns; + + if (!io) { + /* reconfigure counter stream: */ + configure_counters_stream(); + collect_countables_stream(); + } } bool FreedrenoDriver::dump_perfcnt() { - if (has_suspend_count) { + /* Note, when using perfcntr stream instead of mmio basec counter + * reads, we can skip this (since the seqno in the data read from + * the stream will tell us if there is a discontinuity, and the + * kernel will handle reconfiguring counters on resume) + */ + if (has_suspend_count && io) { uint64_t val; fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val); @@ -193,6 +348,9 @@ FreedrenoDriver::dump_perfcnt() } } + if (!io) + return collect_countables_stream(); + auto last_ts = last_dump_ts; /* Capture the timestamp from the *start* of the sampling period: */ @@ -223,11 +381,13 @@ uint64_t FreedrenoDriver::next() return ret; } -void FreedrenoDriver::disable_perfcnt() +void +FreedrenoDriver::disable_perfcnt() { - /* There isn't really any disable, only reconfiguring which countables - * get muxed to which counters - */ + if (perfcntr_stream_fd >= 0) { + close(perfcntr_stream_fd); + perfcntr_stream_fd = -1; + } } /* @@ -278,6 +438,80 @@ FreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset) co } } +void +FreedrenoDriver::Countable::configure_stream(struct drm_msm_perfcntr_config *req) const +{ + const struct fd_perfcntr_countable *countable = d->state[id].countable; + struct drm_msm_perfcntr_group *groups = + (struct drm_msm_perfcntr_group *)U642VOID(req->groups); + + /* Find group: */ + struct drm_msm_perfcntr_group *g = NULL; + + for (unsigned i = 0; i < req->nr_groups; i++) { + if (!strcmp(groups[i].group_name, group.c_str())) { + g = &groups[i]; + break; + } + } + + /* If not found, append a new group: */ + if (!g) { + g = &groups[req->nr_groups++]; + strcpy(g->group_name, group.c_str()); + + /* allocate countables for max # of counters in the group */ + for (unsigned i = 0; i < d->num_perfcntrs; i++) { + if (!strcmp(d->perfcntrs[i].name, group.c_str())) { + void *countables = calloc(sizeof(uint32_t), d->perfcntrs[i].num_counters); + g->countables = VOID2U64(countables); + break; + } + } + + assert(g->countables); + } + + /* Initially, just store the index within the group, since earlier groups + * are not yet fully populated (ie. we don't yet know the offset of the + * first sample in the group) + */ + d->state[id].idx = g->nr_countables; + + /* And last, append the countable: */ + uint32_t *countables = (uint32_t *)U642VOID(g->countables); + countables[g->nr_countables++] = countable->selector; +} + +static unsigned +find_group_offset(const struct drm_msm_perfcntr_config *req, const char *group) +{ + struct drm_msm_perfcntr_group *groups = + (struct drm_msm_perfcntr_group *)U642VOID(req->groups); + unsigned off = 0; + + for (unsigned i = 0; i < req->nr_groups; i++) { + if (!strcmp(groups[i].group_name, group)) + break; + off += groups[i].nr_countables; + } + + return off; +} + +void +FreedrenoDriver::Countable::resolve_sample_idx(const struct drm_msm_perfcntr_config *req) const +{ + d->state[id].idx += find_group_offset(req, group.c_str()); +} + +void +FreedrenoDriver::Countable::collect_stream(const uint64_t *buf) const +{ + d->state[id].last_value = d->state[id].value; + d->state[id].value = buf[d->state[id].idx]; +} + /* Collect current counter value and calculate delta since last sample: */ void FreedrenoDriver::Countable::collect() const diff --git a/src/freedreno/ds/fd_pps_driver.h b/src/freedreno/ds/fd_pps_driver.h index 81395714581..4da5cb6d808 100644 --- a/src/freedreno/ds/fd_pps_driver.h +++ b/src/freedreno/ds/fd_pps_driver.h @@ -6,6 +6,7 @@ #pragma once #include "pps/pps_driver.h" +#include "drm-uapi/msm_drm.h" extern "C" { struct fd_dev_id; @@ -54,10 +55,26 @@ private: const struct fd_dev_info *info; /** - * The memory mapped i/o space for counter readback: + * The memory mapped i/o space for counter readback (legacy): */ void *io; + /** + * perfcntr stream fd, if not using memory mapped i/o for counter + * readback. + */ + int perfcntr_stream_fd = -1; + + /** + * The configured sampling period + */ + uint64_t sampling_period_ns_ = 1000000000; + + /** + * Buffer used to read samples + */ + void *sample_buf; + const struct fd_perfcntr_group *perfcntrs; unsigned num_perfcntrs; @@ -79,6 +96,9 @@ private: void configure_counters(bool reset, bool wait); void collect_countables(); + int configure_counters_stream(); + bool collect_countables_stream(); + /** * Split out countable mutable state from the class so that copy- * constructor does something sane when lambda derive function @@ -88,6 +108,9 @@ private: uint64_t last_value, value; const struct fd_perfcntr_countable *countable; const struct fd_perfcntr_counter *counter; + + /* index into perfcntr stream sample buf: */ + unsigned idx; }; std::vector state; @@ -115,6 +138,11 @@ private: void collect() const; void resolve() const; + /* perfcntr stream related APIs */ + void configure_stream(struct drm_msm_perfcntr_config *req) const; + void resolve_sample_idx(const struct drm_msm_perfcntr_config *req) const; + void collect_stream(const uint64_t *buf) const; + private: uint64_t get_value() const; From 8fbc34bdc5f42ffa347c779c3c61ac118c1f3e11 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 27 Apr 2026 09:39:52 -0700 Subject: [PATCH 09/21] freedreno/ds: Add a8xx derived counters Mostly just some counter renames (slice vs unslice, etc) Signed-off-by: Rob Clark --- src/freedreno/ds/fd_pps_a8xx.cc | 1063 +++++++++++++++++++++++++++++ src/freedreno/ds/fd_pps_driver.cc | 3 + src/freedreno/ds/fd_pps_driver.h | 1 + src/freedreno/ds/meson.build | 1 + 4 files changed, 1068 insertions(+) create mode 100644 src/freedreno/ds/fd_pps_a8xx.cc diff --git a/src/freedreno/ds/fd_pps_a8xx.cc b/src/freedreno/ds/fd_pps_a8xx.cc new file mode 100644 index 00000000000..2972d48673d --- /dev/null +++ b/src/freedreno/ds/fd_pps_a8xx.cc @@ -0,0 +1,1063 @@ +/* + * Copyright © 2021 Google, Inc. + * SPDX-License-Identifier: MIT + */ + +#include "fd_pps_driver.h" + +#include +#include +#include + +#include "common/freedreno_dev_info.h" +#include "drm/freedreno_drmif.h" +#include "drm/freedreno_ringbuffer.h" +#include "perfcntrs/freedreno_dt.h" +#include "perfcntrs/freedreno_perfcntr.h" + +#include "pps/pps.h" +#include "pps/pps_algorithm.h" + +namespace pps +{ + +void +FreedrenoDriver::setup_a8xx_counters() +{ + /* TODO is there a reason to want more than one group? */ + CounterGroup group = {}; + group.name = "counters"; + groups.clear(); + counters.clear(); + countables.clear(); + enabled_counters.clear(); + groups.emplace_back(std::move(group)); + + /* So far, all a7xx devices seem to have two uSPTPs in each SP core + * and 128 ALUs in each uSPTP. + */ + const unsigned number_of_usptp = info->num_sp_cores * 2; + const unsigned number_of_alus_per_usptp = 128; + + /* The enumeration and two helper lambdas serve to handle countables + * that can be sampled from either rendering or visibility bins. + */ + enum { + BR = 0, + BV = 1, + }; + + auto cbCountable = [=](std::string group, std::string name) { + return std::array { + countable(group, name), + countable("BV_" + group, name), + }; + }; + + auto cbSum = [](const std::array& countable) { + return countable[BR] + countable[BV]; + }; + + /* This is a helper no-op lambda to handle known and understood counters + * that we can't currently implement for a variety of reasons. + */ + auto disabledCounter = [](std::string, Counter::Units, std::function) { }; + + /* CP: 3/14 counters */ + auto PERF_CP_ALWAYS_COUNT = countable("CP", "PERF_CP_ALWAYS_COUNT"); + auto PERF_CP_NUM_PREEMPTIONS = countable("CP", "PERF_CP_NUM_PREEMPTIONS"); + auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("CP", "PERF_CP_PREEMPTION_REACTION_DELAY"); + + /* RBBM: 1/4 counters */ + auto PERF_RBBM_US_STATUS_MASKED = countable("RBBM", "PERF_RBBM_US_STATUS_MASKED"); + + /* PC: 3/8 counters, BV_PC: 3/8 counters */ + auto PERF_PC_S_STALL_CYCLES_VFD = cbCountable("PC", "PERF_PC_S_STALL_CYCLES_VFD"); + auto PERF_PC_S_VERTEX_HITS = cbCountable("PC", "PERF_PC_S_VERTEX_HITS"); + auto PERF_PC_S_VS_INVOCATIONS = cbCountable("PC", "PERF_PC_S_VS_INVOCATIONS"); + + /* TSE: 4/8 counters */ + auto PERF_TSE_BE_INPUT_PRIM = countable("TSE", "PERF_TSE_BE_INPUT_PRIM"); + auto PERF_TSE_BE_TRIVAL_REJ_PRIM = countable("TSE", "PERF_TSE_BE_TRIVAL_REJ_PRIM"); + auto PERF_TSE_BE_CLIPPED_PRIM = countable("TSE", "PERF_TSE_BE_CLIPPED_PRIM"); + auto PERF_TSE_BE_OUTPUT_VISIBLE_PRIM = countable("TSE", "PERF_TSE_BE_OUTPUT_VISIBLE_PRIM"); + + /* UCHE: 8/12 counters */ + auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("UCHE", "PERF_UCHE_STALL_CYCLES_ARBITER"); + auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_TP"); + auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_VFD"); + auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_SP"); + auto PERF_UCHE_READ_REQUESTS_TP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_TP"); + auto PERF_UCHE_READ_REQUESTS_SP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_SP"); + auto PERF_UCHE_WRITE_REQUESTS_SP = countable("UCHE", "PERF_UCHE_WRITE_REQUESTS_SP"); + auto PERF_UCHE_EVICTS = countable("UCHE", "PERF_UCHE_EVICTS"); + + /* TP: 7/12 counters, BV_TP: 6/6 counters */ + auto PERF_TP_BUSY_CYCLES = countable("TP", "PERF_TP_BUSY_CYCLES"); + auto PERF_TP_L1_CACHELINE_REQUESTS = cbCountable("TP", "PERF_TP_L1_CACHELINE_REQUESTS"); + auto PERF_TP_L1_CACHELINE_MISSES = cbCountable("TP", "PERF_TP_L1_CACHELINE_MISSES"); + auto PERF_TP_OUTPUT_PIXELS = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS"); + auto PERF_TP_OUTPUT_PIXELS_POINT = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_POINT"); + auto PERF_TP_OUTPUT_PIXELS_BILINEAR = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_BILINEAR"); + auto PERF_TP_OUTPUT_PIXELS_ANISO = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_ANISO"); + + /* SP: 24/24 counters, BV_SP: 7/12 counters */ + auto PERF_SP_BUSY_CYCLES = countable("SP", "PERF_SP_BUSY_CYCLES"); + auto PERF_SP_ALU_WORKING_CYCLES = countable("SP", "PERF_SP_ALU_WORKING_CYCLES"); + auto PERF_SP_EFU_WORKING_CYCLES = countable("SP", "PERF_SP_EFU_WORKING_CYCLES"); + auto PERF_SP_STALL_CYCLES_TP = cbCountable("SP", "PERF_SP_STALL_CYCLES_TP"); + auto PERF_SP_NON_EXECUTION_CYCLES = countable("SP", "PERF_SP_NON_EXECUTION_CYCLES"); + auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_TEX_INSTRUCTIONS"); + auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_EFU_INSTRUCTIONS"); + auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_EFU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS"); + auto PERF_SP_ICL1_REQUESTS = cbCountable("SP", "PERF_SP_ICL1_REQUESTS"); + auto PERF_SP_ICL1_MISSES = cbCountable("SP", "PERF_SP_ICL1_MISSES"); + auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_FS_STAGE"); + auto PERF_SP_ANY_EU_WORKING_VS_STAGE = cbCountable("SP", "PERF_SP_ANY_EU_WORKING_VS_STAGE"); + auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_CS_STAGE"); + auto PERF_SP_PIXELS = countable("SP", "PERF_SP_PIXELS"); + auto PERF_SP_RAY_QUERY_INSTRUCTIONS = countable("SP", "PERF_SP_RAY_QUERY_INSTRUCTIONS"); + auto PERF_SP_RTU_BUSY_CYCLES = countable("SP", "PERF_SP_RTU_BUSY_CYCLES"); + auto PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES"); + auto PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES"); + auto PERF_SP_RTU_RAY_BOX_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_BOX_INTERSECTIONS"); + auto PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS"); + auto PERF_SP_SCH_STALL_CYCLES_RTU = countable("SP", "PERF_SP_SCH_STALL_CYCLES_RTU"); + + /* CMP: 1/4 counters */ + auto PERF_CMPDECMP_VBIF_READ_DATA = countable("CMP", "PERF_CMPDECMP_VBIF_READ_DATA"); + + /* LRZ: 4/4 counters */ + auto PERF_LRZ_TOTAL_PIXEL = countable("LRZ", "PERF_LRZ_TOTAL_PIXEL"); + auto PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ = countable("LRZ", "PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ"); + auto PERF_LRZ_TILE_KILLED = countable("LRZ", "PERF_LRZ_TILE_KILLED"); + auto PERF_LRZ_PRIM_KILLED_BY_LRZ = countable("LRZ", "PERF_LRZ_PRIM_KILLED_BY_LRZ"); + + /** + * GPU Compute + */ + disabledCounter("Avg Load-Store Instructions Per Cycle", Counter::Units::None, [=]() { + /* Number of average Load-Store instructions per cycle. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: 4*sum(PERF_SP_{LM,GM}_{LOAD,STORE}_INSTRUCTIONS) / PERF_SP_BUSY_CYCLES + */ + return 42; + } + ); + counter("Bytes Data Actually Written", Counter::Units::Byte, [=]() { + /* Number of bytes requested to be written by the GPU. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS + * Notes: + * - Equation: PERF_UCHE_EVICTS * 64 + */ + return PERF_UCHE_EVICTS * 64; + } + ); + counter("Bytes Data Write Requested", Counter::Units::Byte, [=]() { + /* Number of bytes requested to be written by the GPU. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP + * Notes: + * - Equation: PERF_UCHE_WRITE_REQUESTS_SP * 16 + */ + return PERF_UCHE_WRITE_REQUESTS_SP * 16; + } + ); + counter("Global Buffer Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global buffer data read in by the GPU, per second from the system memory (when the data is not found in L2 cache). */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP + * Notes: + * - Equation: (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time + */ + return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time; + } + ); + counter("Global Buffer Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global buffer read requests, made by a compute kernel to the L2 cache, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_SP * 16) / time + */ + return (PERF_UCHE_READ_REQUESTS_SP * 16) / time; + } + ); + counter("% Global Buffer Read L2 Hit", Counter::Units::Percent, [=]() { + /* Percentage of total global buffer read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2)) / PERF_UCHE_READ_REQUESTS_SP + */ + return percent(PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2), PERF_UCHE_READ_REQUESTS_SP); + } + ); + counter("% Global Buffer Write L2 Hit", Counter::Units::Percent, [=]() { + /* Percentage of global write L2 Hit. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP + * Notes: + * - Equation: (PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS) / PERF_UCHE_WRITE_REQUESTS_SP + */ + return percent(PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS, PERF_UCHE_WRITE_REQUESTS_SP); + } + ); + counter("Global Image Compressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global Image data (compressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */ + /* Countables: + * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA + * Notes: + * - Equation: (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time + */ + return (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time; + } + ); + counter("Global Image Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of image buffer read requests, made by a compute kernel to the L2 cache, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_TP * 16) / time + */ + return (PERF_UCHE_READ_REQUESTS_TP * 16) / time; + } + ); + counter("Global Image Uncompressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global Image data (uncompressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * Notes: + * - Equation: (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time + */ + return (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time; + } + ); + disabledCounter("Global Memory Atomic Instructions", Counter::Units::None, [=]() { + /* Number of Global Memory Atomic Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_32 = PERF_SP_GM_ATOMICS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_GM_ATOMICS * 4 + */ + return 42; + } + ); + disabledCounter("Global Memory Load Instructions", Counter::Units::None, [=]() { + /* Number of Global Memory Load Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_GM_LOAD_INSTRUCTIONS * 4 + */ + return 42; + } + ); + disabledCounter("Global Memory Store Instructions", Counter::Units::None, [=]() { + /* Number of Global Memory Store Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_GM_STORE_INSTRUCTIONS * 4 + */ + return 42; + } + ); + counter("% Image Read L2 Hit", Counter::Units::Percent, [=]() { + /* Percentage of total image read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2)) / PERF_UCHE_READ_REQUESTS_TP + */ + return percent(PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2), PERF_UCHE_READ_REQUESTS_TP); + } + ); + counter("% Kernel Load Cycles", Counter::Units::Percent, [=]() { + /* Percentage of cycles used for a compute kernel loading; excludes execution cycles. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - Equation: (PERF_RBBM_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * #uSPTP)) / PERF_CP_ALWAYS_COUNT + */ + return percent(PERF_RBBM_US_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * number_of_usptp), PERF_CP_ALWAYS_COUNT); + } + ); + counter("% L1 Hit", Counter::Units::Percent, [=]() { + /* Percentage of L1 texture cache requests that were hits. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS + * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * Notes: + * - Equation: (PERF_TP_L1_CACHELINE_REQUESTS - PERF_TP_L1_CACHELINE_MISSES) / PERF_TP_L1_CACHELINE_REQUESTS + */ + return percent(PERF_TP_L1_CACHELINE_REQUESTS[BR] - PERF_TP_L1_CACHELINE_MISSES[BR], PERF_TP_L1_CACHELINE_REQUESTS[BR]); + } + ); + disabledCounter("Load-Store Utilization", Counter::Units::Percent, [=]() { + /* Percentage of the Load-Store unit is utilized compared to theoretical Load/Store throughput. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_63 = PERF_SP_LOAD_CONTROL_WORKING_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LOAD_CONTROL_WORKING_CYCLES / PERF_SP_BUSY_CYCLES + */ + return 42; + } + ); + disabledCounter("Local Memory Atomic Instructions", Counter::Units::None, [=]() { + /* Number of Local Memory Atomic Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_29 = PERF_SP_LM_ATOMICS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LM_ATOMICS * 4 + */ + return 42; + } + ); + disabledCounter("Local Memory Load Instructions", Counter::Units::None, [=]() { + /* Number of Local Memory Load Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LM_LOAD_INSTRUCTIONS * 4 + */ + return 42; + } + ); + disabledCounter("Local Memory Store Instructions", Counter::Units::None, [=]() { + /* Number of Local Memory Store Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LM_STORE_INSTRUCTIONS * 4 + */ + return 42; + } + ); + + /** + * GPU General + */ + disabledCounter("Clocks / Second", Counter::Units::None, [=]() { + /* Number of GPU clocks per second. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT + * Notes: + * - TODO: with Adaptive Clock Distribution, the measured values are much more varied + * than the constant GPU frequency value we currently get, so this counter is disabled + * for now in favor of the GPU Frequency counter below. + * - Equation: PERF_CP_ALWAYS_COUNT / time + */ + return 42; + } + ); + disabledCounter("GPU % Bus Busy", Counter::Units::Percent, [=]() { + /* Approximate Percentage of time the GPU's bus to system memory is busy. */ + /* Countables: + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL + * Notes: + * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of + * more complex way that those counters are enabled + * - Equation: (PERF_UCHE_STALL_CYCLES_ARBITER + sum(PERF_GBIF_AXI{0,1}_{READ,WRITE}_DATA_BEATS_TOTAL)) / (4 * PERF_RBBM_STATUS_MASKED) + */ + return 42; + } + ); + counter("GPU Frequency", Counter::Units::None, [=]() { + /* Notes: + * - TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/gpuclk + * - Same value can be retrieved through PERF_CP_ALWAYS_COUNT, until ACD enables adaptive + * GPU frequencies that would be covered by the Clocks / Second counter above. + */ + return PERF_CP_ALWAYS_COUNT / time; + } + ); + disabledCounter("GPU Temperature", Counter::Units::None, [=]() { + /* TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/temp */ + return 42; + } + ); + counter("GPU % Utilization", Counter::Units::Percent, [=]() { + /* Percentage utilization of the GPU. */ + /* Countables: + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(PERF_RBBM_US_STATUS_MASKED, max_freq); + } + ); + + /** + * GPU Memory Stats + */ + counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() { + /* Average number of bytes transferred from main memory for each fragment. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_SP_PIXELS); + } + ); + counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() { + /* Average number of bytes transferred from main memory for each vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + */ + return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + disabledCounter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Total number of bytes read by the GPU from memory, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL + * Notes: + * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of + * more complex way that those counters are enabled + * - Equation: (PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL) * 32 / time + */ + return 42; + } + ); + counter("SP Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Bytes of data read from memory by the Shader Processors, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP + */ + return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time; + } + ); + counter("Texture Memory Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Bytes of texture data read from memory per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA + */ + return ((PERF_UCHE_VBIF_READ_BEATS_TP + PERF_CMPDECMP_VBIF_READ_DATA) * 32) / time; + } + ); + counter("Vertex Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Bytes of vertex data read from memory per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD + */ + return (PERF_UCHE_VBIF_READ_BEATS_VFD * 32) / time; + } + ); + disabledCounter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Total number of bytes written by the GPU to memory, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL + * Notes: + * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of + * more complex way that those counters are enabled + * - Equation: (PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL) * 32 / time + */ + return 42; + } + ); + + /** + * GPU Preemption + */ + counter("Avg Preemption Delay", Counter::Units::None, [=]() { + /* Average time (us) from the preemption request to preemption start. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_4 = PERF_CP_PREEMPTION_REACTION_DELAY + * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS + * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT + * Note: + * - PERF_CP_NUM_PREEMPTIONS has to be divided by 2 + */ + if (!PERF_CP_ALWAYS_COUNT || !PERF_CP_NUM_PREEMPTIONS) + return 0.0; + + double clocks_per_us = (double)PERF_CP_ALWAYS_COUNT / (time * 1000000); + double delay_us = PERF_CP_PREEMPTION_REACTION_DELAY / clocks_per_us; + return delay_us / ((double)PERF_CP_NUM_PREEMPTIONS / 2); + } + ); + counter("Preemptions / second", Counter::Units::None, [=]() { + /* The number of GPU preemptions that occurred, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS + * Note: + * - PERF_CP_NUM_PREEMPTIONS has to be divided by 2 + */ + return PERF_CP_NUM_PREEMPTIONS / (2 * time); + } + ); + + /** + * GPU Primitive Processing + */ + counter("Average Polygon Area", Counter::Units::None, [=]() { + /* Average number of pixels per polygon. */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_14 = PERF_TSE_OUTPUT_VISIBLE_PRIM + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(PERF_SP_PIXELS, PERF_TSE_BE_OUTPUT_VISIBLE_PRIM); + } + ); + counter("Average Vertices / Polygon", Counter::Units::None, [=]() { + /* Average number of vertices per polygon. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return safe_div(cbSum(PERF_PC_S_VS_INVOCATIONS), PERF_TSE_BE_INPUT_PRIM); + } + ); + counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() { + /* Number of polygons submitted to the GPU, per second, before any hardware clipping. */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return PERF_TSE_BE_INPUT_PRIM / time; + } + ); + counter("% Prims Clipped", Counter::Units::Percent, [=]() { + /* Percentage of primitives clipped by the GPU (where new primitives are generated). */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_9 = PERF_TSE_CLIPPED_PRIM + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return percent(PERF_TSE_BE_CLIPPED_PRIM, PERF_TSE_BE_INPUT_PRIM); + } + ); + counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() { + /* Percentage of primitives that are trivially rejected. */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_8 = PERF_TSE_TRIVAL_REJ_PRIM + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return percent(PERF_TSE_BE_TRIVAL_REJ_PRIM, PERF_TSE_BE_INPUT_PRIM); + } + ); + counter("Reused Vertices / Second", Counter::Units::None, [=]() { + /* Number of vertices used from the post-transform vertex buffer cache, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS + */ + return cbSum(PERF_PC_S_VERTEX_HITS) / time; + } + ); + + /** + * GPU Shader Processing + */ + counter("ALU / Fragment", Counter::Units::None, [=]() { + /* Average number of scalar fragment shader ALU instructions issued per shaded fragment, expressed as full precision ALUs (2 mediump = 1 fullp). */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS + * Notes: + * - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity. + * - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4 + * - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4 + * to match other per-fragment counters. + */ + return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2, + PERF_SP_PIXELS); + } + ); + counter("ALU / Vertex", Counter::Units::None, [=]() { + /* Average number of vertex scalar shader ALU instructions issued per shaded vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + * - For some reason half-precision ALUs are not counted. + */ + return safe_div(4 * cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS), cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() { + /* Percent of texels filtered using the 'Anisotropic' sampling method. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO + */ + return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_ANISO), cbSum(PERF_TP_OUTPUT_PIXELS)); + } + ); + counter("Average BVH Fetch Latency Cycles", Counter::Units::None, [=]() { + /* The Average BVH Fetch Latency cycles is the latency counted from start of BVH query request till getting BVH Query result back. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_139 = PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_140 = PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return safe_div(PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES, PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES); + } + ); + counter("EFU / Fragment", Counter::Units::None, [=]() { + /* Average number of scalar fragment shader EFU instructions issued per shaded fragment. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS + * Notes: + * - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity. + * - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4 + * - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4 + * to match other per-fragment counters. + */ + return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_SP_PIXELS); + } + ); + counter("EFU / Vertex", Counter::Units::None, [=]() { + /* Average number of scalar vertex shader EFU instructions issued per shaded vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return safe_div(4 * cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS), cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() { + /* Total number of full precision fragment shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * 4) / time; + } + ); + counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() { + /* Total number of half precision Scalar fragment shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * 4) / time; + } + ); + counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() { + /* Total number of Scalar fragment shader Elementary Function Unit (EFU) instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * 4) / time; + } + ); + counter("Fragment Instructions / Second", Counter::Units::None, [=]() { + /* Total number of fragment shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (4 * (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + + + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2)) / time; + } + ); + counter("Fragments Shaded / Second", Counter::Units::None, [=]() { + /* Number of fragments submitted to the shader engine, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return PERF_SP_PIXELS / time; + } + ); + counter("% Linear Filtered", Counter::Units::Percent, [=]() { + /* Percent of texels filtered using the 'Linear' sampling method. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR + */ + return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_BILINEAR), cbSum(PERF_TP_OUTPUT_PIXELS)); + } + ); + counter("% Nearest Filtered", Counter::Units::Percent, [=]() { + /* Percent of texels filtered using the 'Nearest' sampling method. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT + */ + return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_POINT), cbSum(PERF_TP_OUTPUT_PIXELS)); + } + ); + disabledCounter("% Non-Base Level Textures", Counter::Units::Percent, [=]() { + /* Percent of texels coming from a non-base MIP level. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD + * Notes: + * - FIXME: disabled due to lack of TP counter capacity + * - Equation: 100.0 - percent(cbSum(PERF_TP_OUTPUT_PIXELS_ZERO_LOD), cbSum(PERF_TP_OUTPUT_PIXELS)); + */ + return 42; + } + ); + counter("% RTU Busy", Counter::Units::Percent, [=]() { + /* Percentage of time that Ray Tracing Unit in SP is busy compared to whole SP. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_125 = PERF_SP_RTU_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return percent(PERF_SP_RTU_BUSY_CYCLES, PERF_SP_BUSY_CYCLES); + } + ); + counter("RTU Ray Box Intersections Per Instruction", Counter::Units::None, [=]() { + /* Number of Ray Box intersections per instruction. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_148 = PERF_SP_RTU_RAY_BOX_INTERSECTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return safe_div(PERF_SP_RTU_RAY_BOX_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS); + } + ); + counter("RTU Ray Triangle Intersections Per Instruction", Counter::Units::None, [=]() { + /* Number of Ray Triangle intersections per instruction. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_149 = PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return safe_div(PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS); + } + ); + counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() { + /* Percent of maximum shader capacity (ALU operations) utilized. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + int64_t numerator = cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS) + + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2; + int64_t denominator = PERF_SP_BUSY_CYCLES * number_of_alus_per_usptp; + return percent(numerator, denominator); + } + ); + counter("% Shaders Busy", Counter::Units::Percent, [=]() { + /* Percentage of time that all Shader cores are busy. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - SP_BUSY_CYCLES seems to be used as the numerator -- unless it's zero, + * at which point TP_BUSY_CYLCES seems to be used instead. + */ + + int64_t numerator = PERF_SP_BUSY_CYCLES; + if (!numerator) + numerator = PERF_TP_BUSY_CYCLES; + return percent(numerator, number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Shaders Stalled", Counter::Units::Percent, [=]() { + /* Percentage of time that all shader cores are idle with at least one active wave. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_7 = PERF_SP_NON_EXECUTION_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(PERF_SP_NON_EXECUTION_CYCLES, number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Texture Pipes Busy", Counter::Units::Percent, [=]() { + /* Percentage of time that any texture pipe is busy. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(PERF_TP_BUSY_CYCLES, number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("Textures / Fragment", Counter::Units::None, [=]() { + /* Average number of textures referenced per fragment. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(PERF_TP_OUTPUT_PIXELS[BR], PERF_SP_PIXELS); + } + ); + counter("Textures / Vertex", Counter::Units::None, [=]() { + /* Average number of textures referenced per vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return safe_div(4 * cbSum(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS), cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + counter("% Time ALUs Working", Counter::Units::Percent, [=]() { + /* Percentage of time the ALUs are working while the Shaders are busy. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_1 = PERF_SP_ALU_WORKING_CYCLES + * Notes: + * - ALU working cycles have to be halved. + */ + return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES); + } + ); + counter("% Time Compute", Counter::Units::Percent, [=]() { + /* Amount of time spent in compute work compared to the total time spent shading everything. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE + * CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value. + */ + int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE + + cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE); + return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE, total); + } + ); + counter("% Time EFUs Working", Counter::Units::Percent, [=]() { + /* Percentage of time the EFUs are working while the Shaders are busy. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_2 = PERF_SP_EFU_WORKING_CYCLES + */ + return percent(PERF_SP_EFU_WORKING_CYCLES, PERF_SP_BUSY_CYCLES); + } + ); + counter("% Time Shading Fragments", Counter::Units::Percent, [=]() { + /* Amount of time spent shading fragments compared to the total time spent shading everything. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE + * Notes: + * - CS_STAGE amount is also counted in FS_STAGE, so fragment time has to be retrieved + * through subtraction and the compute time shouldn't be summed into the total value. + */ + int64_t fragments = PERF_SP_ANY_EU_WORKING_FS_STAGE - PERF_SP_ANY_EU_WORKING_CS_STAGE; + int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE + + cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE); + return percent(fragments, total); + } + ); + counter("% Time Shading Vertices", Counter::Units::Percent, [=]() { + /* Amount of time spent shading vertices compared to the total time spent shading everything. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * Notes: + * - CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value. + */ + int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE + + cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE); + return percent(cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE), total); + } + ); + counter("Vertex Instructions / Second", Counter::Units::None, [=]() { + /* Total number of scalar vertex shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * Notes: + - Numerator has to be multiplied by four. + */ + return (4 * (cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) + cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS))) / time; + } + ); + counter("Vertices Shaded / Second", Counter::Units::None, [=]() { + /* Number of vertices submitted to the shader engine, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + */ + return cbSum(PERF_PC_S_VS_INVOCATIONS) / time; + } + ); + disabledCounter("% Wave Context Occupancy", Counter::Units::Percent, [=]() { + /* Average percentage of wave context occupancy per cycle. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_8 = PERF_SP_WAVE_CONTEXTS + * PERFCOUNTER_GROUP_SP::COUNTABLE_9 = PERF_SP_WAVE_CONTEXT_CYCLES + * Note: + * - FIXME: disabled due to lack of SP counter capacity + * - the quotient has to be divided by the number of execution wave slots per SP (16 on a7xx) + * - Equation: (PERF_SP_WAVE_CONTEXTS / PERF_SP_WAVE_CONTEXT_CYCLES) / number_of_execution_wave_slots_per_sp; + */ + return 42; + } + ); + + /** + * GPU Stalls + */ + counter("% BVH Fetch Stall", Counter::Units::Percent, [=]() { + /* Percentage of clock cycles where the RTU could not make any more requests for BVH fetch from scheduler. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_150 = PERF_SP_SCH_STALL_CYCLES_RTU + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return percent(PERF_SP_SCH_STALL_CYCLES_RTU, PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Instruction Cache Miss", Counter::Units::Percent, [=]() { + /* Number of L1 instruction cache misses divided by L1 instruction cache requests. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS + * PERFCOUNTER_GROUP_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES + */ + return percent(cbSum(PERF_SP_ICL1_MISSES), cbSum(PERF_SP_ICL1_REQUESTS)); + } + ); + counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() { + /* Average number of Texture L1 cache misses per pixel. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(cbSum(PERF_TP_L1_CACHELINE_MISSES), PERF_SP_PIXELS); + } + ); + counter("% Stalled On System Memory", Counter::Units::Percent, [=]() { + /* Percentage of cycles the L2 cache is stalled waiting for data from system memory. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - denominator has to be multiplied by four, for unknown reasons. + */ + return safe_div(PERF_UCHE_STALL_CYCLES_ARBITER, 4 * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Texture Fetch Stall", Counter::Units::Percent, [=]() { + /* Percentage of clock cycles where the shader processors cannot make any more requests for texture data. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(cbSum(PERF_SP_STALL_CYCLES_TP), number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Texture L1 Miss", Counter::Units::Percent, [=]() { + /* Number of L1 texture cache misses divided by L1 texture cache requests. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS + * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + */ + return percent(cbSum(PERF_TP_L1_CACHELINE_MISSES), cbSum(PERF_TP_L1_CACHELINE_REQUESTS)); + } + ); + counter("% Texture L2 Miss", Counter::Units::Percent, [=]() { + /* Number of L2 texture cache misses divided by L2 texture cache requests. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP + * Notes: + * - ratio has to be multiplied by two. Unsure how this constant comes up. + */ + return percent(2 * PERF_UCHE_VBIF_READ_BEATS_TP, PERF_UCHE_READ_REQUESTS_TP); + } + ); + counter("% Vertex Fetch Stall", Counter::Units::Percent, [=]() { + /* Percentage of clock cycles where the GPU cannot make any more requests for vertex data. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(cbSum(PERF_PC_S_STALL_CYCLES_VFD), PERF_RBBM_US_STATUS_MASKED); + } + ); + + counter("% LRZ Pixel Killed", Counter::Units::Percent, [=]() { + return percent(PERF_LRZ_TOTAL_PIXEL - PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ, + PERF_LRZ_TOTAL_PIXEL); + }); + + counter("LRZ Primitives Killed", Counter::Units::None, [=]() { + return PERF_LRZ_PRIM_KILLED_BY_LRZ; + }); + + counter("LRZ Tiles Killed", Counter::Units::None, [=]() { + return PERF_LRZ_TILE_KILLED; + }); +} + +} // namespace pps diff --git a/src/freedreno/ds/fd_pps_driver.cc b/src/freedreno/ds/fd_pps_driver.cc index 5c3c5675409..2da6073fb31 100644 --- a/src/freedreno/ds/fd_pps_driver.cc +++ b/src/freedreno/ds/fd_pps_driver.cc @@ -262,6 +262,9 @@ FreedrenoDriver::init_perfcnt() case 7: setup_a7xx_counters(); break; + case 8: + setup_a8xx_counters(); + break; default: PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id)); return false; diff --git a/src/freedreno/ds/fd_pps_driver.h b/src/freedreno/ds/fd_pps_driver.h index 4da5cb6d808..b552c5fbda3 100644 --- a/src/freedreno/ds/fd_pps_driver.h +++ b/src/freedreno/ds/fd_pps_driver.h @@ -92,6 +92,7 @@ private: void setup_a6xx_counters(); void setup_a7xx_counters(); + void setup_a8xx_counters(); void configure_counters(bool reset, bool wait); void collect_countables(); diff --git a/src/freedreno/ds/meson.build b/src/freedreno/ds/meson.build index f569311c4a9..13211f0e27c 100644 --- a/src/freedreno/ds/meson.build +++ b/src/freedreno/ds/meson.build @@ -7,6 +7,7 @@ pps_freedreno_lib = static_library( sources: [ 'fd_pps_a6xx.cc', 'fd_pps_a7xx.cc', + 'fd_pps_a8xx.cc', 'fd_pps_driver.cc', 'fd_pps_driver.h', freedreno_xml_header_files, From cc743feb525c2e8ae1f39c5c40358d932bd853bd Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 1 May 2026 11:07:05 -0700 Subject: [PATCH 10/21] freedreno/perfcntrs: Add helpers to resolve group and countable We were duplicating this in a few places. Add helpers instead. Signed-off-by: Rob Clark --- src/freedreno/computerator/main.cc | 9 +++---- src/freedreno/ds/fd_pps_driver.cc | 7 +++-- src/freedreno/perfcntrs/freedreno_perfcntr.h | 27 ++++++++++++++++++++ src/freedreno/vulkan/tu_autotune.cc | 24 +++-------------- 4 files changed, 36 insertions(+), 31 deletions(-) diff --git a/src/freedreno/computerator/main.cc b/src/freedreno/computerator/main.cc index 31c839af1df..f33d136f2ee 100644 --- a/src/freedreno/computerator/main.cc +++ b/src/freedreno/computerator/main.cc @@ -134,13 +134,10 @@ setup_counter(const char *name, struct perfcntr *c) { for (int i = 0; i < num_groups; i++) { const struct fd_perfcntr_group *group = &groups[i]; + const struct fd_perfcntr_countable *countable = + fd_perfcntrs_countable(group, name); - for (int j = 0; j < group->num_countables; j++) { - const struct fd_perfcntr_countable *countable = &group->countables[j]; - - if (strcmp(name, countable->name) != 0) - continue; - + if (countable) { /* * Allocate a counter to use to monitor the requested countable: */ diff --git a/src/freedreno/ds/fd_pps_driver.cc b/src/freedreno/ds/fd_pps_driver.cc index 2da6073fb31..8d73813ae41 100644 --- a/src/freedreno/ds/fd_pps_driver.cc +++ b/src/freedreno/ds/fd_pps_driver.cc @@ -539,11 +539,10 @@ FreedrenoDriver::Countable::resolve() const if (group != g->name) continue; - for (unsigned j = 0; j < g->num_countables; j++) { - const struct fd_perfcntr_countable *c = &g->countables[j]; - if (name != c->name) - continue; + const struct fd_perfcntr_countable *c = + fd_perfcntrs_countable(g, name.c_str()); + if (c) { d->state[id].countable = c; /* Assign counters from high to low to reduce conflicts with UMD-owned diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.h b/src/freedreno/perfcntrs/freedreno_perfcntr.h index a0b6e99acde..5ab0630abf5 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.h +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.h @@ -89,6 +89,33 @@ const struct fd_perfcntr_group *fd_perfcntrs(const struct fd_dev_id *id, unsigne .countables = _countables, \ } +static inline const struct fd_perfcntr_group * +fd_perfcntrs_group(const struct fd_dev_id *id, const char *name) +{ + const struct fd_perfcntr_group *groups; + unsigned count; + + groups = fd_perfcntrs(id, &count); + if (!groups) + return NULL; + + for (unsigned i = 0; i < count; i++) + if (!strcmp(groups[i].name, name)) + return &groups[i]; + + return NULL; +} + +static inline const struct fd_perfcntr_countable * +fd_perfcntrs_countable(const struct fd_perfcntr_group *group, const char *name) +{ + for (unsigned i = 0; i < group->num_countables; i++) + if (!strcmp(group->countables[i].name, name)) + return &group->countables[i]; + + return NULL; +} + #define FD_DERIVED_COUNTER_MAX_PERFCNTRS 8 struct fd_derivation_context { diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc index aa6c7f816dd..421cd26c3d1 100644 --- a/src/freedreno/vulkan/tu_autotune.cc +++ b/src/freedreno/vulkan/tu_autotune.cc @@ -1633,31 +1633,13 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result) tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc"); if (supports_preempt_latency_tracking()) { - uint32_t group_count; - const struct fd_perfcntr_group *groups = fd_perfcntrs(&device->physical_device->dev_id, &group_count); const char *fail_reason = nullptr; - const fd_perfcntr_group *cp_group = nullptr; - for (uint32_t i = 0; i < group_count; i++) { - if (strcmp(groups[i].name, "CP") == 0) { - cp_group = &groups[i]; - break; - } - } + const fd_perfcntr_group *cp_group = fd_perfcntrs_group(&device->physical_device->dev_id, "CP"); if (cp_group) { - auto get_perfcntr_countable = [](const struct fd_perfcntr_group *group, - const char *name) -> const struct fd_perfcntr_countable * { - for (uint32_t i = 0; i < group->num_countables; i++) { - if (strcmp(group->countables[i].name, name) == 0) - return &group->countables[i]; - } - - return nullptr; - }; - - auto preemption_latency_countable = get_perfcntr_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); - auto always_count_countable = get_perfcntr_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); + auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); + auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); if (preemption_latency_countable && always_count_countable) { if (cp_group->num_counters >= 2) { preemption_latency_selector_reg = cp_group->counters[0].select_reg; From 74cfe319b7e9ee41fd774ea967805cd779dc07f6 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 29 Apr 2026 15:32:59 -0700 Subject: [PATCH 11/21] freedreno/perfcntrs: Add helper to assign counters Add a helper to allocate a counter for a requested countable, and (if supported by KMD) do the PERFCNTR_CONFIG ioctl to reserve the counter for UMD local (inline) usage. Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/freedreno_perfcntr.c | 255 +++++++++++++++++++ src/freedreno/perfcntrs/freedreno_perfcntr.h | 15 ++ src/freedreno/perfcntrs/meson.build | 6 +- 3 files changed, 275 insertions(+), 1 deletion(-) diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.c b/src/freedreno/perfcntrs/freedreno_perfcntr.c index aa984903a6b..99239058295 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.c +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.c @@ -7,6 +7,15 @@ */ #include +#include + +#include "util/hash_table.h" +#include "util/ralloc.h" + +#include "drm-uapi/msm_drm.h" +#include "util/bitset.h" +#include "util/simple_mtx.h" +#include "freedreno_common.h" #include "freedreno_perfcntr.h" @@ -47,6 +56,252 @@ fd_perfcntrs(const struct fd_dev_id *id, unsigned *count) } } +struct fd_perfcntr_counter_state { + int group; + int counter; + int countable; + unsigned nr_users; +}; + +#define MAX_COUNTERS_PER_GROUP 32 +typedef BITSET_DECLARE(assigned_counters_t, MAX_COUNTERS_PER_GROUP); + +/** + * Helper to manage assigning counters, tracking if there are multiple users + * for the same countable (to avoid assigning duplicate counters for the + * same countable, etc) + */ +struct fd_perfcntr_state { + simple_mtx_t lock; + int fd; + const struct fd_dev_id *id; + + unsigned nr_groups; + const struct fd_perfcntr_group *groups; + + struct drm_msm_perfcntr_group *group_configs; + struct drm_msm_perfcntr_config config; + + /* bitmask of assigned counters per group: */ + assigned_counters_t *assigned_counters; + + /* maps counter to fd_perfcntr_counter_state: */ + struct hash_table *counter_state; +}; + +static int +update_reserved_counters(struct fd_perfcntr_state *perfcntrs) +{ + /* If no kernel support, just carry on and assume we can use all counters: */ + if (perfcntrs->fd < 0) + return 0; + + return drmIoctl(perfcntrs->fd, DRM_IOCTL_MSM_PERFCNTR_CONFIG, &perfcntrs->config); +} + +static int +update_group_counters(struct fd_perfcntr_state *perfcntrs, int group_idx) +{ + int ret = 0; + + /* Update reserved config with kernel if it changes. We might not + * be assiging/releasing the last counter (and we cannot feasibly + * re-map existing assigned counters to compact away gaps in the + * used counters, as cmdstream might already + * be built encoding the other assigned counters), but if we do + * let the kernel know: + */ + unsigned nr = BITSET_LAST_BIT(perfcntrs->assigned_counters[group_idx]); + if (nr != perfcntrs->group_configs[group_idx].nr_countables) { + perfcntrs->group_configs[group_idx].nr_countables = nr; + ret = update_reserved_counters(perfcntrs); + } + + return ret; +} + +struct fd_perfcntr_state * +fd_perfcntr_state_alloc(const struct fd_dev_id *id, int fd) +{ + const struct fd_perfcntr_group *groups; + unsigned nr_groups; + + groups = fd_perfcntrs(id, &nr_groups); + if (!groups) + return NULL; + + struct fd_perfcntr_state *perfcntrs = rzalloc(NULL, struct fd_perfcntr_state); + + simple_mtx_init(&perfcntrs->lock, mtx_plain); + perfcntrs->fd = fd; + perfcntrs->id = id; + perfcntrs->nr_groups = nr_groups; + perfcntrs->groups = groups; + perfcntrs->group_configs = + rzalloc_array(perfcntrs, struct drm_msm_perfcntr_group, nr_groups); + + for (unsigned i = 0; i < nr_groups; i++) { + assert(strlen(groups[i].name) < sizeof(perfcntrs->group_configs[i].group_name)); + strcpy(perfcntrs->group_configs[i].group_name, groups[i].name); + } + + perfcntrs->config = (struct drm_msm_perfcntr_config) { + .nr_groups = nr_groups, + .groups = VOID2U64(perfcntrs->group_configs), + .group_stride = sizeof(struct drm_msm_perfcntr_group), + }; + + perfcntrs->assigned_counters = rzalloc_array(perfcntrs, assigned_counters_t, nr_groups); + perfcntrs->counter_state = _mesa_pointer_hash_table_create(perfcntrs); + + /* Probe for kernel PERFCNTR_CONFIG support with empty config: */ + if (update_reserved_counters(perfcntrs)) + perfcntrs->fd = -1; + + return perfcntrs; +} + +void +fd_perfcntr_state_free(struct fd_perfcntr_state *perfcntrs) +{ + if (!perfcntrs) + return; + + perfcntrs->config.nr_groups = 0; + update_reserved_counters(perfcntrs); + ralloc_free(perfcntrs); +} + +/** + * Does KMD support perfcntr reservation (ie. PERFCNTR_CONFIG) + */ +bool +fd_perfcntr_has_reservation(struct fd_perfcntr_state *perfcntrs) +{ + return perfcntrs->fd >= 0; +} + +static int +find_group_idx(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_group *group) +{ + for (unsigned i = 0; i < perfcntrs->nr_groups; i++) + if (&perfcntrs->groups[i] == group) + return i; + UNREACHABLE("invalid group"); +} + +static int +find_countable_idx(const struct fd_perfcntr_group *group, + const struct fd_perfcntr_countable *countable) +{ + for (unsigned i = 0; i < group->num_countables; i++) + if (&group->countables[i] == countable) + return i; + UNREACHABLE("invalid countable"); +} + +const struct fd_perfcntr_counter * +fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_group *group, + const struct fd_perfcntr_countable *countable) +{ + struct fd_perfcntr_counter_state *state = NULL; + int c, g = find_group_idx(perfcntrs, group); + + simple_mtx_lock(&perfcntrs->lock); + + /* Check if requested countable is already configured: */ + BITSET_FOREACH_SET (c, perfcntrs->assigned_counters[g], MAX_COUNTERS_PER_GROUP) { + struct hash_entry *e = + _mesa_hash_table_search(perfcntrs->counter_state, &group->counters[c]); + + assert(e); + struct fd_perfcntr_counter_state *s = e->data; + + if (&group->countables[s->countable] == countable) { + state = s; + break; + } + } + + /* If we didn't find a counter assigned to this countable, assign a new one: */ + if (!state) { + assigned_counters_t *assigned_counters = &perfcntrs->assigned_counters[g]; + + /* Pick lowest #ed unassigned counter: */ + assigned_counters_t free_counters; + memcpy(free_counters, *assigned_counters, sizeof(free_counters)); + BITSET_NOT(free_counters); + + c = BITSET_FFS(free_counters) - 1; + assert(c >= 0); + + if (c < group->num_counters) { + state = rzalloc(perfcntrs, struct fd_perfcntr_counter_state); + state->group = g; + state->counter = c; + state->countable = find_countable_idx(group, countable); + + assert(!BITSET_TEST(*assigned_counters, state->counter)); + + BITSET_SET(*assigned_counters, state->counter); + + if (update_group_counters(perfcntrs, state->group)) { + BITSET_CLEAR(*assigned_counters, state->counter); + ralloc_free(state); + state = NULL; + } else { + _mesa_hash_table_insert(perfcntrs->counter_state, + &group->counters[state->counter], + state); + } + } + } + + if (state) + state->nr_users++; + + simple_mtx_unlock(&perfcntrs->lock); + + if (!state) + return NULL; + + return &group->counters[state->counter]; +} + +void +fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_counter *counter) +{ + if (!counter) + return; + + simple_mtx_lock(&perfcntrs->lock); + struct hash_entry *e = _mesa_hash_table_search(perfcntrs->counter_state, counter); + if (e) { + struct fd_perfcntr_counter_state *state = e->data; + + assert(state->nr_users > 0); + + if (--state->nr_users == 0) { + /* dropping last user of the counter: */ + _mesa_hash_table_remove(perfcntrs->counter_state, e); + + assigned_counters_t *assigned_counters = + &perfcntrs->assigned_counters[state->group]; + + assert(BITSET_TEST(*assigned_counters, state->counter)); + + BITSET_CLEAR(*assigned_counters, state->counter); + update_group_counters(perfcntrs, state->group); + + ralloc_free(state); + } + } + simple_mtx_unlock(&perfcntrs->lock); +} + extern const struct fd_derived_counter *a7xx_derived_counters[]; extern const unsigned a7xx_num_derived_counters; diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.h b/src/freedreno/perfcntrs/freedreno_perfcntr.h index 5ab0630abf5..5b109b038e0 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.h +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.h @@ -116,6 +116,21 @@ fd_perfcntrs_countable(const struct fd_perfcntr_group *group, const char *name) return NULL; } +struct fd_perfcntr_state; + +struct fd_perfcntr_state * +fd_perfcntr_state_alloc(const struct fd_dev_id *id, int fd); +void fd_perfcntr_state_free(struct fd_perfcntr_state *perfcntrs); + +bool fd_perfcntr_has_reservation(struct fd_perfcntr_state *perfcntrs); + +const struct fd_perfcntr_counter * +fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_group *group, + const struct fd_perfcntr_countable *countable); +void fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_counter *counter); + #define FD_DERIVED_COUNTER_MAX_PERFCNTRS 8 struct fd_derivation_context { diff --git a/src/freedreno/perfcntrs/meson.build b/src/freedreno/perfcntrs/meson.build index 5caa91fbf30..61d39e8afa4 100644 --- a/src/freedreno/perfcntrs/meson.build +++ b/src/freedreno/perfcntrs/meson.build @@ -20,7 +20,11 @@ libfreedreno_perfcntrs = static_library( c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', link_with : [libfreedreno_common], - dependencies : idep_nir_headers, + dependencies : [ + dep_libdrm, + idep_mesautil, + idep_nir_headers, + ], build_by_default : false, ) From cb27d2e1b26650043a41f3e6650d7b2fa27819c6 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 30 Apr 2026 10:20:52 -0700 Subject: [PATCH 12/21] tu: Use counter allocation helper Signed-off-by: Rob Clark --- src/freedreno/vulkan/tu_autotune.cc | 38 +++++----- src/freedreno/vulkan/tu_autotune.h | 10 +-- src/freedreno/vulkan/tu_device.cc | 8 ++ src/freedreno/vulkan/tu_device.h | 3 + src/freedreno/vulkan/tu_query_pool.cc | 102 ++++++++++---------------- src/freedreno/vulkan/tu_query_pool.h | 6 +- 6 files changed, 74 insertions(+), 93 deletions(-) diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc index 421cd26c3d1..700ba8b7cf1 100644 --- a/src/freedreno/vulkan/tu_autotune.cc +++ b/src/freedreno/vulkan/tu_autotune.cc @@ -1641,15 +1641,12 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result) auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); if (preemption_latency_countable && always_count_countable) { - if (cp_group->num_counters >= 2) { - preemption_latency_selector_reg = cp_group->counters[0].select_reg; - preemption_latency_selector = preemption_latency_countable->selector; - preemption_latency_counter_reg_lo = cp_group->counters[0].counter_reg_lo; + preemption_latency_counter = + fd_perfcntr_reserve(device->perfcntrs, cp_group, preemption_latency_countable); + always_count_counter = + fd_perfcntr_reserve(device->perfcntrs, cp_group, always_count_countable); - always_count_selector_reg = cp_group->counters[1].select_reg; - always_count_selector = always_count_countable->selector; - always_count_counter_reg_lo = cp_group->counters[1].counter_reg_lo; - } else { + if (!preemption_latency_counter || !always_count_counter) { fail_reason = "not enough counters in CP group for preemption latency tracking"; } } else { @@ -1681,6 +1678,9 @@ tu_autotune::~tu_autotune() } tu_bo_suballocator_finish(&suballoc); + + fd_perfcntr_release(device->perfcntrs, preemption_latency_counter); + fd_perfcntr_release(device->perfcntrs, always_count_counter); } tu_autotune::cmd_buf_ctx::cmd_buf_ctx(struct tu_autotune &autotune): batch(autotune.create_batch()) @@ -1934,22 +1934,22 @@ tu_autotune::write_preempt_counters_to_iova(struct tu_cs *cs, uint64_t aon_iova) const { if (emit_selector) { - tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1); - tu_cs_emit(cs, preemption_latency_selector); + tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1); + tu_cs_emit(cs, preemption_latency_countable->selector); - tu_cs_emit_pkt4(cs, always_count_selector_reg, 1); - tu_cs_emit(cs, always_count_selector); + tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1); + tu_cs_emit(cs, always_count_countable->selector); } if (emit_wfi) tu_cs_emit_wfi(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, latency_iova); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, always_count_iova); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); @@ -2042,11 +2042,11 @@ tu_autotune::emit_switch_away_amble(struct tu_cs *cs) const static size_t counter = 0; if (counter++ % 2 == 0) { - tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1); - tu_cs_emit(cs, preemption_latency_selector); + tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1); + tu_cs_emit(cs, preemption_latency_countable->selector); - tu_cs_emit_pkt4(cs, always_count_selector_reg, 1); - tu_cs_emit(cs, always_count_selector); + tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1); + tu_cs_emit(cs, always_count_countable->selector); } tu_cond_exec_end(cs); @@ -2213,4 +2213,4 @@ tu_autotune::emit_preempt_latency_tracking_rp_hash(struct tu_cmd_buffer *cmd) tu_cs_emit_draw_state(&cmd->cs, TU_DRAW_STATE_AT_WRITE_RP_HASH, ds); return rp_key; -} \ No newline at end of file +} diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h index 55e579ae93a..8bf231edd04 100644 --- a/src/freedreno/vulkan/tu_autotune.h +++ b/src/freedreno/vulkan/tu_autotune.h @@ -242,13 +242,11 @@ struct tu_autotune { std::mutex rp_latency_mutex; /* Protects rp_latency_tracking */ uint64_t last_latency_cleanup_ts = 0; - uint32_t preemption_latency_selector_reg; - uint32_t preemption_latency_selector; - uint32_t preemption_latency_counter_reg_lo; + const struct fd_perfcntr_counter *preemption_latency_counter; + const struct fd_perfcntr_countable *preemption_latency_countable; - uint32_t always_count_selector_reg; - uint32_t always_count_selector; - uint32_t always_count_counter_reg_lo; + const struct fd_perfcntr_counter *always_count_counter; + const struct fd_perfcntr_countable *always_count_countable; struct tu_draw_state reset_rp_hash_draw_state; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 2e755551dca..ec79b9790b6 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -11,6 +11,7 @@ #include "drm-uapi/drm_fourcc.h" #include "git_sha1.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "common/freedreno_stompable_regs.h" /* for fd_get_driver/device_uuid() */ @@ -3081,6 +3082,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } } + device->perfcntrs = fd_perfcntr_state_alloc( + &physical_device->dev_id, + is_kgsl(physical_device->instance) ? -1 : device->fd); + device->autotune = new tu_autotune(device, result); if (result != VK_SUCCESS) goto fail_autotune; @@ -3181,6 +3186,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, fail_timeline_cond: fail_a725_workaround: fail_autotune: + fd_perfcntr_state_free(device->perfcntrs); delete device->autotune; fail_bin_preamble: fail_prepare_perfcntrs_pass_cs: @@ -3287,6 +3293,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) delete device->autotune; + fd_perfcntr_state_free(device->perfcntrs); + tu_bo_suballocator_finish(&device->pipeline_suballoc); tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); tu_bo_suballocator_finish(&device->event_suballoc); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..b65b3d5b4e7 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -11,6 +11,7 @@ #define TU_DEVICE_H #include "tu_common.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "radix_sort/radix_sort_vk.h" #include "util/rwlock.h" @@ -486,6 +487,8 @@ struct tu_device pthread_cond_t timeline_cond; pthread_mutex_t submit_mutex; + struct fd_perfcntr_state *perfcntrs; + struct tu_autotune *autotune; struct breadcrumbs_context *breadcrumbs_ctx; diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 3d0851de7c4..e3f6b9fae4b 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -7,6 +7,7 @@ */ #include "tu_query_pool.h" +#include "perfcntrs/freedreno_perfcntr.h" #include @@ -249,21 +250,6 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, assert(i < group_count); } -static uint32_t -perfcntr_reserved_counters(const struct fd_perfcntr_group *group) -{ - /* Keep raw perf queries off the CP slots reserved by autotune latency optimization. - * TODO: We need to do this in a more robust way. - */ - return strcmp(group->name, "CP") == 0 ? 2 : 0; -} - -static uint32_t -perfcntr_available_counters(const struct fd_perfcntr_group *group) -{ - return group->num_counters - MIN2(group->num_counters, perfcntr_reserved_counters(group)); -} - static int compare_perfcntr_pass(const void *a, const void *b) { @@ -271,6 +257,22 @@ compare_perfcntr_pass(const void *a, const void *b) ((struct tu_perf_query_raw_data *)b)->pass; } +static void +tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool, + const VkAllocationCallbacks *pAllocator) +{ + if (is_perf_query_raw(pool)) { + struct tu_perf_query_raw *perf_query = &pool->perf_query.raw; + + for (uint32_t i = 0; i < perf_query->counter_index_count; i++) + fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter); + } + + if (pool->bo) + tu_bo_finish(device, pool->bo); + vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); +} + VKAPI_ATTR VkResult VKAPI_CALL tu_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, @@ -353,50 +355,26 @@ tu_CreateQueryPool(VkDevice _device, perf_query->counter_index_count = perf_query_info->counterIndexCount; - /* Build all perf counters data that is requested, so we could get - * correct group id, countable id, counter register and pass index with - * only a counter index provided by applications at each command submit. - * - * Also, since this built data will be sorted by pass index later, we - * should keep the original indices and store perfcntrs results according - * to them so apps can get correct results with their own indices. - */ - uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count]; - memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0])); - memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0])); - for (uint32_t i = 0; i < perf_query->counter_index_count; i++) { uint32_t gid = 0, cid = 0; perfcntr_index(perf_query->perf_group, perf_query->perf_group_count, perf_query_info->pCounterIndices[i], &gid, &cid); - perf_query->data[i].gid = gid; - perf_query->data[i].cid = cid; perf_query->data[i].app_idx = i; const struct fd_perfcntr_group *group = &perf_query->perf_group[gid]; - uint32_t reserved_counters = perfcntr_reserved_counters(group); - uint32_t available_counters = perfcntr_available_counters(group); + const struct fd_perfcntr_countable *countable = &group->countables[cid]; - if (available_counters == 0) { - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + perf_query->data[i].countable = countable; + perf_query->data[i].counter = + fd_perfcntr_reserve(device->perfcntrs, group, countable); + + if (!perf_query->data[i].counter) { + tu_query_pool_destroy(device, pool, pAllocator); return vk_errorf(device, VK_ERROR_FEATURE_NOT_PRESENT, "No raw perf counters available in group %s", group->name); } - - /* When a counter register is over the capacity(num_counters), - * reset it for next pass. - */ - if (regs[gid] < available_counters) { - perf_query->data[i].cntr_reg = reserved_counters + regs[gid]++; - perf_query->data[i].pass = pass[gid]; - } else { - perf_query->data[i].pass = ++pass[gid]; - perf_query->data[i].cntr_reg = reserved_counters; - regs[gid] = 0; - regs[gid]++; - } } /* Sort by pass index so we could easily prepare a command stream @@ -429,14 +407,13 @@ tu_CreateQueryPool(VkDevice _device, VkResult result = tu_bo_init_new_cached(device, &pool->vk.base, &pool->bo, pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool"); if (result != VK_SUCCESS) { - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); return result; } result = tu_bo_map(device, pool->bo, NULL); if (result != VK_SUCCESS) { - tu_bo_finish(device, pool->bo); - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); return result; } @@ -463,8 +440,7 @@ tu_DestroyQueryPool(VkDevice _device, TU_RMV(resource_destroy, device, pool); - tu_bo_finish(device, pool->bo); - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); } static uint32_t @@ -1276,13 +1252,8 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; - const struct fd_perfcntr_countable *countable = - &perf_query->perf_group[data->gid].countables[data->cid]; - - tu_cs_emit_pkt4(cs, counter->select_reg, 1); - tu_cs_emit(cs, countable->selector); + tu_cs_emit_pkt4(cs, data->counter->select_reg, 1); + tu_cs_emit(cs, data->countable->selector); } tu_cond_exec_end(cs); @@ -1300,8 +1271,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; + const struct fd_perfcntr_counter *counter = data->counter; uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx); @@ -1749,8 +1719,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; + const struct fd_perfcntr_counter *counter = data->counter; end_iova = perf_query_iova(pool, query, end, data->app_idx); @@ -2317,9 +2286,12 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( } for (uint32_t i = 0; i < group_count; i++) { - uint32_t available_counters = perfcntr_available_counters(&group[i]); - if (available_counters == 0) - continue; + /* Some counters may be unavailable at the time the query is + * created due to runtime factors (pps/fdperf using some counters, + * autotune or other queries, etc). But we don't know that up + * front. + */ + uint32_t available_counters = group[i].num_counters; n_passes = DIV_ROUND_UP(counters_requested[i], available_counters); *pNumPasses = MAX2(*pNumPasses, n_passes); diff --git a/src/freedreno/vulkan/tu_query_pool.h b/src/freedreno/vulkan/tu_query_pool.h index b642934f130..b1c004fd484 100644 --- a/src/freedreno/vulkan/tu_query_pool.h +++ b/src/freedreno/vulkan/tu_query_pool.h @@ -11,6 +11,7 @@ #define TU_QUERY_POOL_H #include "tu_common.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "vk_query_pool.h" @@ -24,9 +25,8 @@ enum tu_perf_query_type { struct tu_perf_query_raw_data { - uint32_t gid; /* group-id */ - uint32_t cid; /* countable-id within the group */ - uint32_t cntr_reg; /* counter register within the group */ + const struct fd_perfcntr_counter *counter; + const struct fd_perfcntr_countable *countable; uint32_t pass; /* pass index that countables can be requested */ uint32_t app_idx; /* index provided by apps */ }; From c642bc8c276e58f4c6d64ad4230a2a1c57366b54 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 1 May 2026 08:12:44 -0700 Subject: [PATCH 13/21] freedreno/a6xx: Use counter allocation helper If the kernel supports PERFCNTR_CONFIG for counter reservation, we can expose perfcntrs by default. Signed-off-by: Rob Clark --- .../drivers/freedreno/a6xx/fd6_query.cc | 55 ++++++++----------- .../drivers/freedreno/freedreno_query_acc.c | 3 + .../drivers/freedreno/freedreno_query_acc.h | 1 + .../drivers/freedreno/freedreno_screen.c | 7 ++- .../drivers/freedreno/freedreno_screen.h | 1 + 5 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc index 223758fcf12..fa2d79c262c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc @@ -824,6 +824,7 @@ static const struct fd_acc_sample_provider so_overflow_predicate = { struct fd_batch_query_entry { uint8_t gid; /* group-id */ uint8_t cid; /* countable-id within the group */ + const struct fd_perfcntr_counter *counter; }; struct fd_batch_query_data { @@ -839,33 +840,23 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt struct fd_screen *screen = data->screen; fd_cs cs(batch->draw); - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* configure performance counters for the requested queries: */ for (unsigned i = 0; i < data->num_query_entries; i++) { struct fd_batch_query_entry *entry = &data->query_entries[i]; const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - - assert(counter_idx < g->num_counters); fd_pkt4(cs, 1).add((fd_reg_pair){ - .reg = g->counters[counter_idx].select_reg, + .reg = entry->counter->select_reg, .value = g->countables[entry->cid].selector, }); } - memset(counters_per_group, 0, sizeof(counters_per_group)); - /* and snapshot the start values */ for (unsigned i = 0; i < data->num_query_entries; i++) { struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + const struct fd_perfcntr_counter *counter = entry->counter; fd_pkt7(cs, CP_REG_TO_MEM, 3) .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) @@ -877,12 +868,8 @@ static void perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data; - struct fd_screen *screen = data->screen; fd_cs cs(batch->draw); - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* TODO do we need to bother to turn anything off? */ @@ -890,9 +877,7 @@ perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt /* snapshot the end values: */ for (unsigned i = 0; i < data->num_query_entries; i++) { struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + const struct fd_perfcntr_counter *counter = entry->counter; fd_pkt7(cs, CP_REG_TO_MEM, 3) .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) @@ -925,12 +910,24 @@ perfcntr_accumulate_result(struct fd_acc_query *aq, } } +static void +perfcntr_cleanup(void *query_data) +{ + struct fd_batch_query_data *data = (struct fd_batch_query_data *)query_data; + + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + fd_perfcntr_release(data->screen->perfcntrs, entry->counter); + } +} + static const struct fd_acc_sample_provider perfcntr = { .query_type = FD_QUERY_FIRST_PERFCNTR, .always = true, .resume = perfcntr_resume, .pause = perfcntr_pause, .result = perfcntr_accumulate_result, + .cleanup = perfcntr_cleanup, }; static struct pipe_query * @@ -949,13 +946,6 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, data->screen = screen; data->num_query_entries = num_queries; - /* validate the requested query_types and ensure we don't try - * to request more query_types of a given group than we have - * counters: - */ - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - for (unsigned i = 0; i < num_queries; i++) { unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; @@ -985,13 +975,15 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, entry->cid++; } - if (counters_per_group[entry->gid] >= - screen->perfcntr_groups[entry->gid].num_counters) { - mesa_loge("too many counters for group %u", entry->gid); + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + const struct fd_perfcntr_countable *c = &g->countables[entry->cid]; + + entry->counter = fd_perfcntr_reserve(screen->perfcntrs, g, c); + + if (!entry->counter) { + mesa_loge("Could not reserve counter for %s.%s", g->name, c->name); goto error; } - - counters_per_group[entry->gid]++; } q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); @@ -1004,6 +996,7 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, return (struct pipe_query *)q; error: + perfcntr_cleanup(data); free(data); return NULL; } diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c index 6af0e9697ad..51051c81b97 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_acc.c +++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c @@ -21,6 +21,9 @@ fd_acc_destroy_query(struct fd_context *ctx, struct fd_query *q) assert_dt DBG("%p", q); + if (aq->provider->cleanup) + aq->provider->cleanup(aq->query_data); + pipe_resource_reference(&aq->prsc, NULL); list_del(&aq->node); diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.h b/src/gallium/drivers/freedreno/freedreno_query_acc.h index f06511e2dd8..cc4daefd32e 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_acc.h +++ b/src/gallium/drivers/freedreno/freedreno_query_acc.h @@ -72,6 +72,7 @@ struct fd_acc_sample_provider { void (*result_resource)(struct fd_acc_query *aq, struct fd_ringbuffer *ring, enum pipe_query_value_type result_type, int index, struct fd_resource *dst, unsigned offset); + void (*cleanup)(void *query_data); /* optional cleanup */ }; struct fd_acc_query { diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 4b128d437f1..0efd70af3d9 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -165,6 +165,8 @@ fd_screen_destroy(struct pipe_screen *pscreen) if (screen->ro) screen->ro->destroy(screen->ro); + fd_perfcntr_state_free(screen->perfcntrs); + fd_bc_fini(&screen->batch_cache); fd_gmem_screen_fini(pscreen); @@ -1057,7 +1059,10 @@ fd_screen_create(int fd, if (screen->primtypes[i]) screen->primtypes_mask |= (1 << i); - if (FD_DBG(PERFC)) { + screen->perfcntrs = fd_perfcntr_state_alloc(screen->dev_id, fd); + + if (FD_DBG(PERFC) || + (screen->perfcntrs && fd_perfcntr_has_reservation(screen->perfcntrs))) { screen->perfcntr_groups = fd_perfcntrs(screen->dev_id, &screen->num_perfcntr_groups); } diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 137fea1c5b8..4ae53ac0b10 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -106,6 +106,7 @@ struct fd_screen { unsigned num_perfcntr_groups; const struct fd_perfcntr_group *perfcntr_groups; + struct fd_perfcntr_state *perfcntrs; /* generated at startup from the perfcntr groups: */ unsigned num_perfcntr_queries; From b5461ca40b9ddacd96c03a876ae27ae383b67a58 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 1 May 2026 16:36:41 -0700 Subject: [PATCH 14/21] freedreno/perfcntrs: Refactor derived counter setup Most of what is done here doesn't need to be duplicated per hw gen. Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/fd7_perfcntr.c | 52 +------------------- src/freedreno/perfcntrs/freedreno_perfcntr.c | 51 +++++++++++++++++-- src/freedreno/perfcntrs/freedreno_perfcntr.h | 5 ++ 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/src/freedreno/perfcntrs/fd7_perfcntr.c b/src/freedreno/perfcntrs/fd7_perfcntr.c index 60db54dc7fb..041e3f1397a 100644 --- a/src/freedreno/perfcntrs/fd7_perfcntr.c +++ b/src/freedreno/perfcntrs/fd7_perfcntr.c @@ -102,10 +102,7 @@ static_assert(DERIVED_COUNTER_PERFCNTR_MAX_VALUE <= FD_DERIVED_COUNTER_COLLECTIO #define DERIVED_COUNTER_PERFCNTR_BV(_enum, _counter) \ [DERIVED_COUNTER_PERFCNTR_BV_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum } -static const struct { - const struct fd_perfcntr_counter *counter; - unsigned countable; -} a7xx_derived_counter_perfcntrs[] = { +const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[] = { /* CP: 3/14 counters */ DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, &cp_counters[0]), DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, &cp_counters[1]), @@ -985,50 +982,3 @@ const struct fd_derived_counter *a7xx_derived_counters[] = { const unsigned a7xx_num_derived_counters = ARRAY_SIZE(a7xx_derived_counters); static_assert(ARRAY_SIZE(a7xx_derived_counters) <= FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS, ""); - -/* Prototype for linking purposes. */ -void -a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection); - -void -a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection) -{ - /* The provided collection should already specify the derived counters that will be measured. - * This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each - * used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the - * array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored. - */ - - collection->num_enabled_perfcntrs = 0; - memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map)); - - for (unsigned i = 0; i < collection->num_counters; ++i) { - const struct fd_derived_counter *counter = collection->counters[i]; - - for (unsigned j = 0; j < counter->num_perfcntrs; ++j) { - uint8_t perfcntr = counter->perfcntrs[j]; - collection->enabled_perfcntrs_map[perfcntr] = 0x00; - } - } - - /* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */ - collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0]; - - for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) { - if (collection->enabled_perfcntrs_map[i] == 0xff) - continue; - - uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++; - collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index; - - collection->enabled_perfcntrs[enabled_perfcntr_index].counter = - a7xx_derived_counter_perfcntrs[i].counter; - collection->enabled_perfcntrs[enabled_perfcntr_index].countable = - a7xx_derived_counter_perfcntrs[i].countable; - } - - const struct fd_dev_info *info = fd_dev_info_raw(id); - collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2; - collection->derivation_context.a7xx.number_of_alus_per_usptp = 128; -} - diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.c b/src/freedreno/perfcntrs/freedreno_perfcntr.c index 99239058295..4a26aefa0dd 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.c +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.c @@ -302,6 +302,7 @@ fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs, simple_mtx_unlock(&perfcntrs->lock); } +extern const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[]; extern const struct fd_derived_counter *a7xx_derived_counters[]; extern const unsigned a7xx_num_derived_counters; @@ -318,14 +319,58 @@ fd_derived_counters(const struct fd_dev_id *id, unsigned *count) } } -extern void a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection); - void fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection) { + const struct fd_derived_counter_perfcntr *derived_counter_perfcntrs = NULL; + switch (fd_dev_gen(id)) { case 7: - a7xx_generate_derived_counter_collection(id, collection); + derived_counter_perfcntrs = a7xx_derived_counter_perfcntrs; + break; + default: + return; + } + + /* The provided collection should already specify the derived counters that will be measured. + * This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each + * used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the + * array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored. + */ + + collection->num_enabled_perfcntrs = 0; + memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map)); + + for (unsigned i = 0; i < collection->num_counters; ++i) { + const struct fd_derived_counter *counter = collection->counters[i]; + + for (unsigned j = 0; j < counter->num_perfcntrs; ++j) { + uint8_t perfcntr = counter->perfcntrs[j]; + collection->enabled_perfcntrs_map[perfcntr] = 0x00; + } + } + + /* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */ + collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0]; + + for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) { + if (collection->enabled_perfcntrs_map[i] == 0xff) + continue; + + uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++; + collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index; + + collection->enabled_perfcntrs[enabled_perfcntr_index].counter = + derived_counter_perfcntrs[i].counter; + collection->enabled_perfcntrs[enabled_perfcntr_index].countable = + derived_counter_perfcntrs[i].countable; + } + + const struct fd_dev_info *info = fd_dev_info_raw(id); + switch (fd_dev_gen(id)) { + case 7: + collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2; + collection->derivation_context.a7xx.number_of_alus_per_usptp = 128; break; default: break; diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.h b/src/freedreno/perfcntrs/freedreno_perfcntr.h index 5b109b038e0..6647ca81582 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.h +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.h @@ -152,6 +152,11 @@ struct fd_derived_counter { uint64_t (*derive)(struct fd_derivation_context *context, uint64_t *values); }; +struct fd_derived_counter_perfcntr { + const struct fd_perfcntr_counter *counter; + unsigned countable; +}; + const struct fd_derived_counter **fd_derived_counters(const struct fd_dev_id *id, unsigned *count); #define FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS 64 From f86f48ee96e2a37ee2bff911f57baf02735d63f5 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 1 May 2026 14:14:05 -0700 Subject: [PATCH 15/21] freedreno/perfcntrs: Use helper for derived counters Use helper to assign/reserve counters for derived counters. Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/fd7_perfcntr.c | 146 +++++++++---------- src/freedreno/perfcntrs/freedreno_perfcntr.c | 23 ++- src/freedreno/perfcntrs/freedreno_perfcntr.h | 7 +- src/freedreno/vulkan/tu_query_pool.cc | 7 +- 4 files changed, 101 insertions(+), 82 deletions(-) diff --git a/src/freedreno/perfcntrs/fd7_perfcntr.c b/src/freedreno/perfcntrs/fd7_perfcntr.c index 041e3f1397a..6724b539398 100644 --- a/src/freedreno/perfcntrs/fd7_perfcntr.c +++ b/src/freedreno/perfcntrs/fd7_perfcntr.c @@ -97,104 +97,104 @@ enum { static_assert(DERIVED_COUNTER_PERFCNTR_MAX_VALUE <= FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS, ""); -#define DERIVED_COUNTER_PERFCNTR(_enum, _counter) \ - [DERIVED_COUNTER_PERFCNTR_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum } -#define DERIVED_COUNTER_PERFCNTR_BV(_enum, _counter) \ - [DERIVED_COUNTER_PERFCNTR_BV_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum } +#define DERIVED_COUNTER_PERFCNTR(_countable, _group) \ + [DERIVED_COUNTER_PERFCNTR_##_countable] = { .countable = "PERF_" #_countable, .group = #_group } +#define DERIVED_COUNTER_PERFCNTR_BV(_countable, _group) \ + [DERIVED_COUNTER_PERFCNTR_BV_##_countable] = { .countable = "PERF_" #_countable, .group = "BV_" #_group } const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[] = { /* CP: 3/14 counters */ - DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, &cp_counters[0]), - DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, &cp_counters[1]), - DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, &cp_counters[2]), + DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, CP), + DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, CP), + DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, CP), /* RBBM: 1/4 counters */ - DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, &rbbm_counters[0]), + DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, RBBM), /* PC: 3/8 counters */ - DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, &pc_counters[0]), - DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, &pc_counters[1]), - DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, &pc_counters[2]), + DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, PC), + DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, PC), + DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, PC), /* TSE: 4/4 counters */ - DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, &tse_counters[0]), - DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, &tse_counters[1]), - DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, &tse_counters[2]), - DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, &tse_counters[3]), + DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, TSE), + DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, TSE), + DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, TSE), + DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, TSE), /* UCHE: 5/12 counters */ - DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, &uche_counters[0]), - DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, &uche_counters[1]), - DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, &uche_counters[2]), - DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, &uche_counters[3]), - DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, &uche_counters[4]), + DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, UCHE), /* TP: 7/12 counters */ - DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, &tp_counters[0]), - DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, &tp_counters[1]), - DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, &tp_counters[2]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, &tp_counters[3]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, &tp_counters[4]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, &tp_counters[5]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, &tp_counters[6]), + DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, TP), + DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, TP), + DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, TP), /* SP: 24/24 counters */ - DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, &sp_counters[ 0]), - DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, &sp_counters[ 1]), - DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, &sp_counters[ 2]), - DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, &sp_counters[ 3]), - DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, &sp_counters[ 4]), - DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, &sp_counters[ 5]), - DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 6]), - DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 7]), - DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 8]), - DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 9]), - DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, &sp_counters[10]), - DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, &sp_counters[11]), - DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, &sp_counters[12]), - DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, &sp_counters[13]), - DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, &sp_counters[14]), - DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, &sp_counters[15]), - DERIVED_COUNTER_PERFCNTR(SP_PIXELS, &sp_counters[16]), - DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, &sp_counters[17]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, &sp_counters[18]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, &sp_counters[19]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, &sp_counters[20]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, &sp_counters[21]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, &sp_counters[22]), - DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, &sp_counters[23]), + DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, SP), + DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, SP), + DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, SP), + DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR(SP_PIXELS, SP), + DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, SP), /* CMP: 1/4 counters */ - DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, &cmp_counters[0]), + DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, CMP), /* BV_PC: 3/8 counters */ - DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, &bv_pc_counters[0]), - DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, &bv_pc_counters[1]), - DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, &bv_pc_counters[2]), + DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, PC), + DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, PC), + DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, PC), /* BV_TP: 6/6 counters */ - DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, &bv_tp_counters[0]), - DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, &bv_tp_counters[1]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, &bv_tp_counters[2]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, &bv_tp_counters[3]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, &bv_tp_counters[4]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, &bv_tp_counters[5]), + DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, TP), /* GP: 8/12 counters */ - DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, &bv_sp_counters[0]), - DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, &bv_sp_counters[1]), - DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, &bv_sp_counters[2]), - DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &bv_sp_counters[3]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, &bv_sp_counters[4]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, &bv_sp_counters[5]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, &bv_sp_counters[6]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, &bv_sp_counters[7]), + DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, SP), /* LRZ: 4/4 counters */ - DERIVED_COUNTER_PERFCNTR(LRZ_TOTAL_PIXEL, &lrz_counters[0]), - DERIVED_COUNTER_PERFCNTR(LRZ_VISIBLE_PIXEL_AFTER_LRZ, &lrz_counters[1]), - DERIVED_COUNTER_PERFCNTR(LRZ_TILE_KILLED, &lrz_counters[2]), - DERIVED_COUNTER_PERFCNTR(LRZ_PRIM_KILLED_BY_LRZ, &lrz_counters[3]), + DERIVED_COUNTER_PERFCNTR(LRZ_TOTAL_PIXEL, LRZ), + DERIVED_COUNTER_PERFCNTR(LRZ_VISIBLE_PIXEL_AFTER_LRZ, LRZ), + DERIVED_COUNTER_PERFCNTR(LRZ_TILE_KILLED, LRZ), + DERIVED_COUNTER_PERFCNTR(LRZ_PRIM_KILLED_BY_LRZ, LRZ), }; static uint64_t diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.c b/src/freedreno/perfcntrs/freedreno_perfcntr.c index 4a26aefa0dd..a7cac5b957e 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.c +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.c @@ -320,9 +320,10 @@ fd_derived_counters(const struct fd_dev_id *id, unsigned *count) } void -fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection) +fd_reserve_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection) { const struct fd_derived_counter_perfcntr *derived_counter_perfcntrs = NULL; + const struct fd_dev_id *id = perfcntrs->id; switch (fd_dev_gen(id)) { case 7: @@ -360,10 +361,15 @@ fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_der uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++; collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index; - collection->enabled_perfcntrs[enabled_perfcntr_index].counter = - derived_counter_perfcntrs[i].counter; - collection->enabled_perfcntrs[enabled_perfcntr_index].countable = - derived_counter_perfcntrs[i].countable; + const struct fd_perfcntr_group *group = + fd_perfcntrs_group(perfcntrs->id, derived_counter_perfcntrs[i].group); + const struct fd_perfcntr_countable *countable = + fd_perfcntrs_countable(group, derived_counter_perfcntrs[i].countable); + const struct fd_perfcntr_counter *counter = + fd_perfcntr_reserve(perfcntrs, group, countable); + + collection->enabled_perfcntrs[enabled_perfcntr_index].counter = counter; + collection->enabled_perfcntrs[enabled_perfcntr_index].countable = countable->selector; } const struct fd_dev_info *info = fd_dev_info_raw(id); @@ -376,3 +382,10 @@ fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_der break; } } + +void +fd_release_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection) +{ + for (unsigned i = 0; i < collection->num_enabled_perfcntrs; i++) + fd_perfcntr_release(perfcntrs, collection->enabled_perfcntrs[i].counter); +} diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.h b/src/freedreno/perfcntrs/freedreno_perfcntr.h index 6647ca81582..047bbdfe960 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.h +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.h @@ -153,8 +153,8 @@ struct fd_derived_counter { }; struct fd_derived_counter_perfcntr { - const struct fd_perfcntr_counter *counter; - unsigned countable; + const char *countable; + const char *group; }; const struct fd_derived_counter **fd_derived_counters(const struct fd_dev_id *id, unsigned *count); @@ -177,7 +177,8 @@ struct fd_derived_counter_collection { struct fd_derivation_context derivation_context; }; -void fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection); +void fd_reserve_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection); +void fd_release_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection); #ifdef __cplusplus } /* end of extern "C" */ diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index e3f6b9fae4b..1d66b2775ac 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -266,6 +266,11 @@ tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool, for (uint32_t i = 0; i < perf_query->counter_index_count; i++) fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter); + } else if (is_perf_query_raw(pool)) { + struct tu_perf_query_derived *perf_query = &pool->perf_query.derived; + struct fd_derived_counter_collection *collection = perf_query->collection; + + fd_release_derived_counter_collection(device->perfcntrs, collection); } if (pool->bo) @@ -400,7 +405,7 @@ tu_CreateQueryPool(VkDevice _device, collection->counters[i] = perf_query->derived_counters[counter_index]; } - fd_generate_derived_counter_collection(&device->physical_device->dev_id, collection); + fd_reserve_derived_counter_collection(device->perfcntrs, collection); slot_size += sizeof(struct perfcntr_query_slot) * collection->num_enabled_perfcntrs; } From e9e83b48f837513146e765334820b6fb7b181bb4 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 4 May 2026 06:52:03 -0700 Subject: [PATCH 16/21] freedreno: Skip BV perfcntrs Not useful unless we expose concurrent binning. Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/freedreno_query.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c index 2a41ddea8d0..5982fe87252 100644 --- a/src/gallium/drivers/freedreno/freedreno_query.c +++ b/src/gallium/drivers/freedreno/freedreno_query.c @@ -184,8 +184,12 @@ setup_perfcntr_query_info(struct fd_screen *screen) { unsigned num_queries = 0; - for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) - num_queries += screen->perfcntr_groups[i].num_countables; + for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) { + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i]; + if (g->pipe > PIPE_BR) + continue; + num_queries += g->num_countables; + } screen->perfcntr_queries = calloc(num_queries, sizeof(screen->perfcntr_queries[0])); @@ -194,6 +198,8 @@ setup_perfcntr_query_info(struct fd_screen *screen) unsigned idx = 0; for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) { const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i]; + if (g->pipe > PIPE_BR) + continue; for (unsigned j = 0; j < g->num_countables; j++) { struct pipe_driver_query_info *info = &screen->perfcntr_queries[idx]; const struct fd_perfcntr_countable *c = &g->countables[j]; From 31a99b2226a51c75419bdd39ed0633878df29150 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 4 May 2026 06:59:08 -0700 Subject: [PATCH 17/21] tu: Disable preemption for counters on gen8 Extend the CP_SCOPE_CNTL to gen8 and newer. Signed-off-by: Rob Clark --- src/freedreno/vulkan/tu_query_pool.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 1d66b2775ac..f93a72cd1a7 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -1240,7 +1240,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, * changes in perfcounter values should only apply to work done during * this query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true, .scope = INTERRUPTS).value); @@ -1303,7 +1303,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf, * changes in perfcounter values should only apply to work done during * this query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true, .scope = INTERRUPTS).value); @@ -1773,7 +1773,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, /* This reverts the preemption disablement done at the start * of the query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false, .scope = INTERRUPTS).value); @@ -1850,7 +1850,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf, /* This reverts the preemption disablement done at the start * of the query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false, .scope = INTERRUPTS).value); From 8717118da351797f6d20c6445a3b588a0d468ad5 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 4 May 2026 07:39:00 -0700 Subject: [PATCH 18/21] tu/gen8: Program slice selector regs Signed-off-by: Rob Clark --- src/freedreno/vulkan/tu_query_pool.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index f93a72cd1a7..ab04b0cac22 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -1259,6 +1259,13 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_pkt4(cs, data->counter->select_reg, 1); tu_cs_emit(cs, data->countable->selector); + + for (unsigned s = 0; s < ARRAY_SIZE(data->counter->slice_select_regs); s++) { + if (!data->counter->slice_select_regs[s]) + break; + tu_cs_emit_pkt4(cs, data->counter->slice_select_regs[s], 1); + tu_cs_emit(cs, data->countable->selector); + } } tu_cond_exec_end(cs); @@ -1315,6 +1322,13 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_pkt4(cs, counter->select_reg, 1); tu_cs_emit(cs, countable); + + for (unsigned s = 0; s < ARRAY_SIZE(counter->slice_select_regs); s++) { + if (!counter->slice_select_regs[s]) + break; + tu_cs_emit_pkt4(cs, counter->slice_select_regs[s], 1); + tu_cs_emit(cs, countable); + } } emit_counter_barrier(cs); From 4d114a6bf7602ae1039fbeda22fb04bed466fe81 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 4 May 2026 09:20:38 -0700 Subject: [PATCH 19/21] freedreno/a6xx: Program gen8+ slice SEL regs Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a6xx/fd6_query.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc index fa2d79c262c..2f54f09a144 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc @@ -851,6 +851,15 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt .reg = entry->counter->select_reg, .value = g->countables[entry->cid].selector, }); + + for (unsigned s = 0; s < ARRAY_SIZE(entry->counter->slice_select_regs); s++) { + if (!entry->counter->slice_select_regs[s]) + break; + fd_pkt4(cs, 1).add((fd_reg_pair){ + .reg = entry->counter->slice_select_regs[s], + .value = g->countables[entry->cid].selector, + }); + } } /* and snapshot the start values */ From 1a237a9c677739fe65a95e18d234976d8e2ffea4 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 24 Apr 2026 07:07:34 -0700 Subject: [PATCH 20/21] freedreno/perfcntrs: Expose gen8 counters Signed-off-by: Rob Clark --- src/freedreno/perfcntrs/freedreno_perfcntr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.c b/src/freedreno/perfcntrs/freedreno_perfcntr.c index a7cac5b957e..73aa0d20606 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.c +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.c @@ -50,6 +50,9 @@ fd_perfcntrs(const struct fd_dev_id *id, unsigned *count) case 7: *count = a7xx_num_perfcntr_groups; return a7xx_perfcntr_groups; + case 8: + *count = a8xx_num_perfcntr_groups; + return a8xx_perfcntr_groups; default: *count = 0; return NULL; From 5e0ee1c0eca41ba8fa97587da03eab7630e15a30 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 30 Apr 2026 09:37:43 -0700 Subject: [PATCH 21/21] DEBUG --- src/freedreno/perfcntrs/freedreno_perfcntr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.c b/src/freedreno/perfcntrs/freedreno_perfcntr.c index 73aa0d20606..cb4ea0b2fd7 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.c +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.c @@ -14,6 +14,7 @@ #include "drm-uapi/msm_drm.h" #include "util/bitset.h" +#include "util/log.h" #include "util/simple_mtx.h" #include "freedreno_common.h" @@ -116,6 +117,7 @@ update_group_counters(struct fd_perfcntr_state *perfcntrs, int group_idx) */ unsigned nr = BITSET_LAST_BIT(perfcntrs->assigned_counters[group_idx]); if (nr != perfcntrs->group_configs[group_idx].nr_countables) { + mesa_logi("%s: %u -> %u counters", perfcntrs->groups[group_idx].name, perfcntrs->group_configs[group_idx].nr_countables, nr); perfcntrs->group_configs[group_idx].nr_countables = nr; ret = update_reserved_counters(perfcntrs); } @@ -239,6 +241,7 @@ fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs, c = BITSET_FFS(free_counters) - 1; assert(c >= 0); +mesa_logi("pick counter %d", c); if (c < group->num_counters) { state = rzalloc(perfcntrs, struct fd_perfcntr_counter_state); @@ -270,6 +273,8 @@ fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs, if (!state) return NULL; + mesa_logi("%s.%s: assigned %d (%d users)", group->name, countable->name, state->counter, state->nr_users); + return &group->counters[state->counter]; } @@ -287,6 +292,9 @@ fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs, assert(state->nr_users > 0); + const struct fd_perfcntr_group *group = &perfcntrs->groups[state->group]; + mesa_logi("%s.%s: released %d (%d users)", group->name, group->countables[state->countable].name, state->counter, state->nr_users); + if (--state->nr_users == 0) { /* dropping last user of the counter: */ _mesa_hash_table_remove(perfcntrs->counter_state, e);