diff --git a/include/drm-uapi/msm_drm.h b/include/drm-uapi/msm_drm.h index 5c67294edc9..289cf228b87 100644 --- a/include/drm-uapi/msm_drm.h +++ b/include/drm-uapi/msm_drm.h @@ -117,6 +117,7 @@ struct drm_msm_timespec { * ioctl will throw -EPIPE. */ #define MSM_PARAM_EN_VM_BIND 0x16 /* WO, once */ +#define MSM_PARAM_AQE 0x17 /* RO */ /* For backwards compat. The original support for preemption was based on * a single ring per priority level so # of priority levels equals the # @@ -490,6 +491,52 @@ struct drm_msm_submitqueue_query { __u32 pad; }; +#define MSM_PERFCNTR_STREAM 0x00000001 +#define MSM_PERFCNTR_UPDATE 0x00000002 +#define MSM_PERFCNTR_FLAGS ( \ + MSM_PERFCNTR_STREAM | \ + MSM_PERFCNTR_UPDATE | \ + 0) + +struct drm_msm_perfcntr_group { + char group_name[16]; + __u32 nr_countables; + __u32 pad; + __u64 countables; /* pointer to an array of nr_countables u32 */ +}; + +/* + * Note, for MSM_PERFCNTR_STREAM, the ioctl returns an fd to read recorded + * counters. This only works because the ioctl is DRM_IOW(), if we returned + * a out param in the ioctl struct the copy_to_user() (in drm_ioctl()) + * could fault, causing us to leak the fd. + * + * If the ioctl returns with error E2BIG, that means more counters/countables + * are requested than are currently available. If MSM_PERFCNTR_UPDATE flag + * is set, drm_msm_perfcntr_group::nr_countables will be updated to return + * the actual # of counters available. + * + * The data read from the has the following format for each sampling period: + * + * uint64_t timestamp; // CP_ALWAYS_ON_COUNTER captured at sample time + * uint32_t seqno; // increments by 1 each period, reset to 0 on discontinuity + * uint32_t mbz; // pad out counters to 64b + * struct { + * uint64_t counter[nr_countables]; + * } groups[nr_groups]; + * + * The ordering of groups and counters matches the order in PERFCNTR_CONFIG + * ioctl. + */ +struct drm_msm_perfcntr_config { + __u32 flags; /* bitmask of MSM_PERFCNTR_x */ + __u32 nr_groups; /* # of entries in groups array */ + __u64 groups; /* pointer to array of drm_msm_perfcntr_group */ + __u64 period; /* sampling period in ns */ + __u32 bufsz_shift; /* sample buffer size in bytes is 1<num_countables; j++) { - const struct fd_perfcntr_countable *countable = &group->countables[j]; - - if (strcmp(name, countable->name) != 0) - continue; - + if (countable) { /* * Allocate a counter to use to monitor the requested countable: */ diff --git a/src/freedreno/ds/fd_pps_a8xx.cc b/src/freedreno/ds/fd_pps_a8xx.cc new file mode 100644 index 00000000000..2972d48673d --- /dev/null +++ b/src/freedreno/ds/fd_pps_a8xx.cc @@ -0,0 +1,1063 @@ +/* + * Copyright © 2021 Google, Inc. + * SPDX-License-Identifier: MIT + */ + +#include "fd_pps_driver.h" + +#include +#include +#include + +#include "common/freedreno_dev_info.h" +#include "drm/freedreno_drmif.h" +#include "drm/freedreno_ringbuffer.h" +#include "perfcntrs/freedreno_dt.h" +#include "perfcntrs/freedreno_perfcntr.h" + +#include "pps/pps.h" +#include "pps/pps_algorithm.h" + +namespace pps +{ + +void +FreedrenoDriver::setup_a8xx_counters() +{ + /* TODO is there a reason to want more than one group? */ + CounterGroup group = {}; + group.name = "counters"; + groups.clear(); + counters.clear(); + countables.clear(); + enabled_counters.clear(); + groups.emplace_back(std::move(group)); + + /* So far, all a7xx devices seem to have two uSPTPs in each SP core + * and 128 ALUs in each uSPTP. + */ + const unsigned number_of_usptp = info->num_sp_cores * 2; + const unsigned number_of_alus_per_usptp = 128; + + /* The enumeration and two helper lambdas serve to handle countables + * that can be sampled from either rendering or visibility bins. + */ + enum { + BR = 0, + BV = 1, + }; + + auto cbCountable = [=](std::string group, std::string name) { + return std::array { + countable(group, name), + countable("BV_" + group, name), + }; + }; + + auto cbSum = [](const std::array& countable) { + return countable[BR] + countable[BV]; + }; + + /* This is a helper no-op lambda to handle known and understood counters + * that we can't currently implement for a variety of reasons. + */ + auto disabledCounter = [](std::string, Counter::Units, std::function) { }; + + /* CP: 3/14 counters */ + auto PERF_CP_ALWAYS_COUNT = countable("CP", "PERF_CP_ALWAYS_COUNT"); + auto PERF_CP_NUM_PREEMPTIONS = countable("CP", "PERF_CP_NUM_PREEMPTIONS"); + auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("CP", "PERF_CP_PREEMPTION_REACTION_DELAY"); + + /* RBBM: 1/4 counters */ + auto PERF_RBBM_US_STATUS_MASKED = countable("RBBM", "PERF_RBBM_US_STATUS_MASKED"); + + /* PC: 3/8 counters, BV_PC: 3/8 counters */ + auto PERF_PC_S_STALL_CYCLES_VFD = cbCountable("PC", "PERF_PC_S_STALL_CYCLES_VFD"); + auto PERF_PC_S_VERTEX_HITS = cbCountable("PC", "PERF_PC_S_VERTEX_HITS"); + auto PERF_PC_S_VS_INVOCATIONS = cbCountable("PC", "PERF_PC_S_VS_INVOCATIONS"); + + /* TSE: 4/8 counters */ + auto PERF_TSE_BE_INPUT_PRIM = countable("TSE", "PERF_TSE_BE_INPUT_PRIM"); + auto PERF_TSE_BE_TRIVAL_REJ_PRIM = countable("TSE", "PERF_TSE_BE_TRIVAL_REJ_PRIM"); + auto PERF_TSE_BE_CLIPPED_PRIM = countable("TSE", "PERF_TSE_BE_CLIPPED_PRIM"); + auto PERF_TSE_BE_OUTPUT_VISIBLE_PRIM = countable("TSE", "PERF_TSE_BE_OUTPUT_VISIBLE_PRIM"); + + /* UCHE: 8/12 counters */ + auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("UCHE", "PERF_UCHE_STALL_CYCLES_ARBITER"); + auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_TP"); + auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_VFD"); + auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("UCHE", "PERF_UCHE_VBIF_READ_BEATS_SP"); + auto PERF_UCHE_READ_REQUESTS_TP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_TP"); + auto PERF_UCHE_READ_REQUESTS_SP = countable("UCHE", "PERF_UCHE_READ_REQUESTS_SP"); + auto PERF_UCHE_WRITE_REQUESTS_SP = countable("UCHE", "PERF_UCHE_WRITE_REQUESTS_SP"); + auto PERF_UCHE_EVICTS = countable("UCHE", "PERF_UCHE_EVICTS"); + + /* TP: 7/12 counters, BV_TP: 6/6 counters */ + auto PERF_TP_BUSY_CYCLES = countable("TP", "PERF_TP_BUSY_CYCLES"); + auto PERF_TP_L1_CACHELINE_REQUESTS = cbCountable("TP", "PERF_TP_L1_CACHELINE_REQUESTS"); + auto PERF_TP_L1_CACHELINE_MISSES = cbCountable("TP", "PERF_TP_L1_CACHELINE_MISSES"); + auto PERF_TP_OUTPUT_PIXELS = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS"); + auto PERF_TP_OUTPUT_PIXELS_POINT = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_POINT"); + auto PERF_TP_OUTPUT_PIXELS_BILINEAR = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_BILINEAR"); + auto PERF_TP_OUTPUT_PIXELS_ANISO = cbCountable("TP", "PERF_TP_OUTPUT_PIXELS_ANISO"); + + /* SP: 24/24 counters, BV_SP: 7/12 counters */ + auto PERF_SP_BUSY_CYCLES = countable("SP", "PERF_SP_BUSY_CYCLES"); + auto PERF_SP_ALU_WORKING_CYCLES = countable("SP", "PERF_SP_ALU_WORKING_CYCLES"); + auto PERF_SP_EFU_WORKING_CYCLES = countable("SP", "PERF_SP_EFU_WORKING_CYCLES"); + auto PERF_SP_STALL_CYCLES_TP = cbCountable("SP", "PERF_SP_STALL_CYCLES_TP"); + auto PERF_SP_NON_EXECUTION_CYCLES = countable("SP", "PERF_SP_NON_EXECUTION_CYCLES"); + auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_TEX_INSTRUCTIONS"); + auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_EFU_INSTRUCTIONS"); + auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = cbCountable("SP", "PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_EFU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS"); + auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("SP", "PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS"); + auto PERF_SP_ICL1_REQUESTS = cbCountable("SP", "PERF_SP_ICL1_REQUESTS"); + auto PERF_SP_ICL1_MISSES = cbCountable("SP", "PERF_SP_ICL1_MISSES"); + auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_FS_STAGE"); + auto PERF_SP_ANY_EU_WORKING_VS_STAGE = cbCountable("SP", "PERF_SP_ANY_EU_WORKING_VS_STAGE"); + auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("SP", "PERF_SP_ANY_EU_WORKING_CS_STAGE"); + auto PERF_SP_PIXELS = countable("SP", "PERF_SP_PIXELS"); + auto PERF_SP_RAY_QUERY_INSTRUCTIONS = countable("SP", "PERF_SP_RAY_QUERY_INSTRUCTIONS"); + auto PERF_SP_RTU_BUSY_CYCLES = countable("SP", "PERF_SP_RTU_BUSY_CYCLES"); + auto PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES"); + auto PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES = countable("SP", "PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES"); + auto PERF_SP_RTU_RAY_BOX_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_BOX_INTERSECTIONS"); + auto PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS = countable("SP", "PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS"); + auto PERF_SP_SCH_STALL_CYCLES_RTU = countable("SP", "PERF_SP_SCH_STALL_CYCLES_RTU"); + + /* CMP: 1/4 counters */ + auto PERF_CMPDECMP_VBIF_READ_DATA = countable("CMP", "PERF_CMPDECMP_VBIF_READ_DATA"); + + /* LRZ: 4/4 counters */ + auto PERF_LRZ_TOTAL_PIXEL = countable("LRZ", "PERF_LRZ_TOTAL_PIXEL"); + auto PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ = countable("LRZ", "PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ"); + auto PERF_LRZ_TILE_KILLED = countable("LRZ", "PERF_LRZ_TILE_KILLED"); + auto PERF_LRZ_PRIM_KILLED_BY_LRZ = countable("LRZ", "PERF_LRZ_PRIM_KILLED_BY_LRZ"); + + /** + * GPU Compute + */ + disabledCounter("Avg Load-Store Instructions Per Cycle", Counter::Units::None, [=]() { + /* Number of average Load-Store instructions per cycle. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: 4*sum(PERF_SP_{LM,GM}_{LOAD,STORE}_INSTRUCTIONS) / PERF_SP_BUSY_CYCLES + */ + return 42; + } + ); + counter("Bytes Data Actually Written", Counter::Units::Byte, [=]() { + /* Number of bytes requested to be written by the GPU. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS + * Notes: + * - Equation: PERF_UCHE_EVICTS * 64 + */ + return PERF_UCHE_EVICTS * 64; + } + ); + counter("Bytes Data Write Requested", Counter::Units::Byte, [=]() { + /* Number of bytes requested to be written by the GPU. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP + * Notes: + * - Equation: PERF_UCHE_WRITE_REQUESTS_SP * 16 + */ + return PERF_UCHE_WRITE_REQUESTS_SP * 16; + } + ); + counter("Global Buffer Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global buffer data read in by the GPU, per second from the system memory (when the data is not found in L2 cache). */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP + * Notes: + * - Equation: (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time + */ + return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time; + } + ); + counter("Global Buffer Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global buffer read requests, made by a compute kernel to the L2 cache, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_SP * 16) / time + */ + return (PERF_UCHE_READ_REQUESTS_SP * 16) / time; + } + ); + counter("% Global Buffer Read L2 Hit", Counter::Units::Percent, [=]() { + /* Percentage of total global buffer read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_13 = PERF_UCHE_READ_REQUESTS_SP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2)) / PERF_UCHE_READ_REQUESTS_SP + */ + return percent(PERF_UCHE_READ_REQUESTS_SP - (PERF_UCHE_VBIF_READ_BEATS_SP / 2), PERF_UCHE_READ_REQUESTS_SP); + } + ); + counter("% Global Buffer Write L2 Hit", Counter::Units::Percent, [=]() { + /* Percentage of global write L2 Hit. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_18 = PERF_UCHE_EVICTS + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_15 = PERF_UCHE_WRITE_REQUESTS_SP + * Notes: + * - Equation: (PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS) / PERF_UCHE_WRITE_REQUESTS_SP + */ + return percent(PERF_UCHE_WRITE_REQUESTS_SP - PERF_UCHE_EVICTS, PERF_UCHE_WRITE_REQUESTS_SP); + } + ); + counter("Global Image Compressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global Image data (compressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */ + /* Countables: + * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA + * Notes: + * - Equation: (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time + */ + return (PERF_CMPDECMP_VBIF_READ_DATA * 32) / time; + } + ); + counter("Global Image Data Read Request BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of image buffer read requests, made by a compute kernel to the L2 cache, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_TP * 16) / time + */ + return (PERF_UCHE_READ_REQUESTS_TP * 16) / time; + } + ); + counter("Global Image Uncompressed Data Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Number of bytes of global Image data (uncompressed) read in by the GPU per second from the system memory (when the data is not found in L2 cache). */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * Notes: + * - Equation: (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time + */ + return (PERF_UCHE_VBIF_READ_BEATS_TP * 32) / time; + } + ); + disabledCounter("Global Memory Atomic Instructions", Counter::Units::None, [=]() { + /* Number of Global Memory Atomic Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_32 = PERF_SP_GM_ATOMICS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_GM_ATOMICS * 4 + */ + return 42; + } + ); + disabledCounter("Global Memory Load Instructions", Counter::Units::None, [=]() { + /* Number of Global Memory Load Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_30 = PERF_SP_GM_LOAD_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_GM_LOAD_INSTRUCTIONS * 4 + */ + return 42; + } + ); + disabledCounter("Global Memory Store Instructions", Counter::Units::None, [=]() { + /* Number of Global Memory Store Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_31 = PERF_SP_GM_STORE_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_GM_STORE_INSTRUCTIONS * 4 + */ + return 42; + } + ); + counter("% Image Read L2 Hit", Counter::Units::Percent, [=]() { + /* Percentage of total image read requests that were fulfilled by L2 cache hit which is populated by looking at the number of read requests that were forwarded to VBIF to read from the system memory. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP + * Notes: + * - Equation: (PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2)) / PERF_UCHE_READ_REQUESTS_TP + */ + return percent(PERF_UCHE_READ_REQUESTS_TP - (PERF_UCHE_VBIF_READ_BEATS_TP / 2), PERF_UCHE_READ_REQUESTS_TP); + } + ); + counter("% Kernel Load Cycles", Counter::Units::Percent, [=]() { + /* Percentage of cycles used for a compute kernel loading; excludes execution cycles. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - Equation: (PERF_RBBM_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * #uSPTP)) / PERF_CP_ALWAYS_COUNT + */ + return percent(PERF_RBBM_US_STATUS_MASKED - (PERF_SP_BUSY_CYCLES * number_of_usptp), PERF_CP_ALWAYS_COUNT); + } + ); + counter("% L1 Hit", Counter::Units::Percent, [=]() { + /* Percentage of L1 texture cache requests that were hits. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS + * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * Notes: + * - Equation: (PERF_TP_L1_CACHELINE_REQUESTS - PERF_TP_L1_CACHELINE_MISSES) / PERF_TP_L1_CACHELINE_REQUESTS + */ + return percent(PERF_TP_L1_CACHELINE_REQUESTS[BR] - PERF_TP_L1_CACHELINE_MISSES[BR], PERF_TP_L1_CACHELINE_REQUESTS[BR]); + } + ); + disabledCounter("Load-Store Utilization", Counter::Units::Percent, [=]() { + /* Percentage of the Load-Store unit is utilized compared to theoretical Load/Store throughput. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_63 = PERF_SP_LOAD_CONTROL_WORKING_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LOAD_CONTROL_WORKING_CYCLES / PERF_SP_BUSY_CYCLES + */ + return 42; + } + ); + disabledCounter("Local Memory Atomic Instructions", Counter::Units::None, [=]() { + /* Number of Local Memory Atomic Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_29 = PERF_SP_LM_ATOMICS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LM_ATOMICS * 4 + */ + return 42; + } + ); + disabledCounter("Local Memory Load Instructions", Counter::Units::None, [=]() { + /* Number of Local Memory Load Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_27 = PERF_SP_LM_LOAD_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LM_LOAD_INSTRUCTIONS * 4 + */ + return 42; + } + ); + disabledCounter("Local Memory Store Instructions", Counter::Units::None, [=]() { + /* Number of Local Memory Store Instructions executed by SP during a given sample period. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_28 = PERF_SP_LM_STORE_INSTRUCTIONS + * Notes: + * - FIXME: disabled due to lack of SP counter capacity + * - Equation: PERF_SP_LM_STORE_INSTRUCTIONS * 4 + */ + return 42; + } + ); + + /** + * GPU General + */ + disabledCounter("Clocks / Second", Counter::Units::None, [=]() { + /* Number of GPU clocks per second. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT + * Notes: + * - TODO: with Adaptive Clock Distribution, the measured values are much more varied + * than the constant GPU frequency value we currently get, so this counter is disabled + * for now in favor of the GPU Frequency counter below. + * - Equation: PERF_CP_ALWAYS_COUNT / time + */ + return 42; + } + ); + disabledCounter("GPU % Bus Busy", Counter::Units::Percent, [=]() { + /* Approximate Percentage of time the GPU's bus to system memory is busy. */ + /* Countables: + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL + * Notes: + * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of + * more complex way that those counters are enabled + * - Equation: (PERF_UCHE_STALL_CYCLES_ARBITER + sum(PERF_GBIF_AXI{0,1}_{READ,WRITE}_DATA_BEATS_TOTAL)) / (4 * PERF_RBBM_STATUS_MASKED) + */ + return 42; + } + ); + counter("GPU Frequency", Counter::Units::None, [=]() { + /* Notes: + * - TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/gpuclk + * - Same value can be retrieved through PERF_CP_ALWAYS_COUNT, until ACD enables adaptive + * GPU frequencies that would be covered by the Clocks / Second counter above. + */ + return PERF_CP_ALWAYS_COUNT / time; + } + ); + disabledCounter("GPU Temperature", Counter::Units::None, [=]() { + /* TODO: Should read from (an equivalent of) /sys/class/kgsl/kgsl-3d0/temp */ + return 42; + } + ); + counter("GPU % Utilization", Counter::Units::Percent, [=]() { + /* Percentage utilization of the GPU. */ + /* Countables: + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(PERF_RBBM_US_STATUS_MASKED, max_freq); + } + ); + + /** + * GPU Memory Stats + */ + counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() { + /* Average number of bytes transferred from main memory for each fragment. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_SP_PIXELS); + } + ); + counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() { + /* Average number of bytes transferred from main memory for each vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + */ + return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + disabledCounter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Total number of bytes read by the GPU from memory, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_34 = PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_35 = PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL + * Notes: + * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of + * more complex way that those counters are enabled + * - Equation: (PERF_GBIF_AXI0_READ_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_READ_DATA_BEATS_TOTAL) * 32 / time + */ + return 42; + } + ); + counter("SP Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Bytes of data read from memory by the Shader Processors, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_8 = PERF_UCHE_VBIF_READ_BEATS_SP + */ + return (PERF_UCHE_VBIF_READ_BEATS_SP * 32) / time; + } + ); + counter("Texture Memory Read BW (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Bytes of texture data read from memory per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_CMP::COUNTABLE_7 = PERF_CMPDECMP_VBIF_READ_DATA + */ + return ((PERF_UCHE_VBIF_READ_BEATS_TP + PERF_CMPDECMP_VBIF_READ_DATA) * 32) / time; + } + ); + counter("Vertex Memory Read (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Bytes of vertex data read from memory per second. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_5 = PERF_UCHE_VBIF_READ_BEATS_VFD + */ + return (PERF_UCHE_VBIF_READ_BEATS_VFD * 32) / time; + } + ); + disabledCounter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() { + /* Total number of bytes written by the GPU to memory, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_46 = PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + * PERFCOUNTER_GROUP_VBIF::COUNTABLE_47 = PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL + * Notes: + * - TODO: requires VBIF perfcounter group exposure which isn't trivial because of + * more complex way that those counters are enabled + * - Equation: (PERF_GBIF_AXI0_WRITE_DATA_BEATS_TOTAL + PERF_GBIF_AXI1_WRITE_DATA_BEATS_TOTAL) * 32 / time + */ + return 42; + } + ); + + /** + * GPU Preemption + */ + counter("Avg Preemption Delay", Counter::Units::None, [=]() { + /* Average time (us) from the preemption request to preemption start. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_4 = PERF_CP_PREEMPTION_REACTION_DELAY + * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS + * PERFCOUNTER_GROUP_CP::COUNTABLE_0 = PERF_CP_ALWAYS_COUNT + * Note: + * - PERF_CP_NUM_PREEMPTIONS has to be divided by 2 + */ + if (!PERF_CP_ALWAYS_COUNT || !PERF_CP_NUM_PREEMPTIONS) + return 0.0; + + double clocks_per_us = (double)PERF_CP_ALWAYS_COUNT / (time * 1000000); + double delay_us = PERF_CP_PREEMPTION_REACTION_DELAY / clocks_per_us; + return delay_us / ((double)PERF_CP_NUM_PREEMPTIONS / 2); + } + ); + counter("Preemptions / second", Counter::Units::None, [=]() { + /* The number of GPU preemptions that occurred, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_CP::COUNTABLE_3 = PERF_CP_NUM_PREEMPTIONS + * Note: + * - PERF_CP_NUM_PREEMPTIONS has to be divided by 2 + */ + return PERF_CP_NUM_PREEMPTIONS / (2 * time); + } + ); + + /** + * GPU Primitive Processing + */ + counter("Average Polygon Area", Counter::Units::None, [=]() { + /* Average number of pixels per polygon. */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_14 = PERF_TSE_OUTPUT_VISIBLE_PRIM + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(PERF_SP_PIXELS, PERF_TSE_BE_OUTPUT_VISIBLE_PRIM); + } + ); + counter("Average Vertices / Polygon", Counter::Units::None, [=]() { + /* Average number of vertices per polygon. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return safe_div(cbSum(PERF_PC_S_VS_INVOCATIONS), PERF_TSE_BE_INPUT_PRIM); + } + ); + counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() { + /* Number of polygons submitted to the GPU, per second, before any hardware clipping. */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return PERF_TSE_BE_INPUT_PRIM / time; + } + ); + counter("% Prims Clipped", Counter::Units::Percent, [=]() { + /* Percentage of primitives clipped by the GPU (where new primitives are generated). */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_9 = PERF_TSE_CLIPPED_PRIM + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return percent(PERF_TSE_BE_CLIPPED_PRIM, PERF_TSE_BE_INPUT_PRIM); + } + ); + counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() { + /* Percentage of primitives that are trivially rejected. */ + /* Countables: + * PERFCOUNTER_GROUP_TSE::COUNTABLE_8 = PERF_TSE_TRIVAL_REJ_PRIM + * PERFCOUNTER_GROUP_TSE::COUNTABLE_6 = PERF_TSE_INPUT_PRIM + */ + return percent(PERF_TSE_BE_TRIVAL_REJ_PRIM, PERF_TSE_BE_INPUT_PRIM); + } + ); + counter("Reused Vertices / Second", Counter::Units::None, [=]() { + /* Number of vertices used from the post-transform vertex buffer cache, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_19 = PERF_PC_VERTEX_HITS + */ + return cbSum(PERF_PC_S_VERTEX_HITS) / time; + } + ); + + /** + * GPU Shader Processing + */ + counter("ALU / Fragment", Counter::Units::None, [=]() { + /* Average number of scalar fragment shader ALU instructions issued per shaded fragment, expressed as full precision ALUs (2 mediump = 1 fullp). */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS + * Notes: + * - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity. + * - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4 + * - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4 + * to match other per-fragment counters. + */ + return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2, + PERF_SP_PIXELS); + } + ); + counter("ALU / Vertex", Counter::Units::None, [=]() { + /* Average number of vertex scalar shader ALU instructions issued per shaded vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + * - For some reason half-precision ALUs are not counted. + */ + return safe_div(4 * cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS), cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() { + /* Percent of texels filtered using the 'Anisotropic' sampling method. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_28 = PERF_TP_OUTPUT_PIXELS_ANISO + */ + return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_ANISO), cbSum(PERF_TP_OUTPUT_PIXELS)); + } + ); + counter("Average BVH Fetch Latency Cycles", Counter::Units::None, [=]() { + /* The Average BVH Fetch Latency cycles is the latency counted from start of BVH query request till getting BVH Query result back. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_139 = PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_140 = PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return safe_div(PERF_SP_RTU_BVH_FETCH_LATENCY_CYCLES, PERF_SP_RTU_BVH_FETCH_LATENCY_SAMPLES); + } + ); + counter("EFU / Fragment", Counter::Units::None, [=]() { + /* Average number of scalar fragment shader EFU instructions issued per shaded fragment. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_99 = PERF_SP_QUADS + * Notes: + * - PERF_SP_PIXELS is used instead of PERF_SP_QUADS to avoid SP counter group overcapacity. + * - PERF_SP_PIXELS ~ PERF_SP_QUADS * 4 + * - original equation uses unmultiplied QUADS as denominator, we use PIXELS ~ QUADS * 4 + * to match other per-fragment counters. + */ + return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_SP_PIXELS); + } + ); + counter("EFU / Vertex", Counter::Units::None, [=]() { + /* Average number of scalar vertex shader EFU instructions issued per shaded vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return safe_div(4 * cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS), cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() { + /* Total number of full precision fragment shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * 4) / time; + } + ); + counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() { + /* Total number of half precision Scalar fragment shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * 4) / time; + } + ); + counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() { + /* Total number of Scalar fragment shader Elementary Function Unit (EFU) instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * 4) / time; + } + ); + counter("Fragment Instructions / Second", Counter::Units::None, [=]() { + /* Total number of fragment shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_39 = PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return (4 * (PERF_SP_FS_STAGE_EFU_INSTRUCTIONS + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + + + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2)) / time; + } + ); + counter("Fragments Shaded / Second", Counter::Units::None, [=]() { + /* Number of fragments submitted to the shader engine, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return PERF_SP_PIXELS / time; + } + ); + counter("% Linear Filtered", Counter::Units::Percent, [=]() { + /* Percent of texels filtered using the 'Linear' sampling method. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_26 = PERF_TP_OUTPUT_PIXELS_BILINEAR + */ + return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_BILINEAR), cbSum(PERF_TP_OUTPUT_PIXELS)); + } + ); + counter("% Nearest Filtered", Counter::Units::Percent, [=]() { + /* Percent of texels filtered using the 'Nearest' sampling method. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_25 = PERF_TP_OUTPUT_PIXELS_POINT + */ + return safe_div(cbSum(PERF_TP_OUTPUT_PIXELS_POINT), cbSum(PERF_TP_OUTPUT_PIXELS)); + } + ); + disabledCounter("% Non-Base Level Textures", Counter::Units::Percent, [=]() { + /* Percent of texels coming from a non-base MIP level. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_29 = PERF_TP_OUTPUT_PIXELS_ZERO_LOD + * Notes: + * - FIXME: disabled due to lack of TP counter capacity + * - Equation: 100.0 - percent(cbSum(PERF_TP_OUTPUT_PIXELS_ZERO_LOD), cbSum(PERF_TP_OUTPUT_PIXELS)); + */ + return 42; + } + ); + counter("% RTU Busy", Counter::Units::Percent, [=]() { + /* Percentage of time that Ray Tracing Unit in SP is busy compared to whole SP. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_125 = PERF_SP_RTU_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return percent(PERF_SP_RTU_BUSY_CYCLES, PERF_SP_BUSY_CYCLES); + } + ); + counter("RTU Ray Box Intersections Per Instruction", Counter::Units::None, [=]() { + /* Number of Ray Box intersections per instruction. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_148 = PERF_SP_RTU_RAY_BOX_INTERSECTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return safe_div(PERF_SP_RTU_RAY_BOX_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS); + } + ); + counter("RTU Ray Triangle Intersections Per Instruction", Counter::Units::None, [=]() { + /* Number of Ray Triangle intersections per instruction. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_149 = PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_122 = PERF_SP_RAY_QUERY_INSTRUCTIONS + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return safe_div(PERF_SP_RTU_RAY_TRIANGLE_INTERSECTIONS, PERF_SP_RAY_QUERY_INSTRUCTIONS); + } + ); + counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() { + /* Percent of maximum shader capacity (ALU operations) utilized. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_40 = PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_41 = PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + int64_t numerator = cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS) + + PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2; + int64_t denominator = PERF_SP_BUSY_CYCLES * number_of_alus_per_usptp; + return percent(numerator, denominator); + } + ); + counter("% Shaders Busy", Counter::Units::Percent, [=]() { + /* Percentage of time that all Shader cores are busy. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - SP_BUSY_CYCLES seems to be used as the numerator -- unless it's zero, + * at which point TP_BUSY_CYLCES seems to be used instead. + */ + + int64_t numerator = PERF_SP_BUSY_CYCLES; + if (!numerator) + numerator = PERF_TP_BUSY_CYCLES; + return percent(numerator, number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Shaders Stalled", Counter::Units::Percent, [=]() { + /* Percentage of time that all shader cores are idle with at least one active wave. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_7 = PERF_SP_NON_EXECUTION_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(PERF_SP_NON_EXECUTION_CYCLES, number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Texture Pipes Busy", Counter::Units::Percent, [=]() { + /* Percentage of time that any texture pipe is busy. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_0 = PERF_TP_BUSY_CYCLES + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(PERF_TP_BUSY_CYCLES, number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("Textures / Fragment", Counter::Units::None, [=]() { + /* Average number of textures referenced per fragment. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS + * PERFCOUNTER_GROUP_TP::COUNTABLE_10 = PERF_TP_OUTPUT_PIXELS + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(PERF_TP_OUTPUT_PIXELS[BR], PERF_SP_PIXELS); + } + ); + counter("Textures / Vertex", Counter::Units::None, [=]() { + /* Average number of textures referenced per vertex. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_33 = PERF_SP_VS_STAGE_TEX_INSTRUCTIONS + * Notes: + * - Numerator has to be multiplied by four. + */ + return safe_div(4 * cbSum(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS), cbSum(PERF_PC_S_VS_INVOCATIONS)); + } + ); + counter("% Time ALUs Working", Counter::Units::Percent, [=]() { + /* Percentage of time the ALUs are working while the Shaders are busy. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_1 = PERF_SP_ALU_WORKING_CYCLES + * Notes: + * - ALU working cycles have to be halved. + */ + return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES); + } + ); + counter("% Time Compute", Counter::Units::Percent, [=]() { + /* Amount of time spent in compute work compared to the total time spent shading everything. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE + * CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value. + */ + int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE + + cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE); + return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE, total); + } + ); + counter("% Time EFUs Working", Counter::Units::Percent, [=]() { + /* Percentage of time the EFUs are working while the Shaders are busy. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_0 = PERF_SP_BUSY_CYCLES + * PERFCOUNTER_GROUP_SP::COUNTABLE_2 = PERF_SP_EFU_WORKING_CYCLES + */ + return percent(PERF_SP_EFU_WORKING_CYCLES, PERF_SP_BUSY_CYCLES); + } + ); + counter("% Time Shading Fragments", Counter::Units::Percent, [=]() { + /* Amount of time spent shading fragments compared to the total time spent shading everything. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_78 = PERF_SP_ANY_EU_WORKING_CS_STAGE + * Notes: + * - CS_STAGE amount is also counted in FS_STAGE, so fragment time has to be retrieved + * through subtraction and the compute time shouldn't be summed into the total value. + */ + int64_t fragments = PERF_SP_ANY_EU_WORKING_FS_STAGE - PERF_SP_ANY_EU_WORKING_CS_STAGE; + int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE + + cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE); + return percent(fragments, total); + } + ); + counter("% Time Shading Vertices", Counter::Units::Percent, [=]() { + /* Amount of time spent shading vertices compared to the total time spent shading everything. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_74 = PERF_SP_ANY_EU_WORKING_FS_STAGE + * PERFCOUNTER_GROUP_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_76 = PERF_SP_ANY_EU_WORKING_VS_STAGE + * Notes: + * - CS_STAGE amount is also counted in FS_STAGE, so it shouldn't be summed into the total value. + */ + int64_t total = PERF_SP_ANY_EU_WORKING_FS_STAGE + + cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE); + return percent(cbSum(PERF_SP_ANY_EU_WORKING_VS_STAGE), total); + } + ); + counter("Vertex Instructions / Second", Counter::Units::None, [=]() { + /* Total number of scalar vertex shader instructions issued, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_34 = PERF_SP_VS_STAGE_EFU_INSTRUCTIONS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_35 = PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + * Notes: + - Numerator has to be multiplied by four. + */ + return (4 * (cbSum(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) + cbSum(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS))) / time; + } + ); + counter("Vertices Shaded / Second", Counter::Units::None, [=]() { + /* Number of vertices submitted to the shader engine, per second. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_25 = PERF_PC_VS_INVOCATIONS + */ + return cbSum(PERF_PC_S_VS_INVOCATIONS) / time; + } + ); + disabledCounter("% Wave Context Occupancy", Counter::Units::Percent, [=]() { + /* Average percentage of wave context occupancy per cycle. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_8 = PERF_SP_WAVE_CONTEXTS + * PERFCOUNTER_GROUP_SP::COUNTABLE_9 = PERF_SP_WAVE_CONTEXT_CYCLES + * Note: + * - FIXME: disabled due to lack of SP counter capacity + * - the quotient has to be divided by the number of execution wave slots per SP (16 on a7xx) + * - Equation: (PERF_SP_WAVE_CONTEXTS / PERF_SP_WAVE_CONTEXT_CYCLES) / number_of_execution_wave_slots_per_sp; + */ + return 42; + } + ); + + /** + * GPU Stalls + */ + counter("% BVH Fetch Stall", Counter::Units::Percent, [=]() { + /* Percentage of clock cycles where the RTU could not make any more requests for BVH fetch from scheduler. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_150 = PERF_SP_SCH_STALL_CYCLES_RTU + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - TODO: provisional implementation, wasn't able to verify. + */ + return percent(PERF_SP_SCH_STALL_CYCLES_RTU, PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Instruction Cache Miss", Counter::Units::Percent, [=]() { + /* Number of L1 instruction cache misses divided by L1 instruction cache requests. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS + * PERFCOUNTER_GROUP_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_51 = PERF_SP_ICL1_REQUESTS + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_52 = PERF_SP_ICL1_MISSES + */ + return percent(cbSum(PERF_SP_ICL1_MISSES), cbSum(PERF_SP_ICL1_REQUESTS)); + } + ); + counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() { + /* Average number of Texture L1 cache misses per pixel. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * PERFCOUNTER_GROUP_SP::COUNTABLE_101 = PERF_SP_PIXELS + */ + return safe_div(cbSum(PERF_TP_L1_CACHELINE_MISSES), PERF_SP_PIXELS); + } + ); + counter("% Stalled On System Memory", Counter::Units::Percent, [=]() { + /* Percentage of cycles the L2 cache is stalled waiting for data from system memory. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_1 = PERF_UCHE_STALL_CYCLES_ARBITER + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + * Notes: + * - denominator has to be multiplied by four, for unknown reasons. + */ + return safe_div(PERF_UCHE_STALL_CYCLES_ARBITER, 4 * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Texture Fetch Stall", Counter::Units::Percent, [=]() { + /* Percentage of clock cycles where the shader processors cannot make any more requests for texture data. */ + /* Countables: + * PERFCOUNTER_GROUP_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP + * PERFCOUNTER_GROUP_BV_SP::COUNTABLE_4 = PERF_SP_STALL_CYCLES_TP + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(cbSum(PERF_SP_STALL_CYCLES_TP), number_of_usptp * PERF_RBBM_US_STATUS_MASKED); + } + ); + counter("% Texture L1 Miss", Counter::Units::Percent, [=]() { + /* Number of L1 texture cache misses divided by L1 texture cache requests. */ + /* Countables: + * PERFCOUNTER_GROUP_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS + * PERFCOUNTER_GROUP_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_6 = PERF_TP_L1_CACHELINE_REQUESTS + * PERFCOUNTER_GROUP_BV_TP::COUNTABLE_7 = PERF_TP_L1_CACHELINE_MISSES + */ + return percent(cbSum(PERF_TP_L1_CACHELINE_MISSES), cbSum(PERF_TP_L1_CACHELINE_REQUESTS)); + } + ); + counter("% Texture L2 Miss", Counter::Units::Percent, [=]() { + /* Number of L2 texture cache misses divided by L2 texture cache requests. */ + /* Countables: + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_4 = PERF_UCHE_VBIF_READ_BEATS_TP + * PERFCOUNTER_GROUP_UCHE::COUNTABLE_9 = PERF_UCHE_READ_REQUESTS_TP + * Notes: + * - ratio has to be multiplied by two. Unsure how this constant comes up. + */ + return percent(2 * PERF_UCHE_VBIF_READ_BEATS_TP, PERF_UCHE_READ_REQUESTS_TP); + } + ); + counter("% Vertex Fetch Stall", Counter::Units::Percent, [=]() { + /* Percentage of clock cycles where the GPU cannot make any more requests for vertex data. */ + /* Countables: + * PERFCOUNTER_GROUP_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD + * PERFCOUNTER_GROUP_BV_PC::COUNTABLE_2 = PERF_PC_STALL_CYCLES_VFD + * PERFCOUNTER_GROUP_RBBM::COUNTABLE_6 = PERF_RBBM_STATUS_MASKED + */ + return percent(cbSum(PERF_PC_S_STALL_CYCLES_VFD), PERF_RBBM_US_STATUS_MASKED); + } + ); + + counter("% LRZ Pixel Killed", Counter::Units::Percent, [=]() { + return percent(PERF_LRZ_TOTAL_PIXEL - PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ, + PERF_LRZ_TOTAL_PIXEL); + }); + + counter("LRZ Primitives Killed", Counter::Units::None, [=]() { + return PERF_LRZ_PRIM_KILLED_BY_LRZ; + }); + + counter("LRZ Tiles Killed", Counter::Units::None, [=]() { + return PERF_LRZ_TILE_KILLED; + }); +} + +} // namespace pps diff --git a/src/freedreno/ds/fd_pps_driver.cc b/src/freedreno/ds/fd_pps_driver.cc index ad201d8f00c..8d73813ae41 100644 --- a/src/freedreno/ds/fd_pps_driver.cc +++ b/src/freedreno/ds/fd_pps_driver.cc @@ -7,9 +7,16 @@ #include #include -#include +#include +#include +#include + +#include + +#include "common/freedreno_common.h" #include "common/freedreno_dev_info.h" +#include "drm-uapi/msm_drm.h" #include "drm/freedreno_drmif.h" #include "drm/freedreno_ringbuffer.h" #include "perfcntrs/freedreno_dt.h" @@ -46,6 +53,8 @@ FreedrenoDriver::configure_counters(bool reset, bool wait) (enum fd_ringbuffer_flags)(FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE); struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(submit, 0x1000, flags); + assert(io); /* This is legacy path only */ + for (const auto &countable : countables) countable.configure(ring, reset); @@ -67,12 +76,85 @@ FreedrenoDriver::configure_counters(bool reset, bool wait) void FreedrenoDriver::collect_countables() { + assert(io); /* This is legacy path only */ + last_dump_ts = gpu_timestamp(); for (const auto &countable : countables) countable.collect(); } +int +FreedrenoDriver::configure_counters_stream() +{ + if (perfcntr_stream_fd >= 0) { + close(perfcntr_stream_fd); + perfcntr_stream_fd = -1; + } + + unsigned sample_size = sizeof(uint64_t) * (2 + countables.size()); + unsigned bufsz = 2 * sample_size; + unsigned bufsz_shift = ffs(util_next_power_of_two(bufsz)) - 1; + + struct drm_msm_perfcntr_group groups[num_perfcntrs]; + memset(groups, 0, sizeof(groups)); + + struct drm_msm_perfcntr_config req = { + .flags = MSM_PERFCNTR_STREAM, + .groups = VOID2U64(groups), + .period = sampling_period_ns_, + .bufsz_shift = bufsz_shift, + .group_stride = sizeof(struct drm_msm_perfcntr_group), + }; + + assert(req.period); + + for (const auto &countable : countables) + countable.configure_stream(&req); + + /* Now that the groups are fully populated, resolve the sample indices: */ + for (const auto &countable : countables) + countable.resolve_sample_idx(&req); + + int fd = drmIoctl(fd_device_fd(dev), DRM_IOCTL_MSM_PERFCNTR_CONFIG, &req); + if (fd < 0) + return fd; + + sample_buf = malloc(sample_size); + + perfcntr_stream_fd = fd; + + /* Unlike the legacy path, the kernel handles reconfiguring counters + * after power collapse for us, so we won't need to configure the + * stream again. So cleanup allocated memory now: + */ + for (unsigned i = 0; i < num_perfcntrs; i++) { + if (!groups[i].countables) + break; + free(U642VOID(groups[i].countables)); + } + + return 0; +} + +static bool +perfcntr_stream_ready(int perfcntr_stream_fd) +{ + struct pollfd pfd; + + pfd.fd = perfcntr_stream_fd; + pfd.events = POLLIN; + pfd.revents = 0; + + if (poll(&pfd, 1, 0) < 0) + return false; + + if (!(pfd.revents & POLLIN)) + return false; + + return true; +} + static uint64_t ticks_to_ns(uint64_t ticks) { @@ -82,6 +164,61 @@ ticks_to_ns(uint64_t ticks) return ticks / GPU_TICKS_PER_NS; } +bool +FreedrenoDriver::collect_countables_stream() +{ + unsigned nsamples = 0; + bool discontinuity = false; + + assert(perfcntr_stream_fd >= 0); + + while (perfcntr_stream_ready(perfcntr_stream_fd)) { + unsigned sample_size = sizeof(uint64_t) * (2 + countables.size()); + size_t sz = sample_size; + void *ptr = sample_buf; + + while (sz > 0) { + ssize_t ret = read(perfcntr_stream_fd, ptr, sz); + + if (ret < 0) + ret = -errno; + + if (ret == -EINTR || ret == -EAGAIN) + continue; + + if (ret < 0) + errx(ret, "read failed"); + + sz -= ret; + ptr = static_cast(ptr) + ret; + } + + uint64_t *buf = (uint64_t *)sample_buf; + uint64_t ts = buf[0]; + uint32_t seqno = buf[1] & 0xffffffff; + + discontinuity = seqno == 0; + + /* Capture the timestamp from the *start* of the sampling period: */ + last_capture_ts = last_dump_ts; + last_dump_ts = ts; + + auto elapsed_time_ns = ticks_to_ns(last_dump_ts - last_capture_ts); + + time = (float)elapsed_time_ns / 1000000000.0; + + /* advance past header: */ + buf += 2; + + for (const auto &countable : countables) + countable.collect_stream(buf); + + nsamples++; + } + + return (nsamples > 0) && !discontinuity; +} + bool FreedrenoDriver::init_perfcnt() { @@ -107,9 +244,7 @@ FreedrenoDriver::init_perfcnt() has_suspend_count = true; } - fd_pipe_set_param(pipe, FD_SYSPROF, 1); - - perfcntrs = fd_perfcntrs(fd_pipe_dev_id(pipe), &num_perfcntrs); + perfcntrs = fd_perfcntrs(dev_id, &num_perfcntrs); if (num_perfcntrs == 0) { PERFETTO_FATAL("No hw counters available"); return false; @@ -127,6 +262,9 @@ FreedrenoDriver::init_perfcnt() case 7: setup_a7xx_counters(); break; + case 8: + setup_a8xx_counters(); + break; default: PERFETTO_FATAL("Unsupported GPU: a%03u", fd_dev_gpu_id(dev_id)); return false; @@ -137,12 +275,20 @@ FreedrenoDriver::init_perfcnt() for (const auto &countable : countables) countable.resolve(); + if (!configure_counters_stream()) { + close(perfcntr_stream_fd); + perfcntr_stream_fd = -1; + return true; + } + io = fd_dt_find_io(); if (!io) { PERFETTO_FATAL("Could not map GPU I/O space"); return false; } + fd_pipe_set_param(pipe, FD_SYSPROF, 1); + configure_counters(true, true); collect_countables(); @@ -165,14 +311,26 @@ FreedrenoDriver::enable_all_counters() } void -FreedrenoDriver::enable_perfcnt(const uint64_t /* sampling_period_ns */) +FreedrenoDriver::enable_perfcnt(const uint64_t sampling_period_ns) { + sampling_period_ns_ = sampling_period_ns; + + if (!io) { + /* reconfigure counter stream: */ + configure_counters_stream(); + collect_countables_stream(); + } } bool FreedrenoDriver::dump_perfcnt() { - if (has_suspend_count) { + /* Note, when using perfcntr stream instead of mmio basec counter + * reads, we can skip this (since the seqno in the data read from + * the stream will tell us if there is a discontinuity, and the + * kernel will handle reconfiguring counters on resume) + */ + if (has_suspend_count && io) { uint64_t val; fd_pipe_get_param(pipe, FD_SUSPEND_COUNT, &val); @@ -193,6 +351,9 @@ FreedrenoDriver::dump_perfcnt() } } + if (!io) + return collect_countables_stream(); + auto last_ts = last_dump_ts; /* Capture the timestamp from the *start* of the sampling period: */ @@ -223,11 +384,13 @@ uint64_t FreedrenoDriver::next() return ret; } -void FreedrenoDriver::disable_perfcnt() +void +FreedrenoDriver::disable_perfcnt() { - /* There isn't really any disable, only reconfiguring which countables - * get muxed to which counters - */ + if (perfcntr_stream_fd >= 0) { + close(perfcntr_stream_fd); + perfcntr_stream_fd = -1; + } } /* @@ -278,6 +441,80 @@ FreedrenoDriver::Countable::configure(struct fd_ringbuffer *ring, bool reset) co } } +void +FreedrenoDriver::Countable::configure_stream(struct drm_msm_perfcntr_config *req) const +{ + const struct fd_perfcntr_countable *countable = d->state[id].countable; + struct drm_msm_perfcntr_group *groups = + (struct drm_msm_perfcntr_group *)U642VOID(req->groups); + + /* Find group: */ + struct drm_msm_perfcntr_group *g = NULL; + + for (unsigned i = 0; i < req->nr_groups; i++) { + if (!strcmp(groups[i].group_name, group.c_str())) { + g = &groups[i]; + break; + } + } + + /* If not found, append a new group: */ + if (!g) { + g = &groups[req->nr_groups++]; + strcpy(g->group_name, group.c_str()); + + /* allocate countables for max # of counters in the group */ + for (unsigned i = 0; i < d->num_perfcntrs; i++) { + if (!strcmp(d->perfcntrs[i].name, group.c_str())) { + void *countables = calloc(sizeof(uint32_t), d->perfcntrs[i].num_counters); + g->countables = VOID2U64(countables); + break; + } + } + + assert(g->countables); + } + + /* Initially, just store the index within the group, since earlier groups + * are not yet fully populated (ie. we don't yet know the offset of the + * first sample in the group) + */ + d->state[id].idx = g->nr_countables; + + /* And last, append the countable: */ + uint32_t *countables = (uint32_t *)U642VOID(g->countables); + countables[g->nr_countables++] = countable->selector; +} + +static unsigned +find_group_offset(const struct drm_msm_perfcntr_config *req, const char *group) +{ + struct drm_msm_perfcntr_group *groups = + (struct drm_msm_perfcntr_group *)U642VOID(req->groups); + unsigned off = 0; + + for (unsigned i = 0; i < req->nr_groups; i++) { + if (!strcmp(groups[i].group_name, group)) + break; + off += groups[i].nr_countables; + } + + return off; +} + +void +FreedrenoDriver::Countable::resolve_sample_idx(const struct drm_msm_perfcntr_config *req) const +{ + d->state[id].idx += find_group_offset(req, group.c_str()); +} + +void +FreedrenoDriver::Countable::collect_stream(const uint64_t *buf) const +{ + d->state[id].last_value = d->state[id].value; + d->state[id].value = buf[d->state[id].idx]; +} + /* Collect current counter value and calculate delta since last sample: */ void FreedrenoDriver::Countable::collect() const @@ -302,11 +539,10 @@ FreedrenoDriver::Countable::resolve() const if (group != g->name) continue; - for (unsigned j = 0; j < g->num_countables; j++) { - const struct fd_perfcntr_countable *c = &g->countables[j]; - if (name != c->name) - continue; + const struct fd_perfcntr_countable *c = + fd_perfcntrs_countable(g, name.c_str()); + if (c) { d->state[id].countable = c; /* Assign counters from high to low to reduce conflicts with UMD-owned diff --git a/src/freedreno/ds/fd_pps_driver.h b/src/freedreno/ds/fd_pps_driver.h index 81395714581..b552c5fbda3 100644 --- a/src/freedreno/ds/fd_pps_driver.h +++ b/src/freedreno/ds/fd_pps_driver.h @@ -6,6 +6,7 @@ #pragma once #include "pps/pps_driver.h" +#include "drm-uapi/msm_drm.h" extern "C" { struct fd_dev_id; @@ -54,10 +55,26 @@ private: const struct fd_dev_info *info; /** - * The memory mapped i/o space for counter readback: + * The memory mapped i/o space for counter readback (legacy): */ void *io; + /** + * perfcntr stream fd, if not using memory mapped i/o for counter + * readback. + */ + int perfcntr_stream_fd = -1; + + /** + * The configured sampling period + */ + uint64_t sampling_period_ns_ = 1000000000; + + /** + * Buffer used to read samples + */ + void *sample_buf; + const struct fd_perfcntr_group *perfcntrs; unsigned num_perfcntrs; @@ -75,10 +92,14 @@ private: void setup_a6xx_counters(); void setup_a7xx_counters(); + void setup_a8xx_counters(); void configure_counters(bool reset, bool wait); void collect_countables(); + int configure_counters_stream(); + bool collect_countables_stream(); + /** * Split out countable mutable state from the class so that copy- * constructor does something sane when lambda derive function @@ -88,6 +109,9 @@ private: uint64_t last_value, value; const struct fd_perfcntr_countable *countable; const struct fd_perfcntr_counter *counter; + + /* index into perfcntr stream sample buf: */ + unsigned idx; }; std::vector state; @@ -115,6 +139,11 @@ private: void collect() const; void resolve() const; + /* perfcntr stream related APIs */ + void configure_stream(struct drm_msm_perfcntr_config *req) const; + void resolve_sample_idx(const struct drm_msm_perfcntr_config *req) const; + void collect_stream(const uint64_t *buf) const; + private: uint64_t get_value() const; diff --git a/src/freedreno/ds/meson.build b/src/freedreno/ds/meson.build index f569311c4a9..13211f0e27c 100644 --- a/src/freedreno/ds/meson.build +++ b/src/freedreno/ds/meson.build @@ -7,6 +7,7 @@ pps_freedreno_lib = static_library( sources: [ 'fd_pps_a6xx.cc', 'fd_pps_a7xx.cc', + 'fd_pps_a8xx.cc', 'fd_pps_driver.cc', 'fd_pps_driver.h', freedreno_xml_header_files, diff --git a/src/freedreno/perfcntrs/dumpctrs.c b/src/freedreno/perfcntrs/dumpctrs.c new file mode 100644 index 00000000000..62b9fe4e1fe --- /dev/null +++ b/src/freedreno/perfcntrs/dumpctrs.c @@ -0,0 +1,82 @@ +/* + * Copyright © 2016 Rob Clark + * All Rights Reserved. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drm/freedreno_drmif.h" +#include "drm/freedreno_ringbuffer.h" + +#include "util/os_file.h" + +#include "freedreno_dt.h" +#include "freedreno_perfcntr.h" + +/* + * Simple tool to dump perfctr tables (so we can make sure nothing gets + * missed while converting to generated tables) + */ + +int +main(int argc, char **argv) +{ + struct fd_dev_id dev_id = {}; + unsigned ngroups = 0; + const struct fd_perfcntr_group *groups; + + if (argc != 2) + return -1; + + if (!strcmp(argv[1], "a2xx")) { + dev_id.gpu_id = 200; + } else if (!strcmp(argv[1], "a5xx")) { + dev_id.gpu_id = 530; + } else if (!strcmp(argv[1], "a6xx")) { + dev_id.gpu_id = 630; + } else if (!strcmp(argv[1], "a7xx")) { + dev_id.chip_id = 0xffff07030001; + } + + groups = fd_perfcntrs(&dev_id, &ngroups); + if (!groups) { + errx(1, "no perfcntr support"); + } + + for (int i = 0; i < ngroups; i++) { + const struct fd_perfcntr_group *g = &groups[i]; + printf("GROUP[%s]: num_counters=%u, num_countables=%u\n", + g->name, g->num_counters, g->num_countables); + + for (int j = 0; j < g->num_counters; j++) { + const struct fd_perfcntr_counter *counter = &g->counters[j]; + printf("COUNTER: %04x, %04x, %04x, %04x, %04x\n", + counter->select_reg, counter->counter_reg_lo, counter->counter_reg_hi, + counter->enable, counter->clear); + } + + for (int j = 0; j < g->num_countables; j++) { + const struct fd_perfcntr_countable *countable = &g->countables[j]; + printf("COUNTABLE[%s]: %04x\n", countable->name, countable->selector); + } + + printf("\n"); + } + + + return 0; +} diff --git a/src/freedreno/perfcntrs/fd7_perfcntr.c b/src/freedreno/perfcntrs/fd7_perfcntr.c index 60db54dc7fb..6724b539398 100644 --- a/src/freedreno/perfcntrs/fd7_perfcntr.c +++ b/src/freedreno/perfcntrs/fd7_perfcntr.c @@ -97,107 +97,104 @@ enum { static_assert(DERIVED_COUNTER_PERFCNTR_MAX_VALUE <= FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS, ""); -#define DERIVED_COUNTER_PERFCNTR(_enum, _counter) \ - [DERIVED_COUNTER_PERFCNTR_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum } -#define DERIVED_COUNTER_PERFCNTR_BV(_enum, _counter) \ - [DERIVED_COUNTER_PERFCNTR_BV_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum } +#define DERIVED_COUNTER_PERFCNTR(_countable, _group) \ + [DERIVED_COUNTER_PERFCNTR_##_countable] = { .countable = "PERF_" #_countable, .group = #_group } +#define DERIVED_COUNTER_PERFCNTR_BV(_countable, _group) \ + [DERIVED_COUNTER_PERFCNTR_BV_##_countable] = { .countable = "PERF_" #_countable, .group = "BV_" #_group } -static const struct { - const struct fd_perfcntr_counter *counter; - unsigned countable; -} a7xx_derived_counter_perfcntrs[] = { +const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[] = { /* CP: 3/14 counters */ - DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, &cp_counters[0]), - DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, &cp_counters[1]), - DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, &cp_counters[2]), + DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, CP), + DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, CP), + DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, CP), /* RBBM: 1/4 counters */ - DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, &rbbm_counters[0]), + DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, RBBM), /* PC: 3/8 counters */ - DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, &pc_counters[0]), - DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, &pc_counters[1]), - DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, &pc_counters[2]), + DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, PC), + DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, PC), + DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, PC), /* TSE: 4/4 counters */ - DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, &tse_counters[0]), - DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, &tse_counters[1]), - DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, &tse_counters[2]), - DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, &tse_counters[3]), + DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, TSE), + DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, TSE), + DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, TSE), + DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, TSE), /* UCHE: 5/12 counters */ - DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, &uche_counters[0]), - DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, &uche_counters[1]), - DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, &uche_counters[2]), - DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, &uche_counters[3]), - DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, &uche_counters[4]), + DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, UCHE), + DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, UCHE), /* TP: 7/12 counters */ - DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, &tp_counters[0]), - DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, &tp_counters[1]), - DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, &tp_counters[2]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, &tp_counters[3]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, &tp_counters[4]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, &tp_counters[5]), - DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, &tp_counters[6]), + DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, TP), + DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, TP), + DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, TP), + DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, TP), /* SP: 24/24 counters */ - DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, &sp_counters[ 0]), - DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, &sp_counters[ 1]), - DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, &sp_counters[ 2]), - DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, &sp_counters[ 3]), - DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, &sp_counters[ 4]), - DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, &sp_counters[ 5]), - DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 6]), - DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 7]), - DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 8]), - DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 9]), - DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, &sp_counters[10]), - DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, &sp_counters[11]), - DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, &sp_counters[12]), - DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, &sp_counters[13]), - DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, &sp_counters[14]), - DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, &sp_counters[15]), - DERIVED_COUNTER_PERFCNTR(SP_PIXELS, &sp_counters[16]), - DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, &sp_counters[17]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, &sp_counters[18]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, &sp_counters[19]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, &sp_counters[20]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, &sp_counters[21]), - DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, &sp_counters[22]), - DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, &sp_counters[23]), + DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, SP), + DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, SP), + DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, SP), + DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR(SP_PIXELS, SP), + DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, SP), + DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, SP), /* CMP: 1/4 counters */ - DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, &cmp_counters[0]), + DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, CMP), /* BV_PC: 3/8 counters */ - DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, &bv_pc_counters[0]), - DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, &bv_pc_counters[1]), - DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, &bv_pc_counters[2]), + DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, PC), + DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, PC), + DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, PC), /* BV_TP: 6/6 counters */ - DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, &bv_tp_counters[0]), - DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, &bv_tp_counters[1]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, &bv_tp_counters[2]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, &bv_tp_counters[3]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, &bv_tp_counters[4]), - DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, &bv_tp_counters[5]), + DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, TP), + DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, TP), /* GP: 8/12 counters */ - DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, &bv_sp_counters[0]), - DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, &bv_sp_counters[1]), - DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, &bv_sp_counters[2]), - DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &bv_sp_counters[3]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, &bv_sp_counters[4]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, &bv_sp_counters[5]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, &bv_sp_counters[6]), - DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, &bv_sp_counters[7]), + DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, SP), + DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, SP), /* LRZ: 4/4 counters */ - DERIVED_COUNTER_PERFCNTR(LRZ_TOTAL_PIXEL, &lrz_counters[0]), - DERIVED_COUNTER_PERFCNTR(LRZ_VISIBLE_PIXEL_AFTER_LRZ, &lrz_counters[1]), - DERIVED_COUNTER_PERFCNTR(LRZ_TILE_KILLED, &lrz_counters[2]), - DERIVED_COUNTER_PERFCNTR(LRZ_PRIM_KILLED_BY_LRZ, &lrz_counters[3]), + DERIVED_COUNTER_PERFCNTR(LRZ_TOTAL_PIXEL, LRZ), + DERIVED_COUNTER_PERFCNTR(LRZ_VISIBLE_PIXEL_AFTER_LRZ, LRZ), + DERIVED_COUNTER_PERFCNTR(LRZ_TILE_KILLED, LRZ), + DERIVED_COUNTER_PERFCNTR(LRZ_PRIM_KILLED_BY_LRZ, LRZ), }; static uint64_t @@ -985,50 +982,3 @@ const struct fd_derived_counter *a7xx_derived_counters[] = { const unsigned a7xx_num_derived_counters = ARRAY_SIZE(a7xx_derived_counters); static_assert(ARRAY_SIZE(a7xx_derived_counters) <= FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS, ""); - -/* Prototype for linking purposes. */ -void -a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection); - -void -a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection) -{ - /* The provided collection should already specify the derived counters that will be measured. - * This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each - * used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the - * array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored. - */ - - collection->num_enabled_perfcntrs = 0; - memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map)); - - for (unsigned i = 0; i < collection->num_counters; ++i) { - const struct fd_derived_counter *counter = collection->counters[i]; - - for (unsigned j = 0; j < counter->num_perfcntrs; ++j) { - uint8_t perfcntr = counter->perfcntrs[j]; - collection->enabled_perfcntrs_map[perfcntr] = 0x00; - } - } - - /* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */ - collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0]; - - for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) { - if (collection->enabled_perfcntrs_map[i] == 0xff) - continue; - - uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++; - collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index; - - collection->enabled_perfcntrs[enabled_perfcntr_index].counter = - a7xx_derived_counter_perfcntrs[i].counter; - collection->enabled_perfcntrs[enabled_perfcntr_index].countable = - a7xx_derived_counter_perfcntrs[i].countable; - } - - const struct fd_dev_info *info = fd_dev_info_raw(id); - collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2; - collection->derivation_context.a7xx.number_of_alus_per_usptp = 128; -} - diff --git a/src/freedreno/perfcntrs/fdperf.c b/src/freedreno/perfcntrs/fdperf.c index c0f34ed0385..751c7dc3408 100644 --- a/src/freedreno/perfcntrs/fdperf.c +++ b/src/freedreno/perfcntrs/fdperf.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -24,9 +26,12 @@ #include "util/os_file.h" +#include "freedreno_common.h" #include "freedreno_dt.h" #include "freedreno_perfcntr.h" +#include "drm-uapi/msm_drm.h" + #define MAX_CNTR_PER_GROUP 24 #define REFRESH_MS 500 @@ -45,6 +50,11 @@ static struct { struct counter_group { const struct fd_perfcntr_group *group; + /* We initially try to use all counters, but can reduce this if + * not all counters are available. + */ + unsigned num_counters; + struct { const struct fd_perfcntr_counter *counter; uint16_t select_val; @@ -75,11 +85,30 @@ static struct { const struct fd_dev_id *dev_id; struct fd_submit *submit; struct fd_ringbuffer *ring; -} dev; + + /* This is used for PERFCNTR_CONFIG if supported by kernel. In + * this case, dev.io is not used. + */ + struct drm_msm_perfcntr_config perfcntr_config; + int perfcntr_stream_fd; + + int num_configured_counters; + + uint32_t seqno; + bool discontinuity; +} dev = { + .perfcntr_config = { + .flags = MSM_PERFCNTR_STREAM | MSM_PERFCNTR_UPDATE, + .bufsz_shift = 12, + .group_stride = sizeof(struct drm_msm_perfcntr_group), + }, + .perfcntr_stream_fd = -1, +}; static void config_save(void); static void config_restore(void); static void restore_counter_groups(void); +static void setup_counter_groups(const struct fd_perfcntr_group *groups); /* * helpers @@ -113,6 +142,27 @@ delta(uint64_t a, uint64_t b) return b - a; } +static int +perfcntr_config(void) +{ + if (dev.perfcntr_stream_fd >= 0) { + close(dev.perfcntr_stream_fd); + dev.perfcntr_stream_fd = -1; + } + + errno = 0; + + int fd = drmIoctl(fd_device_fd(dev.dev), + DRM_IOCTL_MSM_PERFCNTR_CONFIG, + &dev.perfcntr_config); + if (fd < 0) + return -errno; + + dev.perfcntr_stream_fd = fd; + + return 0; +} + static void find_device(void) { @@ -146,6 +196,42 @@ find_device(void) printf("min_freq=%u, max_freq=%u\n", dev.min_freq, dev.max_freq); + const struct fd_perfcntr_group *groups; + groups = fd_perfcntrs(dev.dev_id, &dev.ngroups); + if (!groups) { + errx(1, "no perfcntr support"); + } + + dev.groups = calloc(dev.ngroups, sizeof(struct counter_group)); + setup_counter_groups(groups); + + ret = perfcntr_config(); + if (ret == -E2BIG) { + struct drm_msm_perfcntr_group *g = U642VOID(dev.perfcntr_config.groups); + + /* we are trying to use too many counters, back off: */ + for (unsigned i = 0; i < dev.ngroups; i++) { + if (g[i].nr_countables < dev.groups[i].num_counters) { + printf("reducing %s counters %u -> %u\n", + groups[i].name, dev.groups[i].num_counters, g[i].nr_countables); + dev.num_configured_counters -= + dev.groups[i].num_counters - g[i].nr_countables; + dev.groups[i].num_counters = g[i].nr_countables; + } + } + + ret = perfcntr_config(); + } + + if (!ret) { + return; + } + + /* mmio not supported on gen8+: */ + if (fd_dev_gen(dev.dev_id) >= 8) { + err(1, "mmio fallback not supported"); + } + dev.io = fd_dt_find_io(); if (!dev.io) { err(1, "could not map device"); @@ -161,6 +247,13 @@ find_device(void) static void flush_ring(void) { + if (!dev.io) { + int ret = perfcntr_config(); + if (ret < 0) + errx(1, "perfcntr_config() failed"); + return; + } + if (!dev.submit) return; @@ -181,7 +274,7 @@ flush_ring(void) static void select_counter(struct counter_group *group, int ctr, int countable_val) { - assert(ctr < group->group->num_counters); + assert(ctr < group->num_counters); unsigned countable_idx = UINT32_MAX; for (unsigned i = 0; i < group->group->num_countables; i++) { @@ -198,6 +291,20 @@ select_counter(struct counter_group *group, int ctr, int countable_val) group->label[ctr] = group->group->countables[countable_idx].name; group->counter[ctr].select_val = countable_val; + /* If using PERFCNTR_CONFIG, then update the ioctl structure: */ + if (!dev.io) { + struct drm_msm_perfcntr_group *g = U642VOID(dev.perfcntr_config.groups); + + for (int i = 0; i < dev.ngroups; i++) { + if (&dev.groups[i] == group) { + uint32_t *countables = U642VOID(g[i].countables); + countables[ctr] = countable_val; + break; + } + } + return; + } + if (!dev.submit) { dev.submit = fd_submit_new(dev.pipe); dev.ring = fd_submit_new_ringbuffer( @@ -311,6 +418,82 @@ check_counter_invalid(struct counter_group *group, int ctr) group->counter[ctr].is_invalid = (hw_selector != group->counter[ctr].select_val); } +static bool +perfcntr_stream_ready(void) +{ + struct pollfd pfd; + + pfd.fd = dev.perfcntr_stream_fd; + pfd.events = POLLIN; + pfd.revents = 0; + + if (poll(&pfd, 1, 0) < 0) + return false; + + if (!(pfd.revents & POLLIN)) + return false; + + return true; +} + +/* GPU always-on timer constants */ +static const uint64_t ALWAYS_ON_FREQUENCY_HZ = 19200000; +static const double GPU_TICKS_PER_US = ALWAYS_ON_FREQUENCY_HZ / 1000000.0; + +static uint64_t +ticks_to_us(uint64_t ticks) +{ + return ticks / GPU_TICKS_PER_US; +} + +static void +resample_perfcntr_stream(void) +{ + if (!perfcntr_stream_ready()) { + dev.discontinuity = true; + return; + } + + uint64_t buf[dev.num_configured_counters + 2]; /* include 128b header */ + void *ptr = buf; + size_t sz = sizeof(buf); + + while (sz > 0) { + ssize_t ret = read(dev.perfcntr_stream_fd, ptr, sz); + + if (ret < 0) + ret = -errno; + + if (ret == -EINTR || ret == -EAGAIN) + continue; + + if (ret < 0) + errx(ret, "read failed"); + + sz -= ret; + ptr += ret; + } + + int idx = 0; + uint64_t ts = ticks_to_us(buf[idx++]); + uint32_t seqno = buf[idx++] & 0xffffffff; + + dev.discontinuity = (seqno == 0); + + for (unsigned i = 0; i < dev.ngroups; i++) { + struct counter_group *group = &dev.groups[i]; + for (unsigned ctr = 0; ctr < group->num_counters; ctr++) { + uint64_t previous_value = group->value[ctr]; + group->value[ctr] = buf[idx++]; + group->value_delta[ctr] = delta(previous_value, group->value[ctr]); + + uint64_t previous_sample_time = group->sample_time[ctr]; + group->sample_time[ctr] = ts; + group->sample_time_delta[ctr] = delta(previous_sample_time, ts); + } + } +} + /* sample all the counters: */ static void resample(void) @@ -323,9 +506,14 @@ resample(void) last_time = current_time; + if (!dev.io) { + resample_perfcntr_stream(); + return; + } + for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { resample_counter(group, j, current_time); check_counter_invalid(group, j); } @@ -469,7 +657,7 @@ static void redraw_counter(WINDOW *win, int row, struct counter_group *group, int ctr, bool selected) { - bool is_invalid = group->counter[ctr].is_invalid; + bool is_invalid = group->counter[ctr].is_invalid || dev.discontinuity; redraw_counter_label(win, row, group->label[ctr], selected, is_invalid); redraw_counter_value(win, row, group, ctr, is_invalid); } @@ -513,13 +701,13 @@ redraw(WINDOW *win) if (group->counter[0].is_gpufreq_counter) j++; - if (j < group->group->num_counters) { + if (j < group->num_counters) { if ((scroll <= row) && ((row - scroll) < max)) redraw_group_header(win, row - scroll, group->group->name); row++; } - for (; j < group->group->num_counters; j++) { + for (; j < group->num_counters; j++) { if ((scroll <= row) && ((row - scroll) < max)) redraw_counter(win, row - scroll, group, j, row == current_cntr); row++; @@ -554,7 +742,7 @@ current_counter(int *ctr) j++; /* account for group header: */ - if (j < group->group->num_counters) { + if (j < group->num_counters) { /* cannot select group header.. return null to indicate this * main_ui(): */ @@ -563,7 +751,7 @@ current_counter(int *ctr) n++; } - for (; j < group->group->num_counters; j++) { + for (; j < group->num_counters; j++) { if (n == current_cntr) { if (ctr) *ctr = j; @@ -734,6 +922,9 @@ main_ui(void) resample(); redraw(mainwin); + if (!dev.io) + continue; + /* restore the counters every 0.5s in case the GPU has suspended, * in which case the current selected countables will have reset: */ @@ -761,7 +952,7 @@ dump_counters(void) for (unsigned i = 0; i < dev.ngroups; i++) { const struct counter_group *group = &dev.groups[i]; - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { const char *label = group->label[j]; float val = (float) group->value_delta[j] * 1000000.0 / (float) group->sample_time_delta[j]; @@ -798,7 +989,7 @@ restore_counter_groups(void) for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { /* This should also write the CP_ALWAYS_COUNT selectable value into * the reserved CP counter we use for GPU frequency measurement, * avoiding someone else writing a different value there. @@ -811,12 +1002,29 @@ restore_counter_groups(void) static void setup_counter_groups(const struct fd_perfcntr_group *groups) { + /* pre-allocate memory needed for PERFCNTR_CONFIG ioctl: */ + struct drm_msm_perfcntr_group *g = calloc(sizeof(struct drm_msm_perfcntr_group), dev.ngroups); + + dev.perfcntr_config.nr_groups = dev.ngroups; + dev.perfcntr_config.period = options.refresh_ms * 1000000; + dev.perfcntr_config.groups = VOID2U64(g); + for (unsigned i = 0; i < dev.ngroups; i++) { struct counter_group *group = &dev.groups[i]; - group->group = &groups[i]; + if (strlen(groups[i].name) > sizeof(g[i].group_name)) + errx(1, "group name too large: %s", groups[i].name); - max_rows += group->group->num_counters + 1; + strncpy(g[i].group_name, groups[i].name, sizeof(g[i].group_name)); + g[i].nr_countables = groups[i].num_counters; + g[i].countables = VOID2U64(calloc(sizeof(uint32_t), g[i].nr_countables)); + + dev.num_configured_counters += g[i].nr_countables; + + group->group = &groups[i]; + group->num_counters = group->group->num_counters; + + max_rows += group->num_counters + 1; /* We reserve the first counter of the CP group (first in the list) for * measuring GPU frequency that's displayed in the footer. @@ -846,7 +1054,7 @@ setup_counter_groups(const struct fd_perfcntr_group *groups) } } - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { group->counter[j].counter = &group->group->counters[j]; if (!group->counter[j].is_gpufreq_counter) @@ -889,7 +1097,7 @@ config_save(void) config_setting_t *sect = config_setting_get_member(setting, group->group->name); - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { /* Don't save the GPU frequency measurement counter. */ if (group->counter[j].is_gpufreq_counter) continue; @@ -936,7 +1144,7 @@ config_restore(void) config_setting_add(setting, group->group->name, CONFIG_TYPE_GROUP); } - for (unsigned j = 0; j < group->group->num_counters; j++) { + for (unsigned j = 0; j < group->num_counters; j++) { /* Don't restore the GPU frequency measurement counter. */ if (group->counter[j].is_gpufreq_counter) continue; @@ -997,17 +1205,8 @@ main(int argc, char **argv) find_device(); - const struct fd_perfcntr_group *groups; - groups = fd_perfcntrs(dev.dev_id, &dev.ngroups); - if (!groups) { - errx(1, "no perfcntr support"); - } - - dev.groups = calloc(dev.ngroups, sizeof(struct counter_group)); - setlocale(LC_NUMERIC, "en_US.UTF-8"); - setup_counter_groups(groups); restore_counter_groups(); config_restore(); flush_ring(); diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.c b/src/freedreno/perfcntrs/freedreno_perfcntr.c index aa984903a6b..cb4ea0b2fd7 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.c +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.c @@ -7,6 +7,16 @@ */ #include +#include + +#include "util/hash_table.h" +#include "util/ralloc.h" + +#include "drm-uapi/msm_drm.h" +#include "util/bitset.h" +#include "util/log.h" +#include "util/simple_mtx.h" +#include "freedreno_common.h" #include "freedreno_perfcntr.h" @@ -41,12 +51,269 @@ fd_perfcntrs(const struct fd_dev_id *id, unsigned *count) case 7: *count = a7xx_num_perfcntr_groups; return a7xx_perfcntr_groups; + case 8: + *count = a8xx_num_perfcntr_groups; + return a8xx_perfcntr_groups; default: *count = 0; return NULL; } } +struct fd_perfcntr_counter_state { + int group; + int counter; + int countable; + unsigned nr_users; +}; + +#define MAX_COUNTERS_PER_GROUP 32 +typedef BITSET_DECLARE(assigned_counters_t, MAX_COUNTERS_PER_GROUP); + +/** + * Helper to manage assigning counters, tracking if there are multiple users + * for the same countable (to avoid assigning duplicate counters for the + * same countable, etc) + */ +struct fd_perfcntr_state { + simple_mtx_t lock; + int fd; + const struct fd_dev_id *id; + + unsigned nr_groups; + const struct fd_perfcntr_group *groups; + + struct drm_msm_perfcntr_group *group_configs; + struct drm_msm_perfcntr_config config; + + /* bitmask of assigned counters per group: */ + assigned_counters_t *assigned_counters; + + /* maps counter to fd_perfcntr_counter_state: */ + struct hash_table *counter_state; +}; + +static int +update_reserved_counters(struct fd_perfcntr_state *perfcntrs) +{ + /* If no kernel support, just carry on and assume we can use all counters: */ + if (perfcntrs->fd < 0) + return 0; + + return drmIoctl(perfcntrs->fd, DRM_IOCTL_MSM_PERFCNTR_CONFIG, &perfcntrs->config); +} + +static int +update_group_counters(struct fd_perfcntr_state *perfcntrs, int group_idx) +{ + int ret = 0; + + /* Update reserved config with kernel if it changes. We might not + * be assiging/releasing the last counter (and we cannot feasibly + * re-map existing assigned counters to compact away gaps in the + * used counters, as cmdstream might already + * be built encoding the other assigned counters), but if we do + * let the kernel know: + */ + unsigned nr = BITSET_LAST_BIT(perfcntrs->assigned_counters[group_idx]); + if (nr != perfcntrs->group_configs[group_idx].nr_countables) { + mesa_logi("%s: %u -> %u counters", perfcntrs->groups[group_idx].name, perfcntrs->group_configs[group_idx].nr_countables, nr); + perfcntrs->group_configs[group_idx].nr_countables = nr; + ret = update_reserved_counters(perfcntrs); + } + + return ret; +} + +struct fd_perfcntr_state * +fd_perfcntr_state_alloc(const struct fd_dev_id *id, int fd) +{ + const struct fd_perfcntr_group *groups; + unsigned nr_groups; + + groups = fd_perfcntrs(id, &nr_groups); + if (!groups) + return NULL; + + struct fd_perfcntr_state *perfcntrs = rzalloc(NULL, struct fd_perfcntr_state); + + simple_mtx_init(&perfcntrs->lock, mtx_plain); + perfcntrs->fd = fd; + perfcntrs->id = id; + perfcntrs->nr_groups = nr_groups; + perfcntrs->groups = groups; + perfcntrs->group_configs = + rzalloc_array(perfcntrs, struct drm_msm_perfcntr_group, nr_groups); + + for (unsigned i = 0; i < nr_groups; i++) { + assert(strlen(groups[i].name) < sizeof(perfcntrs->group_configs[i].group_name)); + strcpy(perfcntrs->group_configs[i].group_name, groups[i].name); + } + + perfcntrs->config = (struct drm_msm_perfcntr_config) { + .nr_groups = nr_groups, + .groups = VOID2U64(perfcntrs->group_configs), + .group_stride = sizeof(struct drm_msm_perfcntr_group), + }; + + perfcntrs->assigned_counters = rzalloc_array(perfcntrs, assigned_counters_t, nr_groups); + perfcntrs->counter_state = _mesa_pointer_hash_table_create(perfcntrs); + + /* Probe for kernel PERFCNTR_CONFIG support with empty config: */ + if (update_reserved_counters(perfcntrs)) + perfcntrs->fd = -1; + + return perfcntrs; +} + +void +fd_perfcntr_state_free(struct fd_perfcntr_state *perfcntrs) +{ + if (!perfcntrs) + return; + + perfcntrs->config.nr_groups = 0; + update_reserved_counters(perfcntrs); + ralloc_free(perfcntrs); +} + +/** + * Does KMD support perfcntr reservation (ie. PERFCNTR_CONFIG) + */ +bool +fd_perfcntr_has_reservation(struct fd_perfcntr_state *perfcntrs) +{ + return perfcntrs->fd >= 0; +} + +static int +find_group_idx(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_group *group) +{ + for (unsigned i = 0; i < perfcntrs->nr_groups; i++) + if (&perfcntrs->groups[i] == group) + return i; + UNREACHABLE("invalid group"); +} + +static int +find_countable_idx(const struct fd_perfcntr_group *group, + const struct fd_perfcntr_countable *countable) +{ + for (unsigned i = 0; i < group->num_countables; i++) + if (&group->countables[i] == countable) + return i; + UNREACHABLE("invalid countable"); +} + +const struct fd_perfcntr_counter * +fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_group *group, + const struct fd_perfcntr_countable *countable) +{ + struct fd_perfcntr_counter_state *state = NULL; + int c, g = find_group_idx(perfcntrs, group); + + simple_mtx_lock(&perfcntrs->lock); + + /* Check if requested countable is already configured: */ + BITSET_FOREACH_SET (c, perfcntrs->assigned_counters[g], MAX_COUNTERS_PER_GROUP) { + struct hash_entry *e = + _mesa_hash_table_search(perfcntrs->counter_state, &group->counters[c]); + + assert(e); + struct fd_perfcntr_counter_state *s = e->data; + + if (&group->countables[s->countable] == countable) { + state = s; + break; + } + } + + /* If we didn't find a counter assigned to this countable, assign a new one: */ + if (!state) { + assigned_counters_t *assigned_counters = &perfcntrs->assigned_counters[g]; + + /* Pick lowest #ed unassigned counter: */ + assigned_counters_t free_counters; + memcpy(free_counters, *assigned_counters, sizeof(free_counters)); + BITSET_NOT(free_counters); + + c = BITSET_FFS(free_counters) - 1; + assert(c >= 0); +mesa_logi("pick counter %d", c); + + if (c < group->num_counters) { + state = rzalloc(perfcntrs, struct fd_perfcntr_counter_state); + state->group = g; + state->counter = c; + state->countable = find_countable_idx(group, countable); + + assert(!BITSET_TEST(*assigned_counters, state->counter)); + + BITSET_SET(*assigned_counters, state->counter); + + if (update_group_counters(perfcntrs, state->group)) { + BITSET_CLEAR(*assigned_counters, state->counter); + ralloc_free(state); + state = NULL; + } else { + _mesa_hash_table_insert(perfcntrs->counter_state, + &group->counters[state->counter], + state); + } + } + } + + if (state) + state->nr_users++; + + simple_mtx_unlock(&perfcntrs->lock); + + if (!state) + return NULL; + + mesa_logi("%s.%s: assigned %d (%d users)", group->name, countable->name, state->counter, state->nr_users); + + return &group->counters[state->counter]; +} + +void +fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_counter *counter) +{ + if (!counter) + return; + + simple_mtx_lock(&perfcntrs->lock); + struct hash_entry *e = _mesa_hash_table_search(perfcntrs->counter_state, counter); + if (e) { + struct fd_perfcntr_counter_state *state = e->data; + + assert(state->nr_users > 0); + + const struct fd_perfcntr_group *group = &perfcntrs->groups[state->group]; + mesa_logi("%s.%s: released %d (%d users)", group->name, group->countables[state->countable].name, state->counter, state->nr_users); + + if (--state->nr_users == 0) { + /* dropping last user of the counter: */ + _mesa_hash_table_remove(perfcntrs->counter_state, e); + + assigned_counters_t *assigned_counters = + &perfcntrs->assigned_counters[state->group]; + + assert(BITSET_TEST(*assigned_counters, state->counter)); + + BITSET_CLEAR(*assigned_counters, state->counter); + update_group_counters(perfcntrs, state->group); + + ralloc_free(state); + } + } + simple_mtx_unlock(&perfcntrs->lock); +} + +extern const struct fd_derived_counter_perfcntr a7xx_derived_counter_perfcntrs[]; extern const struct fd_derived_counter *a7xx_derived_counters[]; extern const unsigned a7xx_num_derived_counters; @@ -63,16 +330,73 @@ fd_derived_counters(const struct fd_dev_id *id, unsigned *count) } } -extern void a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection); - void -fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection) +fd_reserve_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection) { + const struct fd_derived_counter_perfcntr *derived_counter_perfcntrs = NULL; + const struct fd_dev_id *id = perfcntrs->id; + switch (fd_dev_gen(id)) { case 7: - a7xx_generate_derived_counter_collection(id, collection); + derived_counter_perfcntrs = a7xx_derived_counter_perfcntrs; + break; + default: + return; + } + + /* The provided collection should already specify the derived counters that will be measured. + * This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each + * used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the + * array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored. + */ + + collection->num_enabled_perfcntrs = 0; + memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map)); + + for (unsigned i = 0; i < collection->num_counters; ++i) { + const struct fd_derived_counter *counter = collection->counters[i]; + + for (unsigned j = 0; j < counter->num_perfcntrs; ++j) { + uint8_t perfcntr = counter->perfcntrs[j]; + collection->enabled_perfcntrs_map[perfcntr] = 0x00; + } + } + + /* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */ + collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0]; + + for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) { + if (collection->enabled_perfcntrs_map[i] == 0xff) + continue; + + uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++; + collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index; + + const struct fd_perfcntr_group *group = + fd_perfcntrs_group(perfcntrs->id, derived_counter_perfcntrs[i].group); + const struct fd_perfcntr_countable *countable = + fd_perfcntrs_countable(group, derived_counter_perfcntrs[i].countable); + const struct fd_perfcntr_counter *counter = + fd_perfcntr_reserve(perfcntrs, group, countable); + + collection->enabled_perfcntrs[enabled_perfcntr_index].counter = counter; + collection->enabled_perfcntrs[enabled_perfcntr_index].countable = countable->selector; + } + + const struct fd_dev_info *info = fd_dev_info_raw(id); + switch (fd_dev_gen(id)) { + case 7: + collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2; + collection->derivation_context.a7xx.number_of_alus_per_usptp = 128; break; default: break; } } + +void +fd_release_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection) +{ + for (unsigned i = 0; i < collection->num_enabled_perfcntrs; i++) + fd_perfcntr_release(perfcntrs, collection->enabled_perfcntrs[i].counter); +} diff --git a/src/freedreno/perfcntrs/freedreno_perfcntr.h b/src/freedreno/perfcntrs/freedreno_perfcntr.h index a0b6e99acde..047bbdfe960 100644 --- a/src/freedreno/perfcntrs/freedreno_perfcntr.h +++ b/src/freedreno/perfcntrs/freedreno_perfcntr.h @@ -89,6 +89,48 @@ const struct fd_perfcntr_group *fd_perfcntrs(const struct fd_dev_id *id, unsigne .countables = _countables, \ } +static inline const struct fd_perfcntr_group * +fd_perfcntrs_group(const struct fd_dev_id *id, const char *name) +{ + const struct fd_perfcntr_group *groups; + unsigned count; + + groups = fd_perfcntrs(id, &count); + if (!groups) + return NULL; + + for (unsigned i = 0; i < count; i++) + if (!strcmp(groups[i].name, name)) + return &groups[i]; + + return NULL; +} + +static inline const struct fd_perfcntr_countable * +fd_perfcntrs_countable(const struct fd_perfcntr_group *group, const char *name) +{ + for (unsigned i = 0; i < group->num_countables; i++) + if (!strcmp(group->countables[i].name, name)) + return &group->countables[i]; + + return NULL; +} + +struct fd_perfcntr_state; + +struct fd_perfcntr_state * +fd_perfcntr_state_alloc(const struct fd_dev_id *id, int fd); +void fd_perfcntr_state_free(struct fd_perfcntr_state *perfcntrs); + +bool fd_perfcntr_has_reservation(struct fd_perfcntr_state *perfcntrs); + +const struct fd_perfcntr_counter * +fd_perfcntr_reserve(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_group *group, + const struct fd_perfcntr_countable *countable); +void fd_perfcntr_release(struct fd_perfcntr_state *perfcntrs, + const struct fd_perfcntr_counter *counter); + #define FD_DERIVED_COUNTER_MAX_PERFCNTRS 8 struct fd_derivation_context { @@ -110,6 +152,11 @@ struct fd_derived_counter { uint64_t (*derive)(struct fd_derivation_context *context, uint64_t *values); }; +struct fd_derived_counter_perfcntr { + const char *countable; + const char *group; +}; + const struct fd_derived_counter **fd_derived_counters(const struct fd_dev_id *id, unsigned *count); #define FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS 64 @@ -130,7 +177,8 @@ struct fd_derived_counter_collection { struct fd_derivation_context derivation_context; }; -void fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection); +void fd_reserve_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection); +void fd_release_derived_counter_collection(struct fd_perfcntr_state *perfcntrs, struct fd_derived_counter_collection *collection); #ifdef __cplusplus } /* end of extern "C" */ diff --git a/src/freedreno/perfcntrs/meson.build b/src/freedreno/perfcntrs/meson.build index 5b0dbf8a0d2..61d39e8afa4 100644 --- a/src/freedreno/perfcntrs/meson.build +++ b/src/freedreno/perfcntrs/meson.build @@ -20,7 +20,11 @@ libfreedreno_perfcntrs = static_library( c_args : [no_override_init_args], gnu_symbol_visibility : 'hidden', link_with : [libfreedreno_common], - dependencies : idep_nir_headers, + dependencies : [ + dep_libdrm, + idep_mesautil, + idep_nir_headers, + ], build_by_default : false, ) @@ -51,3 +55,23 @@ if dep_libconfig.found() and dep_curses.found() install : with_tools.contains('freedreno'), ) endif + +dumpctrs = executable( + 'dumpctrs', + ['dumpctrs.c', freedreno_xml_header_files], + include_directories : [ + inc_freedreno, + inc_include, + inc_src, + ], + link_with : [ + libfreedreno_common, + libfreedreno_drm, + libfreedreno_perfcntrs, + ], + dependencies : [ + dep_libdrm, + idep_mesautil, + ], + build_by_default : with_tools.contains('freedreno'), +) diff --git a/src/freedreno/registers/gen_header.py b/src/freedreno/registers/gen_header.py index 07e6f0cb4e6..d3b56a9d84f 100644 --- a/src/freedreno/registers/gen_header.py +++ b/src/freedreno/registers/gen_header.py @@ -1003,7 +1003,7 @@ def dump_c(args, guard, func): # TODO figure out what to do about fd_reg_stomp_allowed() # vs gcc.. for now only enable the warnings with clang: - print("#if defined(__clang__) && !defined(FD_NO_DEPRECATED_PACK)") + print("#if defined(__clang__) && !defined(FD_NO_DEPRECATED_PACK) && !defined(__KERNEL__)") print("#define __FD_DEPRECATED _Pragma (\"GCC warning \\\"Deprecated reg builder\\\"\")") print("#else") print("#define __FD_DEPRECATED") diff --git a/src/freedreno/vulkan/tu_autotune.cc b/src/freedreno/vulkan/tu_autotune.cc index aa6c7f816dd..700ba8b7cf1 100644 --- a/src/freedreno/vulkan/tu_autotune.cc +++ b/src/freedreno/vulkan/tu_autotune.cc @@ -1633,41 +1633,20 @@ tu_autotune::tu_autotune(struct tu_device *device, VkResult &result) tu_bo_suballocator_init(&suballoc, device, 128 * 1024, TU_BO_ALLOC_INTERNAL_RESOURCE, "autotune_suballoc"); if (supports_preempt_latency_tracking()) { - uint32_t group_count; - const struct fd_perfcntr_group *groups = fd_perfcntrs(&device->physical_device->dev_id, &group_count); const char *fail_reason = nullptr; - const fd_perfcntr_group *cp_group = nullptr; - for (uint32_t i = 0; i < group_count; i++) { - if (strcmp(groups[i].name, "CP") == 0) { - cp_group = &groups[i]; - break; - } - } + const fd_perfcntr_group *cp_group = fd_perfcntrs_group(&device->physical_device->dev_id, "CP"); if (cp_group) { - auto get_perfcntr_countable = [](const struct fd_perfcntr_group *group, - const char *name) -> const struct fd_perfcntr_countable * { - for (uint32_t i = 0; i < group->num_countables; i++) { - if (strcmp(group->countables[i].name, name) == 0) - return &group->countables[i]; - } - - return nullptr; - }; - - auto preemption_latency_countable = get_perfcntr_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); - auto always_count_countable = get_perfcntr_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); + auto preemption_latency_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_PREEMPTION_REACTION_DELAY"); + auto always_count_countable = fd_perfcntrs_countable(cp_group, "PERF_CP_ALWAYS_COUNT"); if (preemption_latency_countable && always_count_countable) { - if (cp_group->num_counters >= 2) { - preemption_latency_selector_reg = cp_group->counters[0].select_reg; - preemption_latency_selector = preemption_latency_countable->selector; - preemption_latency_counter_reg_lo = cp_group->counters[0].counter_reg_lo; + preemption_latency_counter = + fd_perfcntr_reserve(device->perfcntrs, cp_group, preemption_latency_countable); + always_count_counter = + fd_perfcntr_reserve(device->perfcntrs, cp_group, always_count_countable); - always_count_selector_reg = cp_group->counters[1].select_reg; - always_count_selector = always_count_countable->selector; - always_count_counter_reg_lo = cp_group->counters[1].counter_reg_lo; - } else { + if (!preemption_latency_counter || !always_count_counter) { fail_reason = "not enough counters in CP group for preemption latency tracking"; } } else { @@ -1699,6 +1678,9 @@ tu_autotune::~tu_autotune() } tu_bo_suballocator_finish(&suballoc); + + fd_perfcntr_release(device->perfcntrs, preemption_latency_counter); + fd_perfcntr_release(device->perfcntrs, always_count_counter); } tu_autotune::cmd_buf_ctx::cmd_buf_ctx(struct tu_autotune &autotune): batch(autotune.create_batch()) @@ -1952,22 +1934,22 @@ tu_autotune::write_preempt_counters_to_iova(struct tu_cs *cs, uint64_t aon_iova) const { if (emit_selector) { - tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1); - tu_cs_emit(cs, preemption_latency_selector); + tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1); + tu_cs_emit(cs, preemption_latency_countable->selector); - tu_cs_emit_pkt4(cs, always_count_selector_reg, 1); - tu_cs_emit(cs, always_count_selector); + tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1); + tu_cs_emit(cs, always_count_countable->selector); } if (emit_wfi) tu_cs_emit_wfi(cs); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(preemption_latency_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, latency_iova); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); - tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter_reg_lo) | CP_REG_TO_MEM_0_64B); + tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(always_count_counter->counter_reg_lo) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, always_count_iova); tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); @@ -2060,11 +2042,11 @@ tu_autotune::emit_switch_away_amble(struct tu_cs *cs) const static size_t counter = 0; if (counter++ % 2 == 0) { - tu_cs_emit_pkt4(cs, preemption_latency_selector_reg, 1); - tu_cs_emit(cs, preemption_latency_selector); + tu_cs_emit_pkt4(cs, preemption_latency_counter->select_reg, 1); + tu_cs_emit(cs, preemption_latency_countable->selector); - tu_cs_emit_pkt4(cs, always_count_selector_reg, 1); - tu_cs_emit(cs, always_count_selector); + tu_cs_emit_pkt4(cs, always_count_counter->select_reg, 1); + tu_cs_emit(cs, always_count_countable->selector); } tu_cond_exec_end(cs); @@ -2231,4 +2213,4 @@ tu_autotune::emit_preempt_latency_tracking_rp_hash(struct tu_cmd_buffer *cmd) tu_cs_emit_draw_state(&cmd->cs, TU_DRAW_STATE_AT_WRITE_RP_HASH, ds); return rp_key; -} \ No newline at end of file +} diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h index 55e579ae93a..8bf231edd04 100644 --- a/src/freedreno/vulkan/tu_autotune.h +++ b/src/freedreno/vulkan/tu_autotune.h @@ -242,13 +242,11 @@ struct tu_autotune { std::mutex rp_latency_mutex; /* Protects rp_latency_tracking */ uint64_t last_latency_cleanup_ts = 0; - uint32_t preemption_latency_selector_reg; - uint32_t preemption_latency_selector; - uint32_t preemption_latency_counter_reg_lo; + const struct fd_perfcntr_counter *preemption_latency_counter; + const struct fd_perfcntr_countable *preemption_latency_countable; - uint32_t always_count_selector_reg; - uint32_t always_count_selector; - uint32_t always_count_counter_reg_lo; + const struct fd_perfcntr_counter *always_count_counter; + const struct fd_perfcntr_countable *always_count_countable; struct tu_draw_state reset_rp_hash_draw_state; diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 2e755551dca..ec79b9790b6 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -11,6 +11,7 @@ #include "drm-uapi/drm_fourcc.h" #include "git_sha1.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "common/freedreno_stompable_regs.h" /* for fd_get_driver/device_uuid() */ @@ -3081,6 +3082,10 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, } } + device->perfcntrs = fd_perfcntr_state_alloc( + &physical_device->dev_id, + is_kgsl(physical_device->instance) ? -1 : device->fd); + device->autotune = new tu_autotune(device, result); if (result != VK_SUCCESS) goto fail_autotune; @@ -3181,6 +3186,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, fail_timeline_cond: fail_a725_workaround: fail_autotune: + fd_perfcntr_state_free(device->perfcntrs); delete device->autotune; fail_bin_preamble: fail_prepare_perfcntrs_pass_cs: @@ -3287,6 +3293,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) delete device->autotune; + fd_perfcntr_state_free(device->perfcntrs); + tu_bo_suballocator_finish(&device->pipeline_suballoc); tu_bo_suballocator_finish(&device->kgsl_profiling_suballoc); tu_bo_suballocator_finish(&device->event_suballoc); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index c9f521fcc15..b65b3d5b4e7 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -11,6 +11,7 @@ #define TU_DEVICE_H #include "tu_common.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "radix_sort/radix_sort_vk.h" #include "util/rwlock.h" @@ -486,6 +487,8 @@ struct tu_device pthread_cond_t timeline_cond; pthread_mutex_t submit_mutex; + struct fd_perfcntr_state *perfcntrs; + struct tu_autotune *autotune; struct breadcrumbs_context *breadcrumbs_ctx; diff --git a/src/freedreno/vulkan/tu_query_pool.cc b/src/freedreno/vulkan/tu_query_pool.cc index 3d0851de7c4..ab04b0cac22 100644 --- a/src/freedreno/vulkan/tu_query_pool.cc +++ b/src/freedreno/vulkan/tu_query_pool.cc @@ -7,6 +7,7 @@ */ #include "tu_query_pool.h" +#include "perfcntrs/freedreno_perfcntr.h" #include @@ -249,21 +250,6 @@ perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, assert(i < group_count); } -static uint32_t -perfcntr_reserved_counters(const struct fd_perfcntr_group *group) -{ - /* Keep raw perf queries off the CP slots reserved by autotune latency optimization. - * TODO: We need to do this in a more robust way. - */ - return strcmp(group->name, "CP") == 0 ? 2 : 0; -} - -static uint32_t -perfcntr_available_counters(const struct fd_perfcntr_group *group) -{ - return group->num_counters - MIN2(group->num_counters, perfcntr_reserved_counters(group)); -} - static int compare_perfcntr_pass(const void *a, const void *b) { @@ -271,6 +257,27 @@ compare_perfcntr_pass(const void *a, const void *b) ((struct tu_perf_query_raw_data *)b)->pass; } +static void +tu_query_pool_destroy(struct tu_device *device, struct tu_query_pool *pool, + const VkAllocationCallbacks *pAllocator) +{ + if (is_perf_query_raw(pool)) { + struct tu_perf_query_raw *perf_query = &pool->perf_query.raw; + + for (uint32_t i = 0; i < perf_query->counter_index_count; i++) + fd_perfcntr_release(device->perfcntrs, perf_query->data[i].counter); + } else if (is_perf_query_raw(pool)) { + struct tu_perf_query_derived *perf_query = &pool->perf_query.derived; + struct fd_derived_counter_collection *collection = perf_query->collection; + + fd_release_derived_counter_collection(device->perfcntrs, collection); + } + + if (pool->bo) + tu_bo_finish(device, pool->bo); + vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); +} + VKAPI_ATTR VkResult VKAPI_CALL tu_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, @@ -353,50 +360,26 @@ tu_CreateQueryPool(VkDevice _device, perf_query->counter_index_count = perf_query_info->counterIndexCount; - /* Build all perf counters data that is requested, so we could get - * correct group id, countable id, counter register and pass index with - * only a counter index provided by applications at each command submit. - * - * Also, since this built data will be sorted by pass index later, we - * should keep the original indices and store perfcntrs results according - * to them so apps can get correct results with their own indices. - */ - uint32_t regs[perf_query->perf_group_count], pass[perf_query->perf_group_count]; - memset(regs, 0x00, perf_query->perf_group_count * sizeof(regs[0])); - memset(pass, 0x00, perf_query->perf_group_count * sizeof(pass[0])); - for (uint32_t i = 0; i < perf_query->counter_index_count; i++) { uint32_t gid = 0, cid = 0; perfcntr_index(perf_query->perf_group, perf_query->perf_group_count, perf_query_info->pCounterIndices[i], &gid, &cid); - perf_query->data[i].gid = gid; - perf_query->data[i].cid = cid; perf_query->data[i].app_idx = i; const struct fd_perfcntr_group *group = &perf_query->perf_group[gid]; - uint32_t reserved_counters = perfcntr_reserved_counters(group); - uint32_t available_counters = perfcntr_available_counters(group); + const struct fd_perfcntr_countable *countable = &group->countables[cid]; - if (available_counters == 0) { - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + perf_query->data[i].countable = countable; + perf_query->data[i].counter = + fd_perfcntr_reserve(device->perfcntrs, group, countable); + + if (!perf_query->data[i].counter) { + tu_query_pool_destroy(device, pool, pAllocator); return vk_errorf(device, VK_ERROR_FEATURE_NOT_PRESENT, "No raw perf counters available in group %s", group->name); } - - /* When a counter register is over the capacity(num_counters), - * reset it for next pass. - */ - if (regs[gid] < available_counters) { - perf_query->data[i].cntr_reg = reserved_counters + regs[gid]++; - perf_query->data[i].pass = pass[gid]; - } else { - perf_query->data[i].pass = ++pass[gid]; - perf_query->data[i].cntr_reg = reserved_counters; - regs[gid] = 0; - regs[gid]++; - } } /* Sort by pass index so we could easily prepare a command stream @@ -422,21 +405,20 @@ tu_CreateQueryPool(VkDevice _device, collection->counters[i] = perf_query->derived_counters[counter_index]; } - fd_generate_derived_counter_collection(&device->physical_device->dev_id, collection); + fd_reserve_derived_counter_collection(device->perfcntrs, collection); slot_size += sizeof(struct perfcntr_query_slot) * collection->num_enabled_perfcntrs; } VkResult result = tu_bo_init_new_cached(device, &pool->vk.base, &pool->bo, pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool"); if (result != VK_SUCCESS) { - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); return result; } result = tu_bo_map(device, pool->bo, NULL); if (result != VK_SUCCESS) { - tu_bo_finish(device, pool->bo); - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); return result; } @@ -463,8 +445,7 @@ tu_DestroyQueryPool(VkDevice _device, TU_RMV(resource_destroy, device, pool); - tu_bo_finish(device, pool->bo); - vk_query_pool_destroy(&device->vk, pAllocator, &pool->vk); + tu_query_pool_destroy(device, pool, pAllocator); } static uint32_t @@ -1259,7 +1240,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, * changes in perfcounter values should only apply to work done during * this query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true, .scope = INTERRUPTS).value); @@ -1276,13 +1257,15 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; - const struct fd_perfcntr_countable *countable = - &perf_query->perf_group[data->gid].countables[data->cid]; + tu_cs_emit_pkt4(cs, data->counter->select_reg, 1); + tu_cs_emit(cs, data->countable->selector); - tu_cs_emit_pkt4(cs, counter->select_reg, 1); - tu_cs_emit(cs, countable->selector); + for (unsigned s = 0; s < ARRAY_SIZE(data->counter->slice_select_regs); s++) { + if (!data->counter->slice_select_regs[s]) + break; + tu_cs_emit_pkt4(cs, data->counter->slice_select_regs[s], 1); + tu_cs_emit(cs, data->countable->selector); + } } tu_cond_exec_end(cs); @@ -1300,8 +1283,7 @@ emit_begin_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; + const struct fd_perfcntr_counter *counter = data->counter; uint64_t begin_iova = perf_query_iova(pool, query, begin, data->app_idx); @@ -1328,7 +1310,7 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf, * changes in perfcounter values should only apply to work done during * this query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = true, .scope = INTERRUPTS).value); @@ -1340,6 +1322,13 @@ emit_begin_perf_query_derived(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_pkt4(cs, counter->select_reg, 1); tu_cs_emit(cs, countable); + + for (unsigned s = 0; s < ARRAY_SIZE(counter->slice_select_regs); s++) { + if (!counter->slice_select_regs[s]) + break; + tu_cs_emit_pkt4(cs, counter->slice_select_regs[s], 1); + tu_cs_emit(cs, countable); + } } emit_counter_barrier(cs); @@ -1749,8 +1738,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, emit_perfcntrs_pass_start(has_pred_bit, cs, data->pass); } - const struct fd_perfcntr_counter *counter = - &perf_query->perf_group[data->gid].counters[data->cntr_reg]; + const struct fd_perfcntr_counter *counter = data->counter; end_iova = perf_query_iova(pool, query, end, data->app_idx); @@ -1799,7 +1787,7 @@ emit_end_perf_query_raw(struct tu_cmd_buffer *cmdbuf, /* This reverts the preemption disablement done at the start * of the query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false, .scope = INTERRUPTS).value); @@ -1876,7 +1864,7 @@ emit_end_perf_query_derived(struct tu_cmd_buffer *cmdbuf, /* This reverts the preemption disablement done at the start * of the query. */ - if (CHIP == A7XX) { + if (CHIP >= A7XX) { tu_cs_emit_pkt7(cs, CP_SCOPE_CNTL, 1); tu_cs_emit(cs, CP_SCOPE_CNTL_0(.disable_preemption = false, .scope = INTERRUPTS).value); @@ -2317,9 +2305,12 @@ tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( } for (uint32_t i = 0; i < group_count; i++) { - uint32_t available_counters = perfcntr_available_counters(&group[i]); - if (available_counters == 0) - continue; + /* Some counters may be unavailable at the time the query is + * created due to runtime factors (pps/fdperf using some counters, + * autotune or other queries, etc). But we don't know that up + * front. + */ + uint32_t available_counters = group[i].num_counters; n_passes = DIV_ROUND_UP(counters_requested[i], available_counters); *pNumPasses = MAX2(*pNumPasses, n_passes); diff --git a/src/freedreno/vulkan/tu_query_pool.h b/src/freedreno/vulkan/tu_query_pool.h index b642934f130..b1c004fd484 100644 --- a/src/freedreno/vulkan/tu_query_pool.h +++ b/src/freedreno/vulkan/tu_query_pool.h @@ -11,6 +11,7 @@ #define TU_QUERY_POOL_H #include "tu_common.h" +#include "perfcntrs/freedreno_perfcntr.h" #include "vk_query_pool.h" @@ -24,9 +25,8 @@ enum tu_perf_query_type { struct tu_perf_query_raw_data { - uint32_t gid; /* group-id */ - uint32_t cid; /* countable-id within the group */ - uint32_t cntr_reg; /* counter register within the group */ + const struct fd_perfcntr_counter *counter; + const struct fd_perfcntr_countable *countable; uint32_t pass; /* pass index that countables can be requested */ uint32_t app_idx; /* index provided by apps */ }; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc index 223758fcf12..2f54f09a144 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_query.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.cc @@ -824,6 +824,7 @@ static const struct fd_acc_sample_provider so_overflow_predicate = { struct fd_batch_query_entry { uint8_t gid; /* group-id */ uint8_t cid; /* countable-id within the group */ + const struct fd_perfcntr_counter *counter; }; struct fd_batch_query_data { @@ -839,33 +840,32 @@ perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt struct fd_screen *screen = data->screen; fd_cs cs(batch->draw); - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* configure performance counters for the requested queries: */ for (unsigned i = 0; i < data->num_query_entries; i++) { struct fd_batch_query_entry *entry = &data->query_entries[i]; const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - - assert(counter_idx < g->num_counters); fd_pkt4(cs, 1).add((fd_reg_pair){ - .reg = g->counters[counter_idx].select_reg, + .reg = entry->counter->select_reg, .value = g->countables[entry->cid].selector, }); - } - memset(counters_per_group, 0, sizeof(counters_per_group)); + for (unsigned s = 0; s < ARRAY_SIZE(entry->counter->slice_select_regs); s++) { + if (!entry->counter->slice_select_regs[s]) + break; + fd_pkt4(cs, 1).add((fd_reg_pair){ + .reg = entry->counter->slice_select_regs[s], + .value = g->countables[entry->cid].selector, + }); + } + } /* and snapshot the start values */ for (unsigned i = 0; i < data->num_query_entries; i++) { struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + const struct fd_perfcntr_counter *counter = entry->counter; fd_pkt7(cs, CP_REG_TO_MEM, 3) .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) @@ -877,12 +877,8 @@ static void perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { struct fd_batch_query_data *data = (struct fd_batch_query_data *)aq->query_data; - struct fd_screen *screen = data->screen; fd_cs cs(batch->draw); - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_pkt7(cs, CP_WAIT_FOR_IDLE, 0); /* TODO do we need to bother to turn anything off? */ @@ -890,9 +886,7 @@ perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt /* snapshot the end values: */ for (unsigned i = 0; i < data->num_query_entries; i++) { struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + const struct fd_perfcntr_counter *counter = entry->counter; fd_pkt7(cs, CP_REG_TO_MEM, 3) .add(CP_REG_TO_MEM_0(.reg = counter->counter_reg_lo, ._64b = true)) @@ -925,12 +919,24 @@ perfcntr_accumulate_result(struct fd_acc_query *aq, } } +static void +perfcntr_cleanup(void *query_data) +{ + struct fd_batch_query_data *data = (struct fd_batch_query_data *)query_data; + + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + fd_perfcntr_release(data->screen->perfcntrs, entry->counter); + } +} + static const struct fd_acc_sample_provider perfcntr = { .query_type = FD_QUERY_FIRST_PERFCNTR, .always = true, .resume = perfcntr_resume, .pause = perfcntr_pause, .result = perfcntr_accumulate_result, + .cleanup = perfcntr_cleanup, }; static struct pipe_query * @@ -949,13 +955,6 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, data->screen = screen; data->num_query_entries = num_queries; - /* validate the requested query_types and ensure we don't try - * to request more query_types of a given group than we have - * counters: - */ - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - for (unsigned i = 0; i < num_queries; i++) { unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; @@ -985,13 +984,15 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, entry->cid++; } - if (counters_per_group[entry->gid] >= - screen->perfcntr_groups[entry->gid].num_counters) { - mesa_loge("too many counters for group %u", entry->gid); + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + const struct fd_perfcntr_countable *c = &g->countables[entry->cid]; + + entry->counter = fd_perfcntr_reserve(screen->perfcntrs, g, c); + + if (!entry->counter) { + mesa_loge("Could not reserve counter for %s.%s", g->name, c->name); goto error; } - - counters_per_group[entry->gid]++; } q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); @@ -1004,6 +1005,7 @@ fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, return (struct pipe_query *)q; error: + perfcntr_cleanup(data); free(data); return NULL; } diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c index 2a41ddea8d0..5982fe87252 100644 --- a/src/gallium/drivers/freedreno/freedreno_query.c +++ b/src/gallium/drivers/freedreno/freedreno_query.c @@ -184,8 +184,12 @@ setup_perfcntr_query_info(struct fd_screen *screen) { unsigned num_queries = 0; - for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) - num_queries += screen->perfcntr_groups[i].num_countables; + for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) { + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i]; + if (g->pipe > PIPE_BR) + continue; + num_queries += g->num_countables; + } screen->perfcntr_queries = calloc(num_queries, sizeof(screen->perfcntr_queries[0])); @@ -194,6 +198,8 @@ setup_perfcntr_query_info(struct fd_screen *screen) unsigned idx = 0; for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) { const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i]; + if (g->pipe > PIPE_BR) + continue; for (unsigned j = 0; j < g->num_countables; j++) { struct pipe_driver_query_info *info = &screen->perfcntr_queries[idx]; const struct fd_perfcntr_countable *c = &g->countables[j]; diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c index 6af0e9697ad..51051c81b97 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_acc.c +++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c @@ -21,6 +21,9 @@ fd_acc_destroy_query(struct fd_context *ctx, struct fd_query *q) assert_dt DBG("%p", q); + if (aq->provider->cleanup) + aq->provider->cleanup(aq->query_data); + pipe_resource_reference(&aq->prsc, NULL); list_del(&aq->node); diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.h b/src/gallium/drivers/freedreno/freedreno_query_acc.h index f06511e2dd8..cc4daefd32e 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_acc.h +++ b/src/gallium/drivers/freedreno/freedreno_query_acc.h @@ -72,6 +72,7 @@ struct fd_acc_sample_provider { void (*result_resource)(struct fd_acc_query *aq, struct fd_ringbuffer *ring, enum pipe_query_value_type result_type, int index, struct fd_resource *dst, unsigned offset); + void (*cleanup)(void *query_data); /* optional cleanup */ }; struct fd_acc_query { diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 4b128d437f1..0efd70af3d9 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -165,6 +165,8 @@ fd_screen_destroy(struct pipe_screen *pscreen) if (screen->ro) screen->ro->destroy(screen->ro); + fd_perfcntr_state_free(screen->perfcntrs); + fd_bc_fini(&screen->batch_cache); fd_gmem_screen_fini(pscreen); @@ -1057,7 +1059,10 @@ fd_screen_create(int fd, if (screen->primtypes[i]) screen->primtypes_mask |= (1 << i); - if (FD_DBG(PERFC)) { + screen->perfcntrs = fd_perfcntr_state_alloc(screen->dev_id, fd); + + if (FD_DBG(PERFC) || + (screen->perfcntrs && fd_perfcntr_has_reservation(screen->perfcntrs))) { screen->perfcntr_groups = fd_perfcntrs(screen->dev_id, &screen->num_perfcntr_groups); } diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 137fea1c5b8..4ae53ac0b10 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -106,6 +106,7 @@ struct fd_screen { unsigned num_perfcntr_groups; const struct fd_perfcntr_group *perfcntr_groups; + struct fd_perfcntr_state *perfcntrs; /* generated at startup from the perfcntr groups: */ unsigned num_perfcntr_queries;