mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-21 04:38:09 +02:00
Timestamp read is not in any hot path so there is no down-sides in share the same function between iris, crocus, anv and hasvk. Also while at it also dropping the functions to read MMIO from kernel, the only use is read render timestamp so we don't need it. v2: - fix compilaton of ds Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Signed-off-by: José Roberto de Souza <jose.souza@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18920>
341 lines
9.9 KiB
C++
341 lines
9.9 KiB
C++
/*
|
|
* Copyright © 2020-2021 Collabora, Ltd.
|
|
* Author: Antonio Caggiano <antonio.caggiano@collabora.com>
|
|
* Author: Corentin Noël <corentin.noel@collabora.com>
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "intel_pps_driver.h"
|
|
|
|
#include <dirent.h>
|
|
#include <fcntl.h>
|
|
#include <math.h>
|
|
#include <poll.h>
|
|
#include <strings.h>
|
|
#include <sys/ioctl.h>
|
|
#include <unistd.h>
|
|
|
|
#include "drm-uapi/i915_drm.h"
|
|
|
|
#include "common/intel_gem.h"
|
|
#include "dev/intel_device_info.h"
|
|
#include "perf/intel_perf.h"
|
|
#include "perf/intel_perf_query.h"
|
|
|
|
#include <pps/pps.h>
|
|
#include <pps/pps_algorithm.h>
|
|
|
|
#include "intel_pps_perf.h"
|
|
#include "intel_pps_priv.h"
|
|
|
|
namespace pps
|
|
{
|
|
|
|
// The HW sampling period is programmed using period_exponent following this
|
|
// formula:
|
|
// sample_period = timestamp_period * 2^(period_exponent + 1)
|
|
// So our minimum sampling period is twice the timestamp period
|
|
|
|
uint64_t IntelDriver::get_min_sampling_period_ns()
|
|
{
|
|
return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
|
|
}
|
|
|
|
IntelDriver::IntelDriver()
|
|
{
|
|
}
|
|
|
|
IntelDriver::~IntelDriver()
|
|
{
|
|
}
|
|
|
|
void IntelDriver::enable_counter(uint32_t counter_id)
|
|
{
|
|
auto &counter = counters[counter_id];
|
|
|
|
enabled_counters.emplace_back(counter);
|
|
}
|
|
|
|
void IntelDriver::enable_all_counters()
|
|
{
|
|
// We should only have one group
|
|
assert(groups.size() == 1);
|
|
for (uint32_t counter_id : groups[0].counters) {
|
|
auto &counter = counters[counter_id];
|
|
enabled_counters.emplace_back(counter);
|
|
}
|
|
}
|
|
|
|
bool IntelDriver::init_perfcnt()
|
|
{
|
|
/* Note: clock_id's below 128 are reserved.. for custom clock sources,
|
|
* using the hash of a namespaced string is the recommended approach.
|
|
* See: https://perfetto.dev/docs/concepts/clock-sync
|
|
*/
|
|
this->clock_id = intel_pps_clock_id(drm_device.gpu_num);
|
|
|
|
assert(!perf && "Intel perf should not be initialized at this point");
|
|
|
|
perf = std::make_unique<IntelPerf>(drm_device.fd);
|
|
|
|
const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET");
|
|
|
|
struct intel_perf_query_info *default_query = nullptr;
|
|
selected_query = nullptr;
|
|
for (auto &query : perf->get_queries()) {
|
|
if (!strcmp(query->symbol_name, "RenderBasic"))
|
|
default_query = query;
|
|
if (metric_set_name && !strcmp(query->symbol_name, metric_set_name))
|
|
selected_query = query;
|
|
}
|
|
|
|
assert(default_query);
|
|
|
|
if (!selected_query) {
|
|
if (metric_set_name) {
|
|
PPS_LOG_ERROR("Available metric sets:");
|
|
for (auto &query : perf->get_queries())
|
|
PPS_LOG_ERROR(" %s", query->symbol_name);
|
|
PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name);
|
|
}
|
|
selected_query = default_query;
|
|
}
|
|
|
|
PPS_LOG("Using metric set '%s': %s",
|
|
selected_query->symbol_name, selected_query->name);
|
|
|
|
// Create group
|
|
CounterGroup group = {};
|
|
group.id = groups.size();
|
|
group.name = selected_query->symbol_name;
|
|
|
|
for (int i = 0; i < selected_query->n_counters; ++i) {
|
|
intel_perf_query_counter &counter = selected_query->counters[i];
|
|
|
|
// Create counter
|
|
Counter counter_desc = {};
|
|
counter_desc.id = counters.size();
|
|
counter_desc.name = counter.symbol_name;
|
|
counter_desc.group = group.id;
|
|
counter_desc.getter = [counter, this](
|
|
const Counter &c, const Driver &dri) -> Counter::Value {
|
|
switch (counter.data_type) {
|
|
case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
|
|
case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
|
|
case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
|
|
return (int64_t)counter.oa_counter_read_uint64(perf->cfg,
|
|
selected_query,
|
|
&perf->result);
|
|
break;
|
|
case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
|
|
case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
|
|
return counter.oa_counter_read_float(perf->cfg,
|
|
selected_query,
|
|
&perf->result);
|
|
break;
|
|
}
|
|
|
|
return {};
|
|
};
|
|
|
|
// Add counter id to the group
|
|
group.counters.emplace_back(counter_desc.id);
|
|
|
|
// Store counter
|
|
counters.emplace_back(std::move(counter_desc));
|
|
}
|
|
|
|
// Store group
|
|
groups.emplace_back(std::move(group));
|
|
|
|
assert(counters.size() && "Failed to query counters");
|
|
|
|
// Clear accumulations
|
|
intel_perf_query_result_clear(&perf->result);
|
|
|
|
return true;
|
|
}
|
|
|
|
void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
|
|
{
|
|
this->sampling_period_ns = sampling_period_ns;
|
|
|
|
intel_gem_read_render_timestamp(drm_device.fd, &gpu_timestamp_udw);
|
|
gpu_timestamp_udw &= ~perf->cfg->oa_timestamp_mask;
|
|
if (!perf->open(sampling_period_ns, selected_query)) {
|
|
PPS_LOG_FATAL("Failed to open intel perf");
|
|
}
|
|
}
|
|
|
|
void IntelDriver::disable_perfcnt()
|
|
{
|
|
gpu_timestamp_udw = 0;
|
|
perf = nullptr;
|
|
groups.clear();
|
|
counters.clear();
|
|
enabled_counters.clear();
|
|
}
|
|
|
|
/// @brief Some perf record durations can be really short
|
|
/// @return True if the duration is at least close to the sampling period
|
|
static bool close_enough(uint64_t duration, uint64_t sampling_period)
|
|
{
|
|
return duration > sampling_period - 100000;
|
|
}
|
|
|
|
/// @brief Transforms the raw data received in from the driver into records
|
|
std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
|
|
const size_t byte_count)
|
|
{
|
|
std::vector<PerfRecord> records;
|
|
records.reserve(128);
|
|
|
|
PerfRecord record;
|
|
record.data.reserve(512);
|
|
|
|
const uint8_t *iter = data.data();
|
|
const uint8_t *end = iter + byte_count;
|
|
|
|
uint64_t prev_gpu_timestamp = last_gpu_timestamp;
|
|
|
|
while (iter < end) {
|
|
// Iterate a record at a time
|
|
auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
|
|
|
|
if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
|
|
// Report is next to the header
|
|
const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
|
|
uint64_t gpu_timestamp_ldw =
|
|
intel_perf_report_timestamp(selected_query, report);
|
|
|
|
/* Our HW only provides us with the lower 32 bits of the 36bits
|
|
* timestamp counter value. If we haven't captured the top bits yet,
|
|
* do it now. If we see a roll over the lower 32bits capture it
|
|
* again.
|
|
*/
|
|
if (gpu_timestamp_udw == 0 ||
|
|
(gpu_timestamp_udw | gpu_timestamp_ldw) < last_gpu_timestamp) {
|
|
intel_gem_read_render_timestamp(drm_device.fd, &gpu_timestamp_udw);
|
|
gpu_timestamp_udw &= ~perf->cfg->oa_timestamp_mask;
|
|
}
|
|
|
|
uint64_t gpu_timestamp = gpu_timestamp_udw | gpu_timestamp_ldw;
|
|
|
|
auto duration = intel_device_info_timebase_scale(&perf->devinfo,
|
|
gpu_timestamp - prev_gpu_timestamp);
|
|
|
|
// Skip perf-records that are too short by checking
|
|
// the distance between last report and this one
|
|
if (close_enough(duration, sampling_period_ns)) {
|
|
prev_gpu_timestamp = gpu_timestamp;
|
|
|
|
// Add the new record to the list
|
|
record.timestamp = gpu_timestamp;
|
|
record.data.resize(header->size); // Possibly 264?
|
|
memcpy(record.data.data(), iter, header->size);
|
|
records.emplace_back(record);
|
|
}
|
|
}
|
|
|
|
// Go to the next record
|
|
iter += header->size;
|
|
}
|
|
|
|
return records;
|
|
}
|
|
|
|
/// @brief Read all the available data from the metric set currently in use
|
|
void IntelDriver::read_data_from_metric_set()
|
|
{
|
|
assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
|
|
|
|
ssize_t bytes_read = 0;
|
|
while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
|
|
metric_buffer.size() - total_bytes_read)) > 0 ||
|
|
errno == EINTR) {
|
|
total_bytes_read += std::max(ssize_t(0), bytes_read);
|
|
|
|
// Increase size of the buffer for the next read
|
|
if (metric_buffer.size() / 2 < total_bytes_read) {
|
|
metric_buffer.resize(metric_buffer.size() * 2);
|
|
}
|
|
}
|
|
|
|
assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
|
|
}
|
|
|
|
bool IntelDriver::dump_perfcnt()
|
|
{
|
|
if (!perf->oa_stream_ready()) {
|
|
return false;
|
|
}
|
|
|
|
read_data_from_metric_set();
|
|
|
|
auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
|
|
if (new_records.empty()) {
|
|
// No new records from the GPU yet
|
|
return false;
|
|
} else {
|
|
// Records are parsed correctly, so we can reset the
|
|
// number of bytes read so far from the metric set
|
|
total_bytes_read = 0;
|
|
}
|
|
|
|
APPEND(records, new_records);
|
|
|
|
if (records.size() < 2) {
|
|
// Not enough records to accumulate
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
uint64_t IntelDriver::gpu_next()
|
|
{
|
|
if (records.size() < 2) {
|
|
// Not enough records to accumulate
|
|
return 0;
|
|
}
|
|
|
|
// Get first and second
|
|
auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
|
|
auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
|
|
|
|
intel_perf_query_result_accumulate_fields(&perf->result,
|
|
selected_query,
|
|
record_a + 1,
|
|
record_b + 1,
|
|
false /* no_oa_accumulate */);
|
|
|
|
// Get last timestamp
|
|
auto gpu_timestamp = records[1].timestamp;
|
|
|
|
// Consume first record
|
|
records.erase(std::begin(records), std::begin(records) + 1);
|
|
|
|
return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
|
|
}
|
|
|
|
uint64_t IntelDriver::next()
|
|
{
|
|
// Reset accumulation
|
|
intel_perf_query_result_clear(&perf->result);
|
|
return gpu_next();
|
|
}
|
|
|
|
uint32_t IntelDriver::gpu_clock_id() const
|
|
{
|
|
return this->clock_id;
|
|
}
|
|
|
|
uint64_t IntelDriver::gpu_timestamp() const
|
|
{
|
|
uint64_t timestamp;
|
|
intel_gem_read_render_timestamp(drm_device.fd, ×tamp);
|
|
return intel_device_info_timebase_scale(&perf->devinfo, timestamp);
|
|
}
|
|
|
|
} // namespace pps
|