mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 11:20:11 +01:00
intel/ds: drop timestamp correlation code
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Antonio Caggiano <antonio.caggiano@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13831>
This commit is contained in:
parent
21a1c6995c
commit
4ef6698a26
2 changed files with 88 additions and 159 deletions
|
|
@ -37,6 +37,45 @@ uint64_t IntelDriver::get_min_sampling_period_ns()
|
||||||
return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
|
return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t scale_gpu_timestamp(uint64_t ts, uint64_t timestamp_frequency)
|
||||||
|
{
|
||||||
|
// Try to avoid going over the 64bits when doing the scaling
|
||||||
|
uint64_t lower_ts = ts >> 6;
|
||||||
|
uint64_t scaled_ts = lower_ts * 1000000000ull / timestamp_frequency;
|
||||||
|
scaled_ts <<= 6;
|
||||||
|
scaled_ts += (ts & 0x3f) * 1000000000ull / timestamp_frequency;
|
||||||
|
return scaled_ts;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t read_gpu_timestamp(int drm_fd)
|
||||||
|
{
|
||||||
|
drm_i915_reg_read reg_read = {};
|
||||||
|
const uint64_t render_ring_timestamp = 0x2358;
|
||||||
|
reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
|
||||||
|
|
||||||
|
if (perf_ioctl(drm_fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {
|
||||||
|
PPS_LOG_ERROR("Unable to read GPU clock");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return reg_read.val;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
IntelDriver::IntelDriver()
|
||||||
|
{
|
||||||
|
/* Note: clock_id's below 128 are reserved.. for custom clock sources,
|
||||||
|
* using the hash of a namespaced string is the recommended approach.
|
||||||
|
* See: https://perfetto.dev/docs/concepts/clock-sync
|
||||||
|
*/
|
||||||
|
this->clock_id =
|
||||||
|
_mesa_hash_string("org.freedesktop.mesa.intel") | 0x80000000;
|
||||||
|
}
|
||||||
|
|
||||||
|
IntelDriver::~IntelDriver()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
void IntelDriver::enable_counter(uint32_t counter_id)
|
void IntelDriver::enable_counter(uint32_t counter_id)
|
||||||
{
|
{
|
||||||
auto &counter = counters[counter_id];
|
auto &counter = counters[counter_id];
|
||||||
|
|
@ -75,71 +114,6 @@ void IntelDriver::enable_all_counters()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t timespec_diff(timespec *begin, timespec *end)
|
|
||||||
{
|
|
||||||
return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// @brief This function tries to correlate CPU time with GPU time
|
|
||||||
std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
|
|
||||||
{
|
|
||||||
TimestampCorrelation corr = {};
|
|
||||||
|
|
||||||
clock_t correlation_clock_id = CLOCK_BOOTTIME;
|
|
||||||
|
|
||||||
drm_i915_reg_read reg_read = {};
|
|
||||||
const uint64_t render_ring_timestamp = 0x2358;
|
|
||||||
reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
|
|
||||||
|
|
||||||
constexpr size_t attempt_count = 3;
|
|
||||||
struct {
|
|
||||||
timespec cpu_ts_begin;
|
|
||||||
timespec cpu_ts_end;
|
|
||||||
uint64_t gpu_ts;
|
|
||||||
} attempts[attempt_count] = {};
|
|
||||||
|
|
||||||
uint32_t best = 0;
|
|
||||||
|
|
||||||
// Gather 3 correlations
|
|
||||||
for (uint32_t i = 0; i < attempt_count; i++) {
|
|
||||||
clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
|
|
||||||
if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {
|
|
||||||
return std::nullopt;
|
|
||||||
}
|
|
||||||
clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
|
|
||||||
|
|
||||||
attempts[i].gpu_ts = reg_read.val;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now select the best
|
|
||||||
for (uint32_t i = 1; i < attempt_count; i++) {
|
|
||||||
if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
|
|
||||||
timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
|
|
||||||
best = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
corr.cpu_timestamp =
|
|
||||||
(attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
|
|
||||||
timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
|
|
||||||
corr.gpu_timestamp = attempts[best].gpu_ts;
|
|
||||||
|
|
||||||
return corr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void IntelDriver::get_new_correlation()
|
|
||||||
{
|
|
||||||
// Rotate left correlations by one position so to make space at the end
|
|
||||||
std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
|
|
||||||
|
|
||||||
// Then we overwrite the last correlation with a new one
|
|
||||||
if (auto corr = query_correlation_timestamps()) {
|
|
||||||
correlations.back() = *corr;
|
|
||||||
} else {
|
|
||||||
PPS_LOG_FATAL("Failed to get correlation timestamps");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IntelDriver::init_perfcnt()
|
bool IntelDriver::init_perfcnt()
|
||||||
{
|
{
|
||||||
assert(!perf && "Intel perf should not be initialized at this point");
|
assert(!perf && "Intel perf should not be initialized at this point");
|
||||||
|
|
@ -201,41 +175,11 @@ void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
|
||||||
{
|
{
|
||||||
this->sampling_period_ns = sampling_period_ns;
|
this->sampling_period_ns = sampling_period_ns;
|
||||||
|
|
||||||
// Fill correlations with an initial one
|
|
||||||
if (auto corr = query_correlation_timestamps()) {
|
|
||||||
correlations.fill(*corr);
|
|
||||||
} else {
|
|
||||||
PPS_LOG_FATAL("Failed to get correlation timestamps");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!perf->open(sampling_period_ns)) {
|
if (!perf->open(sampling_period_ns)) {
|
||||||
PPS_LOG_FATAL("Failed to open intel perf");
|
PPS_LOG_FATAL("Failed to open intel perf");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
|
|
||||||
uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
|
|
||||||
{
|
|
||||||
auto &corr_a = correlations[0];
|
|
||||||
auto &corr_b = correlations[correlations.size() - 1];
|
|
||||||
|
|
||||||
// A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
|
|
||||||
uint64_t mask = 0xffffffff;
|
|
||||||
uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
|
|
||||||
uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
|
|
||||||
|
|
||||||
// Make sure it is within the interval [a,b)
|
|
||||||
assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
|
|
||||||
assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
|
|
||||||
|
|
||||||
uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
|
|
||||||
// Factor to convert gpu time to cpu time
|
|
||||||
double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
|
|
||||||
double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
|
|
||||||
uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
|
|
||||||
return corr_a.cpu_timestamp + cpu_delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
void IntelDriver::disable_perfcnt()
|
void IntelDriver::disable_perfcnt()
|
||||||
{
|
{
|
||||||
perf = nullptr;
|
perf = nullptr;
|
||||||
|
|
@ -244,12 +188,6 @@ void IntelDriver::disable_perfcnt()
|
||||||
enabled_counters.clear();
|
enabled_counters.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Report {
|
|
||||||
uint32_t version;
|
|
||||||
uint32_t timestamp;
|
|
||||||
uint32_t id;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// @brief Some perf record durations can be really short
|
/// @brief Some perf record durations can be really short
|
||||||
/// @return True if the duration is at least close to the sampling period
|
/// @return True if the duration is at least close to the sampling period
|
||||||
static bool close_enough(uint64_t duration, uint64_t sampling_period)
|
static bool close_enough(uint64_t duration, uint64_t sampling_period)
|
||||||
|
|
@ -265,12 +203,12 @@ std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_
|
||||||
records.reserve(128);
|
records.reserve(128);
|
||||||
|
|
||||||
PerfRecord record;
|
PerfRecord record;
|
||||||
record.reserve(512);
|
record.data.reserve(512);
|
||||||
|
|
||||||
const uint8_t *iter = data.data();
|
const uint8_t *iter = data.data();
|
||||||
const uint8_t *end = iter + byte_count;
|
const uint8_t *end = iter + byte_count;
|
||||||
|
|
||||||
uint64_t prev_cpu_timestamp = last_cpu_timestamp;
|
uint64_t prev_gpu_timestamp = last_gpu_timestamp;
|
||||||
|
|
||||||
while (iter < end) {
|
while (iter < end) {
|
||||||
// Iterate a record at a time
|
// Iterate a record at a time
|
||||||
|
|
@ -278,18 +216,32 @@ std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_
|
||||||
|
|
||||||
if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
|
if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
|
||||||
// Report is next to the header
|
// Report is next to the header
|
||||||
auto report = reinterpret_cast<const Report *>(header + 1);
|
const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
|
||||||
auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
|
uint64_t gpu_timestamp_ldw =
|
||||||
auto duration = cpu_timestamp - prev_cpu_timestamp;
|
intel_perf_report_timestamp(&perf->query.value(), report);
|
||||||
|
|
||||||
|
/* Our HW only provides us with the lower 32 bits of the 36bits
|
||||||
|
* timestamp counter value. If we haven't captured the top bits yet,
|
||||||
|
* do it now. If we see a roll over the lower 32bits capture it
|
||||||
|
* again.
|
||||||
|
*/
|
||||||
|
if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw + gpu_timestamp_ldw) < last_gpu_timestamp)
|
||||||
|
gpu_timestamp_udw = read_gpu_timestamp(drm_device.fd) & 0xffffffff00000000;
|
||||||
|
|
||||||
|
uint64_t gpu_timestamp = gpu_timestamp_udw + gpu_timestamp_ldw;
|
||||||
|
|
||||||
|
auto duration = scale_gpu_timestamp(gpu_timestamp - prev_gpu_timestamp,
|
||||||
|
perf->devinfo.timestamp_frequency);
|
||||||
|
|
||||||
// Skip perf-records that are too short by checking
|
// Skip perf-records that are too short by checking
|
||||||
// the distance between last report and this one
|
// the distance between last report and this one
|
||||||
if (close_enough(duration, sampling_period_ns)) {
|
if (close_enough(duration, sampling_period_ns)) {
|
||||||
prev_cpu_timestamp = cpu_timestamp;
|
prev_gpu_timestamp = gpu_timestamp;
|
||||||
|
|
||||||
// Add the new record to the list
|
// Add the new record to the list
|
||||||
record.resize(header->size); // Possibly 264?
|
record.timestamp = gpu_timestamp;
|
||||||
memcpy(record.data(), iter, header->size);
|
record.data.resize(header->size); // Possibly 264?
|
||||||
|
memcpy(record.data.data(), iter, header->size);
|
||||||
records.emplace_back(record);
|
records.emplace_back(record);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -329,8 +281,6 @@ bool IntelDriver::dump_perfcnt()
|
||||||
|
|
||||||
read_data_from_metric_set();
|
read_data_from_metric_set();
|
||||||
|
|
||||||
get_new_correlation();
|
|
||||||
|
|
||||||
auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
|
auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
|
||||||
if (new_records.empty()) {
|
if (new_records.empty()) {
|
||||||
PPS_LOG("No new records");
|
PPS_LOG("No new records");
|
||||||
|
|
@ -353,7 +303,7 @@ bool IntelDriver::dump_perfcnt()
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t IntelDriver::gpu_next()
|
uint64_t IntelDriver::gpu_next()
|
||||||
{
|
{
|
||||||
if (records.size() < 2) {
|
if (records.size() < 2) {
|
||||||
// Not enough records to accumulate
|
// Not enough records to accumulate
|
||||||
|
|
@ -361,8 +311,8 @@ uint32_t IntelDriver::gpu_next()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get first and second
|
// Get first and second
|
||||||
auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
|
auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
|
||||||
auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
|
auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
|
||||||
|
|
||||||
intel_perf_query_result_accumulate_fields(&result,
|
intel_perf_query_result_accumulate_fields(&result,
|
||||||
&perf->query.value(),
|
&perf->query.value(),
|
||||||
|
|
@ -372,42 +322,30 @@ uint32_t IntelDriver::gpu_next()
|
||||||
false /* no_oa_accumulate */);
|
false /* no_oa_accumulate */);
|
||||||
|
|
||||||
// Get last timestamp
|
// Get last timestamp
|
||||||
auto report_b = reinterpret_cast<const Report *>(record_b + 1);
|
auto gpu_timestamp = records[1].timestamp;
|
||||||
auto gpu_timestamp = report_b->timestamp;
|
|
||||||
|
|
||||||
// Consume first record
|
// Consume first record
|
||||||
records.erase(std::begin(records), std::begin(records) + 1);
|
records.erase(std::begin(records), std::begin(records) + 1);
|
||||||
|
|
||||||
return gpu_timestamp;
|
return scale_gpu_timestamp(gpu_timestamp, perf->devinfo.timestamp_frequency);
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t IntelDriver::cpu_next()
|
|
||||||
{
|
|
||||||
if (auto gpu_timestamp = gpu_next()) {
|
|
||||||
auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
|
|
||||||
|
|
||||||
last_cpu_timestamp = cpu_timestamp;
|
|
||||||
return cpu_timestamp;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t IntelDriver::next()
|
uint64_t IntelDriver::next()
|
||||||
{
|
{
|
||||||
// Reset accumulation
|
// Reset accumulation
|
||||||
intel_perf_query_result_clear(&result);
|
intel_perf_query_result_clear(&result);
|
||||||
return cpu_next();
|
return gpu_next();
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t IntelDriver::gpu_clock_id() const
|
uint32_t IntelDriver::gpu_clock_id() const
|
||||||
{
|
{
|
||||||
return perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
|
return this->clock_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t IntelDriver::gpu_timestamp() const
|
uint64_t IntelDriver::gpu_timestamp() const
|
||||||
{
|
{
|
||||||
return perfetto::base::GetBootTimeNs().count();
|
return scale_gpu_timestamp(read_gpu_timestamp(drm_device.fd),
|
||||||
|
perf->devinfo.timestamp_frequency);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace pps
|
} // namespace pps
|
||||||
|
|
|
||||||
|
|
@ -13,17 +13,15 @@
|
||||||
|
|
||||||
namespace pps
|
namespace pps
|
||||||
{
|
{
|
||||||
/// Timestamp correlation between CPU/GPU.
|
|
||||||
struct TimestampCorrelation {
|
|
||||||
/// In CLOCK_MONOTONIC
|
|
||||||
uint64_t cpu_timestamp;
|
|
||||||
|
|
||||||
/// Engine timestamp associated with the OA unit
|
|
||||||
uint64_t gpu_timestamp;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA)
|
/// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA)
|
||||||
using PerfRecord = std::vector<uint8_t>;
|
struct PerfRecord {
|
||||||
|
/// Timestamp in the GPU clock domain
|
||||||
|
uint64_t timestamp;
|
||||||
|
|
||||||
|
/// drm_i915_perf_record_header + report data
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
};
|
||||||
|
|
||||||
/// @brief PPS Driver implementation for Intel graphics devices.
|
/// @brief PPS Driver implementation for Intel graphics devices.
|
||||||
/// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple
|
/// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple
|
||||||
|
|
@ -34,14 +32,8 @@ using PerfRecord = std::vector<uint8_t>;
|
||||||
class IntelDriver : public Driver
|
class IntelDriver : public Driver
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
std::optional<TimestampCorrelation> query_correlation_timestamps() const;
|
IntelDriver();
|
||||||
void get_new_correlation();
|
~IntelDriver();
|
||||||
|
|
||||||
/// @brief OA reports only have the lower 32 bits of the timestamp
|
|
||||||
/// register, while correlation data has the whole 36 bits.
|
|
||||||
/// @param gpu_ts a 32 bit OA report GPU timestamp
|
|
||||||
/// @return The CPU timestamp relative to the argument
|
|
||||||
uint64_t correlate_gpu_timestamp(uint32_t gpu_ts);
|
|
||||||
|
|
||||||
uint64_t get_min_sampling_period_ns() override;
|
uint64_t get_min_sampling_period_ns() override;
|
||||||
bool init_perfcnt() override;
|
bool init_perfcnt() override;
|
||||||
|
|
@ -57,12 +49,7 @@ class IntelDriver : public Driver
|
||||||
private:
|
private:
|
||||||
/// @brief Requests the next perf sample
|
/// @brief Requests the next perf sample
|
||||||
/// @return The sample GPU timestamp
|
/// @return The sample GPU timestamp
|
||||||
uint32_t gpu_next();
|
uint64_t gpu_next();
|
||||||
|
|
||||||
/// @brief Requests the next perf sample accumulating those which
|
|
||||||
/// which duration is shorter than the requested sampling period
|
|
||||||
/// @return The sample CPU timestamp
|
|
||||||
uint64_t cpu_next();
|
|
||||||
|
|
||||||
/// @param data Buffer of bytes to parse
|
/// @param data Buffer of bytes to parse
|
||||||
/// @param byte_count Number of bytes to parse
|
/// @param byte_count Number of bytes to parse
|
||||||
|
|
@ -75,11 +62,12 @@ class IntelDriver : public Driver
|
||||||
/// Sampling period in nanoseconds requested by the datasource
|
/// Sampling period in nanoseconds requested by the datasource
|
||||||
uint64_t sampling_period_ns = 0;
|
uint64_t sampling_period_ns = 0;
|
||||||
|
|
||||||
/// Keep track of the timestamp of the last sample generated
|
/// Last upper 32bits of the GPU timestamp in the parsed reports
|
||||||
uint64_t last_cpu_timestamp = 0;
|
uint64_t gpu_timestamp_udw = 0;
|
||||||
|
|
||||||
/// This is used to correlate CPU and GPU timestamps
|
/// Keep track of the timestamp of the last sample generated (upper & lower
|
||||||
std::array<TimestampCorrelation, 64> correlations;
|
/// 32bits)
|
||||||
|
uint64_t last_gpu_timestamp = 0;
|
||||||
|
|
||||||
/// Data buffer used to store data read from the metric set
|
/// Data buffer used to store data read from the metric set
|
||||||
std::vector<uint8_t> metric_buffer = std::vector<uint8_t>(1024, 0);
|
std::vector<uint8_t> metric_buffer = std::vector<uint8_t>(1024, 0);
|
||||||
|
|
@ -94,6 +82,9 @@ class IntelDriver : public Driver
|
||||||
|
|
||||||
// Accumulations are stored here
|
// Accumulations are stored here
|
||||||
struct intel_perf_query_result result = {};
|
struct intel_perf_query_result result = {};
|
||||||
|
|
||||||
|
// Gpu clock ID used to correlate GPU/CPU timestamps
|
||||||
|
uint32_t clock_id = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace pps
|
} // namespace pps
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue