diff --git a/src/intel/ds/intel_pps_driver.cc b/src/intel/ds/intel_pps_driver.cc index 6966db8fdc1..031097e22df 100644 --- a/src/intel/ds/intel_pps_driver.cc +++ b/src/intel/ds/intel_pps_driver.cc @@ -37,6 +37,45 @@ uint64_t IntelDriver::get_min_sampling_period_ns() return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull; } +uint64_t scale_gpu_timestamp(uint64_t ts, uint64_t timestamp_frequency) +{ + // Try to avoid going over the 64bits when doing the scaling + uint64_t lower_ts = ts >> 6; + uint64_t scaled_ts = lower_ts * 1000000000ull / timestamp_frequency; + scaled_ts <<= 6; + scaled_ts += (ts & 0x3f) * 1000000000ull / timestamp_frequency; + return scaled_ts; +} + +uint64_t read_gpu_timestamp(int drm_fd) +{ + drm_i915_reg_read reg_read = {}; + const uint64_t render_ring_timestamp = 0x2358; + reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA; + + if (perf_ioctl(drm_fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) { + PPS_LOG_ERROR("Unable to read GPU clock"); + return 0; + } + + return reg_read.val; + +} + +IntelDriver::IntelDriver() +{ + /* Note: clock_id's below 128 are reserved.. for custom clock sources, + * using the hash of a namespaced string is the recommended approach. + * See: https://perfetto.dev/docs/concepts/clock-sync + */ + this->clock_id = + _mesa_hash_string("org.freedesktop.mesa.intel") | 0x80000000; +} + +IntelDriver::~IntelDriver() +{ +} + void IntelDriver::enable_counter(uint32_t counter_id) { auto &counter = counters[counter_id]; @@ -75,71 +114,6 @@ void IntelDriver::enable_all_counters() } } -static uint64_t timespec_diff(timespec *begin, timespec *end) -{ - return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec; -} - -/// @brief This function tries to correlate CPU time with GPU time -std::optional IntelDriver::query_correlation_timestamps() const -{ - TimestampCorrelation corr = {}; - - clock_t correlation_clock_id = CLOCK_BOOTTIME; - - drm_i915_reg_read reg_read = {}; - const uint64_t render_ring_timestamp = 0x2358; - reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA; - - constexpr size_t attempt_count = 3; - struct { - timespec cpu_ts_begin; - timespec cpu_ts_end; - uint64_t gpu_ts; - } attempts[attempt_count] = {}; - - uint32_t best = 0; - - // Gather 3 correlations - for (uint32_t i = 0; i < attempt_count; i++) { - clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin); - if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) { - return std::nullopt; - } - clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end); - - attempts[i].gpu_ts = reg_read.val; - } - - // Now select the best - for (uint32_t i = 1; i < attempt_count; i++) { - if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) < - timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) { - best = i; - } - } - - corr.cpu_timestamp = - (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) + - timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2; - corr.gpu_timestamp = attempts[best].gpu_ts; - - return corr; -} - -void IntelDriver::get_new_correlation() -{ - // Rotate left correlations by one position so to make space at the end - std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end()); - - // Then we overwrite the last correlation with a new one - if (auto corr = query_correlation_timestamps()) { - correlations.back() = *corr; - } else { - PPS_LOG_FATAL("Failed to get correlation timestamps"); - } -} - bool IntelDriver::init_perfcnt() { assert(!perf && "Intel perf should not be initialized at this point"); @@ -201,41 +175,11 @@ void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns) { this->sampling_period_ns = sampling_period_ns; - // Fill correlations with an initial one - if (auto corr = query_correlation_timestamps()) { - correlations.fill(*corr); - } else { - PPS_LOG_FATAL("Failed to get correlation timestamps"); - } - if (!perf->open(sampling_period_ns)) { PPS_LOG_FATAL("Failed to open intel perf"); } } -/// @brief Transforms the GPU timestop into a CPU timestamp equivalent -uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts) -{ - auto &corr_a = correlations[0]; - auto &corr_b = correlations[correlations.size() - 1]; - - // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts - uint64_t mask = 0xffffffff; - uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask; - uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask; - - // Make sure it is within the interval [a,b) - assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a"); - assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b"); - - uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts; - // Factor to convert gpu time to cpu time - double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) / - double(corr_b.gpu_timestamp - corr_a.gpu_timestamp); - uint64_t cpu_delta = gpu_delta * gpu_to_cpu; - return corr_a.cpu_timestamp + cpu_delta; -} - void IntelDriver::disable_perfcnt() { perf = nullptr; @@ -244,12 +188,6 @@ void IntelDriver::disable_perfcnt() enabled_counters.clear(); } -struct Report { - uint32_t version; - uint32_t timestamp; - uint32_t id; -}; - /// @brief Some perf record durations can be really short /// @return True if the duration is at least close to the sampling period static bool close_enough(uint64_t duration, uint64_t sampling_period) @@ -265,12 +203,12 @@ std::vector IntelDriver::parse_perf_records(const std::vector IntelDriver::parse_perf_records(const std::vectortype == DRM_I915_PERF_RECORD_SAMPLE) { // Report is next to the header - auto report = reinterpret_cast(header + 1); - auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp); - auto duration = cpu_timestamp - prev_cpu_timestamp; + const uint32_t *report = reinterpret_cast(header + 1); + uint64_t gpu_timestamp_ldw = + intel_perf_report_timestamp(&perf->query.value(), report); + + /* Our HW only provides us with the lower 32 bits of the 36bits + * timestamp counter value. If we haven't captured the top bits yet, + * do it now. If we see a roll over the lower 32bits capture it + * again. + */ + if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw + gpu_timestamp_ldw) < last_gpu_timestamp) + gpu_timestamp_udw = read_gpu_timestamp(drm_device.fd) & 0xffffffff00000000; + + uint64_t gpu_timestamp = gpu_timestamp_udw + gpu_timestamp_ldw; + + auto duration = scale_gpu_timestamp(gpu_timestamp - prev_gpu_timestamp, + perf->devinfo.timestamp_frequency); // Skip perf-records that are too short by checking // the distance between last report and this one if (close_enough(duration, sampling_period_ns)) { - prev_cpu_timestamp = cpu_timestamp; + prev_gpu_timestamp = gpu_timestamp; // Add the new record to the list - record.resize(header->size); // Possibly 264? - memcpy(record.data(), iter, header->size); + record.timestamp = gpu_timestamp; + record.data.resize(header->size); // Possibly 264? + memcpy(record.data.data(), iter, header->size); records.emplace_back(record); } } @@ -329,8 +281,6 @@ bool IntelDriver::dump_perfcnt() read_data_from_metric_set(); - get_new_correlation(); - auto new_records = parse_perf_records(metric_buffer, total_bytes_read); if (new_records.empty()) { PPS_LOG("No new records"); @@ -353,7 +303,7 @@ bool IntelDriver::dump_perfcnt() return true; } -uint32_t IntelDriver::gpu_next() +uint64_t IntelDriver::gpu_next() { if (records.size() < 2) { // Not enough records to accumulate @@ -361,8 +311,8 @@ uint32_t IntelDriver::gpu_next() } // Get first and second - auto record_a = reinterpret_cast(records[0].data()); - auto record_b = reinterpret_cast(records[1].data()); + auto record_a = reinterpret_cast(records[0].data.data()); + auto record_b = reinterpret_cast(records[1].data.data()); intel_perf_query_result_accumulate_fields(&result, &perf->query.value(), @@ -372,42 +322,30 @@ uint32_t IntelDriver::gpu_next() false /* no_oa_accumulate */); // Get last timestamp - auto report_b = reinterpret_cast(record_b + 1); - auto gpu_timestamp = report_b->timestamp; + auto gpu_timestamp = records[1].timestamp; // Consume first record records.erase(std::begin(records), std::begin(records) + 1); - return gpu_timestamp; -} - -uint64_t IntelDriver::cpu_next() -{ - if (auto gpu_timestamp = gpu_next()) { - auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp); - - last_cpu_timestamp = cpu_timestamp; - return cpu_timestamp; - } - - return 0; + return scale_gpu_timestamp(gpu_timestamp, perf->devinfo.timestamp_frequency); } uint64_t IntelDriver::next() { // Reset accumulation intel_perf_query_result_clear(&result); - return cpu_next(); + return gpu_next(); } uint32_t IntelDriver::gpu_clock_id() const { - return perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME; + return this->clock_id; } uint64_t IntelDriver::gpu_timestamp() const { - return perfetto::base::GetBootTimeNs().count(); + return scale_gpu_timestamp(read_gpu_timestamp(drm_device.fd), + perf->devinfo.timestamp_frequency); } } // namespace pps diff --git a/src/intel/ds/intel_pps_driver.h b/src/intel/ds/intel_pps_driver.h index cac029a973e..4f0754c732a 100644 --- a/src/intel/ds/intel_pps_driver.h +++ b/src/intel/ds/intel_pps_driver.h @@ -13,17 +13,15 @@ namespace pps { -/// Timestamp correlation between CPU/GPU. -struct TimestampCorrelation { - /// In CLOCK_MONOTONIC - uint64_t cpu_timestamp; - - /// Engine timestamp associated with the OA unit - uint64_t gpu_timestamp; -}; /// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA) -using PerfRecord = std::vector; +struct PerfRecord { + /// Timestamp in the GPU clock domain + uint64_t timestamp; + + /// drm_i915_perf_record_header + report data + std::vector data; +}; /// @brief PPS Driver implementation for Intel graphics devices. /// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple @@ -34,14 +32,8 @@ using PerfRecord = std::vector; class IntelDriver : public Driver { public: - std::optional query_correlation_timestamps() const; - void get_new_correlation(); - - /// @brief OA reports only have the lower 32 bits of the timestamp - /// register, while correlation data has the whole 36 bits. - /// @param gpu_ts a 32 bit OA report GPU timestamp - /// @return The CPU timestamp relative to the argument - uint64_t correlate_gpu_timestamp(uint32_t gpu_ts); + IntelDriver(); + ~IntelDriver(); uint64_t get_min_sampling_period_ns() override; bool init_perfcnt() override; @@ -57,12 +49,7 @@ class IntelDriver : public Driver private: /// @brief Requests the next perf sample /// @return The sample GPU timestamp - uint32_t gpu_next(); - - /// @brief Requests the next perf sample accumulating those which - /// which duration is shorter than the requested sampling period - /// @return The sample CPU timestamp - uint64_t cpu_next(); + uint64_t gpu_next(); /// @param data Buffer of bytes to parse /// @param byte_count Number of bytes to parse @@ -75,11 +62,12 @@ class IntelDriver : public Driver /// Sampling period in nanoseconds requested by the datasource uint64_t sampling_period_ns = 0; - /// Keep track of the timestamp of the last sample generated - uint64_t last_cpu_timestamp = 0; + /// Last upper 32bits of the GPU timestamp in the parsed reports + uint64_t gpu_timestamp_udw = 0; - /// This is used to correlate CPU and GPU timestamps - std::array correlations; + /// Keep track of the timestamp of the last sample generated (upper & lower + /// 32bits) + uint64_t last_gpu_timestamp = 0; /// Data buffer used to store data read from the metric set std::vector metric_buffer = std::vector(1024, 0); @@ -94,6 +82,9 @@ class IntelDriver : public Driver // Accumulations are stored here struct intel_perf_query_result result = {}; + + // Gpu clock ID used to correlate GPU/CPU timestamps + uint32_t clock_id = 0; }; } // namespace pps