From 0621d5cd392538c4c7e21ce4d71b58b5411e1dc5 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Thu, 18 Sep 2025 18:53:56 +0200
Subject: [PATCH] tu/perfetto: Make GPU clock sequence-scoped

When CPU clock is the same with the authoritative trace clock (normally
default to CLOCK_BOOTTIME), perfetto drops the non-monotonic snapshots
to ensure validity of the global source clock in the resolution graph.
When they are different, the clocks are marked invalid and the rest of
the clock syncs will fail during trace processing.

There's no central daemon emitting consistent snapshots for
synchronization between CPU and GPU clocks on behalf of renderstages and
counters producers. The sequence-scoped clock (64 <= ID < 128) is unique
per producer + writer pair within the tracing session.

Turnip is a bit tricky here, since clocks may be synchronized before
`tu_perfetto_end_submit` is called (in case of KGSL), but emission of
perfetto event has to happen on the same thread as other renderstage events.
To solve this I save the clocks in `tu_perfetto_state` and emit them in
`stage_end` when needed.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37465>
---
 src/freedreno/vulkan/tu_device.cc   |  6 +++
 src/freedreno/vulkan/tu_perfetto.cc | 73 +++++++++++++++++++++--------
 src/freedreno/vulkan/tu_perfetto.h  | 19 +++++---
 3 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index 9b486eb7608..5bad3fc16b3 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -2583,6 +2583,9 @@ tu_device_destroy_mutexes(struct tu_device *device)
    mtx_destroy(&device->fiber_pvtmem_bo.mtx);
    mtx_destroy(&device->wave_pvtmem_bo.mtx);
    mtx_destroy(&device->mutex);
+#ifdef HAVE_PERFETTO
+   mtx_destroy(&device->perfetto.pending_clocks_sync_mtx);
+#endif
    for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
       mtx_destroy(&device->scratch_bos[i].construct_mtx);
 
@@ -2689,6 +2692,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain);
    mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain);
    mtx_init(&device->mutex, mtx_plain);
+#ifdef HAVE_PERFETTO
+   mtx_init(&device->perfetto.pending_clocks_sync_mtx, mtx_plain);
+#endif
    for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
       mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
 
diff --git a/src/freedreno/vulkan/tu_perfetto.cc b/src/freedreno/vulkan/tu_perfetto.cc
index 0ce97703c67..8effc177c33 100644
--- a/src/freedreno/vulkan/tu_perfetto.cc
+++ b/src/freedreno/vulkan/tu_perfetto.cc
@@ -108,18 +108,44 @@ class TuRenderpassDataSource : public MesaRenderpassDataSource<TuRenderpassDataS
    {
       MesaRenderpassDataSource<TuRenderpassDataSource, TuRenderpassTraits>::OnStart(args);
 
-      /* Note: clock_id's below 128 are reserved.. for custom clock sources,
-       * using the hash of a namespaced string is the recommended approach.
-       * See: https://perfetto.dev/docs/concepts/clock-sync
+      /* See: https://perfetto.dev/docs/concepts/clock-sync
+       *
+       * Use sequence-scoped clock (64 <= ID < 128) for GPU clock because
+       * there's no central daemon emitting consistent snapshots for
+       * synchronization between CPU and GPU clocks on behalf of renderstages
+       * and counters producers.
+       *
+       * When CPU clock is the same with the authoritative trace clock
+       * (normally default to CLOCK_BOOTTIME), perfetto drops the
+       * non-monotonic snapshots to ensure validity of the global source clock
+       * in the resolution graph. When they are different, the clocks are
+       * marked invalid and the rest of the clock syncs will fail during trace
+       * processing.
+       *
+       * Meanwhile, since the clock is now sequence-scoped (unique per
+       * producer + writer pair within the tracing session), we can simply
+       * pick 64.
        */
-      gpu_clock_id =
-         _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
+      gpu_clock_id = 64;
    }
 };
 
 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
 PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
 
+static void
+emit_sync_timestamp(struct tu_perfetto_clocks &clocks)
+{
+   uint32_t cpu_clock_id = perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
+   uint64_t gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
+   TuRenderpassDataSource::Trace([=](auto tctx) {
+      MesaRenderpassDataSource<TuRenderpassDataSource,
+                               TuRenderpassTraits>::EmitClockSync(tctx, clocks.cpu,
+                                                                  gpu_ts, cpu_clock_id,
+                                                                  gpu_clock_id);
+   });
+}
+
 static void
 setup_incremental_state(TuRenderpassDataSource::TraceContext &ctx)
 {
@@ -258,6 +284,19 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
       return;
    }
 
+   /* We use sequence-scoped clock for GPU time with perfetto.
+    * Different threads have different scopes, so we have to sync clocks
+    * in the same thread where renderstage events are emitted.
+    */
+   if (state->has_pending_clocks_sync) {
+      mtx_lock(&state->pending_clocks_sync_mtx);
+      struct tu_perfetto_clocks clocks = state->pending_clocks_sync;
+      state->has_pending_clocks_sync = false;
+      mtx_unlock(&state->pending_clocks_sync_mtx);
+
+      emit_sync_timestamp(clocks);
+   }
+
    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
       setup_incremental_state(tctx);
 
@@ -337,18 +376,6 @@ tu_perfetto_init(void)
    }
 }
 
-static void
-emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
-{
-   uint32_t cpu_clock_id = perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
-   TuRenderpassDataSource::Trace([=](auto tctx) {
-      MesaRenderpassDataSource<TuRenderpassDataSource,
-                               TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
-                                                                  gpu_ts, cpu_clock_id,
-                                                                  gpu_clock_id);
-   });
-}
-
 uint64_t
 tu_perfetto_begin_submit()
 {
@@ -439,17 +466,24 @@ tu_perfetto_end_submit(struct tu_queue *queue,
                        struct tu_perfetto_clocks *gpu_clocks)
 {
    struct tu_device *dev = queue->device;
+   struct tu_perfetto_state *state = &dev->perfetto;
    if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
       return {};
 
    struct tu_perfetto_clocks clocks = sync_clocks(dev, gpu_clocks);
-   if (clocks.gpu_ts > 0)
-      emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
+
+   if (clocks.gpu_ts > 0) {
+      mtx_lock(&state->pending_clocks_sync_mtx);
+      state->pending_clocks_sync = clocks;
+      state->has_pending_clocks_sync = true;
+      mtx_unlock(&state->pending_clocks_sync_mtx);
+   }
 
    TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
       auto packet = tctx.NewTracePacket();
 
       packet->set_timestamp(start_ts);
+      packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
 
       auto event = packet->set_vulkan_api_event();
       auto submit = event->set_vk_queue_submit();
@@ -582,6 +616,7 @@ log_mem(struct tu_device *dev, struct tu_buffer *buffer, struct tu_image *image,
       auto packet = tctx.NewTracePacket();
 
       packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
+      packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
 
       auto event = packet->set_vulkan_memory_event();
 
diff --git a/src/freedreno/vulkan/tu_perfetto.h b/src/freedreno/vulkan/tu_perfetto.h
index c5a9c3feb3c..17de140efeb 100644
--- a/src/freedreno/vulkan/tu_perfetto.h
+++ b/src/freedreno/vulkan/tu_perfetto.h
@@ -12,6 +12,7 @@
 #include <stdint.h>
 
 #include <vulkan/vulkan.h>
+#include "c11/threads.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,11 +35,22 @@ struct tu_perfetto_stage {
    void* start_payload_function;
 };
 
+struct tu_perfetto_clocks
+{
+   uint64_t cpu;
+   uint64_t gpu_ts;
+   uint64_t gpu_ts_offset;
+};
+
 struct tu_perfetto_state {
    struct tu_perfetto_stage stages[TU_PERFETTO_MAX_STACK_DEPTH];
    unsigned stage_depth;
    unsigned skipped_depth;
 
+   bool has_pending_clocks_sync;
+   mtx_t pending_clocks_sync_mtx;
+   struct tu_perfetto_clocks pending_clocks_sync;
+
    uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
    uint64_t last_sync_gpu_ts;
 
@@ -50,13 +62,6 @@ struct tu_perfetto_state {
 
 void tu_perfetto_init(void);
 
-struct tu_perfetto_clocks
-{
-   uint64_t cpu;
-   uint64_t gpu_ts;
-   uint64_t gpu_ts_offset;
-};
-
 uint64_t
 tu_perfetto_begin_submit();