diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 9b486eb7608..5bad3fc16b3 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2583,6 +2583,9 @@ tu_device_destroy_mutexes(struct tu_device *device) mtx_destroy(&device->fiber_pvtmem_bo.mtx); mtx_destroy(&device->wave_pvtmem_bo.mtx); mtx_destroy(&device->mutex); +#ifdef HAVE_PERFETTO + mtx_destroy(&device->perfetto.pending_clocks_sync_mtx); +#endif for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) mtx_destroy(&device->scratch_bos[i].construct_mtx); @@ -2689,6 +2692,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain); mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain); mtx_init(&device->mutex, mtx_plain); +#ifdef HAVE_PERFETTO + mtx_init(&device->perfetto.pending_clocks_sync_mtx, mtx_plain); +#endif for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain); diff --git a/src/freedreno/vulkan/tu_perfetto.cc b/src/freedreno/vulkan/tu_perfetto.cc index 0ce97703c67..8effc177c33 100644 --- a/src/freedreno/vulkan/tu_perfetto.cc +++ b/src/freedreno/vulkan/tu_perfetto.cc @@ -108,18 +108,44 @@ class TuRenderpassDataSource : public MesaRenderpassDataSource::OnStart(args); - /* Note: clock_id's below 128 are reserved.. for custom clock sources, - * using the hash of a namespaced string is the recommended approach. - * See: https://perfetto.dev/docs/concepts/clock-sync + /* See: https://perfetto.dev/docs/concepts/clock-sync + * + * Use sequence-scoped clock (64 <= ID < 128) for GPU clock because + * there's no central daemon emitting consistent snapshots for + * synchronization between CPU and GPU clocks on behalf of renderstages + * and counters producers. + * + * When CPU clock is the same with the authoritative trace clock + * (normally default to CLOCK_BOOTTIME), perfetto drops the + * non-monotonic snapshots to ensure validity of the global source clock + * in the resolution graph. When they are different, the clocks are + * marked invalid and the rest of the clock syncs will fail during trace + * processing. + * + * Meanwhile, since the clock is now sequence-scoped (unique per + * producer + writer pair within the tracing session), we can simply + * pick 64. */ - gpu_clock_id = - _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000; + gpu_clock_id = 64; } }; PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource); PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource); +static void +emit_sync_timestamp(struct tu_perfetto_clocks &clocks) +{ + uint32_t cpu_clock_id = perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME; + uint64_t gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset; + TuRenderpassDataSource::Trace([=](auto tctx) { + MesaRenderpassDataSource::EmitClockSync(tctx, clocks.cpu, + gpu_ts, cpu_clock_id, + gpu_clock_id); + }); +} + static void setup_incremental_state(TuRenderpassDataSource::TraceContext &ctx) { @@ -258,6 +284,19 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id, return; } + /* We use sequence-scoped clock for GPU time with perfetto. + * Different threads have different scopes, so we have to sync clocks + * in the same thread where renderstage events are emitted. + */ + if (state->has_pending_clocks_sync) { + mtx_lock(&state->pending_clocks_sync_mtx); + struct tu_perfetto_clocks clocks = state->pending_clocks_sync; + state->has_pending_clocks_sync = false; + mtx_unlock(&state->pending_clocks_sync_mtx); + + emit_sync_timestamp(clocks); + } + TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) { setup_incremental_state(tctx); @@ -337,18 +376,6 @@ tu_perfetto_init(void) } } -static void -emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts) -{ - uint32_t cpu_clock_id = perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME; - TuRenderpassDataSource::Trace([=](auto tctx) { - MesaRenderpassDataSource::EmitClockSync(tctx, cpu_ts, - gpu_ts, cpu_clock_id, - gpu_clock_id); - }); -} - uint64_t tu_perfetto_begin_submit() { @@ -439,17 +466,24 @@ tu_perfetto_end_submit(struct tu_queue *queue, struct tu_perfetto_clocks *gpu_clocks) { struct tu_device *dev = queue->device; + struct tu_perfetto_state *state = &dev->perfetto; if (!u_trace_perfetto_active(tu_device_get_u_trace(dev))) return {}; struct tu_perfetto_clocks clocks = sync_clocks(dev, gpu_clocks); - if (clocks.gpu_ts > 0) - emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset); + + if (clocks.gpu_ts > 0) { + mtx_lock(&state->pending_clocks_sync_mtx); + state->pending_clocks_sync = clocks; + state->has_pending_clocks_sync = true; + mtx_unlock(&state->pending_clocks_sync_mtx); + } TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) { auto packet = tctx.NewTracePacket(); packet->set_timestamp(start_ts); + packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME); auto event = packet->set_vulkan_api_event(); auto submit = event->set_vk_queue_submit(); @@ -582,6 +616,7 @@ log_mem(struct tu_device *dev, struct tu_buffer *buffer, struct tu_image *image, auto packet = tctx.NewTracePacket(); packet->set_timestamp(perfetto::base::GetBootTimeNs().count()); + packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME); auto event = packet->set_vulkan_memory_event(); diff --git a/src/freedreno/vulkan/tu_perfetto.h b/src/freedreno/vulkan/tu_perfetto.h index c5a9c3feb3c..17de140efeb 100644 --- a/src/freedreno/vulkan/tu_perfetto.h +++ b/src/freedreno/vulkan/tu_perfetto.h @@ -12,6 +12,7 @@ #include #include +#include "c11/threads.h" #ifdef __cplusplus extern "C" { @@ -34,11 +35,22 @@ struct tu_perfetto_stage { void* start_payload_function; }; +struct tu_perfetto_clocks +{ + uint64_t cpu; + uint64_t gpu_ts; + uint64_t gpu_ts_offset; +}; + struct tu_perfetto_state { struct tu_perfetto_stage stages[TU_PERFETTO_MAX_STACK_DEPTH]; unsigned stage_depth; unsigned skipped_depth; + bool has_pending_clocks_sync; + mtx_t pending_clocks_sync_mtx; + struct tu_perfetto_clocks pending_clocks_sync; + uint64_t next_clock_sync_ns; /* cpu time of next clk sync */ uint64_t last_sync_gpu_ts; @@ -50,13 +62,6 @@ struct tu_perfetto_state { void tu_perfetto_init(void); -struct tu_perfetto_clocks -{ - uint64_t cpu; - uint64_t gpu_ts; - uint64_t gpu_ts_offset; -}; - uint64_t tu_perfetto_begin_submit();