mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 11:08:03 +02:00
tu/perfetto: Make GPU clock sequence-scoped
When CPU clock is the same with the authoritative trace clock (normally default to CLOCK_BOOTTIME), perfetto drops the non-monotonic snapshots to ensure validity of the global source clock in the resolution graph. When they are different, the clocks are marked invalid and the rest of the clock syncs will fail during trace processing. There's no central daemon emitting consistent snapshots for synchronization between CPU and GPU clocks on behalf of renderstages and counters producers. The sequence-scoped clock (64 <= ID < 128) is unique per producer + writer pair within the tracing session. Turnip is a bit tricky here, since clocks may be synchronized before `tu_perfetto_end_submit` is called (in case of KGSL), but emission of perfetto event has to happen on the same thread as other renderstage events. To solve this I save the clocks in `tu_perfetto_state` and emit them in `stage_end` when needed. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37465>
This commit is contained in:
parent
09f5c9d0ad
commit
0621d5cd39
3 changed files with 72 additions and 26 deletions
|
|
@ -2583,6 +2583,9 @@ tu_device_destroy_mutexes(struct tu_device *device)
|
|||
mtx_destroy(&device->fiber_pvtmem_bo.mtx);
|
||||
mtx_destroy(&device->wave_pvtmem_bo.mtx);
|
||||
mtx_destroy(&device->mutex);
|
||||
#ifdef HAVE_PERFETTO
|
||||
mtx_destroy(&device->perfetto.pending_clocks_sync_mtx);
|
||||
#endif
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
|
||||
mtx_destroy(&device->scratch_bos[i].construct_mtx);
|
||||
|
||||
|
|
@ -2689,6 +2692,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
|||
mtx_init(&device->fiber_pvtmem_bo.mtx, mtx_plain);
|
||||
mtx_init(&device->wave_pvtmem_bo.mtx, mtx_plain);
|
||||
mtx_init(&device->mutex, mtx_plain);
|
||||
#ifdef HAVE_PERFETTO
|
||||
mtx_init(&device->perfetto.pending_clocks_sync_mtx, mtx_plain);
|
||||
#endif
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
|
||||
mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
|
||||
|
||||
|
|
|
|||
|
|
@ -108,18 +108,44 @@ class TuRenderpassDataSource : public MesaRenderpassDataSource<TuRenderpassDataS
|
|||
{
|
||||
MesaRenderpassDataSource<TuRenderpassDataSource, TuRenderpassTraits>::OnStart(args);
|
||||
|
||||
/* Note: clock_id's below 128 are reserved.. for custom clock sources,
|
||||
* using the hash of a namespaced string is the recommended approach.
|
||||
* See: https://perfetto.dev/docs/concepts/clock-sync
|
||||
/* See: https://perfetto.dev/docs/concepts/clock-sync
|
||||
*
|
||||
* Use sequence-scoped clock (64 <= ID < 128) for GPU clock because
|
||||
* there's no central daemon emitting consistent snapshots for
|
||||
* synchronization between CPU and GPU clocks on behalf of renderstages
|
||||
* and counters producers.
|
||||
*
|
||||
* When CPU clock is the same with the authoritative trace clock
|
||||
* (normally default to CLOCK_BOOTTIME), perfetto drops the
|
||||
* non-monotonic snapshots to ensure validity of the global source clock
|
||||
* in the resolution graph. When they are different, the clocks are
|
||||
* marked invalid and the rest of the clock syncs will fail during trace
|
||||
* processing.
|
||||
*
|
||||
* Meanwhile, since the clock is now sequence-scoped (unique per
|
||||
* producer + writer pair within the tracing session), we can simply
|
||||
* pick 64.
|
||||
*/
|
||||
gpu_clock_id =
|
||||
_mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
|
||||
gpu_clock_id = 64;
|
||||
}
|
||||
};
|
||||
|
||||
PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
|
||||
PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
|
||||
|
||||
static void
|
||||
emit_sync_timestamp(struct tu_perfetto_clocks &clocks)
|
||||
{
|
||||
uint32_t cpu_clock_id = perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
|
||||
uint64_t gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
|
||||
TuRenderpassDataSource::Trace([=](auto tctx) {
|
||||
MesaRenderpassDataSource<TuRenderpassDataSource,
|
||||
TuRenderpassTraits>::EmitClockSync(tctx, clocks.cpu,
|
||||
gpu_ts, cpu_clock_id,
|
||||
gpu_clock_id);
|
||||
});
|
||||
}
|
||||
|
||||
static void
|
||||
setup_incremental_state(TuRenderpassDataSource::TraceContext &ctx)
|
||||
{
|
||||
|
|
@ -258,6 +284,19 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
|
|||
return;
|
||||
}
|
||||
|
||||
/* We use sequence-scoped clock for GPU time with perfetto.
|
||||
* Different threads have different scopes, so we have to sync clocks
|
||||
* in the same thread where renderstage events are emitted.
|
||||
*/
|
||||
if (state->has_pending_clocks_sync) {
|
||||
mtx_lock(&state->pending_clocks_sync_mtx);
|
||||
struct tu_perfetto_clocks clocks = state->pending_clocks_sync;
|
||||
state->has_pending_clocks_sync = false;
|
||||
mtx_unlock(&state->pending_clocks_sync_mtx);
|
||||
|
||||
emit_sync_timestamp(clocks);
|
||||
}
|
||||
|
||||
TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
|
||||
setup_incremental_state(tctx);
|
||||
|
||||
|
|
@ -337,18 +376,6 @@ tu_perfetto_init(void)
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
|
||||
{
|
||||
uint32_t cpu_clock_id = perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
|
||||
TuRenderpassDataSource::Trace([=](auto tctx) {
|
||||
MesaRenderpassDataSource<TuRenderpassDataSource,
|
||||
TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
|
||||
gpu_ts, cpu_clock_id,
|
||||
gpu_clock_id);
|
||||
});
|
||||
}
|
||||
|
||||
uint64_t
|
||||
tu_perfetto_begin_submit()
|
||||
{
|
||||
|
|
@ -439,17 +466,24 @@ tu_perfetto_end_submit(struct tu_queue *queue,
|
|||
struct tu_perfetto_clocks *gpu_clocks)
|
||||
{
|
||||
struct tu_device *dev = queue->device;
|
||||
struct tu_perfetto_state *state = &dev->perfetto;
|
||||
if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
|
||||
return {};
|
||||
|
||||
struct tu_perfetto_clocks clocks = sync_clocks(dev, gpu_clocks);
|
||||
if (clocks.gpu_ts > 0)
|
||||
emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
|
||||
|
||||
if (clocks.gpu_ts > 0) {
|
||||
mtx_lock(&state->pending_clocks_sync_mtx);
|
||||
state->pending_clocks_sync = clocks;
|
||||
state->has_pending_clocks_sync = true;
|
||||
mtx_unlock(&state->pending_clocks_sync_mtx);
|
||||
}
|
||||
|
||||
TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
|
||||
auto packet = tctx.NewTracePacket();
|
||||
|
||||
packet->set_timestamp(start_ts);
|
||||
packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
|
||||
|
||||
auto event = packet->set_vulkan_api_event();
|
||||
auto submit = event->set_vk_queue_submit();
|
||||
|
|
@ -582,6 +616,7 @@ log_mem(struct tu_device *dev, struct tu_buffer *buffer, struct tu_image *image,
|
|||
auto packet = tctx.NewTracePacket();
|
||||
|
||||
packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
|
||||
packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
|
||||
|
||||
auto event = packet->set_vulkan_memory_event();
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
#include <stdint.h>
|
||||
|
||||
#include <vulkan/vulkan.h>
|
||||
#include "c11/threads.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -34,11 +35,22 @@ struct tu_perfetto_stage {
|
|||
void* start_payload_function;
|
||||
};
|
||||
|
||||
struct tu_perfetto_clocks
|
||||
{
|
||||
uint64_t cpu;
|
||||
uint64_t gpu_ts;
|
||||
uint64_t gpu_ts_offset;
|
||||
};
|
||||
|
||||
struct tu_perfetto_state {
|
||||
struct tu_perfetto_stage stages[TU_PERFETTO_MAX_STACK_DEPTH];
|
||||
unsigned stage_depth;
|
||||
unsigned skipped_depth;
|
||||
|
||||
bool has_pending_clocks_sync;
|
||||
mtx_t pending_clocks_sync_mtx;
|
||||
struct tu_perfetto_clocks pending_clocks_sync;
|
||||
|
||||
uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
|
||||
uint64_t last_sync_gpu_ts;
|
||||
|
||||
|
|
@ -50,13 +62,6 @@ struct tu_perfetto_state {
|
|||
|
||||
void tu_perfetto_init(void);
|
||||
|
||||
struct tu_perfetto_clocks
|
||||
{
|
||||
uint64_t cpu;
|
||||
uint64_t gpu_ts;
|
||||
uint64_t gpu_ts_offset;
|
||||
};
|
||||
|
||||
uint64_t
|
||||
tu_perfetto_begin_submit();
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue