tu/perfetto: Move away from single timeline for all apps

This moves from deprecated stage_id/hw_queue_id to per-context
stage_iid/hw_queue_iid, which leads to separate timelines per app.
There are several benefits to this:
- Different driver versions could be used by different apps and perfetto
  won't confuse tracepoints.
- Tracepoints from different apps may not align perfectly, so previously
  we got a fair amount of weird vertical ordering of tracepoints.

The downside is that info is spread across several timelines multiplied
by queues, but I think that's better since it is easier to understand
which tracepoints correspond to which app.

The changes are mostly copied from radeon/intel perfetto integration.

This also fixes app_event emission along the way, previously
debug_marker_stage was called _before_ SEQ_INCREMENTAL_STATE_CLEARED.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41105>
This commit is contained in:
Danylo Piliaiev 2026-04-13 18:17:51 +02:00 committed by Marge Bot
parent 117f3cb1fc
commit 5b5bc956df
5 changed files with 179 additions and 125 deletions

View file

@ -2712,9 +2712,6 @@ tu_device_destroy_mutexes(struct tu_device *device)
mtx_destroy(&device->wave_pvtmem_bo.mtx);
mtx_destroy(&device->mutex);
mtx_destroy(&device->copy_timestamp_cs_pool_mutex);
#ifdef HAVE_PERFETTO
mtx_destroy(&device->perfetto.pending_clocks_sync_mtx);
#endif
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
mtx_destroy(&device->scratch_bos[i].construct_mtx);
@ -2830,7 +2827,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
mtx_init(&device->copy_timestamp_cs_pool_mutex, mtx_plain);
mtx_init(&device->softfloat_mutex, mtx_plain);
#ifdef HAVE_PERFETTO
mtx_init(&device->perfetto.pending_clocks_sync_mtx, mtx_plain);
tu_perfetto_init_state(&device->perfetto);
#endif
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
@ -3203,6 +3200,9 @@ fail_queues:
vk_free(&device->vk.alloc, device->queues[i]);
}
#ifdef HAVE_PERFETTO
tu_perfetto_destroy_state(&device->perfetto);
#endif
tu_device_destroy_mutexes(device);
tu_drm_device_finish(device);
vk_device_finish(&device->vk);
@ -4544,13 +4544,10 @@ tu_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,
* buffers. Still, getting the simple case of cmd buffer annotation into
* perfetto should prove useful.
*/
const char *label = pLabelInfo->pLabelName;
if (cmd_buffer->state.pass) {
trace_start_cmd_buffer_annotation_rp(
&cmd_buffer->trace, &cmd_buffer->draw_cs, cmd_buffer, strlen(label), label);
trace_start_cmd_buffer_annotation_rp(&cmd_buffer->trace, &cmd_buffer->draw_cs, cmd_buffer);
} else {
trace_start_cmd_buffer_annotation(&cmd_buffer->trace, &cmd_buffer->cs,
cmd_buffer, strlen(label), label);
trace_start_cmd_buffer_annotation(&cmd_buffer->trace, &cmd_buffer->cs, cmd_buffer);
}
}
@ -4559,11 +4556,16 @@ tu_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
{
VK_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, _commandBuffer);
if (cmd_buffer->state.pass) {
trace_end_cmd_buffer_annotation_rp(&cmd_buffer->trace,
&cmd_buffer->draw_cs);
} else {
trace_end_cmd_buffer_annotation(&cmd_buffer->trace, &cmd_buffer->cs);
if (cmd_buffer->vk.labels.size > 0) {
const VkDebugUtilsLabelEXT *label = util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
if (cmd_buffer->state.pass) {
trace_end_cmd_buffer_annotation_rp(&cmd_buffer->trace, &cmd_buffer->draw_cs, strlen(label->pLabelName),
label->pLabelName);
} else {
trace_end_cmd_buffer_annotation(&cmd_buffer->trace, &cmd_buffer->cs, strlen(label->pLabelName),
label->pLabelName);
}
}
vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
@ -4574,11 +4576,12 @@ tu_SetDebugUtilsObjectNameEXT(
VkDevice device,
const VkDebugUtilsObjectNameInfoEXT *pNameInfo)
{
UNUSED VK_FROM_HANDLE(tu_device, dev, device);
VkResult result = vk_common_SetDebugUtilsObjectNameEXT(device, pNameInfo);
#ifdef HAVE_PERFETTO
if (result == VK_SUCCESS)
tu_perfetto_set_debug_utils_object_name(pNameInfo);
tu_perfetto_set_debug_utils_object_name(dev, pNameInfo);
#endif
return result;

View file

@ -7,6 +7,7 @@
#include <perfetto.h>
#include "util/u_process.h"
#include "util/perf/u_perfetto_renderpass.h"
#include "tu_buffer.h"
@ -30,75 +31,13 @@ tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
struct u_trace_context *
tu_device_get_u_trace(struct tu_device *device);
/**
* Queue-id's
*/
enum tu_queue_id {
BR_HW_QUEUE_ID,
BV_HW_QUEUE_ID,
/* Labels set via VK_EXT_debug_utils are in a separate track due to the
* following part of the spec:
* "An application may open a debug label region in one command buffer and
* close it in another, or otherwise split debug label regions across
* multiple command buffers or multiple queue submissions."
*
* This means labels can start in one renderpass and end in another command
* buffer, which breaks our assumption that stages can be modeled as a stack.
* While applications aren't expected to use labels in such extreme ways,
* even simpler cases can break our assumptions.
*
* Having annotations in a separate track prevents the main track(s) from
* entering an invalid state.
*/
ANNOTATIONS_QUEUE_ID,
};
/**
* Render-stage id's
*/
enum tu_stage_id {
CMD_BUFFER_STAGE_ID,
CMD_BUFFER_ANNOTATION_STAGE_ID,
RENDER_PASS_STAGE_ID,
SECONDARY_CMD_BUFFER_STAGE_ID,
CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
BINNING_STAGE_ID,
CONCURRENT_BINNING_STAGE_ID,
CONCURRENT_BINNING_BARRIER_STAGE_ID,
GMEM_STAGE_ID,
BYPASS_STAGE_ID,
BLIT_STAGE_ID,
DRAW_STAGE_ID,
COMPUTE_STAGE_ID,
CLEAR_SYSMEM_STAGE_ID,
CLEAR_GMEM_STAGE_ID,
GENERIC_CLEAR_STAGE_ID,
GMEM_LOAD_STAGE_ID,
GMEM_STORE_STAGE_ID,
SYSMEM_RESOLVE_STAGE_ID,
CUSTOM_RESOLVE_STAGE_ID,
CLEAR_COLOR_IMAGE_STAGE_ID,
CLEAR_DEPTH_STENCIL_IMAGE_STAGE_ID,
COPY_BUFFER_TO_IMAGE_STAGE_ID,
COPY_IMAGE_TO_BUFFER_STAGE_ID,
COPY_IMAGE_STAGE_ID,
RESOLVE_IMAGE_STAGE_ID,
FILL_BUFFER_STAGE_ID,
COPY_BUFFER_STAGE_ID,
UPDATE_BUFFER_STAGE_ID,
SLOW_CLEAR_LRZ_STAGE_ID,
DISABLE_LRZ_STAGE_ID,
// TODO add the rest from fd_stage_id
};
static const struct {
const char *name;
const char *desc;
} queues[] = {
[ANNOTATIONS_QUEUE_ID] = {"Annotations", "Annotations Queue"},
[BR_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
[BV_HW_QUEUE_ID] = {"GPU Queue 1", "Adreno Bin Visibility Queue"},
[ANNOTATIONS_QUEUE_ID] = {"Annotations", "Annotations Queue"},
};
static const struct {
@ -140,6 +79,14 @@ static const struct {
};
static uint32_t gpu_clock_id;
static uint64_t
get_iid()
{
static uint64_t iid = 1;
return p_atomic_inc_return(&iid);
}
struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
using IncrementalStateType = MesaRenderpassIncrementalState;
};
@ -189,7 +136,8 @@ emit_sync_timestamp(struct tu_perfetto_clocks &clocks)
}
static void
setup_incremental_state(TuRenderpassDataSource::TraceContext &ctx)
setup_incremental_state(TuRenderpassDataSource::TraceContext &ctx,
struct tu_device *dev)
{
auto state = ctx.GetIncrementalState();
if (!state->was_cleared)
@ -201,32 +149,69 @@ setup_incremental_state(TuRenderpassDataSource::TraceContext &ctx)
auto packet = ctx.NewTracePacket();
packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
/* This must be set before interned data is sent. */
packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
packet->set_timestamp(0);
auto interned_data = packet->set_interned_data();
auto event = packet->set_gpu_render_stage_event();
event->set_gpu_id(0);
auto spec = event->set_specifications();
{
auto desc = interned_data->add_graphics_contexts();
desc->set_iid(dev->perfetto.context_iid);
desc->set_pid(getpid());
desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
}
for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
auto desc = spec->add_hw_queue();
char name[100];
auto desc = interned_data->add_gpu_specifications();
desc->set_name(queues[i].name);
snprintf(name, sizeof(name), "%.10s-%02u-%s",
util_get_process_name(), i, queues[i].name);
desc->set_iid(dev->perfetto.queue_iids[i]);
desc->set_name(name);
desc->set_description(queues[i].desc);
}
for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
auto desc = spec->add_stage();
auto desc = interned_data->add_gpu_specifications();
desc->set_iid(dev->perfetto.stage_iids[i]);
desc->set_name(stages[i].name);
if (stages[i].desc)
desc->set_description(stages[i].desc);
}
}
void
tu_perfetto_init_state(struct tu_perfetto_state *state)
{
mtx_init(&state->pending_clocks_sync_mtx, mtx_plain);
state->context_iid = get_iid();
state->event_id = 0;
for (unsigned i = 0; i < ARRAY_SIZE(state->queue_iids); i++)
state->queue_iids[i] = get_iid();
for (unsigned i = 0; i < ARRAY_SIZE(state->stage_iids); i++)
state->stage_iids[i] = get_iid();
}
void
tu_perfetto_destroy_state(struct tu_perfetto_state *state)
{
mtx_destroy(&state->pending_clocks_sync_mtx);
}
static void
stage_cleanup(struct tu_perfetto_stage *stage)
{
free((void *) stage->payload);
stage->payload = nullptr;
stage->start_payload_function = nullptr;
}
static struct tu_perfetto_stage *
stage_push(struct tu_perfetto_stage_stack *stack)
{
@ -268,7 +253,6 @@ static void
stage_start(struct tu_device *dev,
uint64_t ts_ns,
enum tu_stage_id stage_id,
const char *app_event,
const void *payload = nullptr,
size_t payload_size = 0,
const void *indirect = nullptr,
@ -293,22 +277,15 @@ stage_start(struct tu_device *dev,
*stage = (struct tu_perfetto_stage) {
.stage_id = stage_id,
.stage_iid = 0,
.start_ts = ts_ns,
.payload = payload,
.start_payload_function = (void *) payload_as_extra,
};
if (app_event) {
TuRenderpassDataSource::Trace([=](auto tctx) {
stage->stage_iid =
tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event);
});
}
}
static void
stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
const char *app_event,
const void *flush_data,
const void* payload = nullptr,
const void *indirect = nullptr,
@ -327,12 +304,15 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
uint64_t duration = ts_ns - stage->start_ts;
/* Zero duration can only happen when tracepoints did not happen on GPU. */
if (duration == 0)
if (duration == 0) {
stage_cleanup(stage);
return;
}
if (stage->stage_id != stage_id) {
PERFETTO_ELOG("stage %d ended while stage %d is expected",
stage_id, stage->stage_id);
stage_cleanup(stage);
return;
}
@ -366,7 +346,11 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
}
TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
setup_incremental_state(tctx);
setup_incremental_state(tctx, dev);
uint64_t stage_iid = app_event ?
tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) :
state->stage_iids[stage->stage_id];
auto packet = tctx.NewTracePacket();
@ -376,26 +360,24 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
packet->set_timestamp_clock_id(gpu_clock_id);
auto event = packet->set_gpu_render_stage_event();
event->set_event_id(0); // ???
event->set_hw_queue_id(queue_id);
event->set_event_id(state->event_id++);
event->set_hw_queue_iid(state->queue_iids[queue_id]);
event->set_duration(ts_ns - stage->start_ts);
if (stage->stage_iid)
event->set_stage_iid(stage->stage_iid);
else
event->set_stage_id(stage->stage_id);
event->set_context((uintptr_t) dev);
event->set_stage_iid(stage_iid);
event->set_context(state->context_iid);
event->set_submission_id(submission_id);
if (stage->payload) {
if (stage->start_payload_function)
((trace_payload_as_extra_func) stage->start_payload_function)(
event, stage->payload, nullptr);
free((void *)stage->payload);
}
if (payload && payload_as_extra)
payload_as_extra(event, payload, indirect);
});
stage_cleanup(stage);
}
class TuMemoryDataSource : public perfetto::DataSource<TuMemoryDataSource> {
@ -591,7 +573,7 @@ tu_perfetto_end_submit(struct tu_queue *queue,
const void *indirect_data) \
{ \
stage_start( \
dev, ts_ns, stage_id, NULL, payload, sizeof(*payload), indirect_data, \
dev, ts_ns, stage_id, payload, sizeof(*payload), indirect_data, \
(trace_payload_as_extra_func) &trace_payload_as_extra_start_##event_name); \
} \
\
@ -601,7 +583,7 @@ tu_perfetto_end_submit(struct tu_queue *queue,
const void *indirect_data) \
{ \
stage_end( \
dev, ts_ns, stage_id, flush_data, payload, indirect_data, \
dev, ts_ns, stage_id, NULL, flush_data, payload, indirect_data, \
(trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name); \
}
@ -647,7 +629,7 @@ tu_perfetto_start_cmd_buffer_annotation(
const void *indirect_data)
{
/* No extra func necessary, the only arg is in the end payload.*/
stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, payload->str, payload,
stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, payload,
sizeof(*payload), NULL);
}
@ -663,7 +645,7 @@ tu_perfetto_end_cmd_buffer_annotation(
/* Pass the payload string as the app_event, which will appear right on the
* event block, rather than as metadata inside.
*/
stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, flush_data,
stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_STAGE_ID, payload->str, flush_data,
payload, NULL);
}
@ -678,7 +660,7 @@ tu_perfetto_start_cmd_buffer_annotation_rp(
{
/* No extra func necessary, the only arg is in the end payload.*/
stage_start(dev, ts_ns, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
payload->str, payload, sizeof(*payload), NULL);
payload, sizeof(*payload), NULL);
}
void
@ -694,7 +676,7 @@ tu_perfetto_end_cmd_buffer_annotation_rp(
* event block, rather than as metadata inside.
*/
stage_end(dev, ts_ns, CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
flush_data, payload, NULL);
payload->str, flush_data, payload, NULL);
}
@ -773,22 +755,22 @@ tu_perfetto_log_destroy_image(struct tu_device *dev, struct tu_image *image)
void
tu_perfetto_set_debug_utils_object_name(const VkDebugUtilsObjectNameInfoEXT *pNameInfo)
tu_perfetto_set_debug_utils_object_name(struct tu_device *dev, const VkDebugUtilsObjectNameInfoEXT *pNameInfo)
{
TuRenderpassDataSource::Trace([=](auto tctx) {
/* Do we need this for SEQ_INCREMENTAL_STATE_CLEARED for the object name to stick? */
setup_incremental_state(tctx);
setup_incremental_state(tctx, dev);
tctx.GetDataSourceLocked()->SetDebugUtilsObjectNameEXT(tctx, pNameInfo);
});
}
void
tu_perfetto_refresh_debug_utils_object_name(const struct vk_object_base *object)
tu_perfetto_refresh_debug_utils_object_name(struct tu_device *dev, const struct vk_object_base *object)
{
TuRenderpassDataSource::Trace([=](auto tctx) {
/* Do we need this for SEQ_INCREMENTAL_STATE_CLEARED for the object name to stick? */
setup_incremental_state(tctx);
setup_incremental_state(tctx, dev);
tctx.GetDataSourceLocked()->RefreshSetDebugUtilsObjectNameEXT(tctx, object);
});

View file

@ -20,16 +20,76 @@ extern "C" {
#define TU_PERFETTO_MAX_STACK_DEPTH 8
/**
* Queue-id's
*/
enum tu_queue_id {
/* Labels set via VK_EXT_debug_utils are in a separate track due to the
* following part of the spec:
* "An application may open a debug label region in one command buffer and
* close it in another, or otherwise split debug label regions across
* multiple command buffers or multiple queue submissions."
*
* This means labels can start in one renderpass and end in another command
* buffer, which breaks our assumption that stages can be modeled as a stack.
* While applications aren't expected to use labels in such extreme ways,
* even simpler cases can break our assumptions.
*
* Having annotations in a separate track prevents the main track(s) from
* entering an invalid state.
*/
ANNOTATIONS_QUEUE_ID,
BR_HW_QUEUE_ID,
BV_HW_QUEUE_ID,
TU_QUEUE_ID_COUNT,
};
/**
* Render-stage id's
*/
enum tu_stage_id {
CMD_BUFFER_STAGE_ID,
CMD_BUFFER_ANNOTATION_STAGE_ID,
RENDER_PASS_STAGE_ID,
SECONDARY_CMD_BUFFER_STAGE_ID,
CMD_BUFFER_ANNOTATION_RENDER_PASS_STAGE_ID,
BINNING_STAGE_ID,
CONCURRENT_BINNING_STAGE_ID,
CONCURRENT_BINNING_BARRIER_STAGE_ID,
GMEM_STAGE_ID,
BYPASS_STAGE_ID,
BLIT_STAGE_ID,
DRAW_STAGE_ID,
COMPUTE_STAGE_ID,
CLEAR_SYSMEM_STAGE_ID,
CLEAR_GMEM_STAGE_ID,
GENERIC_CLEAR_STAGE_ID,
GMEM_LOAD_STAGE_ID,
GMEM_STORE_STAGE_ID,
SYSMEM_RESOLVE_STAGE_ID,
CUSTOM_RESOLVE_STAGE_ID,
CLEAR_COLOR_IMAGE_STAGE_ID,
CLEAR_DEPTH_STENCIL_IMAGE_STAGE_ID,
COPY_BUFFER_TO_IMAGE_STAGE_ID,
COPY_IMAGE_TO_BUFFER_STAGE_ID,
COPY_IMAGE_STAGE_ID,
RESOLVE_IMAGE_STAGE_ID,
FILL_BUFFER_STAGE_ID,
COPY_BUFFER_STAGE_ID,
UPDATE_BUFFER_STAGE_ID,
SLOW_CLEAR_LRZ_STAGE_ID,
DISABLE_LRZ_STAGE_ID,
TU_STAGE_ID_COUNT,
};
struct tu_device;
struct tu_queue;
struct tu_u_trace_submission_data;
struct tu_perfetto_stage {
int stage_id;
/* dynamically allocated stage iid, for app_events. 0 if stage_id should be
* used instead.
*/
uint64_t stage_iid;
enum tu_stage_id stage_id;
uint64_t start_ts;
const void* payload;
void* start_payload_function;
@ -52,6 +112,11 @@ struct tu_perfetto_state {
struct tu_perfetto_stage_stack annotations_stack;
struct tu_perfetto_stage_stack render_stack;
uint64_t context_iid;
uint64_t queue_iids[TU_QUEUE_ID_COUNT];
uint64_t stage_iids[TU_STAGE_ID_COUNT];
uint64_t event_id;
bool has_pending_clocks_sync;
mtx_t pending_clocks_sync_mtx;
struct tu_perfetto_clocks pending_clocks_sync;
@ -66,6 +131,8 @@ struct tu_perfetto_state {
};
void tu_perfetto_init(void);
void tu_perfetto_init_state(struct tu_perfetto_state *state);
void tu_perfetto_destroy_state(struct tu_perfetto_state *state);
uint64_t
tu_perfetto_begin_submit();
@ -86,10 +153,12 @@ void tu_perfetto_log_destroy_image(struct tu_device *dev, struct tu_image *image
void
tu_perfetto_set_debug_utils_object_name(
struct tu_device *dev,
const VkDebugUtilsObjectNameInfoEXT *pNameInfo);
void
tu_perfetto_refresh_debug_utils_object_name(
struct tu_device *dev,
const struct vk_object_base *object);
#ifdef __cplusplus

View file

@ -548,7 +548,7 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
#ifdef HAVE_PERFETTO
if (u_trace_should_process(&device->trace_context)) {
for (int i = 0; i < vk_submit->command_buffer_count; i++)
tu_perfetto_refresh_debug_utils_object_name(
tu_perfetto_refresh_debug_utils_object_name(device,
&vk_submit->command_buffers[i]->base);
}
#endif

View file

@ -266,9 +266,9 @@ begin_end_tp('compute_indirect',
# Annotations for Cmd(Begin|End)DebugUtilsLabelEXT
for suffix in ["", "_rp"]:
begin_end_tp('cmd_buffer_annotation' + suffix,
args=[Arg(type='unsigned', var='len'),
Arg(type='str', var='str', c_format='%s', length_arg='len + 1', copy_func='strncpy'),],
tp_struct=[Arg(type='uint8_t', name='dummy', var='0'),])
end_args=[Arg(type='unsigned', var='len'),
Arg(type='str', var='str', c_format='%s', length_arg='len + 1', copy_func='strncpy'),],
end_tp_struct=[Arg(type='uint8_t', name='dummy', var='0'),])
utrace_generate(cpath=args.utrace_src,
hpath=args.utrace_hdr,