tu: Cache copy timestamp cs to avoid allocations on submit

For non-ONE_TIME_SUBMIT command buffers we have to create CS that
copies u_trace timestamps. Having cs pool allows us to avoid
allocations in most cases.

Together with previous commit, BO allocations on submit would be
rare during normal u_trace tracing.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35637>
This commit is contained in:
Danylo Piliaiev 2025-06-19 15:00:14 +02:00 committed by Marge Bot
parent a1b9aa3385
commit 6e5944ec4b
3 changed files with 100 additions and 69 deletions

View file

@ -2173,43 +2173,56 @@ tu_cmd_end_iterator(struct tu_cmd_buffer *cmdbuf)
return u_trace_end_iterator(&cmdbuf->trace);
}
}
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy)
tu_create_copy_timestamp_cs(struct tu_u_trace_submission_data *submission_data,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count,
uint32_t trace_chunks_to_copy)
{
*cs = (struct tu_cs *) vk_zalloc(&cmdbuf->device->vk.alloc,
sizeof(struct tu_cs), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
submission_data->last_buffer_with_tracepoints = -1;
if (*cs == NULL) {
return VK_ERROR_OUT_OF_HOST_MEMORY;
tu_device *device = cmd_buffers[0]->device;
uint32_t cs_size = trace_chunks_to_copy * 6 * 2 + 3;
if (!list_is_empty(&device->copy_timestamp_cs_pool)) {
submission_data->timestamp_copy_data =
list_first_entry(&device->copy_timestamp_cs_pool,
struct tu_copy_timestamp_data, node);
list_del(&submission_data->timestamp_copy_data->node);
} else {
submission_data->timestamp_copy_data =
(struct tu_copy_timestamp_data *) vk_zalloc(
&device->vk.alloc, sizeof(struct tu_copy_timestamp_data), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
tu_cs_init(&submission_data->timestamp_copy_data->cs, device,
TU_CS_MODE_GROW, cs_size, "trace copy timestamp cs");
u_trace_init(&submission_data->timestamp_copy_data->trace,
&device->trace_context);
}
tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW,
list_length(&cmdbuf->trace.trace_chunks) * 6 * 2 + 3, "trace copy timestamp cs");
tu_cs *cs = &submission_data->timestamp_copy_data->cs;
tu_cs_begin(*cs);
tu_cs_begin(cs);
tu_cs_emit_wfi(*cs);
tu_cs_emit_pkt7(*cs, CP_WAIT_FOR_ME, 0);
tu_cs_emit_wfi(cs);
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
*trace_copy = (struct u_trace *) vk_zalloc(
&cmdbuf->device->vk.alloc, sizeof(struct u_trace), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
if (*trace_copy == NULL) {
return VK_ERROR_OUT_OF_HOST_MEMORY;
if (!u_trace_has_points(&cmdbuf->trace) ||
(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
continue;
u_trace_clone_append(
tu_cmd_begin_iterator(cmdbuf), tu_cmd_end_iterator(cmdbuf),
&submission_data->timestamp_copy_data->trace, cs, tu_copy_buffer);
}
u_trace_init(*trace_copy, cmdbuf->trace.utctx);
u_trace_clone_append(tu_cmd_begin_iterator(cmdbuf),
tu_cmd_end_iterator(cmdbuf),
*trace_copy, *cs,
tu_copy_buffer);
tu_cs_emit_wfi(*cs);
tu_cs_end(*cs);
tu_cs_emit_wfi(cs);
tu_cs_end(cs);
return VK_SUCCESS;
}
@ -2231,13 +2244,14 @@ tu_u_trace_submission_data_create(
}
struct tu_u_trace_submission_data *data = *submission_data;
uint32_t trace_chunks_to_copy = 0;
data->cmd_trace_data = (struct tu_u_trace_cmd_data *) vk_zalloc(
data->trace_per_cmd_buffer = (struct u_trace **) vk_zalloc(
&device->vk.alloc,
cmd_buffer_count * sizeof(struct tu_u_trace_cmd_data), 8,
cmd_buffer_count * sizeof(struct u_trace *), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!data->cmd_trace_data) {
if (!data->trace_per_cmd_buffer) {
goto fail;
}
@ -2258,19 +2272,16 @@ tu_u_trace_submission_data_create(
* single-use. Therefor we have to copy trace points and create
* a new timestamp buffer on every submit of reusable command buffer.
*/
if (tu_create_copy_timestamp_cs(cmdbuf,
&data->cmd_trace_data[i].timestamp_copy_cs,
&data->cmd_trace_data[i].trace) != VK_SUCCESS) {
goto fail;
}
assert(data->cmd_trace_data[i].timestamp_copy_cs->entry_count == 1);
trace_chunks_to_copy += list_length(&cmdbuf->trace.trace_chunks);
} else {
data->cmd_trace_data[i].trace = &cmdbuf->trace;
data->trace_per_cmd_buffer[i] = &cmdbuf->trace;
}
}
assert(data->last_buffer_with_tracepoints != -1);
if (trace_chunks_to_copy > 0) {
tu_create_copy_timestamp_cs(data, cmd_buffers, cmd_buffer_count,
trace_chunks_to_copy);
}
return VK_SUCCESS;
@ -2281,20 +2292,31 @@ fail:
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
}
void
tu_free_copy_timestamp_data(struct tu_device *device,
struct tu_copy_timestamp_data *data)
{
if (list_is_linked(&data->node))
list_del(&data->node);
tu_cs_finish(&data->cs);
u_trace_fini(&data->trace);
vk_free(&device->vk.alloc, data);
}
void
tu_u_trace_submission_data_finish(
struct tu_device *device,
struct tu_u_trace_submission_data *submission_data)
{
for (uint32_t i = 0; i < submission_data->cmd_buffer_count; ++i) {
/* Only if we had to create a copy of trace we should free it */
struct tu_u_trace_cmd_data *cmd_data = &submission_data->cmd_trace_data[i];
if (cmd_data->timestamp_copy_cs) {
tu_cs_finish(cmd_data->timestamp_copy_cs);
vk_free(&device->vk.alloc, cmd_data->timestamp_copy_cs);
u_trace_fini(cmd_data->trace);
vk_free(&device->vk.alloc, cmd_data->trace);
if (submission_data->timestamp_copy_data) {
if (u_trace_enabled(&device->trace_context)) {
tu_cs_reset(&submission_data->timestamp_copy_data->cs);
u_trace_fini(&submission_data->timestamp_copy_data->trace);
list_addtail(&submission_data->timestamp_copy_data->node,
&device->copy_timestamp_cs_pool);
} else {
tu_free_copy_timestamp_data(device,
submission_data->timestamp_copy_data);
}
}
@ -2305,7 +2327,7 @@ tu_u_trace_submission_data_finish(
mtx_unlock(&device->kgsl_profiling_mutex);
}
vk_free(&device->vk.alloc, submission_data->cmd_trace_data);
vk_free(&device->vk.alloc, submission_data->trace_per_cmd_buffer);
vk_free(&device->vk.alloc, submission_data);
}
@ -2866,6 +2888,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
tu_gpu_tracepoint_config_variable();
device->submit_count = 0;
list_inithead(&device->copy_timestamp_cs_pool);
u_trace_context_init(&device->trace_context, device,
sizeof(uint64_t),
12,
@ -3025,6 +3048,12 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (device->null_accel_struct_bo)
tu_bo_finish(device, device->null_accel_struct_bo);
list_for_each_entry_safe(struct tu_copy_timestamp_data, data,
&device->copy_timestamp_cs_pool, node)
{
tu_free_copy_timestamp_data(device, data);
}
if (device->trace_suballoc) {
tu_bo_suballocator_finish(device->trace_suballoc);
vk_free(&device->vk.alloc, device->trace_suballoc);

View file

@ -442,6 +442,7 @@ struct tu_device
uint64_t fault_count;
struct u_trace_context trace_context;
struct list_head copy_timestamp_cs_pool;
#ifdef HAVE_PERFETTO
struct tu_perfetto_state perfetto;
@ -557,16 +558,16 @@ tu_copy_buffer(struct u_trace_context *utctx, void *cmdstream,
void *ts_to, uint64_t to_offset_B,
uint64_t size_B);
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy);
tu_create_copy_timestamp_cs(struct tu_u_trace_submission_data *submission_data,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count,
uint32_t trace_chunks_to_copy);
/* If we copy trace and timestamps we will have to free them. */
struct tu_u_trace_cmd_data
{
struct tu_cs *timestamp_copy_cs;
struct u_trace *trace;
struct tu_copy_timestamp_data {
struct list_head node;
struct tu_cs cs;
struct u_trace trace;
};
/* Data necessary to retrieve timestamps and clean all
@ -584,7 +585,9 @@ struct tu_u_trace_submission_data
uint32_t cmd_buffer_count;
uint32_t last_buffer_with_tracepoints;
struct tu_u_trace_cmd_data *cmd_trace_data;
void *mem_ctx;
struct u_trace **trace_per_cmd_buffer;
struct tu_copy_timestamp_data *timestamp_copy_data;
/* GPU time is reset on GPU power cycle and the GPU time
* offset may change between submissions due to power cycle.

View file

@ -124,11 +124,10 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
cs->entry_count);
if (u_trace_submission_data &&
u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) {
struct tu_cs_entry *trace_cs_entry =
&u_trace_submission_data->cmd_trace_data[i]
.timestamp_copy_cs->entries[0];
submit_add_entries(device, submit, &dump_cmds, trace_cs_entry, 1);
u_trace_submission_data->timestamp_copy_data) {
struct tu_cs *cs = &u_trace_submission_data->timestamp_copy_data->cs;
submit_add_entries(device, submit, &dump_cmds, cs->entries,
cs->entry_count);
}
}
@ -209,15 +208,15 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
for (uint32_t i = 0; i < u_trace_submission_data->cmd_buffer_count; i++) {
bool free_data = i == u_trace_submission_data->last_buffer_with_tracepoints;
if (u_trace_submission_data->cmd_trace_data[i].trace)
u_trace_flush(u_trace_submission_data->cmd_trace_data[i].trace,
if (u_trace_submission_data->trace_per_cmd_buffer[i])
u_trace_flush(u_trace_submission_data->trace_per_cmd_buffer[i],
u_trace_submission_data, queue->device->vk.current_frame,
free_data);
if (!u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) {
/* u_trace is owned by cmd_buffer */
u_trace_submission_data->cmd_trace_data[i].trace = NULL;
}
}
if (u_trace_submission_data->timestamp_copy_data) {
u_trace_flush(&u_trace_submission_data->timestamp_copy_data->trace,
u_trace_submission_data, queue->device->vk.current_frame,
true);
}
}