From 5c6f0d46e7a44e30c743e97adabbfde45c5c3188 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Mon, 7 Jun 2021 13:16:25 +0300 Subject: [PATCH] turnip/perfetto: reusable command buffers support The limitation is that the reusable command buffer should be created when perfetto is already connected in order to write timestamps. Otherwise such cmd buffer won't be traces. Signed-off-by: Danylo Piliaiev Reviewed-by: Rob Clark Reviewed-by: Hyunjun Ko Part-of: --- src/freedreno/vulkan/tu_device.c | 76 ++++++++++++++++++++ src/freedreno/vulkan/tu_drm.c | 113 +++++++++++++++++++++++------- src/freedreno/vulkan/tu_private.h | 25 +++++++ 3 files changed, 187 insertions(+), 27 deletions(-) diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index c579f391065..0538915063e 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1326,10 +1326,86 @@ tu_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data) container_of(utctx, struct tu_device, trace_context); struct tu_u_trace_flush_data *trace_flush_data = flush_data; + tu_u_trace_cmd_data_finish(device, trace_flush_data->cmd_trace_data, + trace_flush_data->trace_count); vk_free(&device->vk.alloc, trace_flush_data->syncobj); vk_free(&device->vk.alloc, trace_flush_data); } +void +tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, + void *ts_from, uint32_t from_offset, + void *ts_to, uint32_t to_offset, + uint32_t count) +{ + struct tu_cs *cs = cmdstream; + struct tu_bo *bo_from = ts_from; + struct tu_bo *bo_to = ts_to; + + tu_cs_emit_pkt7(cs, CP_MEMCPY, 5); + tu_cs_emit(cs, count * sizeof(uint64_t) / sizeof(uint32_t)); + tu_cs_emit_qw(cs, bo_from->iova + from_offset * sizeof(uint64_t)); + tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t)); +} + +VkResult +tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, + struct u_trace **trace_copy) +{ + *cs = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct tu_cs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (*cs == NULL) { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW, + list_length(&cmdbuf->trace.trace_chunks) * 6 + 3); + + tu_cs_begin(*cs); + + tu_cs_emit_wfi(*cs); + tu_cs_emit_pkt7(*cs, CP_WAIT_FOR_ME, 0); + + *trace_copy = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct u_trace), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (*trace_copy == NULL) { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + u_trace_init(*trace_copy, cmdbuf->trace.utctx); + u_trace_clone_append(u_trace_begin_iterator(&cmdbuf->trace), + u_trace_end_iterator(&cmdbuf->trace), + *trace_copy, *cs, + tu_copy_timestamp_buffer); + + tu_cs_emit_wfi(*cs); + + tu_cs_end(*cs); + + return VK_SUCCESS; +} + +void +tu_u_trace_cmd_data_finish(struct tu_device *device, + struct tu_u_trace_cmd_data *trace_data, + uint32_t entry_count) +{ + for (uint32_t i = 0; i < entry_count; ++i) { + /* Only if we had to create a copy of trace we should free it */ + if (trace_data[i].timestamp_copy_cs != NULL) { + tu_cs_finish(trace_data[i].timestamp_copy_cs); + vk_free(&device->vk.alloc, trace_data[i].timestamp_copy_cs); + + u_trace_fini(trace_data[i].trace); + vk_free(&device->vk.alloc, trace_data[i].trace); + } + } + + vk_free(&device->vk.alloc, trace_data); +} + VKAPI_ATTR VkResult VKAPI_CALL tu_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo, diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c index d11c3164fd6..5ec0d531fe4 100644 --- a/src/freedreno/vulkan/tu_drm.c +++ b/src/freedreno/vulkan/tu_drm.c @@ -38,6 +38,8 @@ #include "tu_private.h" +#include "tu_cs.h" + struct tu_binary_syncobj { uint32_t permanent, temporary; }; @@ -85,6 +87,7 @@ struct tu_queue_submit struct list_head link; VkCommandBuffer *cmd_buffers; + struct tu_u_trace_cmd_data *cmd_buffer_trace_data; uint32_t cmd_buffer_count; struct tu_syncobj **wait_semaphores; @@ -938,6 +941,9 @@ tu_queue_submit_create_locked(struct tu_queue *queue, } } + bool u_trace_enabled = u_trace_context_tracing(&queue->device->trace_context); + bool has_trace_points = false; + uint32_t entry_count = 0; for (uint32_t j = 0; j < new_submit->cmd_buffer_count; ++j) { TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, new_submit->cmd_buffers[j]); @@ -946,6 +952,13 @@ tu_queue_submit_create_locked(struct tu_queue *queue, entry_count++; entry_count += cmdbuf->cs.entry_count; + + if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) { + if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) + entry_count++; + + has_trace_points = true; + } } new_submit->cmds = vk_zalloc(&queue->device->vk.alloc, @@ -957,6 +970,39 @@ tu_queue_submit_create_locked(struct tu_queue *queue, goto fail_cmds; } + if (has_trace_points) { + new_submit->cmd_buffer_trace_data = vk_zalloc(&queue->device->vk.alloc, + new_submit->cmd_buffer_count * sizeof(struct tu_u_trace_cmd_data), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (new_submit->cmd_buffer_trace_data == NULL) { + result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY) + goto fail_cmd_trace_data; + } + + for (uint32_t i = 0; i < new_submit->cmd_buffer_count; ++i) { + TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, new_submit->cmd_buffers[i]); + + if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) && + u_trace_has_points(&cmdbuf->trace)) { + /* A single command buffer could be submitted several times, but we + * already backed timestamp iova addresses and trace points are + * single-use. Therefor we have to copy trace points and create + * a new timestamp buffer on every submit of reusable command buffer. + */ + if (tu_create_copy_timestamp_cs(cmdbuf, + &new_submit->cmd_buffer_trace_data[i].timestamp_copy_cs, + &new_submit->cmd_buffer_trace_data[i].trace) != VK_SUCCESS) { + result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY) + goto fail_copy_timestamp_cs; + } + assert(new_submit->cmd_buffer_trace_data[i].timestamp_copy_cs->entry_count == 1); + } else { + new_submit->cmd_buffer_trace_data[i].trace = &cmdbuf->trace; + } + } + } + /* Allocate without wait timeline semaphores */ new_submit->in_syncobjs = vk_zalloc(&queue->device->vk.alloc, (nr_in_syncobjs - new_submit->wait_timeline_count) * @@ -992,6 +1038,12 @@ tu_queue_submit_create_locked(struct tu_queue *queue, fail_out_syncobjs: vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs); fail_in_syncobjs: + if (new_submit->cmd_buffer_trace_data) + tu_u_trace_cmd_data_finish(queue->device, new_submit->cmd_buffer_trace_data, + new_submit->cmd_buffer_count); +fail_copy_timestamp_cs: + vk_free(&queue->device->vk.alloc, new_submit->cmd_buffer_trace_data); +fail_cmd_trace_data: vk_free(&queue->device->vk.alloc, new_submit->cmds); fail_cmds: fail_signal_timelines: @@ -1059,6 +1111,23 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, cmds[entry_idx].nr_relocs = 0; cmds[entry_idx].relocs = 0; } + + if (submit->cmd_buffer_trace_data) { + struct tu_cs *ts_cs = submit->cmd_buffer_trace_data[j].timestamp_copy_cs; + if (ts_cs) { + cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; + cmds[entry_idx].submit_idx = + queue->device->bo_idx[ts_cs->entries[0].bo->gem_handle]; + + assert(cmds[entry_idx].submit_idx < queue->device->bo_count); + + cmds[entry_idx].submit_offset = ts_cs->entries[0].offset; + cmds[entry_idx].size = ts_cs->entries[0].size; + cmds[entry_idx].pad = 0; + cmds[entry_idx].nr_relocs = 0; + cmds[entry_idx++].relocs = 0; + } + } } } @@ -1137,32 +1206,24 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) sem->timeline.highest_submitted = signal_value; } - if (u_trace_context_tracing(&queue->device->trace_context)) { - bool has_chunks = false; + if (submit->cmd_buffer_trace_data) { + struct tu_u_trace_flush_data *flush_data = + vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_flush_data), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + flush_data->submission_id = queue->device->submit_count; + flush_data->syncobj = + vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + flush_data->syncobj->fence = req.fence; + flush_data->syncobj->msm_queue_id = queue->msm_queue_id; + + flush_data->cmd_trace_data = submit->cmd_buffer_trace_data; + flush_data->trace_count = submit->cmd_buffer_count; + submit->cmd_buffer_trace_data = NULL; + for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->cmd_buffers[i]); - if (!list_is_empty(&cmdbuf->trace.trace_chunks)) { - has_chunks = true; - break; - } - } - - if (has_chunks) { - struct tu_u_trace_flush_data *flush_data = - vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_flush_data), - 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - flush_data->submission_id = queue->device->submit_count; - flush_data->syncobj = - vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), - 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); - flush_data->syncobj->fence = req.fence; - flush_data->syncobj->msm_queue_id = queue->msm_queue_id; - - for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) { - TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->cmd_buffers[i]); - bool free_data = i == (submit->cmd_buffer_count - 1); - u_trace_flush(&cmdbuf->trace, flush_data, free_data); - } + bool free_data = i == (submit->cmd_buffer_count - 1); + u_trace_flush(flush_data->cmd_trace_data[i].trace, flush_data, free_data); } } @@ -1320,8 +1381,6 @@ tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj return VK_TIMEOUT; } - close(syncobj->fence); - return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 03b4b2ebc07..150b6d28f1c 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1728,10 +1728,35 @@ tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_ int tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync); + +void +tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, + void *ts_from, uint32_t from_offset, + void *ts_to, uint32_t to_offset, + uint32_t count); + + +VkResult +tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, + struct u_trace **trace_copy); + +struct tu_u_trace_cmd_data +{ + struct tu_cs *timestamp_copy_cs; + struct u_trace *trace; +}; + +void +tu_u_trace_cmd_data_finish(struct tu_device *device, + struct tu_u_trace_cmd_data *trace_data, + uint32_t entry_count); + struct tu_u_trace_flush_data { uint32_t submission_id; struct tu_u_trace_syncobj *syncobj; + uint32_t trace_count; + struct tu_u_trace_cmd_data *cmd_trace_data; }; #define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \