From 6615cbfeaf0b02ca8234b56bcde600716f42bfbc Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 13 Dec 2024 15:41:43 -0500 Subject: [PATCH] tu: Make userspace RD dump generic Stop relying on the submit BO list, which won't exist with the new "VM_BIND" uAPI. Instead, create a separate list in generic code, only when dumping is enabled. As a bonus this means that it should work on virtio and kgsl too, and more code is removed from the kernel backend. We need to use the generic fence wait introduced in the previous commit. Part-of: --- src/freedreno/vulkan/tu_device.cc | 32 +++++++++++ src/freedreno/vulkan/tu_device.h | 8 +++ src/freedreno/vulkan/tu_knl.cc | 6 +- src/freedreno/vulkan/tu_knl.h | 3 + src/freedreno/vulkan/tu_knl_drm.cc | 1 + src/freedreno/vulkan/tu_knl_drm_msm.cc | 43 +------------- src/freedreno/vulkan/tu_knl_drm_virtio.cc | 2 + src/freedreno/vulkan/tu_knl_kgsl.cc | 5 +- src/freedreno/vulkan/tu_queue.cc | 70 +++++++++++++++++++++-- 9 files changed, 121 insertions(+), 49 deletions(-) diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 7ef7ee88637..514487e4c21 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2719,6 +2719,7 @@ fail_global_bo_map: TU_RMV(resource_destroy, device, device->global_bo); tu_bo_finish(device, device->global_bo); vk_free(&device->vk.alloc, device->submit_bo_list); + util_dynarray_fini(&device->dump_bo_list); fail_global_bo: ir3_compiler_destroy(device->compiler); util_sparse_array_finish(&device->bo_map); @@ -2823,6 +2824,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) pthread_cond_destroy(&device->timeline_cond); _mesa_hash_table_destroy(device->bo_sizes, NULL); vk_free(&device->vk.alloc, device->submit_bo_list); + util_dynarray_fini(&device->dump_bo_list); vk_device_finish(&device->vk); vk_free(&device->vk.alloc, device); } @@ -3426,6 +3428,36 @@ tu_debug_bos_print_stats(struct tu_device *dev) mtx_unlock(&dev->bo_mutex); } +void +tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo) +{ + bo->dump_bo_list_idx = ~0; + + if (!FD_RD_DUMP(ENABLE)) + return; + + mtx_lock(&dev->bo_mutex); + uint32_t idx = + util_dynarray_num_elements(&dev->dump_bo_list, struct tu_bo *); + bo->dump_bo_list_idx = idx; + util_dynarray_append(&dev->dump_bo_list, struct tu_bo *, bo); + mtx_unlock(&dev->bo_mutex); +} + +void +tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo) +{ + if (bo->dump_bo_list_idx != ~0) { + mtx_lock(&dev->bo_mutex); + struct tu_bo *exchanging_bo = + util_dynarray_pop(&dev->dump_bo_list, struct tu_bo *); + *util_dynarray_element(&dev->dump_bo_list, struct tu_bo *, + bo->dump_bo_list_idx) = exchanging_bo; + exchanging_bo->dump_bo_list_idx = bo->dump_bo_list_idx; + mtx_unlock(&dev->bo_mutex); + } +} + void tu_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer, const VkDebugUtilsLabelEXT *pLabelInfo) diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index ac9dc35b51c..27652f82652 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -347,6 +347,8 @@ struct tu_device struct drm_msm_gem_submit_bo *submit_bo_list; /* map bo handles to bo list index: */ uint32_t submit_bo_count, submit_bo_list_size; + /* bo list for dumping: */ + struct util_dynarray dump_bo_list; mtx_t bo_mutex; /* protects imported BOs creation/freeing */ struct u_rwlock dma_bo_lock; @@ -581,4 +583,10 @@ tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo); void tu_debug_bos_print_stats(struct tu_device *dev); +void +tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo); +void +tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo); + + #endif /* TU_DEVICE_H */ diff --git a/src/freedreno/vulkan/tu_knl.cc b/src/freedreno/vulkan/tu_knl.cc index bc797b32cbf..80e3900c3a9 100644 --- a/src/freedreno/vulkan/tu_knl.cc +++ b/src/freedreno/vulkan/tu_knl.cc @@ -53,6 +53,8 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev, (*out_bo)->iova, (*out_bo)->size, VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT); + (*out_bo)->dump = flags & TU_BO_ALLOC_ALLOW_DUMP; + return VK_SUCCESS; } @@ -73,7 +75,7 @@ tu_bo_init_dmabuf(struct tu_device *dev, */ if (dev->physical_device->has_cached_non_coherent_memory) (*bo)->cached_non_coherent = true; - + return VK_SUCCESS; } @@ -208,6 +210,8 @@ if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64)) void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo) { dev->instance->knl->bo_allow_dump(dev, bo); + + p_atomic_set(&bo->dump, true); } void diff --git a/src/freedreno/vulkan/tu_knl.h b/src/freedreno/vulkan/tu_knl.h index 5914de2e7c4..b41f203b04b 100644 --- a/src/freedreno/vulkan/tu_knl.h +++ b/src/freedreno/vulkan/tu_knl.h @@ -58,6 +58,7 @@ struct tu_bo { int32_t refcnt; uint32_t submit_bo_list_idx; + uint32_t dump_bo_list_idx; #ifdef TU_HAS_KGSL /* We have to store fd returned by ion_fd_data @@ -71,6 +72,8 @@ struct tu_bo { bool never_unmap : 1; bool cached_non_coherent : 1; + bool dump; + /* Pointer to the vk_object_base associated with the BO * for the purposes of VK_EXT_device_address_binding_report */ diff --git a/src/freedreno/vulkan/tu_knl_drm.cc b/src/freedreno/vulkan/tu_knl_drm.cc index ec64249d2b9..27916d0fee8 100644 --- a/src/freedreno/vulkan/tu_knl_drm.cc +++ b/src/freedreno/vulkan/tu_knl_drm.cc @@ -77,6 +77,7 @@ tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo) TU_RMV(bo_destroy, dev, bo); tu_debug_bos_del(dev, bo); + tu_dump_bo_del(dev, bo); mtx_lock(&dev->bo_mutex); dev->submit_bo_count--; diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc index e98b960e289..5388b321e2b 100644 --- a/src/freedreno/vulkan/tu_knl_drm_msm.cc +++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc @@ -537,6 +537,8 @@ tu_bo_init(struct tu_device *dev, mtx_unlock(&dev->bo_mutex); + tu_dump_bo_init(dev, bo); + TU_RMV(bo_allocate, dev, bo); return VK_SUCCESS; @@ -798,7 +800,6 @@ msm_queue_submit(struct tu_queue *queue, void *_submit, (struct tu_msm_queue_submit *)_submit; struct drm_msm_gem_submit_syncobj *in_syncobjs, *out_syncobjs; struct drm_msm_gem_submit req; - uint32_t submit_idx = queue->device->submit_count; uint64_t gpu_offset = 0; uint32_t entry_count = util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd); @@ -889,46 +890,6 @@ msm_queue_submit(struct tu_queue *queue, void *_submit, .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj), }; - if (req.nr_cmds && FD_RD_DUMP(ENABLE) && - fd_rd_output_begin(&queue->device->rd_output, submit_idx)) { - struct tu_device *device = queue->device; - struct fd_rd_output *rd_output = &device->rd_output; - - if (FD_RD_DUMP(FULL)) { - VkResult result = tu_wait_fence(device, queue->msm_queue_id, queue->fence, ~0); - if (result != VK_SUCCESS) { - mesa_loge("FD_RD_DUMP_FULL: wait on previous submission for device %u and queue %d failed: %u", - device->device_idx, queue->msm_queue_id, 0); - } - } - - fd_rd_output_write_section(rd_output, RD_CHIP_ID, &device->physical_device->dev_id.chip_id, 8); - fd_rd_output_write_section(rd_output, RD_CMD, "tu-dump", 8); - - for (unsigned i = 0; i < device->submit_bo_count; i++) { - struct drm_msm_gem_submit_bo bo = device->submit_bo_list[i]; - struct tu_bo *tu_bo = tu_device_lookup_bo(device, bo.handle); - uint64_t iova = bo.presumed; - - uint32_t buf[3] = { iova, tu_bo->size, iova >> 32 }; - fd_rd_output_write_section(rd_output, RD_GPUADDR, buf, 12); - if (bo.flags & MSM_SUBMIT_BO_DUMP || FD_RD_DUMP(FULL)) { - tu_bo_map(device, tu_bo, NULL); /* note: this would need locking to be safe */ - fd_rd_output_write_section(rd_output, RD_BUFFER_CONTENTS, tu_bo->map, tu_bo->size); - } - } - - util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd, - cmd) { - uint64_t iova = device->submit_bo_list[cmd->submit_idx].presumed + cmd->submit_offset; - uint32_t size = cmd->size >> 2; - uint32_t buf[3] = { iova, size, iova >> 32 }; - fd_rd_output_write_section(rd_output, RD_CMDSTREAM_ADDR, buf, 12); - } - - fd_rd_output_end(rd_output); - } - ret = drmCommandWriteRead(queue->device->fd, DRM_MSM_GEM_SUBMIT, &req, sizeof(req)); diff --git a/src/freedreno/vulkan/tu_knl_drm_virtio.cc b/src/freedreno/vulkan/tu_knl_drm_virtio.cc index ab466dae950..c4630f3ea48 100644 --- a/src/freedreno/vulkan/tu_knl_drm_virtio.cc +++ b/src/freedreno/vulkan/tu_knl_drm_virtio.cc @@ -582,6 +582,8 @@ tu_bo_init(struct tu_device *dev, mtx_unlock(&dev->bo_mutex); + tu_dump_bo_init(dev, bo); + return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_knl_kgsl.cc b/src/freedreno/vulkan/tu_knl_kgsl.cc index 26284929e15..b6e17fd4ebd 100644 --- a/src/freedreno/vulkan/tu_knl_kgsl.cc +++ b/src/freedreno/vulkan/tu_knl_kgsl.cc @@ -261,9 +261,9 @@ kgsl_bo_init(struct tu_device *dev, * and the CPU mapping must stay fixed for the lifetime of the BO. */ bo->never_unmap = true; - } + tu_dump_bo_init(dev, bo); *out_bo = bo; @@ -321,6 +321,8 @@ kgsl_bo_init_dmabuf(struct tu_device *dev, .shared_fd = os_dupfd_cloexec(fd), }; + tu_dump_bo_init(dev, bo); + *out_bo = bo; return VK_SUCCESS; @@ -380,6 +382,7 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo) TU_RMV(bo_destroy, dev, bo); tu_debug_bos_del(dev, bo); + tu_dump_bo_del(dev, bo); struct kgsl_gpumem_free_id req = { .id = bo->gem_handle diff --git a/src/freedreno/vulkan/tu_queue.cc b/src/freedreno/vulkan/tu_queue.cc index dcefcb94ce7..6c5632ba311 100644 --- a/src/freedreno/vulkan/tu_queue.cc +++ b/src/freedreno/vulkan/tu_queue.cc @@ -51,12 +51,27 @@ tu_get_submitqueue_priority(const struct tu_physical_device *pdevice, return priority; } +static void +submit_add_entries(struct tu_device *dev, void *submit, + struct util_dynarray *dump_cmds, + struct tu_cs_entry *entries, unsigned num_entries) +{ + tu_submit_add_entries(dev, submit, entries, num_entries); + if (FD_RD_DUMP(ENABLE)) { + util_dynarray_append_array(dump_cmds, struct tu_cs_entry, entries, + num_entries); + } +} + static VkResult queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) { struct tu_queue *queue = list_entry(_queue, struct tu_queue, vk); struct tu_device *device = queue->device; bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); + struct util_dynarray dump_cmds; + + util_dynarray_init(&dump_cmds, NULL); uint32_t perf_pass_index = device->perfcntrs_pass_cs_entries ? vk_submit->perf_pass_index : ~0; @@ -102,28 +117,71 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit) struct tu_cs_entry *perf_cs_entry = &cmd_buffer->device->perfcntrs_pass_cs_entries[perf_pass_index]; - tu_submit_add_entries(device, submit, perf_cs_entry, 1); + submit_add_entries(device, submit, &dump_cmds, perf_cs_entry, 1); } - tu_submit_add_entries(device, submit, cs->entries, - cs->entry_count); + submit_add_entries(device, submit, &dump_cmds, cs->entries, + cs->entry_count); if (u_trace_submission_data && u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) { struct tu_cs_entry *trace_cs_entry = &u_trace_submission_data->cmd_trace_data[i] .timestamp_copy_cs->entries[0]; - tu_submit_add_entries(device, submit, trace_cs_entry, 1); + submit_add_entries(device, submit, &dump_cmds, trace_cs_entry, 1); } } if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) { struct tu_cs *autotune_cs = tu_autotune_on_submit( device, &device->autotune, cmd_buffers, cmdbuf_count); - tu_submit_add_entries(device, submit, autotune_cs->entries, - autotune_cs->entry_count); + submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries, + autotune_cs->entry_count); } + if (cmdbuf_count && FD_RD_DUMP(ENABLE) && + fd_rd_output_begin(&queue->device->rd_output, + queue->device->submit_count)) { + struct tu_device *device = queue->device; + struct fd_rd_output *rd_output = &device->rd_output; + + if (FD_RD_DUMP(FULL)) { + VkResult result = tu_queue_wait_fence(queue, queue->fence, ~0); + if (result != VK_SUCCESS) { + mesa_loge("FD_RD_DUMP_FULL: wait on previous submission for device %u and queue %d failed: %u", + device->device_idx, queue->msm_queue_id, 0); + } + } + + fd_rd_output_write_section(rd_output, RD_CHIP_ID, &device->physical_device->dev_id.chip_id, 8); + fd_rd_output_write_section(rd_output, RD_CMD, "tu-dump", 8); + + mtx_lock(&device->bo_mutex); + util_dynarray_foreach (&device->dump_bo_list, struct tu_bo *, bo_ptr) { + struct tu_bo *bo = *bo_ptr; + uint64_t iova = bo->iova; + + uint32_t buf[3] = { iova, bo->size, iova >> 32 }; + fd_rd_output_write_section(rd_output, RD_GPUADDR, buf, 12); + if (bo->dump || FD_RD_DUMP(FULL)) { + tu_bo_map(device, bo, NULL); /* note: this would need locking to be safe */ + fd_rd_output_write_section(rd_output, RD_BUFFER_CONTENTS, bo->map, bo->size); + } + } + mtx_unlock(&device->bo_mutex); + + util_dynarray_foreach (&dump_cmds, struct tu_cs_entry, cmd) { + uint64_t iova = cmd->bo->iova + cmd->offset; + uint32_t size = cmd->size >> 2; + uint32_t buf[3] = { iova, size, iova >> 32 }; + fd_rd_output_write_section(rd_output, RD_CMDSTREAM_ADDR, buf, 12); + } + + fd_rd_output_end(rd_output); + } + + util_dynarray_fini(&dump_cmds); + result = tu_queue_submit(queue, submit, vk_submit->waits, vk_submit->wait_count, vk_submit->signals, vk_submit->signal_count,