tu: Make userspace RD dump generic

Stop relying on the submit BO list, which won't exist with the new
"VM_BIND" uAPI. Instead, create a separate list in generic code, only
when dumping is enabled. As a bonus this means that it should work on
virtio and kgsl too, and more code is removed from the kernel backend.
We need to use the generic fence wait introduced in the previous commit.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32165>
This commit is contained in:
Connor Abbott 2024-12-13 15:41:43 -05:00 committed by Marge Bot
parent a8b2f45346
commit 6615cbfeaf
9 changed files with 121 additions and 49 deletions

View file

@ -2719,6 +2719,7 @@ fail_global_bo_map:
TU_RMV(resource_destroy, device, device->global_bo);
tu_bo_finish(device, device->global_bo);
vk_free(&device->vk.alloc, device->submit_bo_list);
util_dynarray_fini(&device->dump_bo_list);
fail_global_bo:
ir3_compiler_destroy(device->compiler);
util_sparse_array_finish(&device->bo_map);
@ -2823,6 +2824,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
pthread_cond_destroy(&device->timeline_cond);
_mesa_hash_table_destroy(device->bo_sizes, NULL);
vk_free(&device->vk.alloc, device->submit_bo_list);
util_dynarray_fini(&device->dump_bo_list);
vk_device_finish(&device->vk);
vk_free(&device->vk.alloc, device);
}
@ -3426,6 +3428,36 @@ tu_debug_bos_print_stats(struct tu_device *dev)
mtx_unlock(&dev->bo_mutex);
}
void
tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo)
{
bo->dump_bo_list_idx = ~0;
if (!FD_RD_DUMP(ENABLE))
return;
mtx_lock(&dev->bo_mutex);
uint32_t idx =
util_dynarray_num_elements(&dev->dump_bo_list, struct tu_bo *);
bo->dump_bo_list_idx = idx;
util_dynarray_append(&dev->dump_bo_list, struct tu_bo *, bo);
mtx_unlock(&dev->bo_mutex);
}
void
tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo)
{
if (bo->dump_bo_list_idx != ~0) {
mtx_lock(&dev->bo_mutex);
struct tu_bo *exchanging_bo =
util_dynarray_pop(&dev->dump_bo_list, struct tu_bo *);
*util_dynarray_element(&dev->dump_bo_list, struct tu_bo *,
bo->dump_bo_list_idx) = exchanging_bo;
exchanging_bo->dump_bo_list_idx = bo->dump_bo_list_idx;
mtx_unlock(&dev->bo_mutex);
}
}
void
tu_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,
const VkDebugUtilsLabelEXT *pLabelInfo)

View file

@ -347,6 +347,8 @@ struct tu_device
struct drm_msm_gem_submit_bo *submit_bo_list;
/* map bo handles to bo list index: */
uint32_t submit_bo_count, submit_bo_list_size;
/* bo list for dumping: */
struct util_dynarray dump_bo_list;
mtx_t bo_mutex;
/* protects imported BOs creation/freeing */
struct u_rwlock dma_bo_lock;
@ -581,4 +583,10 @@ tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
void
tu_debug_bos_print_stats(struct tu_device *dev);
void
tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo);
void
tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo);
#endif /* TU_DEVICE_H */

View file

@ -53,6 +53,8 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
(*out_bo)->iova, (*out_bo)->size,
VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT);
(*out_bo)->dump = flags & TU_BO_ALLOC_ALLOW_DUMP;
return VK_SUCCESS;
}
@ -73,7 +75,7 @@ tu_bo_init_dmabuf(struct tu_device *dev,
*/
if (dev->physical_device->has_cached_non_coherent_memory)
(*bo)->cached_non_coherent = true;
return VK_SUCCESS;
}
@ -208,6 +210,8 @@ if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64))
void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
{
dev->instance->knl->bo_allow_dump(dev, bo);
p_atomic_set(&bo->dump, true);
}
void

View file

@ -58,6 +58,7 @@ struct tu_bo {
int32_t refcnt;
uint32_t submit_bo_list_idx;
uint32_t dump_bo_list_idx;
#ifdef TU_HAS_KGSL
/* We have to store fd returned by ion_fd_data
@ -71,6 +72,8 @@ struct tu_bo {
bool never_unmap : 1;
bool cached_non_coherent : 1;
bool dump;
/* Pointer to the vk_object_base associated with the BO
* for the purposes of VK_EXT_device_address_binding_report
*/

View file

@ -77,6 +77,7 @@ tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
TU_RMV(bo_destroy, dev, bo);
tu_debug_bos_del(dev, bo);
tu_dump_bo_del(dev, bo);
mtx_lock(&dev->bo_mutex);
dev->submit_bo_count--;

View file

@ -537,6 +537,8 @@ tu_bo_init(struct tu_device *dev,
mtx_unlock(&dev->bo_mutex);
tu_dump_bo_init(dev, bo);
TU_RMV(bo_allocate, dev, bo);
return VK_SUCCESS;
@ -798,7 +800,6 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
(struct tu_msm_queue_submit *)_submit;
struct drm_msm_gem_submit_syncobj *in_syncobjs, *out_syncobjs;
struct drm_msm_gem_submit req;
uint32_t submit_idx = queue->device->submit_count;
uint64_t gpu_offset = 0;
uint32_t entry_count =
util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd);
@ -889,46 +890,6 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
.syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
};
if (req.nr_cmds && FD_RD_DUMP(ENABLE) &&
fd_rd_output_begin(&queue->device->rd_output, submit_idx)) {
struct tu_device *device = queue->device;
struct fd_rd_output *rd_output = &device->rd_output;
if (FD_RD_DUMP(FULL)) {
VkResult result = tu_wait_fence(device, queue->msm_queue_id, queue->fence, ~0);
if (result != VK_SUCCESS) {
mesa_loge("FD_RD_DUMP_FULL: wait on previous submission for device %u and queue %d failed: %u",
device->device_idx, queue->msm_queue_id, 0);
}
}
fd_rd_output_write_section(rd_output, RD_CHIP_ID, &device->physical_device->dev_id.chip_id, 8);
fd_rd_output_write_section(rd_output, RD_CMD, "tu-dump", 8);
for (unsigned i = 0; i < device->submit_bo_count; i++) {
struct drm_msm_gem_submit_bo bo = device->submit_bo_list[i];
struct tu_bo *tu_bo = tu_device_lookup_bo(device, bo.handle);
uint64_t iova = bo.presumed;
uint32_t buf[3] = { iova, tu_bo->size, iova >> 32 };
fd_rd_output_write_section(rd_output, RD_GPUADDR, buf, 12);
if (bo.flags & MSM_SUBMIT_BO_DUMP || FD_RD_DUMP(FULL)) {
tu_bo_map(device, tu_bo, NULL); /* note: this would need locking to be safe */
fd_rd_output_write_section(rd_output, RD_BUFFER_CONTENTS, tu_bo->map, tu_bo->size);
}
}
util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd,
cmd) {
uint64_t iova = device->submit_bo_list[cmd->submit_idx].presumed + cmd->submit_offset;
uint32_t size = cmd->size >> 2;
uint32_t buf[3] = { iova, size, iova >> 32 };
fd_rd_output_write_section(rd_output, RD_CMDSTREAM_ADDR, buf, 12);
}
fd_rd_output_end(rd_output);
}
ret = drmCommandWriteRead(queue->device->fd,
DRM_MSM_GEM_SUBMIT,
&req, sizeof(req));

View file

@ -582,6 +582,8 @@ tu_bo_init(struct tu_device *dev,
mtx_unlock(&dev->bo_mutex);
tu_dump_bo_init(dev, bo);
return VK_SUCCESS;
}

View file

@ -261,9 +261,9 @@ kgsl_bo_init(struct tu_device *dev,
* and the CPU mapping must stay fixed for the lifetime of the BO.
*/
bo->never_unmap = true;
}
tu_dump_bo_init(dev, bo);
*out_bo = bo;
@ -321,6 +321,8 @@ kgsl_bo_init_dmabuf(struct tu_device *dev,
.shared_fd = os_dupfd_cloexec(fd),
};
tu_dump_bo_init(dev, bo);
*out_bo = bo;
return VK_SUCCESS;
@ -380,6 +382,7 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo)
TU_RMV(bo_destroy, dev, bo);
tu_debug_bos_del(dev, bo);
tu_dump_bo_del(dev, bo);
struct kgsl_gpumem_free_id req = {
.id = bo->gem_handle

View file

@ -51,12 +51,27 @@ tu_get_submitqueue_priority(const struct tu_physical_device *pdevice,
return priority;
}
static void
submit_add_entries(struct tu_device *dev, void *submit,
struct util_dynarray *dump_cmds,
struct tu_cs_entry *entries, unsigned num_entries)
{
tu_submit_add_entries(dev, submit, entries, num_entries);
if (FD_RD_DUMP(ENABLE)) {
util_dynarray_append_array(dump_cmds, struct tu_cs_entry, entries,
num_entries);
}
}
static VkResult
queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
{
struct tu_queue *queue = list_entry(_queue, struct tu_queue, vk);
struct tu_device *device = queue->device;
bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
struct util_dynarray dump_cmds;
util_dynarray_init(&dump_cmds, NULL);
uint32_t perf_pass_index =
device->perfcntrs_pass_cs_entries ? vk_submit->perf_pass_index : ~0;
@ -102,28 +117,71 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
struct tu_cs_entry *perf_cs_entry =
&cmd_buffer->device->perfcntrs_pass_cs_entries[perf_pass_index];
tu_submit_add_entries(device, submit, perf_cs_entry, 1);
submit_add_entries(device, submit, &dump_cmds, perf_cs_entry, 1);
}
tu_submit_add_entries(device, submit, cs->entries,
cs->entry_count);
submit_add_entries(device, submit, &dump_cmds, cs->entries,
cs->entry_count);
if (u_trace_submission_data &&
u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) {
struct tu_cs_entry *trace_cs_entry =
&u_trace_submission_data->cmd_trace_data[i]
.timestamp_copy_cs->entries[0];
tu_submit_add_entries(device, submit, trace_cs_entry, 1);
submit_add_entries(device, submit, &dump_cmds, trace_cs_entry, 1);
}
}
if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
struct tu_cs *autotune_cs = tu_autotune_on_submit(
device, &device->autotune, cmd_buffers, cmdbuf_count);
tu_submit_add_entries(device, submit, autotune_cs->entries,
autotune_cs->entry_count);
submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
autotune_cs->entry_count);
}
if (cmdbuf_count && FD_RD_DUMP(ENABLE) &&
fd_rd_output_begin(&queue->device->rd_output,
queue->device->submit_count)) {
struct tu_device *device = queue->device;
struct fd_rd_output *rd_output = &device->rd_output;
if (FD_RD_DUMP(FULL)) {
VkResult result = tu_queue_wait_fence(queue, queue->fence, ~0);
if (result != VK_SUCCESS) {
mesa_loge("FD_RD_DUMP_FULL: wait on previous submission for device %u and queue %d failed: %u",
device->device_idx, queue->msm_queue_id, 0);
}
}
fd_rd_output_write_section(rd_output, RD_CHIP_ID, &device->physical_device->dev_id.chip_id, 8);
fd_rd_output_write_section(rd_output, RD_CMD, "tu-dump", 8);
mtx_lock(&device->bo_mutex);
util_dynarray_foreach (&device->dump_bo_list, struct tu_bo *, bo_ptr) {
struct tu_bo *bo = *bo_ptr;
uint64_t iova = bo->iova;
uint32_t buf[3] = { iova, bo->size, iova >> 32 };
fd_rd_output_write_section(rd_output, RD_GPUADDR, buf, 12);
if (bo->dump || FD_RD_DUMP(FULL)) {
tu_bo_map(device, bo, NULL); /* note: this would need locking to be safe */
fd_rd_output_write_section(rd_output, RD_BUFFER_CONTENTS, bo->map, bo->size);
}
}
mtx_unlock(&device->bo_mutex);
util_dynarray_foreach (&dump_cmds, struct tu_cs_entry, cmd) {
uint64_t iova = cmd->bo->iova + cmd->offset;
uint32_t size = cmd->size >> 2;
uint32_t buf[3] = { iova, size, iova >> 32 };
fd_rd_output_write_section(rd_output, RD_CMDSTREAM_ADDR, buf, 12);
}
fd_rd_output_end(rd_output);
}
util_dynarray_fini(&dump_cmds);
result =
tu_queue_submit(queue, submit, vk_submit->waits, vk_submit->wait_count,
vk_submit->signals, vk_submit->signal_count,