tu: Make userspace RD dump generic

Stop relying on the submit BO list, which won't exist with the new "VM_BIND" uAPI. Instead, create a separate list in generic code, only when dumping is enabled. As a bonus this means that it should work on virtio and kgsl too, and more code is removed from the kernel backend. We need to use the generic fence wait introduced in the previous commit. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32165>
2026-02-26 09:10:31 +01:00 · 2024-12-13 15:41:43 -05:00 · 2024-12-13 15:41:43 -05:00 · 6615cbfeaf
commit 6615cbfeaf
parent a8b2f45346
9 changed files with 121 additions and 49 deletions
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -2719,6 +2719,7 @@ fail_global_bo_map:
   TU_RMV(resource_destroy, device, device->global_bo);
   tu_bo_finish(device, device->global_bo);
   vk_free(&device->vk.alloc, device->submit_bo_list);
+   util_dynarray_fini(&device->dump_bo_list);
 fail_global_bo:
   ir3_compiler_destroy(device->compiler);
   util_sparse_array_finish(&device->bo_map);
@ -2823,6 +2824,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   pthread_cond_destroy(&device->timeline_cond);
   _mesa_hash_table_destroy(device->bo_sizes, NULL);
   vk_free(&device->vk.alloc, device->submit_bo_list);
+   util_dynarray_fini(&device->dump_bo_list);
   vk_device_finish(&device->vk);
   vk_free(&device->vk.alloc, device);
 }
@ -3426,6 +3428,36 @@ tu_debug_bos_print_stats(struct tu_device *dev)
   mtx_unlock(&dev->bo_mutex);
 }

+void
+tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo)
+{
+   bo->dump_bo_list_idx = ~0;
+
+   if (!FD_RD_DUMP(ENABLE))
+      return;
+
+   mtx_lock(&dev->bo_mutex);
+   uint32_t idx =
+      util_dynarray_num_elements(&dev->dump_bo_list, struct tu_bo *);
+   bo->dump_bo_list_idx = idx;
+   util_dynarray_append(&dev->dump_bo_list, struct tu_bo *, bo);
+   mtx_unlock(&dev->bo_mutex);
+}
+
+void
+tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo)
+{
+   if (bo->dump_bo_list_idx != ~0) {
+      mtx_lock(&dev->bo_mutex);
+      struct tu_bo *exchanging_bo =
+         util_dynarray_pop(&dev->dump_bo_list, struct tu_bo *);
+      *util_dynarray_element(&dev->dump_bo_list, struct tu_bo *,
+                             bo->dump_bo_list_idx) = exchanging_bo;
+      exchanging_bo->dump_bo_list_idx = bo->dump_bo_list_idx;
+      mtx_unlock(&dev->bo_mutex);
+   }
+}
+
 void
 tu_CmdBeginDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer,
                              const VkDebugUtilsLabelEXT *pLabelInfo)
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -347,6 +347,8 @@ struct tu_device
   struct drm_msm_gem_submit_bo *submit_bo_list;
   /* map bo handles to bo list index: */
   uint32_t submit_bo_count, submit_bo_list_size;
+   /* bo list for dumping: */
+   struct util_dynarray dump_bo_list;
   mtx_t bo_mutex;
   /* protects imported BOs creation/freeing */
   struct u_rwlock dma_bo_lock;
@ -581,4 +583,10 @@ tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
 void
 tu_debug_bos_print_stats(struct tu_device *dev);

+void
+tu_dump_bo_init(struct tu_device *dev, struct tu_bo *bo);
+void
+tu_dump_bo_del(struct tu_device *dev, struct tu_bo *bo);
+
+
 #endif /* TU_DEVICE_H */
--- a/src/freedreno/vulkan/tu_knl.cc
+++ b/src/freedreno/vulkan/tu_knl.cc
@ -53,6 +53,8 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
                             (*out_bo)->iova, (*out_bo)->size,
                             VK_DEVICE_ADDRESS_BINDING_TYPE_BIND_EXT);

+   (*out_bo)->dump = flags & TU_BO_ALLOC_ALLOW_DUMP;
+
   return VK_SUCCESS;
 }

@ -73,7 +75,7 @@ tu_bo_init_dmabuf(struct tu_device *dev,
    */
   if (dev->physical_device->has_cached_non_coherent_memory)
      (*bo)->cached_non_coherent = true;
-   
+
   return VK_SUCCESS;
 }

@ -208,6 +210,8 @@ if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64))
 void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
 {
   dev->instance->knl->bo_allow_dump(dev, bo);
+
+   p_atomic_set(&bo->dump, true);
 }

 void
--- a/src/freedreno/vulkan/tu_knl.h
+++ b/src/freedreno/vulkan/tu_knl.h
@ -58,6 +58,7 @@ struct tu_bo {
   int32_t refcnt;

   uint32_t submit_bo_list_idx;
+   uint32_t dump_bo_list_idx;

 #ifdef TU_HAS_KGSL
   /* We have to store fd returned by ion_fd_data
@ -71,6 +72,8 @@ struct tu_bo {
   bool never_unmap : 1;
   bool cached_non_coherent : 1;

+   bool dump;
+
   /* Pointer to the vk_object_base associated with the BO
    * for the purposes of VK_EXT_device_address_binding_report
    */
--- a/src/freedreno/vulkan/tu_knl_drm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm.cc
@ -77,6 +77,7 @@ tu_drm_bo_finish(struct tu_device *dev, struct tu_bo *bo)

   TU_RMV(bo_destroy, dev, bo);
   tu_debug_bos_del(dev, bo);
+   tu_dump_bo_del(dev, bo);

   mtx_lock(&dev->bo_mutex);
   dev->submit_bo_count--;
--- a/src/freedreno/vulkan/tu_knl_drm_msm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc
@ -537,6 +537,8 @@ tu_bo_init(struct tu_device *dev,

   mtx_unlock(&dev->bo_mutex);

+   tu_dump_bo_init(dev, bo);
+
   TU_RMV(bo_allocate, dev, bo);

   return VK_SUCCESS;
@ -798,7 +800,6 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
      (struct tu_msm_queue_submit *)_submit;
   struct drm_msm_gem_submit_syncobj *in_syncobjs, *out_syncobjs;
   struct drm_msm_gem_submit req;
-   uint32_t submit_idx = queue->device->submit_count;
   uint64_t gpu_offset = 0;
   uint32_t entry_count =
      util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd);
@ -889,46 +890,6 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
      .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
   };

-   if (req.nr_cmds && FD_RD_DUMP(ENABLE) &&
-       fd_rd_output_begin(&queue->device->rd_output, submit_idx)) {
-      struct tu_device *device = queue->device;
-      struct fd_rd_output *rd_output = &device->rd_output;
-
-      if (FD_RD_DUMP(FULL)) {
-         VkResult result = tu_wait_fence(device, queue->msm_queue_id, queue->fence, ~0);
-         if (result != VK_SUCCESS) {
-            mesa_loge("FD_RD_DUMP_FULL: wait on previous submission for device %u and queue %d failed: %u",
-                      device->device_idx, queue->msm_queue_id, 0);
-         }
-      }
-
-      fd_rd_output_write_section(rd_output, RD_CHIP_ID, &device->physical_device->dev_id.chip_id, 8);
-      fd_rd_output_write_section(rd_output, RD_CMD, "tu-dump", 8);
-
-      for (unsigned i = 0; i < device->submit_bo_count; i++) {
-         struct drm_msm_gem_submit_bo bo = device->submit_bo_list[i];
-         struct tu_bo *tu_bo = tu_device_lookup_bo(device, bo.handle);
-         uint64_t iova = bo.presumed;
-
-         uint32_t buf[3] = { iova, tu_bo->size, iova >> 32 };
-         fd_rd_output_write_section(rd_output, RD_GPUADDR, buf, 12);
-         if (bo.flags & MSM_SUBMIT_BO_DUMP || FD_RD_DUMP(FULL)) {
-            tu_bo_map(device, tu_bo, NULL); /* note: this would need locking to be safe */
-            fd_rd_output_write_section(rd_output, RD_BUFFER_CONTENTS, tu_bo->map, tu_bo->size);
-         }
-      }
-
-      util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd,
-                             cmd) {
-         uint64_t iova = device->submit_bo_list[cmd->submit_idx].presumed + cmd->submit_offset;
-         uint32_t size = cmd->size >> 2;
-         uint32_t buf[3] = { iova, size, iova >> 32 };
-         fd_rd_output_write_section(rd_output, RD_CMDSTREAM_ADDR, buf, 12);
-      }
-
-      fd_rd_output_end(rd_output);
-   }
-
   ret = drmCommandWriteRead(queue->device->fd,
                             DRM_MSM_GEM_SUBMIT,
                             &req, sizeof(req));
--- a/src/freedreno/vulkan/tu_knl_drm_virtio.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_virtio.cc
@ -582,6 +582,8 @@ tu_bo_init(struct tu_device *dev,

   mtx_unlock(&dev->bo_mutex);

+   tu_dump_bo_init(dev, bo);
+
   return VK_SUCCESS;
 }

--- a/src/freedreno/vulkan/tu_knl_kgsl.cc
+++ b/src/freedreno/vulkan/tu_knl_kgsl.cc
@ -261,9 +261,9 @@ kgsl_bo_init(struct tu_device *dev,
       * and the CPU mapping must stay fixed for the lifetime of the BO.
       */
      bo->never_unmap = true;
-
   }

+   tu_dump_bo_init(dev, bo);

   *out_bo = bo;

@ -321,6 +321,8 @@ kgsl_bo_init_dmabuf(struct tu_device *dev,
      .shared_fd = os_dupfd_cloexec(fd),
   };

+   tu_dump_bo_init(dev, bo);
+
   *out_bo = bo;

   return VK_SUCCESS;
@ -380,6 +382,7 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo)

   TU_RMV(bo_destroy, dev, bo);
   tu_debug_bos_del(dev, bo);
+   tu_dump_bo_del(dev, bo);

   struct kgsl_gpumem_free_id req = {
      .id = bo->gem_handle
--- a/src/freedreno/vulkan/tu_queue.cc
+++ b/src/freedreno/vulkan/tu_queue.cc
@ -51,12 +51,27 @@ tu_get_submitqueue_priority(const struct tu_physical_device *pdevice,
   return priority;
 }

+static void
+submit_add_entries(struct tu_device *dev, void *submit,
+                   struct util_dynarray *dump_cmds,
+                   struct tu_cs_entry *entries, unsigned num_entries)
+{
+   tu_submit_add_entries(dev, submit, entries, num_entries);
+   if (FD_RD_DUMP(ENABLE)) {
+      util_dynarray_append_array(dump_cmds, struct tu_cs_entry, entries,
+                                 num_entries);
+   }
+}
+
 static VkResult
 queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
 {
   struct tu_queue *queue = list_entry(_queue, struct tu_queue, vk);
   struct tu_device *device = queue->device;
   bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
+   struct util_dynarray dump_cmds;
+
+   util_dynarray_init(&dump_cmds, NULL);

   uint32_t perf_pass_index =
      device->perfcntrs_pass_cs_entries ? vk_submit->perf_pass_index : ~0;
@ -102,28 +117,71 @@ queue_submit(struct vk_queue *_queue, struct vk_queue_submit *vk_submit)
         struct tu_cs_entry *perf_cs_entry =
            &cmd_buffer->device->perfcntrs_pass_cs_entries[perf_pass_index];

-         tu_submit_add_entries(device, submit, perf_cs_entry, 1);
+         submit_add_entries(device, submit, &dump_cmds, perf_cs_entry, 1);
      }

-      tu_submit_add_entries(device, submit, cs->entries,
-                            cs->entry_count);
+      submit_add_entries(device, submit, &dump_cmds, cs->entries,
+                         cs->entry_count);

      if (u_trace_submission_data &&
          u_trace_submission_data->cmd_trace_data[i].timestamp_copy_cs) {
         struct tu_cs_entry *trace_cs_entry =
            &u_trace_submission_data->cmd_trace_data[i]
                .timestamp_copy_cs->entries[0];
-         tu_submit_add_entries(device, submit, trace_cs_entry, 1);
+         submit_add_entries(device, submit, &dump_cmds, trace_cs_entry, 1);
      }
   }

   if (tu_autotune_submit_requires_fence(cmd_buffers, cmdbuf_count)) {
      struct tu_cs *autotune_cs = tu_autotune_on_submit(
         device, &device->autotune, cmd_buffers, cmdbuf_count);
-      tu_submit_add_entries(device, submit, autotune_cs->entries,
-                            autotune_cs->entry_count);
+      submit_add_entries(device, submit, &dump_cmds, autotune_cs->entries,
+                         autotune_cs->entry_count);
   }

+   if (cmdbuf_count && FD_RD_DUMP(ENABLE) &&
+       fd_rd_output_begin(&queue->device->rd_output,
+                          queue->device->submit_count)) {
+      struct tu_device *device = queue->device;
+      struct fd_rd_output *rd_output = &device->rd_output;
+
+      if (FD_RD_DUMP(FULL)) {
+         VkResult result = tu_queue_wait_fence(queue, queue->fence, ~0);
+         if (result != VK_SUCCESS) {
+            mesa_loge("FD_RD_DUMP_FULL: wait on previous submission for device %u and queue %d failed: %u",
+                      device->device_idx, queue->msm_queue_id, 0);
+         }
+      }
+
+      fd_rd_output_write_section(rd_output, RD_CHIP_ID, &device->physical_device->dev_id.chip_id, 8);
+      fd_rd_output_write_section(rd_output, RD_CMD, "tu-dump", 8);
+
+      mtx_lock(&device->bo_mutex);
+      util_dynarray_foreach (&device->dump_bo_list, struct tu_bo *, bo_ptr) {
+         struct tu_bo *bo = *bo_ptr;
+         uint64_t iova = bo->iova;
+
+         uint32_t buf[3] = { iova, bo->size, iova >> 32 };
+         fd_rd_output_write_section(rd_output, RD_GPUADDR, buf, 12);
+         if (bo->dump || FD_RD_DUMP(FULL)) {
+            tu_bo_map(device, bo, NULL); /* note: this would need locking to be safe */
+            fd_rd_output_write_section(rd_output, RD_BUFFER_CONTENTS, bo->map, bo->size);
+         }
+      }
+      mtx_unlock(&device->bo_mutex);
+
+      util_dynarray_foreach (&dump_cmds, struct tu_cs_entry, cmd) {
+         uint64_t iova = cmd->bo->iova + cmd->offset;
+         uint32_t size = cmd->size >> 2;
+         uint32_t buf[3] = { iova, size, iova >> 32 };
+         fd_rd_output_write_section(rd_output, RD_CMDSTREAM_ADDR, buf, 12);
+      }
+
+      fd_rd_output_end(rd_output);
+   }
+
+   util_dynarray_fini(&dump_cmds);
+
   result =
      tu_queue_submit(queue, submit, vk_submit->waits, vk_submit->wait_count,
                      vk_submit->signals, vk_submit->signal_count,