tu/drm: Enable VM_BIND

Use a new driver-internal VM_BIND submit queue for mapping and unmapping "normal" BOs. This will be required for sparse, because we can't mix the old and new interface, but it should also allow us to stop using "zombie" VMAs and the bo list. Also use MSM_BO_NO_SHARE, which we assume is available when VM_BIND is. This should significantly reduce kernel submit overhead, in parallel to the userspace submit overhead cut by using VM_BIND. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32533>
2026-01-15 05:10:18 +01:00 · 2024-11-13 17:54:44 -05:00 · 2024-11-13 17:54:44 -05:00 · 4efbfa1441
commit 4efbfa1441
parent 460ed35916
4 changed files with 262 additions and 81 deletions
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -2612,7 +2612,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   mtx_init(&device->event_mutex, mtx_plain);
   mtx_init(&device->trace_mutex, mtx_plain);
   u_rwlock_init(&device->dma_bo_lock);
+   u_rwlock_init(&device->vm_bind_fence_lock);
   pthread_mutex_init(&device->submit_mutex, NULL);
+   device->vm_bind_fence_fd = -1;

   if (physical_device->has_set_iova) {
      mtx_init(&device->vma_mutex, mtx_plain);
@ -2693,8 +2695,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
   /* Initialize sparse array for refcounting imported BOs */
   util_sparse_array_init(&device->bo_map, sizeof(struct tu_bo), 512);

-   if (physical_device->has_set_iova) {
-      STATIC_ASSERT(TU_MAX_QUEUE_FAMILIES == 1);
+   if (physical_device->has_set_iova && !physical_device->has_vm_bind) {
      if (!u_vector_init(&device->zombie_vmas, 64,
                         sizeof(struct tu_zombie_vma))) {
         result = vk_startup_errorf(physical_device->instance,
@ -3036,6 +3037,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)

   tu_bo_finish(device, device->global_bo);

+   if (device->vm_bind_fence_fd != -1)
+      close(device->vm_bind_fence_fd);
+
   if (device->null_accel_struct_bo)
      tu_bo_finish(device, device->null_accel_struct_bo);

@ -3064,6 +3068,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)

   util_sparse_array_finish(&device->bo_map);
   u_rwlock_destroy(&device->dma_bo_lock);
+   u_rwlock_destroy(&device->vm_bind_fence_lock);

   u_vector_finish(&device->zombie_vmas);

--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@ -121,6 +121,7 @@ struct tu_physical_device

   bool has_set_iova;
   bool has_raytracing;
+   bool has_vm_bind;
   uint64_t va_start;
   uint64_t va_size;

@ -376,6 +377,9 @@ struct tu_device
   mtx_t bo_mutex;
   /* protects imported BOs creation/freeing */
   struct u_rwlock dma_bo_lock;
+   int vm_bind_fence_fd;
+   /* protects vm_bind_fence_fd */
+   struct u_rwlock vm_bind_fence_lock;

   /* Tracking of name -> size allocated for TU_DEBUG_BOS */
   struct hash_table *bo_sizes;
@ -449,6 +453,9 @@ struct tu_device
   bool use_lrz;

   struct fd_rd_output rd_output;
+
+   /* This is an internal queue for mapping/unmapping non-sparse BOs */
+   uint32_t vm_bind_queue_id;
 };
 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)

--- a/src/freedreno/vulkan/tu_knl_drm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm.cc
@ -122,6 +122,8 @@ msm_submit_add_entries(struct tu_device *device, void *_submit,
   struct tu_msm_queue_submit *submit =
      (struct tu_msm_queue_submit *)_submit;

+   bool has_vm_bind = device->physical_device->has_vm_bind;
+
   struct drm_msm_gem_submit_cmd *cmds = (struct drm_msm_gem_submit_cmd *)
      util_dynarray_grow(&submit->commands, struct drm_msm_gem_submit_cmd,
                         num_entries);
@ -132,12 +134,15 @@ msm_submit_add_entries(struct tu_device *device, void *_submit,

   for (unsigned i = 0; i < num_entries; i++) {
      cmds[i].type = MSM_SUBMIT_CMD_BUF;
-      cmds[i].submit_idx = entries[i].bo->submit_bo_list_idx;
-      cmds[i].submit_offset = entries[i].offset;
+      cmds[i].submit_idx = has_vm_bind ? 0 : entries[i].bo->submit_bo_list_idx;
+      cmds[i].submit_offset = has_vm_bind ? 0 : entries[i].offset;
      cmds[i].size = entries[i].size;
      cmds[i].pad = 0;
      cmds[i].nr_relocs = 0;
-      cmds[i].relocs = 0;
+      if (has_vm_bind)
+         cmds[i].iova = entries[i].bo->iova + entries[i].offset;
+      else
+         cmds[i].relocs = 0;
      bos[i] = entries[i].bo;
   }
 }
--- a/src/freedreno/vulkan/tu_knl_drm_msm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc
@ -18,6 +18,7 @@
 #include "util/u_debug.h"
 #include "util/u_process.h"
 #include "util/hash_table.h"
+#include "util/libsync.h"

 #include "tu_cmd_buffer.h"
 #include "tu_cs.h"
@ -142,6 +143,12 @@ tu_drm_set_param(int fd, uint32_t param, uint64_t value, uint32_t len)
   return ret;
 }

+static int
+tu_try_enable_vm_bind(int fd)
+{
+   return tu_drm_set_param(fd, MSM_PARAM_EN_VM_BIND, 1, 0);
+}
+
 static void
 tu_drm_set_debuginfo(int fd)
 {
@ -240,9 +247,34 @@ msm_device_init(struct tu_device *dev)
            "failed to open device %s", dev->physical_device->fd_path);
   }

+   int ret;
+   if (dev->physical_device->has_vm_bind) {
+      ret = tu_try_enable_vm_bind(fd);
+      if (ret != 0) {
+         return vk_startup_errorf(dev->physical_device->instance,
+                                  VK_ERROR_INITIALIZATION_FAILED,
+                                  "Failed to enable VM_BIND mode: %d", ret);
+      }
+
+      struct drm_msm_submitqueue submit_req = {
+         .flags = MSM_SUBMITQUEUE_VM_BIND,
+      };
+
+      ret = drmCommandWriteRead(fd, DRM_MSM_SUBMITQUEUE_NEW, &submit_req,
+                                sizeof(submit_req));
+      if (ret != 0) {
+         close(fd);
+         return vk_startup_errorf(dev->physical_device->instance,
+                                  VK_ERROR_INITIALIZATION_FAILED,
+                                  "Failed to create VM_BIND queue: %d", ret);
+      }
+
+      dev->vm_bind_queue_id = submit_req.id;
+   }
+
   tu_drm_set_debuginfo(fd);

-   int ret = tu_drm_get_param(fd, MSM_PARAM_FAULTS, &dev->fault_count);
+   ret = tu_drm_get_param(fd, MSM_PARAM_FAULTS, &dev->fault_count);
   if (ret != 0) {
      close(fd);
      return vk_startup_errorf(dev->physical_device->instance,
@ -525,35 +557,74 @@ tu_allocate_kernel_iova(struct tu_device *dev,
 }

 static VkResult
-tu_bo_init(struct tu_device *dev,
-           struct vk_object_base *base,
-           struct tu_bo *bo,
-           uint32_t gem_handle,
-           uint64_t size,
-           uint64_t client_iova,
-           enum tu_bo_alloc_flags flags,
-           const char *name)
+tu_map_vm_bind(struct tu_device *dev, uint32_t map_op, uint32_t map_op_flags,
+               uint64_t iova, uint32_t gem_handle, uint64_t bo_offset,
+               uint64_t range)
 {
-   VkResult result = VK_SUCCESS;
-   uint64_t iova = 0;
+   struct drm_msm_vm_bind req = {
+      .flags = MSM_VM_BIND_FENCE_FD_OUT,
+      .nr_ops = 1,
+      .queue_id = dev->vm_bind_queue_id,
+      .op_stride = sizeof(drm_msm_vm_bind_op),
+      .op = {
+         .op = map_op,
+         .handle = gem_handle,
+         .obj_offset = bo_offset,
+         .iova = iova,
+         .range = range,
+         .flags = map_op_flags,
+      },
+   };

-   assert(!client_iova || dev->physical_device->has_set_iova);
+   int ret = drmCommandWriteRead(dev->fd,
+                                 DRM_MSM_VM_BIND,
+                                 &req, sizeof(req));

-   if (dev->physical_device->has_set_iova) {
-      result = msm_allocate_userspace_iova_locked(dev, gem_handle, size,
-                                                  client_iova, flags, &iova);
-   } else {
-      result = tu_allocate_kernel_iova(dev, gem_handle, &iova);
-   }
+   /* When failing to map a BO, the kernel marks the VM as dead */
+   if (ret)
+      return vk_device_set_lost(&dev->vk, "BO map failed: %m");

-   if (result != VK_SUCCESS) {
-      tu_gem_close(dev, gem_handle);
+   int old_fence;
+   u_rwlock_wrlock(&dev->vm_bind_fence_lock);
+   old_fence = dev->vm_bind_fence_fd;
+   dev->vm_bind_fence_fd = req.fence_fd;
+   u_rwlock_wrunlock(&dev->vm_bind_fence_lock);
+
+   if (old_fence != -1)
+      close(old_fence);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+msm_allocate_vm_bind(struct tu_device *dev,
+                     uint32_t gem_handle,
+                     uint64_t size,
+                     uint64_t client_iova,
+                     enum tu_bo_alloc_flags flags,
+                     uint64_t *iova)
+{
+   VkResult result;
+
+   *iova = 0;
+
+   result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
+
+   if (result != VK_SUCCESS)
      return result;
-   }

-   name = tu_debug_bos_add(dev, size, name);
+   uint32_t map_op_flags = 0;
+   if (flags & TU_BO_ALLOC_ALLOW_DUMP)
+      map_op_flags |= MSM_VM_BIND_OP_DUMP;
+   return tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP, map_op_flags, *iova,
+                         gem_handle, 0, size);
+}

-   mtx_lock(&dev->bo_mutex);
+static VkResult
+tu_bo_add_to_bo_list(struct tu_device *dev,
+                     uint32_t gem_handle, uint32_t flags, uint64_t iova,
+                     uint32_t *bo_list_idx)
+{
   uint32_t idx = dev->submit_bo_count++;

   /* grow the bo list if needed */
@ -564,10 +635,6 @@ tu_bo_init(struct tu_device *dev,
                    8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
      if (!new_ptr) {
         dev->submit_bo_count--;
-         mtx_unlock(&dev->bo_mutex);
-         if (dev->physical_device->has_set_iova)
-            util_vma_heap_free(&dev->vma, iova, size);
-         tu_gem_close(dev, gem_handle);
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      }

@ -588,6 +655,58 @@ tu_bo_init(struct tu_device *dev,
   if (implicit_sync)
      dev->implicit_sync_bo_count++;

+   *bo_list_idx = idx;
+   return VK_SUCCESS;
+}
+
+static VkResult
+tu_bo_init(struct tu_device *dev,
+           struct vk_object_base *base,
+           struct tu_bo *bo,
+           uint32_t gem_handle,
+           uint64_t size,
+           uint64_t client_iova,
+           enum tu_bo_alloc_flags flags,
+           const char *name)
+{
+   VkResult result = VK_SUCCESS;
+   uint64_t iova = 0;
+
+   assert(!client_iova || dev->physical_device->has_set_iova);
+
+   if (dev->physical_device->has_vm_bind) {
+      result = msm_allocate_vm_bind(dev, gem_handle, size, client_iova, flags,
+                                    &iova);
+   } else if (dev->physical_device->has_set_iova) {
+      result = msm_allocate_userspace_iova_locked(dev, gem_handle, size,
+                                                  client_iova, flags, &iova);
+   } else {
+      result = tu_allocate_kernel_iova(dev, gem_handle, &iova);
+   }
+
+   if (result != VK_SUCCESS) {
+      tu_gem_close(dev, gem_handle);
+      return result;
+   }
+
+   name = tu_debug_bos_add(dev, size, name);
+
+   uint32_t idx = 0;
+
+   if (!dev->physical_device->has_vm_bind) {
+      mtx_lock(&dev->bo_mutex);
+
+      result = tu_bo_add_to_bo_list(dev, gem_handle, flags, iova, &idx);
+      if (result != VK_SUCCESS) {
+         mtx_unlock(&dev->bo_mutex);
+         if (dev->physical_device->has_set_iova)
+            util_vma_heap_free(&dev->vma, iova, size);
+         tu_gem_close(dev, gem_handle);
+         return result;
+      }
+   }
+
+   bool implicit_sync = flags & TU_BO_ALLOC_IMPLICIT_SYNC;
   *bo = (struct tu_bo) {
      .gem_handle = gem_handle,
      .size = size,
@ -599,7 +718,8 @@ tu_bo_init(struct tu_device *dev,
      .base = base,
   };

-   mtx_unlock(&dev->bo_mutex);
+   if (!dev->physical_device->has_vm_bind)
+      mtx_unlock(&dev->bo_mutex);

   tu_dump_bo_init(dev, bo);

@ -682,6 +802,9 @@ msm_bo_init(struct tu_device *dev,
   if (flags & TU_BO_ALLOC_GPU_READ_ONLY)
      req.flags |= MSM_BO_GPU_READONLY;

+   if (dev->physical_device->has_vm_bind && !(flags & TU_BO_ALLOC_SHAREABLE))
+      req.flags |= MSM_BO_NO_SHARE;
+
   int ret = drmCommandWriteRead(dev->fd,
                                 DRM_MSM_GEM_NEW, &req, sizeof(req));
   if (ret)
@ -809,9 +932,14 @@ msm_bo_map(struct tu_device *dev, struct tu_bo *bo, void *placed_addr)
 static void
 msm_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
 {
-   mtx_lock(&dev->bo_mutex);
-   dev->submit_bo_list[bo->submit_bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP;
-   mtx_unlock(&dev->bo_mutex);
+   if (dev->physical_device->has_vm_bind) {
+      tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP, MSM_VM_BIND_OP_DUMP,
+                     bo->iova, bo->gem_handle, 0, bo->size);
+   } else {
+      mtx_lock(&dev->bo_mutex);
+      dev->submit_bo_list[bo->submit_bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP;
+      mtx_unlock(&dev->bo_mutex);
+   }
 }


@ -853,6 +981,23 @@ msm_bo_get_metadata(struct tu_device *dev, struct tu_bo *bo,
   return ret;
 }

+static void
+msm_bo_gem_close(struct tu_device *dev, struct tu_bo *bo)
+{
+   /* Our BO structs are stored in a sparse array in the physical device,
+    * so we don't want to free the BO pointer, instead we want to reset it
+    * to 0, to signal that array entry as being free.
+    */
+   uint32_t gem_handle = bo->gem_handle;
+   memset(bo, 0, sizeof(*bo));
+
+   struct drm_gem_close req = {
+      .handle = gem_handle,
+   };
+
+   drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
+}
+
 static void
 msm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
 {
@ -875,23 +1020,27 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo)

   TU_RMV(bo_destroy, dev, bo);

-   tu_bo_list_del(dev, bo);
+   if (dev->physical_device->has_vm_bind) {
+      tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, bo->iova, 0, 0,
+                     bo->size);

-   if (dev->physical_device->has_set_iova) {
+      mtx_lock(&dev->bo_mutex);
+      if (bo->implicit_sync)
+         dev->implicit_sync_bo_count--;
+      mtx_unlock(&dev->bo_mutex);
+
+      mtx_lock(&dev->vma_mutex);
+      util_vma_heap_free(&dev->vma, bo->iova, bo->size);
+      mtx_unlock(&dev->vma_mutex);
+
+      msm_bo_gem_close(dev, bo);
+   } else if (dev->physical_device->has_set_iova) {
+      tu_bo_list_del(dev, bo);
      tu_bo_make_zombie(dev, bo);
   } else {
-      /* Our BO structs are stored in a sparse array in the physical device,
-       * so we don't want to free the BO pointer, instead we want to reset it
-       * to 0, to signal that array entry as being free.
-       */
-      uint32_t gem_handle = bo->gem_handle;
-      memset(bo, 0, sizeof(*bo));
+      tu_bo_list_del(dev, bo);

-      struct drm_gem_close req = {
-         .handle = gem_handle,
-      };
-
-      drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
+      msm_bo_gem_close(dev, bo);
   }

   u_rwlock_rdunlock(&dev->dma_bo_lock);
@ -912,6 +1061,7 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
   uint64_t gpu_offset = 0;
   uint32_t entry_count =
      util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd);
+   bool has_vm_bind = queue->device->physical_device->has_vm_bind;
 #if HAVE_PERFETTO
   struct tu_perfetto_clocks clocks;
   uint64_t start_ts = tu_perfetto_begin_submit();
@ -967,39 +1117,47 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
   if (signal_count)
      flags |= MSM_SUBMIT_SYNCOBJ_OUT;

-   mtx_lock(&queue->device->bo_mutex);
+   if (has_vm_bind) {
+      u_rwlock_rdlock(&queue->device->vm_bind_fence_lock);

-   /* MSM_SUBMIT_NO_IMPLICIT skips having the scheduler wait on the previous dma
-    * fences attached to the BO (such as from the window system server's command
-    * queue) before submitting the job. Our fence will always get attached to
-    * the BO, because it gets used for synchronization for the shrinker.
-    *
-    * If the flag is not set, then the kernel falls back to checking each BO's
-    * MSM_SUBMIT_NO_IMPLICIT flag for its implicit sync handling.
-    *
-    * As of kernel 6.0, the core wsi code will be generating appropriate syncobj
-    * export-and-waits/signal-and-imports for implict syncing (on implicit sync
-    * WSI backends) and not allocating any
-    * wsi_memory_allocate_info->implicit_sync BOs from the driver. However, on
-    * older kernels with that flag set, we have to submit without NO_IMPLICIT
-    * set to do have the kernel do pre-submit waits on whatever the last fence
-    * was.
-    */
-   if (queue->device->implicit_sync_bo_count == 0)
-      flags |= MSM_SUBMIT_NO_IMPLICIT;
+      if (queue->device->vm_bind_fence_fd != -1)
+         flags |= MSM_SUBMIT_FENCE_FD_IN;
+   } else {
+      mtx_lock(&queue->device->bo_mutex);

-   /* drm_msm_gem_submit_cmd requires index of bo which could change at any
-    * time when bo_mutex is not locked. So we update the index here under the
-    * lock.
-    */
-   util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd,
-                          cmd) {
-      unsigned i = cmd -
-         util_dynarray_element(&submit->commands,
-                               struct drm_msm_gem_submit_cmd, 0);
-      struct tu_bo **bo = util_dynarray_element(&submit->command_bos,
-                                                struct tu_bo *, i);
-      cmd->submit_idx = (*bo)->submit_bo_list_idx;
+      /* MSM_SUBMIT_NO_IMPLICIT skips having the scheduler wait on the
+       * previous dma fences attached to the BO (such as from the window
+       * system server's command queue) before submitting the job. Our fence
+       * will always get attached to the BO, because it gets used for
+       * synchronization for the shrinker.
+       *
+       * If the flag is not set, then the kernel falls back to checking each
+       * BO's MSM_SUBMIT_NO_IMPLICIT flag for its implicit sync handling.
+       *
+       * As of kernel 6.0, the core wsi code will be generating appropriate
+       * syncobj export-and-waits/signal-and-imports for implict syncing (on
+       * implicit sync WSI backends) and not allocating any
+       * wsi_memory_allocate_info->implicit_sync BOs from the driver. However,
+       * on older kernels with that flag set, we have to submit without
+       * NO_IMPLICIT set to do have the kernel do pre-submit waits on whatever
+       * the last fence was.
+       */
+      if (queue->device->implicit_sync_bo_count == 0)
+         flags |= MSM_SUBMIT_NO_IMPLICIT;
+
+      /* drm_msm_gem_submit_cmd requires index of bo which could change at any
+       * time when bo_mutex is not locked. So we update the index here under the
+       * lock.
+       */
+      util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd,
+                             cmd) {
+         unsigned i = cmd -
+            util_dynarray_element(&submit->commands,
+                                  struct drm_msm_gem_submit_cmd, 0);
+         struct tu_bo **bo = util_dynarray_element(&submit->command_bos,
+                                                   struct tu_bo *, i);
+         cmd->submit_idx = (*bo)->submit_bo_list_idx;
+      }
   }

   req = (struct drm_msm_gem_submit) {
@ -1008,6 +1166,7 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
      .nr_cmds = entry_count,
      .bos = (uint64_t)(uintptr_t) queue->device->submit_bo_list,
      .cmds = (uint64_t)(uintptr_t)submit->commands.data,
+      .fence_fd = queue->device->vm_bind_fence_fd,
      .queueid = queue->msm_queue_id,
      .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs,
      .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs,
@ -1023,7 +1182,10 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
                              &req, sizeof(req));
   }

-   mtx_unlock(&queue->device->bo_mutex);
+   if (has_vm_bind)
+      u_rwlock_rdunlock(&queue->device->vm_bind_fence_lock);
+   else
+      mtx_unlock(&queue->device->bo_mutex);

   if (ret) {
      result = vk_device_set_lost(&queue->device->vk, "submit failed: %m");
@ -1111,6 +1273,8 @@ tu_knl_drm_msm_load(struct tu_instance *instance,
   device->instance = instance;
   device->local_fd = fd;

+   device->has_vm_bind = tu_try_enable_vm_bind(fd) == 0;
+
   if (tu_drm_get_gpu_id(device, &device->dev_id.gpu_id)) {
      result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                                 "could not get GPU ID");