tu/kgsl: Add support for sparse binding

Use the "virtual BO" interface. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32533>
2026-05-06 09:28:07 +02:00 · 2024-12-06 13:24:43 -05:00 · 2024-12-06 13:24:43 -05:00 · 71ef46717c
commit 71ef46717c
parent 797c74452f
1 changed files with 333 additions and 46 deletions
--- a/src/freedreno/vulkan/tu_knl_kgsl.cc
+++ b/src/freedreno/vulkan/tu_knl_kgsl.cc
@ -166,6 +166,37 @@ bo_init_new_ion_legacy(struct tu_device *dev, struct tu_bo **out_bo, uint64_t si
   return tu_bo_init_dmabuf(dev, out_bo, -1, share.fd);
 }
 static VkResult
 kgsl_bo_user_map(struct tu_device *dev, struct tu_bo *bo, uint64_t client_iova)
 {
   uint64_t offset = bo->gem_handle << 12;
   void *map = mmap((void *)client_iova, bo->size, PROT_READ | PROT_WRITE,
                    MAP_SHARED, dev->physical_device->local_fd, offset);
   if (map == MAP_FAILED) {
      kgsl_bo_finish(dev, bo);
      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "mmap failed (%s)", strerror(errno));
   }
   if (client_iova && (uint64_t)map != client_iova) {
      kgsl_bo_finish(dev, bo);
      return vk_errorf(dev, VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS,
                       "mmap could not map the given address");
   }
   bo->map = map;
   bo->iova = (uint64_t)map;
   /* Because we're using SVM, the CPU mapping and GPU mapping are the same
    * and the CPU mapping must stay fixed for the lifetime of the BO.
    */
   bo->never_unmap = true;
   return VK_SUCCESS;
 }
 static VkResult
 kgsl_bo_init(struct tu_device *dev,
             struct vk_object_base *base,
@ -239,30 +270,9 @@ kgsl_bo_init(struct tu_device *dev,
   };
   if (flags & TU_BO_ALLOC_REPLAYABLE) {
-      uint64_t offset = req.id << 12;
+      VkResult result = kgsl_bo_user_map(dev, bo, client_iova);
-      void *map = mmap((void *)client_iova, bo->size, PROT_READ | PROT_WRITE,
+      if (result != VK_SUCCESS)
-                       MAP_SHARED, dev->physical_device->local_fd, offset);
+         return result;
      if (map == MAP_FAILED) {
         kgsl_bo_finish(dev, bo);
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "mmap failed (%s)", strerror(errno));
      }
      if (client_iova && (uint64_t)map != client_iova) {
         kgsl_bo_finish(dev, bo);
         return vk_errorf(dev, VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS,
                          "mmap could not map the given address");
      }
      bo->map = map;
      bo->iova = (uint64_t)map;
      /* Because we're using SVM, the CPU mapping and GPU mapping are the same
       * and the CPU mapping must stay fixed for the lifetime of the BO.
       */
      bo->never_unmap = true;
   }
   tu_dump_bo_init(dev, bo);
@ -397,6 +407,124 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo)
   safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUMEM_FREE_ID, &req);
 }
 static VkResult
 kgsl_sparse_vma_init(struct tu_device *dev,
                     struct vk_object_base *base,
                     struct tu_sparse_vma *out_vma,
                     uint64_t *out_iova,
                     enum tu_sparse_vma_flags flags,
                     uint64_t size, uint64_t client_iova)
 {
   /* Note: we cannot use kgsl_gpumem_alloc_id because it only has a 32-bit
    * flags value. kgsl_gpuobj_alloc seems to be the only ioctl we can use.
    */
   struct kgsl_gpuobj_alloc req = {
      .size = size,
      .flags = KGSL_MEMFLAGS_VBO,
      .va_len = 0, /* seems to be unused? */
   };
   if (flags & TU_SPARSE_VMA_REPLAYABLE)
      req.flags |= KGSL_MEMFLAGS_USE_CPU_MAP;
   if (!(flags & TU_SPARSE_VMA_MAP_ZERO))
      req.flags |= KGSL_MEMFLAGS_VBO_NO_MAP_ZERO;
   int ret;
   ret = safe_ioctl(dev->physical_device->local_fd,
                    IOCTL_KGSL_GPUOBJ_ALLOC, &req);
   if (ret) {
      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "GPUOBJ_ALLOC failed (%s)", strerror(errno));
   }
   struct tu_bo *bo = tu_device_lookup_bo(dev, req.id);
   assert(bo && bo->gem_handle == 0);
   *bo = (struct tu_bo) {
      .gem_handle = req.id,
      .size = req.mmapsize,
      .name = NULL,
      .refcnt = 1,
      .shared_fd = -1,
      .base = base,
   };
   if (flags & TU_SPARSE_VMA_REPLAYABLE) {
      VkResult result = kgsl_bo_user_map(dev, bo, client_iova);
      if (result != VK_SUCCESS)
         return result;
   } else {
      /* For some cursed reason, the ioctl doesn't return the GPU address so
       * we have to query it.
       */
      struct kgsl_gpumem_get_info info = {
         .id = req.id,
      };
      ret = safe_ioctl(dev->physical_device->local_fd,
                       IOCTL_KGSL_GPUMEM_GET_INFO, &info);
      if (ret) {
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "GPUMEM_GET_INFO failed (%s)", strerror(errno));
      }
      bo->iova = info.gpuaddr;
   }
   out_vma->kgsl.virtual_bo = bo;
   *out_iova = bo->iova;
   return VK_SUCCESS;
 }
 static VkResult
 kgsl_sparse_vma_map(struct tu_device *dev,
                    struct tu_sparse_vma *vma,
                    struct tu_bo *bo, uint64_t bo_offset)
 {
   struct kgsl_gpumem_bind_range range = {
      .child_offset = bo_offset,
      .target_offset = 0,
      .length = vma->kgsl.virtual_bo->size,
      .child_id = bo->gem_handle,
      .op = KGSL_GPUMEM_RANGE_OP_BIND,
   };
   struct kgsl_gpumem_bind_ranges req = {
      .ranges = (uint64_t)(uintptr_t)&range,
      .ranges_nents = 1,
      .ranges_size = sizeof(range),
      .id = vma->kgsl.virtual_bo->gem_handle,
      .flags = 0,
   };
   int ret;
   ret = safe_ioctl(dev->physical_device->local_fd,
                    IOCTL_KGSL_GPUMEM_BIND_RANGES, &req);
   if (ret) {
      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "GPUMEM_BIND_RANGES failed (%s)", strerror(errno));
   }
   return VK_SUCCESS;
 }
 static void
 kgsl_sparse_vma_finish(struct tu_device *dev,
                       struct tu_sparse_vma *vma)
 {
   struct kgsl_gpuobj_free req = {
      .id = vma->kgsl.virtual_bo->gem_handle
   };
   /* Tell sparse array that entry is free */
   memset(vma->kgsl.virtual_bo, 0, sizeof(*vma->kgsl.virtual_bo));
   safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUOBJ_FREE, &req);
 }
 static VkResult
 get_kgsl_prop(int fd, unsigned int type, void *value, size_t size)
 {
@ -431,6 +559,26 @@ kgsl_is_memory_type_supported(int fd, uint32_t flags)
   return true;
 }
 static bool
 kgsl_is_virtual_bo_supported(int fd)
 {
   struct kgsl_gpuobj_alloc req_alloc = {
      .size = 0x1000,
      .flags = KGSL_MEMFLAGS_VBO,
   };
   int ret = safe_ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &req_alloc);
   if (ret) {
      return false;
   }
   struct kgsl_gpuobj_free req_free = { .id = req_alloc.id };
   safe_ioctl(fd, IOCTL_KGSL_GPUOBJ_FREE, &req_free);
   return true;
 }
 enum kgsl_syncobj_state {
   KGSL_SYNCOBJ_STATE_UNSIGNALED,
   KGSL_SYNCOBJ_STATE_SIGNALED,
@ -1047,6 +1195,10 @@ const struct vk_sync_type vk_kgsl_sync_type = {
 struct tu_kgsl_queue_submit {
   struct util_dynarray commands;
   struct util_dynarray ranges;
   struct util_dynarray bind_cmds;
   struct tu_sparse_vma *cur_vma;
   unsigned cur_vma_range_start;
 };
 static void *
@ -1064,6 +1216,8 @@ kgsl_submit_finish(struct tu_device *device,
      (struct tu_kgsl_queue_submit *)_submit;
   util_dynarray_fini(&submit->commands);
   util_dynarray_fini(&submit->ranges);
   util_dynarray_fini(&submit->bind_cmds);
   vk_free(&device->vk.alloc, submit);
 }
@ -1088,6 +1242,76 @@ kgsl_submit_add_entries(struct tu_device *device, void *_submit,
   }
 }
 static void
 kgsl_submit_add_bind(struct tu_device *device,
                     void *_submit,
                     struct tu_sparse_vma *vma, uint64_t vma_offset,
                     struct tu_bo *bo, uint64_t bo_offset,
                     uint64_t size)
 {
   struct tu_kgsl_queue_submit *submit =
      (struct tu_kgsl_queue_submit *)_submit;
   if (vma != submit->cur_vma) {
      unsigned range_count =
         util_dynarray_num_elements(&submit->ranges,
                                    struct kgsl_gpumem_bind_range);
      if (submit->cur_vma) {
         struct kgsl_gpu_aux_command_bind *last_bind =
            util_dynarray_top_ptr(&submit->bind_cmds,
                                  struct kgsl_gpu_aux_command_bind);
         last_bind->numranges = range_count - submit->cur_vma_range_start;
      }
      struct kgsl_gpu_aux_command_bind bind = {
         .rangeslist = submit->ranges.size,
         .numranges = 0,
         .rangesize = sizeof(struct kgsl_gpumem_bind_range),
         .target = vma->kgsl.virtual_bo->gem_handle,
      };
      util_dynarray_append(&submit->bind_cmds,
                           struct kgsl_gpu_aux_command_bind, bind);
      submit->cur_vma = vma;
      submit->cur_vma_range_start = range_count;
   }
   struct kgsl_gpumem_bind_range range = {
      .child_offset = bo_offset,
      .target_offset = vma_offset,
      .length = size,
      .child_id = bo ? bo->gem_handle : 0,
      .op = bo ? KGSL_GPUMEM_RANGE_OP_BIND : KGSL_GPUMEM_RANGE_OP_UNBIND,
   };
   util_dynarray_append(&submit->ranges, struct kgsl_gpumem_bind_range,
                        range);
 }
 /* We don't know the actual CPU pointers until we've finished adding all the
 * bind commands, so we put the offset from the base instead. We need to write
 * the actual pointer after all the ranges are added. We also need to fill out
 * of the size of the last command.
 */
 static void
 kgsl_bind_finalize(struct tu_kgsl_queue_submit *submit)
 {
   unsigned range_count =
      util_dynarray_num_elements(&submit->ranges,
                                 struct kgsl_gpumem_bind_range);
   struct kgsl_gpu_aux_command_bind *last_bind =
      util_dynarray_top_ptr(&submit->bind_cmds,
                            struct kgsl_gpu_aux_command_bind);
   last_bind->numranges = range_count - submit->cur_vma_range_start;
   util_dynarray_foreach (&submit->bind_cmds,
                          struct kgsl_gpu_aux_command_bind, bind) {
      bind->rangeslist += (uint64_t)(uintptr_t)submit->ranges.data;
   }
 }
 static VkResult
 kgsl_queue_submit(struct tu_queue *queue, void *_submit,
                  struct vk_sync_wait *waits, uint32_t wait_count,
@ -1164,6 +1388,9 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
   VkResult result = VK_SUCCESS;
   if (submit->bind_cmds.size != 0)
      kgsl_bind_finalize(submit);
   if (u_trace_submission_data) {
      mtx_lock(&queue->device->kgsl_profiling_mutex);
      tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo,
@ -1240,29 +1467,76 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
      UNREACHABLE("invalid syncobj state");
   }
-   struct kgsl_gpu_command req = {
+   int ret;
-      .flags = KGSL_CMDBATCH_SUBMIT_IB_LIST,
+   uint32_t timestamp = 0;
-      .cmdlist = (uintptr_t) submit->commands.data,
+   uint64_t gpu_offset = 0;
      .cmdsize = sizeof(struct kgsl_command_object),
      .numcmds = util_dynarray_num_elements(&submit->commands,
                                            struct kgsl_command_object),
      .synclist = (uintptr_t) &sync,
      .syncsize = sizeof(sync),
      .numsyncs = has_sync != 0 ? 1 : 0,
      .context_id = queue->msm_queue_id,
   };
-   if (obj_idx) {
+   if (submit->bind_cmds.size == 0) {
-      req.flags |= KGSL_CMDBATCH_PROFILING;
+      struct kgsl_gpu_command req = {
-      req.objlist = (uintptr_t) objs;
+         .flags = KGSL_CMDBATCH_SUBMIT_IB_LIST,
-      req.objsize = sizeof(struct kgsl_command_object);
+         .cmdlist = (uintptr_t) submit->commands.data,
-      req.numobjs = obj_idx;
+         .cmdsize = sizeof(struct kgsl_command_object),
         .numcmds = util_dynarray_num_elements(&submit->commands,
                                               struct kgsl_command_object),
         .synclist = (uintptr_t) &sync,
         .syncsize = sizeof(sync),
         .numsyncs = has_sync != 0 ? 1 : 0,
         .context_id = queue->msm_queue_id,
      };
      if (obj_idx) {
         req.flags |= KGSL_CMDBATCH_PROFILING;
         req.objlist = (uintptr_t) objs;
         req.objsize = sizeof(struct kgsl_command_object);
         req.numobjs = obj_idx;
      }
      ret = safe_ioctl(queue->device->physical_device->local_fd,
                       IOCTL_KGSL_GPU_COMMAND, &req);
      timestamp = req.timestamp;
   } else {
      /* kgsl doesn't support multiple bind commands at once */
      uint32_t i = 0;
      util_dynarray_foreach(&submit->bind_cmds,
                            struct kgsl_gpu_aux_command_bind, bind) {
         bool do_sync = has_sync && i == 0;
         struct kgsl_gpu_aux_command_generic aux = {
            .priv = (uintptr_t) bind,
            .size = sizeof(*bind),
            .type = KGSL_GPU_AUX_COMMAND_BIND,
         };
         uint32_t flags = KGSL_GPU_AUX_COMMAND_BIND;
         if (do_sync)
            flags |= KGSL_GPU_AUX_COMMAND_SYNC;
         struct kgsl_gpu_aux_command req = {
            .flags = flags,
            .cmdlist = (uintptr_t) &aux,
            .cmdsize = sizeof(aux),
            .numcmds = 1,
            .synclist = (uintptr_t) &sync,
            .syncsize = sizeof(sync),
            .numsyncs = do_sync ? 1 : 0,
            .context_id = queue->msm_queue_id,
         };
         ret = safe_ioctl(queue->device->physical_device->local_fd,
                          IOCTL_KGSL_GPU_AUX_COMMAND, &req);
         if (ret) {
            result = vk_device_set_lost(&queue->device->vk,
                                        "bind submit failed: %s\n",
                                        strerror(errno));
            goto fail_submit;
         }
         timestamp = req.timestamp;
         i++;
      }
   }
   int ret = safe_ioctl(queue->device->physical_device->local_fd,
                        IOCTL_KGSL_GPU_COMMAND, &req);
   uint64_t gpu_offset = 0;
 #if HAVE_PERFETTO
   if (profiling_buffer) {
      /* We need to wait for KGSL to queue the GPU command before we can read
@ -1310,7 +1584,7 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
      goto fail_submit;
   }
-   p_atomic_set(&queue->fence, req.timestamp);
+   p_atomic_set(&queue->fence, timestamp);
   for (uint32_t i = 0; i < signal_count; i++) {
      struct kgsl_syncobj *signal_sync =
@ -1320,7 +1594,7 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit,
      kgsl_syncobj_reset(signal_sync);
      signal_sync->state = KGSL_SYNCOBJ_STATE_TS;
      signal_sync->queue = queue;
-      signal_sync->timestamp = req.timestamp;
+      signal_sync->timestamp = timestamp;
   }
   if (u_trace_submission_data) {
@ -1412,8 +1686,11 @@ static const struct tu_knl kgsl_knl_funcs = {
      .submit_create = kgsl_submit_create,
      .submit_finish = kgsl_submit_finish,
      .submit_add_entries = kgsl_submit_add_entries,
      .submit_add_bind = kgsl_submit_add_bind,
      .queue_submit = kgsl_queue_submit,
      .queue_wait_fence = kgsl_queue_wait_fence,
      .sparse_vma_init = kgsl_sparse_vma_init,
      .sparse_vma_finish = kgsl_sparse_vma_finish,
 };
 static bool
@ -1530,6 +1807,16 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd)
      fd, KGSL_MEMFLAGS_IOCOHERENT |
             (KGSL_CACHEMODE_WRITEBACK << KGSL_CACHEMODE_SHIFT));
   device->has_sparse = kgsl_is_virtual_bo_supported(fd);
   device->has_sparse_prr = device->has_sparse;
   get_kgsl_prop(fd, KGSL_PROP_GPU_VA64_SIZE, &device->va_size,
                 sizeof(device->va_size));
   /* We don't actually use the VMA, but set a fake offset so that it doesn't
    * think we're trying to allocate 0 and assert.
    */
   device->va_start = 0x100000000;
   /* preemption is always supported on kgsl */
   device->has_preemption = true;