diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index d30a4d19d37..e8e20f61569 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -2612,7 +2612,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, mtx_init(&device->event_mutex, mtx_plain); mtx_init(&device->trace_mutex, mtx_plain); u_rwlock_init(&device->dma_bo_lock); + u_rwlock_init(&device->vm_bind_fence_lock); pthread_mutex_init(&device->submit_mutex, NULL); + device->vm_bind_fence_fd = -1; if (physical_device->has_set_iova) { mtx_init(&device->vma_mutex, mtx_plain); @@ -2693,8 +2695,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, /* Initialize sparse array for refcounting imported BOs */ util_sparse_array_init(&device->bo_map, sizeof(struct tu_bo), 512); - if (physical_device->has_set_iova) { - STATIC_ASSERT(TU_MAX_QUEUE_FAMILIES == 1); + if (physical_device->has_set_iova && !physical_device->has_vm_bind) { if (!u_vector_init(&device->zombie_vmas, 64, sizeof(struct tu_zombie_vma))) { result = vk_startup_errorf(physical_device->instance, @@ -3036,6 +3037,9 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) tu_bo_finish(device, device->global_bo); + if (device->vm_bind_fence_fd != -1) + close(device->vm_bind_fence_fd); + if (device->null_accel_struct_bo) tu_bo_finish(device, device->null_accel_struct_bo); @@ -3064,6 +3068,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) util_sparse_array_finish(&device->bo_map); u_rwlock_destroy(&device->dma_bo_lock); + u_rwlock_destroy(&device->vm_bind_fence_lock); u_vector_finish(&device->zombie_vmas); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 18bebb6c76c..fb05910e3ab 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -121,6 +121,7 @@ struct tu_physical_device bool has_set_iova; bool has_raytracing; + bool has_vm_bind; uint64_t va_start; uint64_t va_size; @@ -376,6 +377,9 @@ struct tu_device mtx_t bo_mutex; /* protects imported BOs creation/freeing */ struct u_rwlock dma_bo_lock; + int vm_bind_fence_fd; + /* protects vm_bind_fence_fd */ + struct u_rwlock vm_bind_fence_lock; /* Tracking of name -> size allocated for TU_DEBUG_BOS */ struct hash_table *bo_sizes; @@ -449,6 +453,9 @@ struct tu_device bool use_lrz; struct fd_rd_output rd_output; + + /* This is an internal queue for mapping/unmapping non-sparse BOs */ + uint32_t vm_bind_queue_id; }; VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) diff --git a/src/freedreno/vulkan/tu_knl_drm.cc b/src/freedreno/vulkan/tu_knl_drm.cc index ab06b8d53be..d50b5da3064 100644 --- a/src/freedreno/vulkan/tu_knl_drm.cc +++ b/src/freedreno/vulkan/tu_knl_drm.cc @@ -122,6 +122,8 @@ msm_submit_add_entries(struct tu_device *device, void *_submit, struct tu_msm_queue_submit *submit = (struct tu_msm_queue_submit *)_submit; + bool has_vm_bind = device->physical_device->has_vm_bind; + struct drm_msm_gem_submit_cmd *cmds = (struct drm_msm_gem_submit_cmd *) util_dynarray_grow(&submit->commands, struct drm_msm_gem_submit_cmd, num_entries); @@ -132,12 +134,15 @@ msm_submit_add_entries(struct tu_device *device, void *_submit, for (unsigned i = 0; i < num_entries; i++) { cmds[i].type = MSM_SUBMIT_CMD_BUF; - cmds[i].submit_idx = entries[i].bo->submit_bo_list_idx; - cmds[i].submit_offset = entries[i].offset; + cmds[i].submit_idx = has_vm_bind ? 0 : entries[i].bo->submit_bo_list_idx; + cmds[i].submit_offset = has_vm_bind ? 0 : entries[i].offset; cmds[i].size = entries[i].size; cmds[i].pad = 0; cmds[i].nr_relocs = 0; - cmds[i].relocs = 0; + if (has_vm_bind) + cmds[i].iova = entries[i].bo->iova + entries[i].offset; + else + cmds[i].relocs = 0; bos[i] = entries[i].bo; } } diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc index 701ed7580e6..8c7079f9fae 100644 --- a/src/freedreno/vulkan/tu_knl_drm_msm.cc +++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc @@ -18,6 +18,7 @@ #include "util/u_debug.h" #include "util/u_process.h" #include "util/hash_table.h" +#include "util/libsync.h" #include "tu_cmd_buffer.h" #include "tu_cs.h" @@ -142,6 +143,12 @@ tu_drm_set_param(int fd, uint32_t param, uint64_t value, uint32_t len) return ret; } +static int +tu_try_enable_vm_bind(int fd) +{ + return tu_drm_set_param(fd, MSM_PARAM_EN_VM_BIND, 1, 0); +} + static void tu_drm_set_debuginfo(int fd) { @@ -240,9 +247,34 @@ msm_device_init(struct tu_device *dev) "failed to open device %s", dev->physical_device->fd_path); } + int ret; + if (dev->physical_device->has_vm_bind) { + ret = tu_try_enable_vm_bind(fd); + if (ret != 0) { + return vk_startup_errorf(dev->physical_device->instance, + VK_ERROR_INITIALIZATION_FAILED, + "Failed to enable VM_BIND mode: %d", ret); + } + + struct drm_msm_submitqueue submit_req = { + .flags = MSM_SUBMITQUEUE_VM_BIND, + }; + + ret = drmCommandWriteRead(fd, DRM_MSM_SUBMITQUEUE_NEW, &submit_req, + sizeof(submit_req)); + if (ret != 0) { + close(fd); + return vk_startup_errorf(dev->physical_device->instance, + VK_ERROR_INITIALIZATION_FAILED, + "Failed to create VM_BIND queue: %d", ret); + } + + dev->vm_bind_queue_id = submit_req.id; + } + tu_drm_set_debuginfo(fd); - int ret = tu_drm_get_param(fd, MSM_PARAM_FAULTS, &dev->fault_count); + ret = tu_drm_get_param(fd, MSM_PARAM_FAULTS, &dev->fault_count); if (ret != 0) { close(fd); return vk_startup_errorf(dev->physical_device->instance, @@ -525,35 +557,74 @@ tu_allocate_kernel_iova(struct tu_device *dev, } static VkResult -tu_bo_init(struct tu_device *dev, - struct vk_object_base *base, - struct tu_bo *bo, - uint32_t gem_handle, - uint64_t size, - uint64_t client_iova, - enum tu_bo_alloc_flags flags, - const char *name) +tu_map_vm_bind(struct tu_device *dev, uint32_t map_op, uint32_t map_op_flags, + uint64_t iova, uint32_t gem_handle, uint64_t bo_offset, + uint64_t range) { - VkResult result = VK_SUCCESS; - uint64_t iova = 0; + struct drm_msm_vm_bind req = { + .flags = MSM_VM_BIND_FENCE_FD_OUT, + .nr_ops = 1, + .queue_id = dev->vm_bind_queue_id, + .op_stride = sizeof(drm_msm_vm_bind_op), + .op = { + .op = map_op, + .handle = gem_handle, + .obj_offset = bo_offset, + .iova = iova, + .range = range, + .flags = map_op_flags, + }, + }; - assert(!client_iova || dev->physical_device->has_set_iova); + int ret = drmCommandWriteRead(dev->fd, + DRM_MSM_VM_BIND, + &req, sizeof(req)); - if (dev->physical_device->has_set_iova) { - result = msm_allocate_userspace_iova_locked(dev, gem_handle, size, - client_iova, flags, &iova); - } else { - result = tu_allocate_kernel_iova(dev, gem_handle, &iova); - } + /* When failing to map a BO, the kernel marks the VM as dead */ + if (ret) + return vk_device_set_lost(&dev->vk, "BO map failed: %m"); - if (result != VK_SUCCESS) { - tu_gem_close(dev, gem_handle); + int old_fence; + u_rwlock_wrlock(&dev->vm_bind_fence_lock); + old_fence = dev->vm_bind_fence_fd; + dev->vm_bind_fence_fd = req.fence_fd; + u_rwlock_wrunlock(&dev->vm_bind_fence_lock); + + if (old_fence != -1) + close(old_fence); + + return VK_SUCCESS; +} + +static VkResult +msm_allocate_vm_bind(struct tu_device *dev, + uint32_t gem_handle, + uint64_t size, + uint64_t client_iova, + enum tu_bo_alloc_flags flags, + uint64_t *iova) +{ + VkResult result; + + *iova = 0; + + result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova); + + if (result != VK_SUCCESS) return result; - } - name = tu_debug_bos_add(dev, size, name); + uint32_t map_op_flags = 0; + if (flags & TU_BO_ALLOC_ALLOW_DUMP) + map_op_flags |= MSM_VM_BIND_OP_DUMP; + return tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP, map_op_flags, *iova, + gem_handle, 0, size); +} - mtx_lock(&dev->bo_mutex); +static VkResult +tu_bo_add_to_bo_list(struct tu_device *dev, + uint32_t gem_handle, uint32_t flags, uint64_t iova, + uint32_t *bo_list_idx) +{ uint32_t idx = dev->submit_bo_count++; /* grow the bo list if needed */ @@ -564,10 +635,6 @@ tu_bo_init(struct tu_device *dev, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!new_ptr) { dev->submit_bo_count--; - mtx_unlock(&dev->bo_mutex); - if (dev->physical_device->has_set_iova) - util_vma_heap_free(&dev->vma, iova, size); - tu_gem_close(dev, gem_handle); return VK_ERROR_OUT_OF_HOST_MEMORY; } @@ -588,6 +655,58 @@ tu_bo_init(struct tu_device *dev, if (implicit_sync) dev->implicit_sync_bo_count++; + *bo_list_idx = idx; + return VK_SUCCESS; +} + +static VkResult +tu_bo_init(struct tu_device *dev, + struct vk_object_base *base, + struct tu_bo *bo, + uint32_t gem_handle, + uint64_t size, + uint64_t client_iova, + enum tu_bo_alloc_flags flags, + const char *name) +{ + VkResult result = VK_SUCCESS; + uint64_t iova = 0; + + assert(!client_iova || dev->physical_device->has_set_iova); + + if (dev->physical_device->has_vm_bind) { + result = msm_allocate_vm_bind(dev, gem_handle, size, client_iova, flags, + &iova); + } else if (dev->physical_device->has_set_iova) { + result = msm_allocate_userspace_iova_locked(dev, gem_handle, size, + client_iova, flags, &iova); + } else { + result = tu_allocate_kernel_iova(dev, gem_handle, &iova); + } + + if (result != VK_SUCCESS) { + tu_gem_close(dev, gem_handle); + return result; + } + + name = tu_debug_bos_add(dev, size, name); + + uint32_t idx = 0; + + if (!dev->physical_device->has_vm_bind) { + mtx_lock(&dev->bo_mutex); + + result = tu_bo_add_to_bo_list(dev, gem_handle, flags, iova, &idx); + if (result != VK_SUCCESS) { + mtx_unlock(&dev->bo_mutex); + if (dev->physical_device->has_set_iova) + util_vma_heap_free(&dev->vma, iova, size); + tu_gem_close(dev, gem_handle); + return result; + } + } + + bool implicit_sync = flags & TU_BO_ALLOC_IMPLICIT_SYNC; *bo = (struct tu_bo) { .gem_handle = gem_handle, .size = size, @@ -599,7 +718,8 @@ tu_bo_init(struct tu_device *dev, .base = base, }; - mtx_unlock(&dev->bo_mutex); + if (!dev->physical_device->has_vm_bind) + mtx_unlock(&dev->bo_mutex); tu_dump_bo_init(dev, bo); @@ -682,6 +802,9 @@ msm_bo_init(struct tu_device *dev, if (flags & TU_BO_ALLOC_GPU_READ_ONLY) req.flags |= MSM_BO_GPU_READONLY; + if (dev->physical_device->has_vm_bind && !(flags & TU_BO_ALLOC_SHAREABLE)) + req.flags |= MSM_BO_NO_SHARE; + int ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req, sizeof(req)); if (ret) @@ -809,9 +932,14 @@ msm_bo_map(struct tu_device *dev, struct tu_bo *bo, void *placed_addr) static void msm_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo) { - mtx_lock(&dev->bo_mutex); - dev->submit_bo_list[bo->submit_bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP; - mtx_unlock(&dev->bo_mutex); + if (dev->physical_device->has_vm_bind) { + tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP, MSM_VM_BIND_OP_DUMP, + bo->iova, bo->gem_handle, 0, bo->size); + } else { + mtx_lock(&dev->bo_mutex); + dev->submit_bo_list[bo->submit_bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP; + mtx_unlock(&dev->bo_mutex); + } } @@ -853,6 +981,23 @@ msm_bo_get_metadata(struct tu_device *dev, struct tu_bo *bo, return ret; } +static void +msm_bo_gem_close(struct tu_device *dev, struct tu_bo *bo) +{ + /* Our BO structs are stored in a sparse array in the physical device, + * so we don't want to free the BO pointer, instead we want to reset it + * to 0, to signal that array entry as being free. + */ + uint32_t gem_handle = bo->gem_handle; + memset(bo, 0, sizeof(*bo)); + + struct drm_gem_close req = { + .handle = gem_handle, + }; + + drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req); +} + static void msm_bo_finish(struct tu_device *dev, struct tu_bo *bo) { @@ -875,23 +1020,27 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo) TU_RMV(bo_destroy, dev, bo); - tu_bo_list_del(dev, bo); + if (dev->physical_device->has_vm_bind) { + tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, bo->iova, 0, 0, + bo->size); - if (dev->physical_device->has_set_iova) { + mtx_lock(&dev->bo_mutex); + if (bo->implicit_sync) + dev->implicit_sync_bo_count--; + mtx_unlock(&dev->bo_mutex); + + mtx_lock(&dev->vma_mutex); + util_vma_heap_free(&dev->vma, bo->iova, bo->size); + mtx_unlock(&dev->vma_mutex); + + msm_bo_gem_close(dev, bo); + } else if (dev->physical_device->has_set_iova) { + tu_bo_list_del(dev, bo); tu_bo_make_zombie(dev, bo); } else { - /* Our BO structs are stored in a sparse array in the physical device, - * so we don't want to free the BO pointer, instead we want to reset it - * to 0, to signal that array entry as being free. - */ - uint32_t gem_handle = bo->gem_handle; - memset(bo, 0, sizeof(*bo)); + tu_bo_list_del(dev, bo); - struct drm_gem_close req = { - .handle = gem_handle, - }; - - drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req); + msm_bo_gem_close(dev, bo); } u_rwlock_rdunlock(&dev->dma_bo_lock); @@ -912,6 +1061,7 @@ msm_queue_submit(struct tu_queue *queue, void *_submit, uint64_t gpu_offset = 0; uint32_t entry_count = util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd); + bool has_vm_bind = queue->device->physical_device->has_vm_bind; #if HAVE_PERFETTO struct tu_perfetto_clocks clocks; uint64_t start_ts = tu_perfetto_begin_submit(); @@ -967,39 +1117,47 @@ msm_queue_submit(struct tu_queue *queue, void *_submit, if (signal_count) flags |= MSM_SUBMIT_SYNCOBJ_OUT; - mtx_lock(&queue->device->bo_mutex); + if (has_vm_bind) { + u_rwlock_rdlock(&queue->device->vm_bind_fence_lock); - /* MSM_SUBMIT_NO_IMPLICIT skips having the scheduler wait on the previous dma - * fences attached to the BO (such as from the window system server's command - * queue) before submitting the job. Our fence will always get attached to - * the BO, because it gets used for synchronization for the shrinker. - * - * If the flag is not set, then the kernel falls back to checking each BO's - * MSM_SUBMIT_NO_IMPLICIT flag for its implicit sync handling. - * - * As of kernel 6.0, the core wsi code will be generating appropriate syncobj - * export-and-waits/signal-and-imports for implict syncing (on implicit sync - * WSI backends) and not allocating any - * wsi_memory_allocate_info->implicit_sync BOs from the driver. However, on - * older kernels with that flag set, we have to submit without NO_IMPLICIT - * set to do have the kernel do pre-submit waits on whatever the last fence - * was. - */ - if (queue->device->implicit_sync_bo_count == 0) - flags |= MSM_SUBMIT_NO_IMPLICIT; + if (queue->device->vm_bind_fence_fd != -1) + flags |= MSM_SUBMIT_FENCE_FD_IN; + } else { + mtx_lock(&queue->device->bo_mutex); - /* drm_msm_gem_submit_cmd requires index of bo which could change at any - * time when bo_mutex is not locked. So we update the index here under the - * lock. - */ - util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd, - cmd) { - unsigned i = cmd - - util_dynarray_element(&submit->commands, - struct drm_msm_gem_submit_cmd, 0); - struct tu_bo **bo = util_dynarray_element(&submit->command_bos, - struct tu_bo *, i); - cmd->submit_idx = (*bo)->submit_bo_list_idx; + /* MSM_SUBMIT_NO_IMPLICIT skips having the scheduler wait on the + * previous dma fences attached to the BO (such as from the window + * system server's command queue) before submitting the job. Our fence + * will always get attached to the BO, because it gets used for + * synchronization for the shrinker. + * + * If the flag is not set, then the kernel falls back to checking each + * BO's MSM_SUBMIT_NO_IMPLICIT flag for its implicit sync handling. + * + * As of kernel 6.0, the core wsi code will be generating appropriate + * syncobj export-and-waits/signal-and-imports for implict syncing (on + * implicit sync WSI backends) and not allocating any + * wsi_memory_allocate_info->implicit_sync BOs from the driver. However, + * on older kernels with that flag set, we have to submit without + * NO_IMPLICIT set to do have the kernel do pre-submit waits on whatever + * the last fence was. + */ + if (queue->device->implicit_sync_bo_count == 0) + flags |= MSM_SUBMIT_NO_IMPLICIT; + + /* drm_msm_gem_submit_cmd requires index of bo which could change at any + * time when bo_mutex is not locked. So we update the index here under the + * lock. + */ + util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd, + cmd) { + unsigned i = cmd - + util_dynarray_element(&submit->commands, + struct drm_msm_gem_submit_cmd, 0); + struct tu_bo **bo = util_dynarray_element(&submit->command_bos, + struct tu_bo *, i); + cmd->submit_idx = (*bo)->submit_bo_list_idx; + } } req = (struct drm_msm_gem_submit) { @@ -1008,6 +1166,7 @@ msm_queue_submit(struct tu_queue *queue, void *_submit, .nr_cmds = entry_count, .bos = (uint64_t)(uintptr_t) queue->device->submit_bo_list, .cmds = (uint64_t)(uintptr_t)submit->commands.data, + .fence_fd = queue->device->vm_bind_fence_fd, .queueid = queue->msm_queue_id, .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs, .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs, @@ -1023,7 +1182,10 @@ msm_queue_submit(struct tu_queue *queue, void *_submit, &req, sizeof(req)); } - mtx_unlock(&queue->device->bo_mutex); + if (has_vm_bind) + u_rwlock_rdunlock(&queue->device->vm_bind_fence_lock); + else + mtx_unlock(&queue->device->bo_mutex); if (ret) { result = vk_device_set_lost(&queue->device->vk, "submit failed: %m"); @@ -1111,6 +1273,8 @@ tu_knl_drm_msm_load(struct tu_instance *instance, device->instance = instance; device->local_fd = fd; + device->has_vm_bind = tu_try_enable_vm_bind(fd) == 0; + if (tu_drm_get_gpu_id(device, &device->dev_id.gpu_id)) { result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, "could not get GPU ID");