From 71ef46717cd091f0a12a12cf5a8da3ec8a6fe96a Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 6 Dec 2024 13:24:43 -0500 Subject: [PATCH] tu/kgsl: Add support for sparse binding Use the "virtual BO" interface. Part-of: --- src/freedreno/vulkan/tu_knl_kgsl.cc | 379 ++++++++++++++++++++++++---- 1 file changed, 333 insertions(+), 46 deletions(-) diff --git a/src/freedreno/vulkan/tu_knl_kgsl.cc b/src/freedreno/vulkan/tu_knl_kgsl.cc index 9b2f4c102d4..28b5acff0cd 100644 --- a/src/freedreno/vulkan/tu_knl_kgsl.cc +++ b/src/freedreno/vulkan/tu_knl_kgsl.cc @@ -166,6 +166,37 @@ bo_init_new_ion_legacy(struct tu_device *dev, struct tu_bo **out_bo, uint64_t si return tu_bo_init_dmabuf(dev, out_bo, -1, share.fd); } +static VkResult +kgsl_bo_user_map(struct tu_device *dev, struct tu_bo *bo, uint64_t client_iova) +{ + uint64_t offset = bo->gem_handle << 12; + void *map = mmap((void *)client_iova, bo->size, PROT_READ | PROT_WRITE, + MAP_SHARED, dev->physical_device->local_fd, offset); + if (map == MAP_FAILED) { + kgsl_bo_finish(dev, bo); + + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "mmap failed (%s)", strerror(errno)); + } + + if (client_iova && (uint64_t)map != client_iova) { + kgsl_bo_finish(dev, bo); + + return vk_errorf(dev, VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, + "mmap could not map the given address"); + } + + bo->map = map; + bo->iova = (uint64_t)map; + + /* Because we're using SVM, the CPU mapping and GPU mapping are the same + * and the CPU mapping must stay fixed for the lifetime of the BO. + */ + bo->never_unmap = true; + + return VK_SUCCESS; +} + static VkResult kgsl_bo_init(struct tu_device *dev, struct vk_object_base *base, @@ -239,30 +270,9 @@ kgsl_bo_init(struct tu_device *dev, }; if (flags & TU_BO_ALLOC_REPLAYABLE) { - uint64_t offset = req.id << 12; - void *map = mmap((void *)client_iova, bo->size, PROT_READ | PROT_WRITE, - MAP_SHARED, dev->physical_device->local_fd, offset); - if (map == MAP_FAILED) { - kgsl_bo_finish(dev, bo); - - return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, - "mmap failed (%s)", strerror(errno)); - } - - if (client_iova && (uint64_t)map != client_iova) { - kgsl_bo_finish(dev, bo); - - return vk_errorf(dev, VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, - "mmap could not map the given address"); - } - - bo->map = map; - bo->iova = (uint64_t)map; - - /* Because we're using SVM, the CPU mapping and GPU mapping are the same - * and the CPU mapping must stay fixed for the lifetime of the BO. - */ - bo->never_unmap = true; + VkResult result = kgsl_bo_user_map(dev, bo, client_iova); + if (result != VK_SUCCESS) + return result; } tu_dump_bo_init(dev, bo); @@ -397,6 +407,124 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo) safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUMEM_FREE_ID, &req); } +static VkResult +kgsl_sparse_vma_init(struct tu_device *dev, + struct vk_object_base *base, + struct tu_sparse_vma *out_vma, + uint64_t *out_iova, + enum tu_sparse_vma_flags flags, + uint64_t size, uint64_t client_iova) +{ + /* Note: we cannot use kgsl_gpumem_alloc_id because it only has a 32-bit + * flags value. kgsl_gpuobj_alloc seems to be the only ioctl we can use. + */ + struct kgsl_gpuobj_alloc req = { + .size = size, + .flags = KGSL_MEMFLAGS_VBO, + .va_len = 0, /* seems to be unused? */ + }; + + if (flags & TU_SPARSE_VMA_REPLAYABLE) + req.flags |= KGSL_MEMFLAGS_USE_CPU_MAP; + + if (!(flags & TU_SPARSE_VMA_MAP_ZERO)) + req.flags |= KGSL_MEMFLAGS_VBO_NO_MAP_ZERO; + + int ret; + + ret = safe_ioctl(dev->physical_device->local_fd, + IOCTL_KGSL_GPUOBJ_ALLOC, &req); + if (ret) { + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "GPUOBJ_ALLOC failed (%s)", strerror(errno)); + } + + struct tu_bo *bo = tu_device_lookup_bo(dev, req.id); + assert(bo && bo->gem_handle == 0); + + *bo = (struct tu_bo) { + .gem_handle = req.id, + .size = req.mmapsize, + .name = NULL, + .refcnt = 1, + .shared_fd = -1, + .base = base, + }; + + if (flags & TU_SPARSE_VMA_REPLAYABLE) { + VkResult result = kgsl_bo_user_map(dev, bo, client_iova); + if (result != VK_SUCCESS) + return result; + } else { + /* For some cursed reason, the ioctl doesn't return the GPU address so + * we have to query it. + */ + struct kgsl_gpumem_get_info info = { + .id = req.id, + }; + + ret = safe_ioctl(dev->physical_device->local_fd, + IOCTL_KGSL_GPUMEM_GET_INFO, &info); + if (ret) { + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "GPUMEM_GET_INFO failed (%s)", strerror(errno)); + } + + bo->iova = info.gpuaddr; + } + + out_vma->kgsl.virtual_bo = bo; + *out_iova = bo->iova; + return VK_SUCCESS; +} + +static VkResult +kgsl_sparse_vma_map(struct tu_device *dev, + struct tu_sparse_vma *vma, + struct tu_bo *bo, uint64_t bo_offset) +{ + struct kgsl_gpumem_bind_range range = { + .child_offset = bo_offset, + .target_offset = 0, + .length = vma->kgsl.virtual_bo->size, + .child_id = bo->gem_handle, + .op = KGSL_GPUMEM_RANGE_OP_BIND, + }; + + struct kgsl_gpumem_bind_ranges req = { + .ranges = (uint64_t)(uintptr_t)&range, + .ranges_nents = 1, + .ranges_size = sizeof(range), + .id = vma->kgsl.virtual_bo->gem_handle, + .flags = 0, + }; + + int ret; + + ret = safe_ioctl(dev->physical_device->local_fd, + IOCTL_KGSL_GPUMEM_BIND_RANGES, &req); + if (ret) { + return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "GPUMEM_BIND_RANGES failed (%s)", strerror(errno)); + } + + return VK_SUCCESS; +} + +static void +kgsl_sparse_vma_finish(struct tu_device *dev, + struct tu_sparse_vma *vma) +{ + struct kgsl_gpuobj_free req = { + .id = vma->kgsl.virtual_bo->gem_handle + }; + + /* Tell sparse array that entry is free */ + memset(vma->kgsl.virtual_bo, 0, sizeof(*vma->kgsl.virtual_bo)); + + safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUOBJ_FREE, &req); +} + static VkResult get_kgsl_prop(int fd, unsigned int type, void *value, size_t size) { @@ -431,6 +559,26 @@ kgsl_is_memory_type_supported(int fd, uint32_t flags) return true; } +static bool +kgsl_is_virtual_bo_supported(int fd) +{ + struct kgsl_gpuobj_alloc req_alloc = { + .size = 0x1000, + .flags = KGSL_MEMFLAGS_VBO, + }; + + int ret = safe_ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &req_alloc); + if (ret) { + return false; + } + + struct kgsl_gpuobj_free req_free = { .id = req_alloc.id }; + + safe_ioctl(fd, IOCTL_KGSL_GPUOBJ_FREE, &req_free); + + return true; +} + enum kgsl_syncobj_state { KGSL_SYNCOBJ_STATE_UNSIGNALED, KGSL_SYNCOBJ_STATE_SIGNALED, @@ -1047,6 +1195,10 @@ const struct vk_sync_type vk_kgsl_sync_type = { struct tu_kgsl_queue_submit { struct util_dynarray commands; + struct util_dynarray ranges; + struct util_dynarray bind_cmds; + struct tu_sparse_vma *cur_vma; + unsigned cur_vma_range_start; }; static void * @@ -1064,6 +1216,8 @@ kgsl_submit_finish(struct tu_device *device, (struct tu_kgsl_queue_submit *)_submit; util_dynarray_fini(&submit->commands); + util_dynarray_fini(&submit->ranges); + util_dynarray_fini(&submit->bind_cmds); vk_free(&device->vk.alloc, submit); } @@ -1088,6 +1242,76 @@ kgsl_submit_add_entries(struct tu_device *device, void *_submit, } } +static void +kgsl_submit_add_bind(struct tu_device *device, + void *_submit, + struct tu_sparse_vma *vma, uint64_t vma_offset, + struct tu_bo *bo, uint64_t bo_offset, + uint64_t size) +{ + struct tu_kgsl_queue_submit *submit = + (struct tu_kgsl_queue_submit *)_submit; + + if (vma != submit->cur_vma) { + unsigned range_count = + util_dynarray_num_elements(&submit->ranges, + struct kgsl_gpumem_bind_range); + if (submit->cur_vma) { + struct kgsl_gpu_aux_command_bind *last_bind = + util_dynarray_top_ptr(&submit->bind_cmds, + struct kgsl_gpu_aux_command_bind); + last_bind->numranges = range_count - submit->cur_vma_range_start; + } + + struct kgsl_gpu_aux_command_bind bind = { + .rangeslist = submit->ranges.size, + .numranges = 0, + .rangesize = sizeof(struct kgsl_gpumem_bind_range), + .target = vma->kgsl.virtual_bo->gem_handle, + }; + + + util_dynarray_append(&submit->bind_cmds, + struct kgsl_gpu_aux_command_bind, bind); + + submit->cur_vma = vma; + submit->cur_vma_range_start = range_count; + } + + struct kgsl_gpumem_bind_range range = { + .child_offset = bo_offset, + .target_offset = vma_offset, + .length = size, + .child_id = bo ? bo->gem_handle : 0, + .op = bo ? KGSL_GPUMEM_RANGE_OP_BIND : KGSL_GPUMEM_RANGE_OP_UNBIND, + }; + + util_dynarray_append(&submit->ranges, struct kgsl_gpumem_bind_range, + range); +} + +/* We don't know the actual CPU pointers until we've finished adding all the + * bind commands, so we put the offset from the base instead. We need to write + * the actual pointer after all the ranges are added. We also need to fill out + * of the size of the last command. + */ +static void +kgsl_bind_finalize(struct tu_kgsl_queue_submit *submit) +{ + unsigned range_count = + util_dynarray_num_elements(&submit->ranges, + struct kgsl_gpumem_bind_range); + struct kgsl_gpu_aux_command_bind *last_bind = + util_dynarray_top_ptr(&submit->bind_cmds, + struct kgsl_gpu_aux_command_bind); + last_bind->numranges = range_count - submit->cur_vma_range_start; + + util_dynarray_foreach (&submit->bind_cmds, + struct kgsl_gpu_aux_command_bind, bind) { + bind->rangeslist += (uint64_t)(uintptr_t)submit->ranges.data; + } +} + static VkResult kgsl_queue_submit(struct tu_queue *queue, void *_submit, struct vk_sync_wait *waits, uint32_t wait_count, @@ -1164,6 +1388,9 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit, VkResult result = VK_SUCCESS; + if (submit->bind_cmds.size != 0) + kgsl_bind_finalize(submit); + if (u_trace_submission_data) { mtx_lock(&queue->device->kgsl_profiling_mutex); tu_suballoc_bo_alloc(&u_trace_submission_data->kgsl_timestamp_bo, @@ -1240,29 +1467,76 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit, UNREACHABLE("invalid syncobj state"); } - struct kgsl_gpu_command req = { - .flags = KGSL_CMDBATCH_SUBMIT_IB_LIST, - .cmdlist = (uintptr_t) submit->commands.data, - .cmdsize = sizeof(struct kgsl_command_object), - .numcmds = util_dynarray_num_elements(&submit->commands, - struct kgsl_command_object), - .synclist = (uintptr_t) &sync, - .syncsize = sizeof(sync), - .numsyncs = has_sync != 0 ? 1 : 0, - .context_id = queue->msm_queue_id, - }; + int ret; + uint32_t timestamp = 0; + uint64_t gpu_offset = 0; - if (obj_idx) { - req.flags |= KGSL_CMDBATCH_PROFILING; - req.objlist = (uintptr_t) objs; - req.objsize = sizeof(struct kgsl_command_object); - req.numobjs = obj_idx; + if (submit->bind_cmds.size == 0) { + struct kgsl_gpu_command req = { + .flags = KGSL_CMDBATCH_SUBMIT_IB_LIST, + .cmdlist = (uintptr_t) submit->commands.data, + .cmdsize = sizeof(struct kgsl_command_object), + .numcmds = util_dynarray_num_elements(&submit->commands, + struct kgsl_command_object), + .synclist = (uintptr_t) &sync, + .syncsize = sizeof(sync), + .numsyncs = has_sync != 0 ? 1 : 0, + .context_id = queue->msm_queue_id, + }; + + if (obj_idx) { + req.flags |= KGSL_CMDBATCH_PROFILING; + req.objlist = (uintptr_t) objs; + req.objsize = sizeof(struct kgsl_command_object); + req.numobjs = obj_idx; + } + + ret = safe_ioctl(queue->device->physical_device->local_fd, + IOCTL_KGSL_GPU_COMMAND, &req); + + timestamp = req.timestamp; + } else { + /* kgsl doesn't support multiple bind commands at once */ + uint32_t i = 0; + util_dynarray_foreach(&submit->bind_cmds, + struct kgsl_gpu_aux_command_bind, bind) { + bool do_sync = has_sync && i == 0; + + struct kgsl_gpu_aux_command_generic aux = { + .priv = (uintptr_t) bind, + .size = sizeof(*bind), + .type = KGSL_GPU_AUX_COMMAND_BIND, + }; + + uint32_t flags = KGSL_GPU_AUX_COMMAND_BIND; + if (do_sync) + flags |= KGSL_GPU_AUX_COMMAND_SYNC; + + struct kgsl_gpu_aux_command req = { + .flags = flags, + .cmdlist = (uintptr_t) &aux, + .cmdsize = sizeof(aux), + .numcmds = 1, + .synclist = (uintptr_t) &sync, + .syncsize = sizeof(sync), + .numsyncs = do_sync ? 1 : 0, + .context_id = queue->msm_queue_id, + }; + ret = safe_ioctl(queue->device->physical_device->local_fd, + IOCTL_KGSL_GPU_AUX_COMMAND, &req); + + if (ret) { + result = vk_device_set_lost(&queue->device->vk, + "bind submit failed: %s\n", + strerror(errno)); + goto fail_submit; + } + + timestamp = req.timestamp; + i++; + } } - int ret = safe_ioctl(queue->device->physical_device->local_fd, - IOCTL_KGSL_GPU_COMMAND, &req); - - uint64_t gpu_offset = 0; #if HAVE_PERFETTO if (profiling_buffer) { /* We need to wait for KGSL to queue the GPU command before we can read @@ -1310,7 +1584,7 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit, goto fail_submit; } - p_atomic_set(&queue->fence, req.timestamp); + p_atomic_set(&queue->fence, timestamp); for (uint32_t i = 0; i < signal_count; i++) { struct kgsl_syncobj *signal_sync = @@ -1320,7 +1594,7 @@ kgsl_queue_submit(struct tu_queue *queue, void *_submit, kgsl_syncobj_reset(signal_sync); signal_sync->state = KGSL_SYNCOBJ_STATE_TS; signal_sync->queue = queue; - signal_sync->timestamp = req.timestamp; + signal_sync->timestamp = timestamp; } if (u_trace_submission_data) { @@ -1412,8 +1686,11 @@ static const struct tu_knl kgsl_knl_funcs = { .submit_create = kgsl_submit_create, .submit_finish = kgsl_submit_finish, .submit_add_entries = kgsl_submit_add_entries, + .submit_add_bind = kgsl_submit_add_bind, .queue_submit = kgsl_queue_submit, .queue_wait_fence = kgsl_queue_wait_fence, + .sparse_vma_init = kgsl_sparse_vma_init, + .sparse_vma_finish = kgsl_sparse_vma_finish, }; static bool @@ -1530,6 +1807,16 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd) fd, KGSL_MEMFLAGS_IOCOHERENT | (KGSL_CACHEMODE_WRITEBACK << KGSL_CACHEMODE_SHIFT)); + device->has_sparse = kgsl_is_virtual_bo_supported(fd); + device->has_sparse_prr = device->has_sparse; + get_kgsl_prop(fd, KGSL_PROP_GPU_VA64_SIZE, &device->va_size, + sizeof(device->va_size)); + /* We don't actually use the VMA, but set a fake offset so that it doesn't + * think we're trying to allocate 0 and assert. + */ + device->va_start = 0x100000000; + + /* preemption is always supported on kgsl */ device->has_preemption = true;