diff --git a/docs/envvars.rst b/docs/envvars.rst index 9e90cad565b..98731cb04ee 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -193,6 +193,10 @@ Core Mesa environment variables causes the Vulkan driver to call abort() immediately after detecting a lost device. This is extremely useful when testing as it prevents the test suite from continuing on with a lost device. +:envvar:`MESA_VK_ENABLE_SUBMIT_THREAD` + for Vulkan drivers which support real timeline semaphores, this forces + them to use a submit thread from the beginning, regardless of whether or + not they ever see a wait-before-signal condition. :envvar:`MESA_LOADER_DRIVER_OVERRIDE` chooses a different driver binary such as ``etnaviv`` or ``zink``. diff --git a/src/vulkan/runtime/vk_device.c b/src/vulkan/runtime/vk_device.c index a39c56917e3..3bbe89a1381 100644 --- a/src/vulkan/runtime/vk_device.c +++ b/src/vulkan/runtime/vk_device.c @@ -28,11 +28,53 @@ #include "vk_log.h" #include "vk_physical_device.h" #include "vk_queue.h" +#include "vk_sync.h" +#include "vk_sync_timeline.h" #include "vk_util.h" #include "util/debug.h" #include "util/hash_table.h" #include "util/ralloc.h" +static enum vk_device_timeline_mode +get_timeline_mode(struct vk_physical_device *physical_device) +{ + if (physical_device->supported_sync_types == NULL) + return VK_DEVICE_TIMELINE_MODE_NONE; + + const struct vk_sync_type *timeline_type = NULL; + for (const struct vk_sync_type *const *t = + physical_device->supported_sync_types; *t; t++) { + if ((*t)->features & VK_SYNC_FEATURE_TIMELINE) { + /* We can only have one timeline mode */ + assert(timeline_type == NULL); + timeline_type = *t; + } + } + + if (timeline_type == NULL) + return VK_DEVICE_TIMELINE_MODE_NONE; + + if (vk_sync_type_is_vk_sync_timeline(timeline_type)) + return VK_DEVICE_TIMELINE_MODE_EMULATED; + + if (timeline_type->features & VK_SYNC_FEATURE_WAIT_BEFORE_SIGNAL) + return VK_DEVICE_TIMELINE_MODE_NATIVE; + + /* For assisted mode, we require a few additional things of all sync types + * which may be used as semaphores. + */ + for (const struct vk_sync_type *const *t = + physical_device->supported_sync_types; *t; t++) { + if ((*t)->features & VK_SYNC_FEATURE_GPU_WAIT) { + assert((*t)->features & VK_SYNC_FEATURE_WAIT_PENDING); + if ((*t)->features & VK_SYNC_FEATURE_BINARY) + assert((*t)->features & VK_SYNC_FEATURE_CPU_RESET); + } + } + + return VK_DEVICE_TIMELINE_MODE_ASSISTED; +} + VkResult vk_device_init(struct vk_device *device, struct vk_physical_device *physical_device, @@ -95,6 +137,8 @@ vk_device_init(struct vk_device *device, device->drm_fd = -1; + device->timeline_mode = get_timeline_mode(physical_device); + #ifdef ANDROID mtx_init(&device->swapchain_private_mtx, mtx_plain); device->swapchain_private = NULL; @@ -120,6 +164,30 @@ vk_device_finish(UNUSED struct vk_device *device) vk_object_base_finish(&device->base); } +VkResult +vk_device_flush(struct vk_device *device) +{ + if (device->timeline_mode != VK_DEVICE_TIMELINE_MODE_EMULATED) + return VK_SUCCESS; + + bool progress; + do { + progress = false; + + vk_foreach_queue(queue, device) { + uint32_t queue_submit_count; + VkResult result = vk_queue_flush(queue, &queue_submit_count); + if (unlikely(result != VK_SUCCESS)) + return result; + + if (queue_submit_count) + progress = true; + } + } while (progress); + + return VK_SUCCESS; +} + void _vk_device_report_lost(struct vk_device *device) { diff --git a/src/vulkan/runtime/vk_device.h b/src/vulkan/runtime/vk_device.h index 14426abe498..85c0cb93bf5 100644 --- a/src/vulkan/runtime/vk_device.h +++ b/src/vulkan/runtime/vk_device.h @@ -68,6 +68,61 @@ struct vk_device { /* Set by vk_device_set_drm_fd() */ int drm_fd; + /** An enum describing how timeline semaphores work */ + enum vk_device_timeline_mode { + /** Timeline semaphores are not supported */ + VK_DEVICE_TIMELINE_MODE_NONE, + + /** Timeline semaphores are emulated with vk_timeline + * + * In this mode, timeline semaphores are emulated using vk_timeline + * which is a collection of binary semaphores, one per time point. + * These timeline semaphores cannot be shared because the data structure + * exists entirely in userspace. These timelines are virtually + * invisible to the driver; all it sees are the binary vk_syncs, one per + * time point. + * + * To handle wait-before-signal, we place all vk_queue_submits in the + * queue's submit list in vkQueueSubmit() and call vk_device_flush() at + * key points such as the end of vkQueueSubmit() and vkSemaphoreSignal(). + * This ensures that, as soon as a given submit's dependencies are fully + * resolvable, it gets submitted to the driver. + */ + VK_DEVICE_TIMELINE_MODE_EMULATED, + + /** Timeline semaphores are a kernel-assisted emulation + * + * In this mode, timeline semaphores are still technically an emulation + * in the sense that they don't support wait-before-signal natively. + * Instead, all GPU-waitable objects support a CPU wait-for-pending + * operation which lets the userspace driver wait until a given event + * on the (possibly shared) vk_sync is pending. The event is "pending" + * if a job has been submitted to the kernel (possibly from a different + * process) which will signal it. In vkQueueSubit, we use this wait + * mode to detect waits which are not yet pending and, the first time we + * do, spawn a thread to manage the queue. That thread waits for each + * submit's waits to all be pending before submitting to the driver + * queue. + * + * We have to be a bit more careful about a few things in this mode. + * In particular, we can never assume that any given wait operation is + * pending. For instance, when we go to export a sync file from a + * binary semaphore, we need to first wait for it to be pending. The + * spec guarantees that the vast majority of these waits return almost + * immediately, but we do need to insert them for correctness. + */ + VK_DEVICE_TIMELINE_MODE_ASSISTED, + + /** Timeline semaphores are 100% native + * + * In this mode, wait-before-signal is natively supported by the + * underlying timeline implementation. We can submit-and-forget and + * assume that dependencies will get resolved for us by the kernel. + * Currently, this isn't supported by any Linux primitives. + */ + VK_DEVICE_TIMELINE_MODE_NATIVE, + } timeline_mode; + #ifdef ANDROID mtx_t swapchain_private_mtx; struct hash_table *swapchain_private; @@ -93,6 +148,8 @@ vk_device_set_drm_fd(struct vk_device *device, int drm_fd) void vk_device_finish(struct vk_device *device); +VkResult vk_device_flush(struct vk_device *device); + VkResult PRINTFLIKE(4, 5) _vk_device_set_lost(struct vk_device *device, const char *file, int line, diff --git a/src/vulkan/runtime/vk_fence.c b/src/vulkan/runtime/vk_fence.c index 602f3fae64b..cf5e8914498 100644 --- a/src/vulkan/runtime/vk_fence.c +++ b/src/vulkan/runtime/vk_fence.c @@ -434,6 +434,18 @@ vk_common_GetFenceFdKHR(VkDevice _device, break; case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: + /* There's no direct spec quote for this but the same rules as for + * semaphore export apply. We can't export a sync file from a fence + * if the fence event hasn't been submitted to the kernel yet. + */ + if (device->timeline_mode == VK_DEVICE_TIMELINE_MODE_ASSISTED) { + result = vk_sync_wait(device, sync, 0, + VK_SYNC_WAIT_PENDING, + UINT64_MAX); + if (unlikely(result != VK_SUCCESS)) + return result; + } + result = vk_sync_export_sync_file(device, sync, pFd); if (unlikely(result != VK_SUCCESS)) return result; diff --git a/src/vulkan/runtime/vk_queue.c b/src/vulkan/runtime/vk_queue.c index 7f4a5ce2043..6a2e44c87b6 100644 --- a/src/vulkan/runtime/vk_queue.c +++ b/src/vulkan/runtime/vk_queue.c @@ -24,14 +24,28 @@ #include "vk_queue.h" #include "util/debug.h" +#include +#include "vk_alloc.h" +#include "vk_command_buffer.h" +#include "vk_common_entrypoints.h" #include "vk_device.h" +#include "vk_fence.h" +#include "vk_log.h" +#include "vk_physical_device.h" +#include "vk_semaphore.h" +#include "vk_sync.h" +#include "vk_sync_timeline.h" +#include "vk_util.h" VkResult vk_queue_init(struct vk_queue *queue, struct vk_device *device, const VkDeviceQueueCreateInfo *pCreateInfo, uint32_t index_in_family) { + VkResult result = VK_SUCCESS; + int ret; + memset(queue, 0, sizeof(*queue)); vk_object_base_init(device, &queue->base, VK_OBJECT_TYPE_QUEUE); @@ -43,18 +57,43 @@ vk_queue_init(struct vk_queue *queue, struct vk_device *device, assert(index_in_family < pCreateInfo->queueCount); queue->index_in_family = index_in_family; + list_inithead(&queue->submit.submits); + + ret = mtx_init(&queue->submit.mutex, mtx_plain); + if (ret == thrd_error) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, "mtx_init failed"); + goto fail_mutex; + } + + ret = cnd_init(&queue->submit.push); + if (ret == thrd_error) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, "cnd_init failed"); + goto fail_push; + } + + ret = cnd_init(&queue->submit.pop); + if (ret == thrd_error) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, "cnd_init failed"); + goto fail_pop; + } + util_dynarray_init(&queue->labels, NULL); queue->region_begin = true; return VK_SUCCESS; + +fail_pop: + cnd_destroy(&queue->submit.push); +fail_push: + mtx_destroy(&queue->submit.mutex); +fail_mutex: + return result; } -void -vk_queue_finish(struct vk_queue *queue) +static bool +vk_queue_has_submit_thread(struct vk_queue *queue) { - util_dynarray_fini(&queue->labels); - list_del(&queue->link); - vk_object_base_finish(&queue->base); + return queue->submit.has_thread; } VkResult @@ -83,3 +122,869 @@ _vk_queue_set_lost(struct vk_queue *queue, return VK_ERROR_DEVICE_LOST; } + +static struct vk_queue_submit * +vk_queue_submit_alloc(struct vk_queue *queue, + uint32_t wait_count, + uint32_t command_buffer_count, + uint32_t signal_count) +{ + VK_MULTIALLOC(ma); + VK_MULTIALLOC_DECL(&ma, struct vk_queue_submit, submit, 1); + VK_MULTIALLOC_DECL(&ma, struct vk_sync_wait, waits, wait_count); + VK_MULTIALLOC_DECL(&ma, struct vk_command_buffer *, command_buffers, + command_buffer_count); + VK_MULTIALLOC_DECL(&ma, struct vk_sync_signal, signals, signal_count); + VK_MULTIALLOC_DECL(&ma, struct vk_sync *, wait_temps, wait_count); + + struct vk_sync_timeline_point **wait_points = NULL, **signal_points = NULL; + if (queue->base.device->timeline_mode == VK_DEVICE_TIMELINE_MODE_EMULATED) { + vk_multialloc_add(&ma, &wait_points, + struct vk_sync_timeline_point *, wait_count); + vk_multialloc_add(&ma, &signal_points, + struct vk_sync_timeline_point *, signal_count); + } + + if (!vk_multialloc_zalloc(&ma, &queue->base.device->alloc, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE)) + return NULL; + + submit->wait_count = wait_count; + submit->command_buffer_count = command_buffer_count; + submit->signal_count = signal_count; + + submit->waits = waits; + submit->command_buffers = command_buffers; + submit->signals = signals; + submit->_wait_temps = wait_temps; + submit->_wait_points = wait_points; + submit->_signal_points = signal_points; + + return submit; +} + +static void +vk_queue_submit_cleanup(struct vk_queue *queue, + struct vk_queue_submit *submit) +{ + for (uint32_t i = 0; i < submit->wait_count; i++) { + if (submit->_wait_temps[i] != NULL) + vk_sync_destroy(queue->base.device, submit->_wait_temps[i]); + } + + if (submit->_wait_points != NULL) { + for (uint32_t i = 0; i < submit->wait_count; i++) { + if (unlikely(submit->_wait_points[i] != NULL)) { + vk_sync_timeline_point_release(queue->base.device, + submit->_wait_points[i]); + } + } + } + + if (submit->_signal_points != NULL) { + for (uint32_t i = 0; i < submit->signal_count; i++) { + if (unlikely(submit->_signal_points[i] != NULL)) { + vk_sync_timeline_point_free(queue->base.device, + submit->_signal_points[i]); + } + } + } +} + +static void +vk_queue_submit_free(struct vk_queue *queue, + struct vk_queue_submit *submit) +{ + vk_free(&queue->base.device->alloc, submit); +} + +static void +vk_queue_submit_destroy(struct vk_queue *queue, + struct vk_queue_submit *submit) +{ + vk_queue_submit_cleanup(queue, submit); + vk_queue_submit_free(queue, submit); +} + +static void +vk_queue_push_submit(struct vk_queue *queue, + struct vk_queue_submit *submit) +{ + mtx_lock(&queue->submit.mutex); + list_addtail(&submit->link, &queue->submit.submits); + cnd_signal(&queue->submit.push); + mtx_unlock(&queue->submit.mutex); +} + +static VkResult +vk_queue_drain(struct vk_queue *queue) +{ + VkResult result = VK_SUCCESS; + + mtx_lock(&queue->submit.mutex); + while (!list_is_empty(&queue->submit.submits)) { + if (vk_device_is_lost(queue->base.device)) { + result = VK_ERROR_DEVICE_LOST; + break; + } + + int ret = cnd_wait(&queue->submit.pop, &queue->submit.mutex); + if (ret == thrd_error) { + result = vk_queue_set_lost(queue, "cnd_wait failed"); + break; + } + } + mtx_unlock(&queue->submit.mutex); + + return result; +} + +static VkResult +vk_queue_submit_final(struct vk_queue *queue, + struct vk_queue_submit *submit) +{ + VkResult result; + + /* Now that we know all our time points exist, fetch the time point syncs + * from any vk_sync_timelines. While we're here, also compact down the + * list of waits to get rid of any trivial timeline waits. + */ + uint32_t wait_count = 0; + for (uint32_t i = 0; i < submit->wait_count; i++) { + /* A timeline wait on 0 is always a no-op */ + if ((submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) && + submit->waits[i].wait_value == 0) + continue; + + /* For emulated timelines, we have a binary vk_sync associated with + * each time point and pass the binary vk_sync to the driver. + */ + struct vk_sync_timeline *timeline = + vk_sync_as_timeline(submit->waits[i].sync); + if (timeline) { + assert(queue->base.device->timeline_mode == + VK_DEVICE_TIMELINE_MODE_EMULATED); + result = vk_sync_timeline_get_point(queue->base.device, timeline, + submit->waits[i].wait_value, + &submit->_wait_points[i]); + if (unlikely(result != VK_SUCCESS)) { + result = vk_queue_set_lost(queue, + "Time point >= %"PRIu64" not found", + submit->waits[i].wait_value); + } + + /* This can happen if the point is long past */ + if (submit->_wait_points[i] == NULL) + continue; + + submit->waits[i].sync = &submit->_wait_points[i]->sync; + submit->waits[i].wait_value = 0; + } + + assert((submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) || + submit->waits[i].wait_value == 0); + + assert(wait_count <= i); + if (wait_count < i) { + submit->waits[wait_count] = submit->waits[i]; + submit->_wait_temps[wait_count] = submit->_wait_temps[i]; + if (submit->_wait_points) + submit->_wait_points[wait_count] = submit->_wait_points[i]; + } + wait_count++; + } + + assert(wait_count <= submit->wait_count); + submit->wait_count = wait_count; + + for (uint32_t i = 0; i < submit->signal_count; i++) { + assert((submit->signals[i].sync->flags & VK_SYNC_IS_TIMELINE) || + submit->signals[i].signal_value == 0); + } + + result = queue->driver_submit(queue, submit); + if (unlikely(result != VK_SUCCESS)) + return result; + + if (submit->_signal_points) { + for (uint32_t i = 0; i < submit->signal_count; i++) { + if (submit->_signal_points[i] == NULL) + continue; + + vk_sync_timeline_point_install(queue->base.device, + submit->_signal_points[i]); + submit->_signal_points[i] = NULL; + } + } + + return VK_SUCCESS; +} + +VkResult +vk_queue_flush(struct vk_queue *queue, uint32_t *submit_count_out) +{ + VkResult result = VK_SUCCESS; + + assert(queue->base.device->timeline_mode == + VK_DEVICE_TIMELINE_MODE_EMULATED); + + mtx_lock(&queue->submit.mutex); + + uint32_t submit_count = 0; + while (!list_is_empty(&queue->submit.submits)) { + struct vk_queue_submit *submit = + list_first_entry(&queue->submit.submits, + struct vk_queue_submit, link); + + for (uint32_t i = 0; i < submit->wait_count; i++) { + /* In emulated timeline mode, only emulated timelines are allowed */ + if (!vk_sync_type_is_vk_sync_timeline(submit->waits[i].sync->type)) { + assert(!(submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE)); + continue; + } + + result = vk_sync_wait(queue->base.device, + submit->waits[i].sync, + submit->waits[i].wait_value, + VK_SYNC_WAIT_PENDING, 0); + if (result == VK_TIMEOUT) { + /* This one's not ready yet */ + result = VK_SUCCESS; + goto done; + } else if (result != VK_SUCCESS) { + result = vk_queue_set_lost(queue, "Wait for time points failed"); + goto done; + } + } + + result = vk_queue_submit_final(queue, submit); + if (unlikely(result != VK_SUCCESS)) { + result = vk_queue_set_lost(queue, "queue::driver_submit failed"); + goto done; + } + + submit_count++; + + list_del(&submit->link); + + vk_queue_submit_destroy(queue, submit); + } + +done: + if (submit_count) + cnd_broadcast(&queue->submit.pop); + + mtx_unlock(&queue->submit.mutex); + + if (submit_count_out) + *submit_count_out = submit_count; + + return result; +} + +static int +vk_queue_submit_thread_func(void *_data) +{ + struct vk_queue *queue = _data; + VkResult result; + + assert(queue->base.device->timeline_mode == + VK_DEVICE_TIMELINE_MODE_ASSISTED); + + mtx_lock(&queue->submit.mutex); + + while (queue->submit.thread_run) { + if (list_is_empty(&queue->submit.submits)) { + int ret = cnd_wait(&queue->submit.push, &queue->submit.mutex); + if (ret == thrd_error) { + mtx_unlock(&queue->submit.mutex); + vk_queue_set_lost(queue, "cnd_wait failed"); + return 1; + } + continue; + } + + struct vk_queue_submit *submit = + list_first_entry(&queue->submit.submits, + struct vk_queue_submit, link); + + /* Drop the lock while we wait */ + mtx_unlock(&queue->submit.mutex); + + result = vk_sync_wait_many(queue->base.device, + submit->wait_count, submit->waits, + VK_SYNC_WAIT_PENDING, UINT64_MAX); + if (unlikely(result != VK_SUCCESS)) { + vk_queue_set_lost(queue, "Wait for time points failed"); + return 1; + } + + result = vk_queue_submit_final(queue, submit); + if (unlikely(result != VK_SUCCESS)) { + vk_queue_set_lost(queue, "queue::driver_submit failed"); + return 1; + } + + /* Do all our cleanup of individual fences etc. outside the lock. + * We can't actually remove it from the list yet. We have to do + * that under the lock. + */ + vk_queue_submit_cleanup(queue, submit); + + mtx_lock(&queue->submit.mutex); + + /* Only remove the submit from from the list and free it after + * queue->submit() has completed. This ensures that, when + * vk_queue_drain() completes, there are no more pending jobs. + */ + list_del(&submit->link); + vk_queue_submit_free(queue, submit); + + cnd_broadcast(&queue->submit.pop); + } + + return 0; +} + +static VkResult +vk_queue_enable_submit_thread(struct vk_queue *queue) +{ + int ret; + + queue->submit.thread_run = true; + + ret = thrd_create(&queue->submit.thread, + vk_queue_submit_thread_func, + queue); + if (ret == thrd_error) + return vk_errorf(queue, VK_ERROR_UNKNOWN, "thrd_create failed"); + + queue->submit.has_thread = true; + + return VK_SUCCESS; +} + +static void +vk_queue_disable_submit_thread(struct vk_queue *queue) +{ + vk_queue_drain(queue); + + /* Kick the thread to disable it */ + mtx_lock(&queue->submit.mutex); + queue->submit.thread_run = false; + cnd_signal(&queue->submit.push); + mtx_unlock(&queue->submit.mutex); + + thrd_join(queue->submit.thread, NULL); + + queue->submit.has_thread = false; +} + +static VkResult +vk_queue_submit(struct vk_queue *queue, + const VkSubmitInfo2KHR *info, + struct vk_fence *fence) +{ + VkResult result; + + struct vk_queue_submit *submit = + vk_queue_submit_alloc(queue, info->waitSemaphoreInfoCount, + info->commandBufferInfoCount, + info->signalSemaphoreInfoCount + (fence != NULL)); + if (unlikely(submit == NULL)) + return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* From the Vulkan 1.2.194 spec: + * + * "If the VkSubmitInfo::pNext chain does not include this structure, + * the batch defaults to use counter pass index 0." + */ + const VkPerformanceQuerySubmitInfoKHR *perf_info = + vk_find_struct_const(info->pNext, PERFORMANCE_QUERY_SUBMIT_INFO_KHR); + submit->perf_pass_index = perf_info ? perf_info->counterPassIndex : 0; + + bool has_binary_permanent_semaphore_wait = false; + for (uint32_t i = 0; i < info->waitSemaphoreInfoCount; i++) { + VK_FROM_HANDLE(vk_semaphore, semaphore, + info->pWaitSemaphoreInfos[i].semaphore); + + /* From the Vulkan 1.2.194 spec: + * + * "Applications can import a semaphore payload into an existing + * semaphore using an external semaphore handle. The effects of the + * import operation will be either temporary or permanent, as + * specified by the application. If the import is temporary, the + * implementation must restore the semaphore to its prior permanent + * state after submitting the next semaphore wait operation." + * + * and + * + * VUID-VkImportSemaphoreFdInfoKHR-flags-03323 + * + * "If flags contains VK_SEMAPHORE_IMPORT_TEMPORARY_BIT, the + * VkSemaphoreTypeCreateInfo::semaphoreType field of the semaphore + * from which handle or name was exported must not be + * VK_SEMAPHORE_TYPE_TIMELINE" + */ + struct vk_sync *sync; + if (semaphore->temporary) { + assert(semaphore->type == VK_SEMAPHORE_TYPE_BINARY); + sync = submit->_wait_temps[i] = semaphore->temporary; + semaphore->temporary = NULL; + } else { + if (semaphore->type == VK_SEMAPHORE_TYPE_BINARY) { + if (queue->base.device->timeline_mode == + VK_DEVICE_TIMELINE_MODE_ASSISTED) + assert(semaphore->permanent.type->move); + has_binary_permanent_semaphore_wait = true; + } + + sync = &semaphore->permanent; + } + + uint32_t wait_value = semaphore->type == VK_SEMAPHORE_TYPE_TIMELINE ? + info->pWaitSemaphoreInfos[i].value : 0; + + submit->waits[i] = (struct vk_sync_wait) { + .sync = sync, + .stage_mask = info->pWaitSemaphoreInfos[i].stageMask, + .wait_value = wait_value, + }; + } + + for (uint32_t i = 0; i < info->commandBufferInfoCount; i++) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, + info->pCommandBufferInfos[i].commandBuffer); + assert(info->pCommandBufferInfos[i].deviceMask == 0 || + info->pCommandBufferInfos[i].deviceMask == 1); + submit->command_buffers[i] = cmd_buffer; + } + + for (uint32_t i = 0; i < info->signalSemaphoreInfoCount; i++) { + VK_FROM_HANDLE(vk_semaphore, semaphore, + info->pSignalSemaphoreInfos[i].semaphore); + + struct vk_sync *sync = vk_semaphore_get_active_sync(semaphore); + uint32_t signal_value = info->pSignalSemaphoreInfos[i].value; + if (semaphore->type == VK_SEMAPHORE_TYPE_TIMELINE) { + if (signal_value == 0) { + result = vk_queue_set_lost(queue, + "Tried to signal a timeline with value 0"); + goto fail; + } + } else { + signal_value = 0; + } + + /* For emulated timelines, we need to associate a binary vk_sync with + * each time point and pass the binary vk_sync to the driver. We could + * do this in vk_queue_submit_final but it might require doing memory + * allocation and we don't want to to add extra failure paths there. + * Instead, allocate and replace the driver-visible vk_sync now and + * we'll insert it into the timeline in vk_queue_submit_final. The + * insert step is guaranteed to not fail. + */ + struct vk_sync_timeline *timeline = vk_sync_as_timeline(sync); + if (timeline) { + assert(queue->base.device->timeline_mode == + VK_DEVICE_TIMELINE_MODE_EMULATED); + result = vk_sync_timeline_alloc_point(queue->base.device, timeline, + signal_value, + &submit->_signal_points[i]); + if (unlikely(result != VK_SUCCESS)) + goto fail; + + sync = &submit->_signal_points[i]->sync; + signal_value = 0; + } + + submit->signals[i] = (struct vk_sync_signal) { + .sync = sync, + .stage_mask = info->pSignalSemaphoreInfos[i].stageMask, + .signal_value = signal_value, + }; + } + + if (fence != NULL) { + uint32_t fence_idx = info->signalSemaphoreInfoCount; + assert(submit->signal_count == fence_idx + 1); + assert(submit->signals[fence_idx].sync == NULL); + submit->signals[fence_idx] = (struct vk_sync_signal) { + .sync = vk_fence_get_active_sync(fence), + .stage_mask = ~(VkPipelineStageFlags2KHR)0, + }; + } + + switch (queue->base.device->timeline_mode) { + case VK_DEVICE_TIMELINE_MODE_ASSISTED: + if (!vk_queue_has_submit_thread(queue)) { + static int force_submit_thread = -1; + if (unlikely(force_submit_thread < 0)) { + force_submit_thread = + env_var_as_boolean("MESA_VK_ENABLE_SUBMIT_THREAD", false); + } + + if (unlikely(force_submit_thread)) { + result = vk_queue_enable_submit_thread(queue); + } else { + /* Otherwise, only enable the submit thread if we need it in order + * to resolve timeline semaphore wait-before-signal issues. + */ + result = vk_sync_wait_many(queue->base.device, + submit->wait_count, submit->waits, + VK_SYNC_WAIT_PENDING, 0); + if (result == VK_TIMEOUT) + result = vk_queue_enable_submit_thread(queue); + } + if (unlikely(result != VK_SUCCESS)) + goto fail; + } + + if (vk_queue_has_submit_thread(queue)) { + if (has_binary_permanent_semaphore_wait) { + for (uint32_t i = 0; i < info->waitSemaphoreInfoCount; i++) { + VK_FROM_HANDLE(vk_semaphore, semaphore, + info->pWaitSemaphoreInfos[i].semaphore); + + if (semaphore->type != VK_SEMAPHORE_TYPE_BINARY) + continue; + + /* From the Vulkan 1.2.194 spec: + * + * "When a batch is submitted to a queue via a queue + * submission, and it includes semaphores to be waited on, + * it defines a memory dependency between prior semaphore + * signal operations and the batch, and defines semaphore + * wait operations. + * + * Such semaphore wait operations set the semaphores + * created with a VkSemaphoreType of + * VK_SEMAPHORE_TYPE_BINARY to the unsignaled state." + * + * For threaded submit, we depend on tracking the unsignaled + * state of binary semaphores to determine when we can safely + * submit. The VK_SYNC_WAIT_PENDING check above as well as the + * one in the sumbit thread depend on all binary semaphores + * being reset when they're not in active use from the point + * of view of the client's CPU timeline. This means we need to + * reset them inside vkQueueSubmit and cannot wait until the + * actual submit which happens later in the thread. + * + * We've already stolen temporary semaphore payloads above as + * part of basic semaphore processing. We steal permanent + * semaphore payloads here by way of vk_sync_move. For shared + * semaphores, this can be a bit expensive (sync file import + * and export) but, for non-shared semaphores, it can be made + * fairly cheap. Also, we only do this semaphore swapping in + * the case where you have real timelines AND the client is + * using timeline semaphores with wait-before-signal (that's + * the only way to get a submit thread) AND mixing those with + * waits on binary semaphores AND said binary semaphore is + * using its permanent payload. In other words, this code + * should basically only ever get executed in CTS tests. + */ + if (submit->_wait_temps[i] != NULL) + continue; + + assert(submit->waits[i].sync == &semaphore->permanent); + + /* From the Vulkan 1.2.194 spec: + * + * VUID-vkQueueSubmit-pWaitSemaphores-03238 + * + * "All elements of the pWaitSemaphores member of all + * elements of pSubmits created with a VkSemaphoreType of + * VK_SEMAPHORE_TYPE_BINARY must reference a semaphore + * signal operation that has been submitted for execution + * and any semaphore signal operations on which it depends + * (if any) must have also been submitted for execution." + * + * Therefore, we can safely do a blocking wait here and it + * won't actually block for long. This ensures that the + * vk_sync_move below will succeed. + */ + result = vk_sync_wait(queue->base.device, + submit->waits[i].sync, 0, + VK_SYNC_WAIT_PENDING, UINT64_MAX); + if (unlikely(result != VK_SUCCESS)) + goto fail; + + result = vk_sync_create(queue->base.device, + semaphore->permanent.type, + 0 /* flags */, + 0 /* initial value */, + &submit->_wait_temps[i]); + if (unlikely(result != VK_SUCCESS)) + goto fail; + + result = vk_sync_move(queue->base.device, + submit->_wait_temps[i], + &semaphore->permanent); + if (unlikely(result != VK_SUCCESS)) + goto fail; + + submit->waits[i].sync = submit->_wait_temps[i]; + } + } + + vk_queue_push_submit(queue, submit); + return VK_SUCCESS; + } else { + result = vk_queue_submit_final(queue, submit); + if (unlikely(result != VK_SUCCESS)) + goto fail; + + /* If we don't have a submit thread, we can more directly ensure + * that binary semaphore payloads get reset. If we also signal the + * vk_sync, then we can consider it to have been both reset and + * signaled. A reset in this case would be wrong because it would + * throw away our signal operation. If we don't signal the vk_sync, + * then we need to reset it. + */ + if (has_binary_permanent_semaphore_wait) { + for (uint32_t i = 0; i < submit->wait_count; i++) { + if ((submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) || + submit->_wait_temps[i] != NULL) + continue; + + bool was_signaled = false; + for (uint32_t j = 0; j < submit->signal_count; j++) { + if (submit->signals[j].sync == submit->waits[i].sync) { + was_signaled = true; + break; + } + } + + if (!was_signaled) { + result = vk_sync_reset(queue->base.device, + submit->waits[i].sync); + if (unlikely(result != VK_SUCCESS)) + goto fail; + } + } + } + + vk_queue_submit_destroy(queue, submit); + return VK_SUCCESS; + } + unreachable("Should have returned"); + + case VK_DEVICE_TIMELINE_MODE_EMULATED: + vk_queue_push_submit(queue, submit); + return vk_device_flush(queue->base.device); + + case VK_DEVICE_TIMELINE_MODE_NONE: + case VK_DEVICE_TIMELINE_MODE_NATIVE: + result = vk_queue_submit_final(queue, submit); + vk_queue_submit_destroy(queue, submit); + return result; + } + unreachable("Invalid timeline mode"); + +fail: + vk_queue_submit_destroy(queue, submit); + return result; +} + +VkResult +vk_queue_wait_before_present(struct vk_queue *queue, + const VkPresentInfoKHR *pPresentInfo) +{ + if (vk_device_is_lost(queue->base.device)) + return VK_ERROR_DEVICE_LOST; + + /* From the Vulkan 1.2.194 spec: + * + * VUID-vkQueuePresentKHR-pWaitSemaphores-03268 + * + * "All elements of the pWaitSemaphores member of pPresentInfo must + * reference a semaphore signal operation that has been submitted for + * execution and any semaphore signal operations on which it depends (if + * any) must have also been submitted for execution." + * + * As with vkQueueSubmit above, we need to ensure that any binary + * semaphores we use in this present actually exist. If we don't have + * timeline semaphores, this is a non-issue. If they're emulated, then + * this is ensured for us by the vk_device_flush() at the end of every + * vkQueueSubmit() and every vkSignalSemaphore(). For real timeline + * semaphores, however, we need to do a wait. Thanks to the above bit of + * spec text, that wait should never block for long. + */ + if (queue->base.device->timeline_mode != VK_DEVICE_TIMELINE_MODE_ASSISTED) + return VK_SUCCESS; + + const uint32_t wait_count = pPresentInfo->waitSemaphoreCount; + STACK_ARRAY(struct vk_sync_wait, waits, wait_count); + + for (uint32_t i = 0; i < wait_count; i++) { + VK_FROM_HANDLE(vk_semaphore, semaphore, + pPresentInfo->pWaitSemaphores[i]); + + /* From the Vulkan 1.2.194 spec: + * + * VUID-vkQueuePresentKHR-pWaitSemaphores-03267 + * + * "All elements of the pWaitSemaphores member of pPresentInfo must + * be created with a VkSemaphoreType of VK_SEMAPHORE_TYPE_BINARY." + */ + assert(semaphore->type == VK_SEMAPHORE_TYPE_BINARY); + + waits[i] = (struct vk_sync_wait) { + .sync = vk_semaphore_get_active_sync(semaphore), + .stage_mask = ~(VkPipelineStageFlags2KHR)0, + }; + } + + VkResult result = vk_sync_wait_many(queue->base.device, wait_count, waits, + VK_SYNC_WAIT_PENDING, UINT64_MAX); + + STACK_ARRAY_FINISH(waits); + + /* Check again, just in case */ + if (vk_device_is_lost(queue->base.device)) + return VK_ERROR_DEVICE_LOST; + + return result; +} + +static VkResult +vk_queue_signal_sync(struct vk_queue *queue, + struct vk_sync *sync, + uint32_t signal_value) +{ + struct vk_queue_submit *submit = vk_queue_submit_alloc(queue, 0, 0, 1); + if (unlikely(submit == NULL)) + return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->signals[0] = (struct vk_sync_signal) { + .sync = sync, + .stage_mask = ~(VkPipelineStageFlags2KHR)0, + .signal_value = signal_value, + }; + + VkResult result; + switch (queue->base.device->timeline_mode) { + case VK_DEVICE_TIMELINE_MODE_ASSISTED: + if (vk_queue_has_submit_thread(queue)) { + vk_queue_push_submit(queue, submit); + return VK_SUCCESS; + } else { + result = vk_queue_submit_final(queue, submit); + vk_queue_submit_destroy(queue, submit); + return result; + } + + case VK_DEVICE_TIMELINE_MODE_EMULATED: + vk_queue_push_submit(queue, submit); + return vk_device_flush(queue->base.device); + + case VK_DEVICE_TIMELINE_MODE_NONE: + case VK_DEVICE_TIMELINE_MODE_NATIVE: + result = vk_queue_submit_final(queue, submit); + vk_queue_submit_destroy(queue, submit); + return result; + } + unreachable("Invalid timeline mode"); +} + +void +vk_queue_finish(struct vk_queue *queue) +{ + if (vk_queue_has_submit_thread(queue)) + vk_queue_disable_submit_thread(queue); + + while (!list_is_empty(&queue->submit.submits)) { + assert(vk_device_is_lost_no_report(queue->base.device)); + + struct vk_queue_submit *submit = + list_first_entry(&queue->submit.submits, + struct vk_queue_submit, link); + + list_del(&submit->link); + vk_queue_submit_destroy(queue, submit); + } + + cnd_destroy(&queue->submit.pop); + cnd_destroy(&queue->submit.push); + mtx_destroy(&queue->submit.mutex); + + util_dynarray_fini(&queue->labels); + list_del(&queue->link); + vk_object_base_finish(&queue->base); +} + +VKAPI_ATTR VkResult VKAPI_CALL +vk_common_QueueSubmit2KHR(VkQueue _queue, + uint32_t submitCount, + const VkSubmitInfo2KHR *pSubmits, + VkFence _fence) +{ + VK_FROM_HANDLE(vk_queue, queue, _queue); + VK_FROM_HANDLE(vk_fence, fence, _fence); + + if (vk_device_is_lost(queue->base.device)) + return VK_ERROR_DEVICE_LOST; + + if (submitCount == 0) { + if (fence == NULL) { + return VK_SUCCESS; + } else { + return vk_queue_signal_sync(queue, vk_fence_get_active_sync(fence), 0); + } + } + + for (uint32_t i = 0; i < submitCount; i++) { + VkResult result = vk_queue_submit(queue, &pSubmits[i], + i == submitCount - 1 ? fence : NULL); + if (unlikely(result != VK_SUCCESS)) + return result; + } + + return VK_SUCCESS; +} + +static const struct vk_sync_type * +get_cpu_wait_type(struct vk_physical_device *pdevice) +{ + for (const struct vk_sync_type *const *t = + pdevice->supported_sync_types; *t; t++) { + if (((*t)->features & VK_SYNC_FEATURE_BINARY) && + ((*t)->features & VK_SYNC_FEATURE_CPU_WAIT)) + return *t; + } + + unreachable("You must have a non-timeline CPU wait sync type"); +} + +VKAPI_ATTR VkResult VKAPI_CALL +vk_common_QueueWaitIdle(VkQueue _queue) +{ + VK_FROM_HANDLE(vk_queue, queue, _queue); + VkResult result; + + if (vk_device_is_lost(queue->base.device)) + return VK_ERROR_DEVICE_LOST; + + const struct vk_sync_type *sync_type = + get_cpu_wait_type(queue->base.device->physical); + + struct vk_sync *sync; + result = vk_sync_create(queue->base.device, sync_type, 0, 0, &sync); + if (unlikely(result != VK_SUCCESS)) + return result; + + result = vk_queue_signal_sync(queue, sync, 0); + if (unlikely(result != VK_SUCCESS)) + return result; + + result = vk_sync_wait(queue->base.device, sync, 0, + VK_SYNC_WAIT_COMPLETE, UINT64_MAX); + + vk_sync_destroy(queue->base.device, sync); + + VkResult device_status = vk_device_check_status(queue->base.device); + if (device_status != VK_SUCCESS) + return device_status; + + return result; +} diff --git a/src/vulkan/runtime/vk_queue.h b/src/vulkan/runtime/vk_queue.h index 7998a277c39..f3854f3dc1f 100644 --- a/src/vulkan/runtime/vk_queue.h +++ b/src/vulkan/runtime/vk_queue.h @@ -26,6 +26,8 @@ #include "vk_object.h" +#include "c11/threads.h" + #include "util/list.h" #include "util/u_dynarray.h" @@ -33,6 +35,13 @@ extern "C" { #endif +struct vk_command_buffer; +struct vk_queue_submit; +struct vk_sync; +struct vk_sync_wait; +struct vk_sync_signal; +struct vk_sync_timeline_point; + struct vk_queue { struct vk_object_base base; @@ -48,6 +57,32 @@ struct vk_queue { /* Which queue this is within the queue family */ uint32_t index_in_family; + /** Driver queue submit hook + * + * When using the common implementation of vkQueueSubmit(), this function + * is called to do the final submit to the kernel driver after all + * semaphore dependencies have been resolved. Depending on the timeline + * mode and application usage, this function may be called directly from + * the client thread on which vkQueueSubmit was called or from a runtime- + * managed submit thread. We do, however, guarantee that as long as the + * client follows the Vulkan threading rules, this function will never be + * called by the runtime concurrently on the same queue. + */ + VkResult (*driver_submit)(struct vk_queue *queue, + struct vk_queue_submit *submit); + + struct { + mtx_t mutex; + cnd_t push; + cnd_t pop; + + struct list_head submits; + + bool thread_run; + bool has_thread; + thrd_t thread; + } submit; + struct { /* Only set once atomically by the queue */ int lost; @@ -107,6 +142,17 @@ vk_queue_init(struct vk_queue *queue, struct vk_device *device, void vk_queue_finish(struct vk_queue *queue); +static inline bool +vk_queue_is_empty(struct vk_queue *queue) +{ + return list_is_empty(&queue->submit.submits); +} + +VkResult vk_queue_flush(struct vk_queue *queue, uint32_t *submit_count_out); + +VkResult vk_queue_wait_before_present(struct vk_queue *queue, + const VkPresentInfoKHR *pPresentInfo); + VkResult PRINTFLIKE(4, 5) _vk_queue_set_lost(struct vk_queue *queue, const char *file, int line, @@ -127,6 +173,25 @@ vk_queue_is_lost(struct vk_queue *queue) #define vk_foreach_queue_safe(queue, device) \ list_for_each_entry_safe(struct vk_queue, queue, &(device)->queues, link) +struct vk_queue_submit { + struct list_head link; + + uint32_t wait_count; + uint32_t command_buffer_count; + uint32_t signal_count; + + struct vk_sync_wait *waits; + struct vk_command_buffer **command_buffers; + struct vk_sync_signal *signals; + + uint32_t perf_pass_index; + + /* Used internally; should be ignored by drivers */ + struct vk_sync **_wait_temps; + struct vk_sync_timeline_point **_wait_points; + struct vk_sync_timeline_point **_signal_points; +}; + #ifdef __cplusplus } #endif diff --git a/src/vulkan/runtime/vk_semaphore.c b/src/vulkan/runtime/vk_semaphore.c index ee5580bc2ae..ee0588b4616 100644 --- a/src/vulkan/runtime/vk_semaphore.c +++ b/src/vulkan/runtime/vk_semaphore.c @@ -132,6 +132,9 @@ vk_common_CreateSemaphore(VkDevice _device, const VkSemaphoreType semaphore_type = get_semaphore_type(pCreateInfo->pNext, &initial_value); + if (semaphore_type == VK_SEMAPHORE_TYPE_TIMELINE) + assert(device->timeline_mode != VK_DEVICE_TIMELINE_MODE_NONE); + const VkExportSemaphoreCreateInfo *export = vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO); VkExternalSemaphoreHandleTypeFlags handle_types = @@ -147,6 +150,15 @@ vk_common_CreateSemaphore(VkDevice _device, "for VkSemaphore creation."); } + /* If the timeline mode is ASSISTED, then any permanent binary semaphore + * types need to be able to support move. We don't require this for + * temporary unless that temporary is also used as a semaphore signal + * operation which is much trickier to assert early. + */ + if (semaphore_type == VK_SEMAPHORE_TYPE_BINARY && + device->timeline_mode == VK_DEVICE_TIMELINE_MODE_ASSISTED) + assert(sync_type->move); + /* Allocate a vk_semaphore + vk_sync implementation. Because the permanent * field of vk_semaphore is the base field of the vk_sync implementation, * we can make the 2 structures overlap. @@ -359,6 +371,12 @@ vk_common_SignalSemaphore(VkDevice _device, if (unlikely(result != VK_SUCCESS)) return result; + if (device->timeline_mode == VK_DEVICE_TIMELINE_MODE_EMULATED) { + result = vk_device_flush(device); + if (unlikely(result != VK_SUCCESS)) + return result; + } + return VK_SUCCESS; } @@ -489,6 +507,28 @@ vk_common_GetSemaphoreFdKHR(VkDevice _device, "Cannot export a timeline semaphore as SYNC_FD"); } + /* From the Vulkan 1.2.194 spec: + * VUID-VkSemaphoreGetFdInfoKHR-handleType-03254 + * + * "If handleType refers to a handle type with copy payload + * transference semantics, semaphore must have an associated + * semaphore signal operation that has been submitted for execution + * and any semaphore signal operations on which it depends (if any) + * must have also been submitted for execution." + * + * If we have real timelines, it's possible that the time point doesn't + * exist yet and is waiting for one of our submit threads to trigger. + * However, thanks to the above bit of spec text, that wait should never + * block for long. + */ + if (device->timeline_mode == VK_DEVICE_TIMELINE_MODE_ASSISTED) { + result = vk_sync_wait(device, sync, 0, + VK_SYNC_WAIT_PENDING, + UINT64_MAX); + if (unlikely(result != VK_SUCCESS)) + return result; + } + result = vk_sync_export_sync_file(device, sync, pFd); if (unlikely(result != VK_SUCCESS)) return result; diff --git a/src/vulkan/runtime/vk_sync.h b/src/vulkan/runtime/vk_sync.h index 84cbb5b9991..cd14ed6569a 100644 --- a/src/vulkan/runtime/vk_sync.h +++ b/src/vulkan/runtime/vk_sync.h @@ -297,6 +297,13 @@ struct vk_sync_wait { uint64_t wait_value; }; +/* See VkSemaphoreSubmitInfoKHR */ +struct vk_sync_signal { + struct vk_sync *sync; + VkPipelineStageFlags2KHR stage_mask; + uint64_t signal_value; +}; + VkResult MUST_CHECK vk_sync_init(struct vk_device *device, struct vk_sync *sync, const struct vk_sync_type *type,