diff --git a/src/virtio/vulkan/vn_device.c b/src/virtio/vulkan/vn_device.c index 5fc070a9ee4..ff17999d70e 100644 --- a/src/virtio/vulkan/vn_device.c +++ b/src/virtio/vulkan/vn_device.c @@ -310,7 +310,7 @@ vn_device_feedback_pool_init(struct vn_device *dev) static const uint32_t pool_size = 4096; const VkAllocationCallbacks *alloc = &dev->base.base.alloc; - if (VN_PERF(NO_EVENT_FEEDBACK)) + if (VN_PERF(NO_EVENT_FEEDBACK) && VN_PERF(NO_FENCE_FEEDBACK)) return VK_SUCCESS; return vn_feedback_pool_init(dev, &dev->feedback_pool, pool_size, alloc); @@ -319,7 +319,7 @@ vn_device_feedback_pool_init(struct vn_device *dev) static inline void vn_device_feedback_pool_fini(struct vn_device *dev) { - if (VN_PERF(NO_EVENT_FEEDBACK)) + if (VN_PERF(NO_EVENT_FEEDBACK) && VN_PERF(NO_FENCE_FEEDBACK)) return; vn_feedback_pool_fini(&dev->feedback_pool); diff --git a/src/virtio/vulkan/vn_feedback.c b/src/virtio/vulkan/vn_feedback.c index 31d653357de..7e2d1a65950 100644 --- a/src/virtio/vulkan/vn_feedback.c +++ b/src/virtio/vulkan/vn_feedback.c @@ -323,6 +323,112 @@ vn_feedback_event_cmd_record(VkCommandBuffer cmd_handle, &buf_barrier_after, 0, NULL); } +static VkResult +vn_feedback_fence_cmd_record(VkCommandBuffer cmd_handle, + struct vn_feedback_slot *slot) + +{ + STATIC_ASSERT(sizeof(*slot->status) == 4); + + static const VkCommandBufferBeginInfo begin_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .pNext = NULL, + .flags = 0, + .pInheritanceInfo = NULL, + }; + VkResult result = vn_BeginCommandBuffer(cmd_handle, &begin_info); + if (result != VK_SUCCESS) + return result; + + static const VkMemoryBarrier mem_barrier_before = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = NULL, + /* make pending writes available to stay close to fence signal op */ + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + /* no need to make all memory visible for feedback update */ + .dstAccessMask = 0, + }; + const VkBufferMemoryBarrier buf_barrier_before = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = NULL, + /* slot memory has been made available via mem_barrier_before */ + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slot->buffer, + .offset = slot->offset, + .size = 4, + }; + vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, + &mem_barrier_before, 1, &buf_barrier_before, 0, + NULL); + vn_CmdFillBuffer(cmd_handle, slot->buffer, slot->offset, 4, VK_SUCCESS); + + const VkBufferMemoryBarrier buf_barrier_after = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = NULL, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slot->buffer, + .offset = slot->offset, + .size = 4, + }; + vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_HOST_BIT, 0, 0, NULL, 1, + &buf_barrier_after, 0, NULL); + + return vn_EndCommandBuffer(cmd_handle); +} + +VkResult +vn_feedback_fence_cmd_alloc(VkDevice dev_handle, + struct vn_feedback_cmd_pool *pool, + struct vn_feedback_slot *slot, + VkCommandBuffer *out_cmd_handle) +{ + const VkCommandBufferAllocateInfo info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .pNext = NULL, + .commandPool = pool->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + VkCommandBuffer cmd_handle; + VkResult result; + + simple_mtx_lock(&pool->mutex); + result = vn_AllocateCommandBuffers(dev_handle, &info, &cmd_handle); + if (result != VK_SUCCESS) + goto out_unlock; + + result = vn_feedback_fence_cmd_record(cmd_handle, slot); + if (result != VK_SUCCESS) { + vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle); + goto out_unlock; + } + + *out_cmd_handle = cmd_handle; + +out_unlock: + simple_mtx_unlock(&pool->mutex); + + return result; +} + +void +vn_feedback_fence_cmd_free(VkDevice dev_handle, + struct vn_feedback_cmd_pool *pool, + VkCommandBuffer cmd_handle) +{ + simple_mtx_lock(&pool->mutex); + vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle); + simple_mtx_unlock(&pool->mutex); +} + VkResult vn_feedback_cmd_pools_init(struct vn_device *dev) { diff --git a/src/virtio/vulkan/vn_feedback.h b/src/virtio/vulkan/vn_feedback.h index c391ed8d78e..8f7870c894b 100644 --- a/src/virtio/vulkan/vn_feedback.h +++ b/src/virtio/vulkan/vn_feedback.h @@ -113,6 +113,17 @@ vn_feedback_event_cmd_record(VkCommandBuffer cmd_handle, VkPipelineStageFlags stage_mask, VkResult status); +VkResult +vn_feedback_fence_cmd_alloc(VkDevice dev_handle, + struct vn_feedback_cmd_pool *pool, + struct vn_feedback_slot *slot, + VkCommandBuffer *out_cmd_handle); + +void +vn_feedback_fence_cmd_free(VkDevice dev_handle, + struct vn_feedback_cmd_pool *pool, + VkCommandBuffer cmd_handle); + VkResult vn_feedback_cmd_pools_init(struct vn_device *dev); diff --git a/src/virtio/vulkan/vn_queue.c b/src/virtio/vulkan/vn_queue.c index e50d28b2eaf..7408ba50941 100644 --- a/src/virtio/vulkan/vn_queue.c +++ b/src/virtio/vulkan/vn_queue.c @@ -312,6 +312,38 @@ vn_queue_submission_cleanup(struct vn_queue_submission *submit) vk_free(alloc, submit->temp.storage); } +static inline uint32_t +vn_queue_family_array_index(struct vn_queue *queue) +{ + for (uint32_t i = 0; i < queue->device->queue_family_count; i++) { + if (queue->device->queue_families[i] == queue->family) + return i; + } + unreachable("invalid queue"); +} + +static VkResult +vn_queue_submit(struct vn_instance *instance, + VkQueue queue_handle, + uint32_t batch_count, + const VkSubmitInfo *batches, + VkFence fence_handle, + bool sync_submit) +{ + /* skip no-op submit */ + if (!batch_count && fence_handle == VK_NULL_HANDLE) + return VK_SUCCESS; + + if (sync_submit) { + return vn_call_vkQueueSubmit(instance, queue_handle, batch_count, + batches, fence_handle); + } + + vn_async_vkQueueSubmit(instance, queue_handle, batch_count, batches, + fence_handle); + return VK_SUCCESS; +} + VkResult vn_QueueSubmit(VkQueue _queue, uint32_t submitCount, @@ -322,15 +354,18 @@ vn_QueueSubmit(VkQueue _queue, struct vn_queue *queue = vn_queue_from_handle(_queue); struct vn_device *dev = queue->device; struct vn_fence *fence = vn_fence_from_handle(_fence); - const bool is_fence_external = fence && fence->is_external; - + const bool external_fence = fence && fence->is_external; + const bool feedback_fence = fence && fence->feedback.slot; struct vn_queue_submission submit; - VkResult result = vn_queue_submission_prepare_submit( - &submit, _queue, submitCount, pSubmits, _fence); + const struct vn_device_memory *wsi_mem = NULL; + bool sync_submit; + VkResult result; + + result = vn_queue_submission_prepare_submit(&submit, _queue, submitCount, + pSubmits, _fence); if (result != VK_SUCCESS) return vn_error(dev->instance, VK_ERROR_OUT_OF_HOST_MEMORY); - const struct vn_device_memory *wsi_mem = NULL; if (submit.batch_count == 1) { const struct wsi_memory_signal_submit_info *info = vk_find_struct_const( submit.submit_batches[0].pNext, WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA); @@ -340,22 +375,51 @@ vn_QueueSubmit(VkQueue _queue, } } - /* TODO defer roundtrip for external fence until the next sync operation */ - if (!wsi_mem && !is_fence_external && !VN_PERF(NO_ASYNC_QUEUE_SUBMIT)) { - vn_async_vkQueueSubmit(dev->instance, submit.queue, submit.batch_count, - submit.submit_batches, submit.fence); - vn_queue_submission_cleanup(&submit); - return VK_SUCCESS; - } + /* force synchronous submission if any of the below applies: + * - struct wsi_memory_signal_submit_info + * - fence is an external fence + * - NO_ASYNC_QUEUE_SUBMIT perf option enabled + */ + sync_submit = wsi_mem || external_fence || VN_PERF(NO_ASYNC_QUEUE_SUBMIT); - result = - vn_call_vkQueueSubmit(dev->instance, submit.queue, submit.batch_count, - submit.submit_batches, submit.fence); + /* if the original submission involves a feedback fence: + * - defer the feedback fence to another submit to avoid deep copy + * - defer the potential sync_submit to the feedback fence submission + */ + result = vn_queue_submit(dev->instance, submit.queue, submit.batch_count, + submit.submit_batches, + feedback_fence ? VK_NULL_HANDLE : submit.fence, + !feedback_fence && sync_submit); if (result != VK_SUCCESS) { vn_queue_submission_cleanup(&submit); return vn_error(dev->instance, result); } + /* TODO intercept original submit batches to append the fence feedback cmd + * with a per-queue cached submission builder to avoid transient allocs. + * + * vn_queue_submission bits must be fixed for VkTimelineSemaphoreSubmitInfo + * before adding timeline semaphore feedback. + */ + if (feedback_fence) { + const uint32_t feedback_cmd_index = vn_queue_family_array_index(queue); + const VkSubmitInfo info = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = NULL, + .waitSemaphoreCount = 0, + .pWaitSemaphores = NULL, + .pWaitDstStageMask = NULL, + .commandBufferCount = 1, + .pCommandBuffers = &fence->feedback.commands[feedback_cmd_index], + }; + result = vn_queue_submit(dev->instance, submit.queue, 1, &info, + submit.fence, sync_submit); + if (result != VK_SUCCESS) { + vn_queue_submission_cleanup(&submit); + return vn_error(dev->instance, result); + } + } + if (wsi_mem) { /* XXX this is always false and kills the performance */ if (dev->instance->renderer->info.has_implicit_fencing) { @@ -463,6 +527,84 @@ vn_fence_signal_wsi(struct vn_device *dev, struct vn_fence *fence) fence->payload = temp; } +static VkResult +vn_fence_feedback_init(struct vn_device *dev, + struct vn_fence *fence, + bool signaled, + const VkAllocationCallbacks *alloc) +{ + VkDevice dev_handle = vn_device_to_handle(dev); + struct vn_feedback_slot *slot; + VkCommandBuffer *cmd_handles; + VkResult result; + + /* Fence feedback implementation relies on vkWaitForFences to cover the gap + * between feedback slot signaling and the actual fence signal operation. + */ + if (unlikely(!dev->instance->renderer->info.allow_vk_wait_syncs)) + return VK_SUCCESS; + + if (VN_PERF(NO_FENCE_FEEDBACK)) + return VK_SUCCESS; + + slot = vn_feedback_pool_alloc(&dev->feedback_pool, VN_FEEDBACK_TYPE_FENCE); + if (!slot) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + vn_feedback_set_status(slot, signaled ? VK_SUCCESS : VK_NOT_READY); + + cmd_handles = + vk_zalloc(alloc, sizeof(*cmd_handles) * dev->queue_family_count, + VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!cmd_handles) { + vn_feedback_pool_free(&dev->feedback_pool, slot); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + for (uint32_t i = 0; i < dev->queue_family_count; i++) { + result = vn_feedback_fence_cmd_alloc(dev_handle, &dev->cmd_pools[i], + slot, &cmd_handles[i]); + if (result != VK_SUCCESS) { + for (uint32_t j = 0; j < i; j++) { + vn_feedback_fence_cmd_free(dev_handle, &dev->cmd_pools[j], + cmd_handles[j]); + } + break; + } + } + + if (result != VK_SUCCESS) { + vk_free(alloc, cmd_handles); + vn_feedback_pool_free(&dev->feedback_pool, slot); + return result; + } + + fence->feedback.slot = slot; + fence->feedback.commands = cmd_handles; + + return VK_SUCCESS; +} + +static void +vn_fence_feedback_fini(struct vn_device *dev, + struct vn_fence *fence, + const VkAllocationCallbacks *alloc) +{ + VkDevice dev_handle = vn_device_to_handle(dev); + + if (!fence->feedback.slot) + return; + + for (uint32_t i = 0; i < dev->queue_family_count; i++) { + vn_feedback_fence_cmd_free(dev_handle, &dev->cmd_pools[i], + fence->feedback.commands[i]); + } + + vn_feedback_pool_free(&dev->feedback_pool, fence->feedback.slot); + + vk_free(alloc, fence->feedback.commands); +} + VkResult vn_CreateFence(VkDevice device, const VkFenceCreateInfo *pCreateInfo, @@ -472,6 +614,8 @@ vn_CreateFence(VkDevice device, struct vn_device *dev = vn_device_from_handle(device); const VkAllocationCallbacks *alloc = pAllocator ? pAllocator : &dev->base.base.alloc; + const bool signaled = pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT; + VkResult result; struct vn_fence *fence = vk_zalloc(alloc, sizeof(*fence), VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); @@ -491,21 +635,27 @@ vn_CreateFence(VkDevice device, fence->is_external = !!export_info->handleTypes; } - VkResult result = vn_fence_init_payloads( - dev, fence, pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT, alloc); - if (result != VK_SUCCESS) { - vn_object_base_fini(&fence->base); - vk_free(alloc, fence); - return vn_error(dev->instance, result); - } + result = vn_fence_init_payloads(dev, fence, signaled, alloc); + if (result != VK_SUCCESS) + goto out_object_base_fini; - VkFence fence_handle = vn_fence_to_handle(fence); - vn_async_vkCreateFence(dev->instance, device, pCreateInfo, NULL, - &fence_handle); + result = vn_fence_feedback_init(dev, fence, signaled, alloc); + if (result != VK_SUCCESS) + goto out_payloads_fini; - *pFence = fence_handle; + *pFence = vn_fence_to_handle(fence); + vn_async_vkCreateFence(dev->instance, device, pCreateInfo, NULL, pFence); return VK_SUCCESS; + +out_payloads_fini: + vn_sync_payload_release(dev, &fence->permanent); + vn_sync_payload_release(dev, &fence->temporary); + +out_object_base_fini: + vn_object_base_fini(&fence->base); + vk_free(alloc, fence); + return vn_error(dev->instance, result); } void @@ -523,6 +673,8 @@ vn_DestroyFence(VkDevice device, vn_async_vkDestroyFence(dev->instance, device, _fence, NULL); + vn_fence_feedback_fini(dev, fence, alloc); + vn_sync_payload_release(dev, &fence->permanent); vn_sync_payload_release(dev, &fence->temporary); @@ -549,6 +701,9 @@ vn_ResetFences(VkDevice device, uint32_t fenceCount, const VkFence *pFences) assert(perm->type == VN_SYNC_TYPE_DEVICE_ONLY); fence->payload = perm; + + if (fence->feedback.slot) + vn_feedback_reset_status(fence->feedback.slot); } return VK_SUCCESS; @@ -564,7 +719,23 @@ vn_GetFenceStatus(VkDevice device, VkFence _fence) VkResult result; switch (payload->type) { case VN_SYNC_TYPE_DEVICE_ONLY: - result = vn_call_vkGetFenceStatus(dev->instance, device, _fence); + if (fence->feedback.slot) { + result = vn_feedback_get_status(fence->feedback.slot); + if (result == VK_SUCCESS) { + /* When fence feedback slot gets signaled, the real fence + * signal operation follows after but the signaling isr can be + * deferred or preempted. To avoid theoretical racing, we let + * the renderer wait for the fence. This also helps resolve + * synchronization validation errors, because the layer no + * longer sees any fence status checks and falsely believes the + * caller does not sync. + */ + vn_async_vkWaitForFences(dev->instance, device, 1, &_fence, + VK_TRUE, UINT64_MAX); + } + } else { + result = vn_call_vkGetFenceStatus(dev->instance, device, _fence); + } break; case VN_SYNC_TYPE_WSI_SIGNALED: result = VK_SUCCESS; diff --git a/src/virtio/vulkan/vn_queue.h b/src/virtio/vulkan/vn_queue.h index a66697b4f9c..594ca226e53 100644 --- a/src/virtio/vulkan/vn_queue.h +++ b/src/virtio/vulkan/vn_queue.h @@ -50,6 +50,12 @@ struct vn_fence { struct vn_sync_payload permanent; struct vn_sync_payload temporary; + struct { + /* non-NULL if VN_PERF_NO_FENCE_FEEDBACK is disabled */ + struct vn_feedback_slot *slot; + VkCommandBuffer *commands; + } feedback; + bool is_external; }; VK_DEFINE_NONDISP_HANDLE_CASTS(vn_fence,