venus: add fence feedback

- intercept to record feedback cmds for:
  - vkQueueSubmit
- add feedback code path for
  - vkGetFenceStatus
  - vkResetFences
- VN_PERF_NO_FENCE_FEEDBACK can disable fence feedback

Test: dEQP-VK.synchronization.basic.fence.*
Test: dEQP-VK.wsi.android.swapchain.render.basic*
Test: dEQP-VK.api.object_management.*
Test: dEQP-VK.api.external.fence.sync_fd.*

Signed-off-by: Yiwei Zhang <zzyiwei@chromium.org>
Reviewed-by: Ryan Neph <ryanneph@google.com>
Reviewed-by: Chad Versace <chadversary@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16731>
This commit is contained in:
Yiwei Zhang 2022-05-25 07:13:13 +00:00 committed by Marge Bot
parent 9f9d543b12
commit d7f2e6c8d0
5 changed files with 323 additions and 29 deletions

View file

@ -310,7 +310,7 @@ vn_device_feedback_pool_init(struct vn_device *dev)
static const uint32_t pool_size = 4096; static const uint32_t pool_size = 4096;
const VkAllocationCallbacks *alloc = &dev->base.base.alloc; const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
if (VN_PERF(NO_EVENT_FEEDBACK)) if (VN_PERF(NO_EVENT_FEEDBACK) && VN_PERF(NO_FENCE_FEEDBACK))
return VK_SUCCESS; return VK_SUCCESS;
return vn_feedback_pool_init(dev, &dev->feedback_pool, pool_size, alloc); return vn_feedback_pool_init(dev, &dev->feedback_pool, pool_size, alloc);
@ -319,7 +319,7 @@ vn_device_feedback_pool_init(struct vn_device *dev)
static inline void static inline void
vn_device_feedback_pool_fini(struct vn_device *dev) vn_device_feedback_pool_fini(struct vn_device *dev)
{ {
if (VN_PERF(NO_EVENT_FEEDBACK)) if (VN_PERF(NO_EVENT_FEEDBACK) && VN_PERF(NO_FENCE_FEEDBACK))
return; return;
vn_feedback_pool_fini(&dev->feedback_pool); vn_feedback_pool_fini(&dev->feedback_pool);

View file

@ -323,6 +323,112 @@ vn_feedback_event_cmd_record(VkCommandBuffer cmd_handle,
&buf_barrier_after, 0, NULL); &buf_barrier_after, 0, NULL);
} }
static VkResult
vn_feedback_fence_cmd_record(VkCommandBuffer cmd_handle,
struct vn_feedback_slot *slot)
{
STATIC_ASSERT(sizeof(*slot->status) == 4);
static const VkCommandBufferBeginInfo begin_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.pNext = NULL,
.flags = 0,
.pInheritanceInfo = NULL,
};
VkResult result = vn_BeginCommandBuffer(cmd_handle, &begin_info);
if (result != VK_SUCCESS)
return result;
static const VkMemoryBarrier mem_barrier_before = {
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = NULL,
/* make pending writes available to stay close to fence signal op */
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
/* no need to make all memory visible for feedback update */
.dstAccessMask = 0,
};
const VkBufferMemoryBarrier buf_barrier_before = {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = NULL,
/* slot memory has been made available via mem_barrier_before */
.srcAccessMask = 0,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = slot->buffer,
.offset = slot->offset,
.size = 4,
};
vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1,
&mem_barrier_before, 1, &buf_barrier_before, 0,
NULL);
vn_CmdFillBuffer(cmd_handle, slot->buffer, slot->offset, 4, VK_SUCCESS);
const VkBufferMemoryBarrier buf_barrier_after = {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = NULL,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = slot->buffer,
.offset = slot->offset,
.size = 4,
};
vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_HOST_BIT, 0, 0, NULL, 1,
&buf_barrier_after, 0, NULL);
return vn_EndCommandBuffer(cmd_handle);
}
VkResult
vn_feedback_fence_cmd_alloc(VkDevice dev_handle,
struct vn_feedback_cmd_pool *pool,
struct vn_feedback_slot *slot,
VkCommandBuffer *out_cmd_handle)
{
const VkCommandBufferAllocateInfo info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.pNext = NULL,
.commandPool = pool->pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
VkCommandBuffer cmd_handle;
VkResult result;
simple_mtx_lock(&pool->mutex);
result = vn_AllocateCommandBuffers(dev_handle, &info, &cmd_handle);
if (result != VK_SUCCESS)
goto out_unlock;
result = vn_feedback_fence_cmd_record(cmd_handle, slot);
if (result != VK_SUCCESS) {
vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle);
goto out_unlock;
}
*out_cmd_handle = cmd_handle;
out_unlock:
simple_mtx_unlock(&pool->mutex);
return result;
}
void
vn_feedback_fence_cmd_free(VkDevice dev_handle,
struct vn_feedback_cmd_pool *pool,
VkCommandBuffer cmd_handle)
{
simple_mtx_lock(&pool->mutex);
vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle);
simple_mtx_unlock(&pool->mutex);
}
VkResult VkResult
vn_feedback_cmd_pools_init(struct vn_device *dev) vn_feedback_cmd_pools_init(struct vn_device *dev)
{ {

View file

@ -113,6 +113,17 @@ vn_feedback_event_cmd_record(VkCommandBuffer cmd_handle,
VkPipelineStageFlags stage_mask, VkPipelineStageFlags stage_mask,
VkResult status); VkResult status);
VkResult
vn_feedback_fence_cmd_alloc(VkDevice dev_handle,
struct vn_feedback_cmd_pool *pool,
struct vn_feedback_slot *slot,
VkCommandBuffer *out_cmd_handle);
void
vn_feedback_fence_cmd_free(VkDevice dev_handle,
struct vn_feedback_cmd_pool *pool,
VkCommandBuffer cmd_handle);
VkResult VkResult
vn_feedback_cmd_pools_init(struct vn_device *dev); vn_feedback_cmd_pools_init(struct vn_device *dev);

View file

@ -312,6 +312,38 @@ vn_queue_submission_cleanup(struct vn_queue_submission *submit)
vk_free(alloc, submit->temp.storage); vk_free(alloc, submit->temp.storage);
} }
static inline uint32_t
vn_queue_family_array_index(struct vn_queue *queue)
{
for (uint32_t i = 0; i < queue->device->queue_family_count; i++) {
if (queue->device->queue_families[i] == queue->family)
return i;
}
unreachable("invalid queue");
}
static VkResult
vn_queue_submit(struct vn_instance *instance,
VkQueue queue_handle,
uint32_t batch_count,
const VkSubmitInfo *batches,
VkFence fence_handle,
bool sync_submit)
{
/* skip no-op submit */
if (!batch_count && fence_handle == VK_NULL_HANDLE)
return VK_SUCCESS;
if (sync_submit) {
return vn_call_vkQueueSubmit(instance, queue_handle, batch_count,
batches, fence_handle);
}
vn_async_vkQueueSubmit(instance, queue_handle, batch_count, batches,
fence_handle);
return VK_SUCCESS;
}
VkResult VkResult
vn_QueueSubmit(VkQueue _queue, vn_QueueSubmit(VkQueue _queue,
uint32_t submitCount, uint32_t submitCount,
@ -322,15 +354,18 @@ vn_QueueSubmit(VkQueue _queue,
struct vn_queue *queue = vn_queue_from_handle(_queue); struct vn_queue *queue = vn_queue_from_handle(_queue);
struct vn_device *dev = queue->device; struct vn_device *dev = queue->device;
struct vn_fence *fence = vn_fence_from_handle(_fence); struct vn_fence *fence = vn_fence_from_handle(_fence);
const bool is_fence_external = fence && fence->is_external; const bool external_fence = fence && fence->is_external;
const bool feedback_fence = fence && fence->feedback.slot;
struct vn_queue_submission submit; struct vn_queue_submission submit;
VkResult result = vn_queue_submission_prepare_submit( const struct vn_device_memory *wsi_mem = NULL;
&submit, _queue, submitCount, pSubmits, _fence); bool sync_submit;
VkResult result;
result = vn_queue_submission_prepare_submit(&submit, _queue, submitCount,
pSubmits, _fence);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return vn_error(dev->instance, VK_ERROR_OUT_OF_HOST_MEMORY); return vn_error(dev->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
const struct vn_device_memory *wsi_mem = NULL;
if (submit.batch_count == 1) { if (submit.batch_count == 1) {
const struct wsi_memory_signal_submit_info *info = vk_find_struct_const( const struct wsi_memory_signal_submit_info *info = vk_find_struct_const(
submit.submit_batches[0].pNext, WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA); submit.submit_batches[0].pNext, WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA);
@ -340,22 +375,51 @@ vn_QueueSubmit(VkQueue _queue,
} }
} }
/* TODO defer roundtrip for external fence until the next sync operation */ /* force synchronous submission if any of the below applies:
if (!wsi_mem && !is_fence_external && !VN_PERF(NO_ASYNC_QUEUE_SUBMIT)) { * - struct wsi_memory_signal_submit_info
vn_async_vkQueueSubmit(dev->instance, submit.queue, submit.batch_count, * - fence is an external fence
submit.submit_batches, submit.fence); * - NO_ASYNC_QUEUE_SUBMIT perf option enabled
vn_queue_submission_cleanup(&submit); */
return VK_SUCCESS; sync_submit = wsi_mem || external_fence || VN_PERF(NO_ASYNC_QUEUE_SUBMIT);
}
result = /* if the original submission involves a feedback fence:
vn_call_vkQueueSubmit(dev->instance, submit.queue, submit.batch_count, * - defer the feedback fence to another submit to avoid deep copy
submit.submit_batches, submit.fence); * - defer the potential sync_submit to the feedback fence submission
*/
result = vn_queue_submit(dev->instance, submit.queue, submit.batch_count,
submit.submit_batches,
feedback_fence ? VK_NULL_HANDLE : submit.fence,
!feedback_fence && sync_submit);
if (result != VK_SUCCESS) { if (result != VK_SUCCESS) {
vn_queue_submission_cleanup(&submit); vn_queue_submission_cleanup(&submit);
return vn_error(dev->instance, result); return vn_error(dev->instance, result);
} }
/* TODO intercept original submit batches to append the fence feedback cmd
* with a per-queue cached submission builder to avoid transient allocs.
*
* vn_queue_submission bits must be fixed for VkTimelineSemaphoreSubmitInfo
* before adding timeline semaphore feedback.
*/
if (feedback_fence) {
const uint32_t feedback_cmd_index = vn_queue_family_array_index(queue);
const VkSubmitInfo info = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.pNext = NULL,
.waitSemaphoreCount = 0,
.pWaitSemaphores = NULL,
.pWaitDstStageMask = NULL,
.commandBufferCount = 1,
.pCommandBuffers = &fence->feedback.commands[feedback_cmd_index],
};
result = vn_queue_submit(dev->instance, submit.queue, 1, &info,
submit.fence, sync_submit);
if (result != VK_SUCCESS) {
vn_queue_submission_cleanup(&submit);
return vn_error(dev->instance, result);
}
}
if (wsi_mem) { if (wsi_mem) {
/* XXX this is always false and kills the performance */ /* XXX this is always false and kills the performance */
if (dev->instance->renderer->info.has_implicit_fencing) { if (dev->instance->renderer->info.has_implicit_fencing) {
@ -463,6 +527,84 @@ vn_fence_signal_wsi(struct vn_device *dev, struct vn_fence *fence)
fence->payload = temp; fence->payload = temp;
} }
static VkResult
vn_fence_feedback_init(struct vn_device *dev,
struct vn_fence *fence,
bool signaled,
const VkAllocationCallbacks *alloc)
{
VkDevice dev_handle = vn_device_to_handle(dev);
struct vn_feedback_slot *slot;
VkCommandBuffer *cmd_handles;
VkResult result;
/* Fence feedback implementation relies on vkWaitForFences to cover the gap
* between feedback slot signaling and the actual fence signal operation.
*/
if (unlikely(!dev->instance->renderer->info.allow_vk_wait_syncs))
return VK_SUCCESS;
if (VN_PERF(NO_FENCE_FEEDBACK))
return VK_SUCCESS;
slot = vn_feedback_pool_alloc(&dev->feedback_pool, VN_FEEDBACK_TYPE_FENCE);
if (!slot)
return VK_ERROR_OUT_OF_HOST_MEMORY;
vn_feedback_set_status(slot, signaled ? VK_SUCCESS : VK_NOT_READY);
cmd_handles =
vk_zalloc(alloc, sizeof(*cmd_handles) * dev->queue_family_count,
VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!cmd_handles) {
vn_feedback_pool_free(&dev->feedback_pool, slot);
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
for (uint32_t i = 0; i < dev->queue_family_count; i++) {
result = vn_feedback_fence_cmd_alloc(dev_handle, &dev->cmd_pools[i],
slot, &cmd_handles[i]);
if (result != VK_SUCCESS) {
for (uint32_t j = 0; j < i; j++) {
vn_feedback_fence_cmd_free(dev_handle, &dev->cmd_pools[j],
cmd_handles[j]);
}
break;
}
}
if (result != VK_SUCCESS) {
vk_free(alloc, cmd_handles);
vn_feedback_pool_free(&dev->feedback_pool, slot);
return result;
}
fence->feedback.slot = slot;
fence->feedback.commands = cmd_handles;
return VK_SUCCESS;
}
static void
vn_fence_feedback_fini(struct vn_device *dev,
struct vn_fence *fence,
const VkAllocationCallbacks *alloc)
{
VkDevice dev_handle = vn_device_to_handle(dev);
if (!fence->feedback.slot)
return;
for (uint32_t i = 0; i < dev->queue_family_count; i++) {
vn_feedback_fence_cmd_free(dev_handle, &dev->cmd_pools[i],
fence->feedback.commands[i]);
}
vn_feedback_pool_free(&dev->feedback_pool, fence->feedback.slot);
vk_free(alloc, fence->feedback.commands);
}
VkResult VkResult
vn_CreateFence(VkDevice device, vn_CreateFence(VkDevice device,
const VkFenceCreateInfo *pCreateInfo, const VkFenceCreateInfo *pCreateInfo,
@ -472,6 +614,8 @@ vn_CreateFence(VkDevice device,
struct vn_device *dev = vn_device_from_handle(device); struct vn_device *dev = vn_device_from_handle(device);
const VkAllocationCallbacks *alloc = const VkAllocationCallbacks *alloc =
pAllocator ? pAllocator : &dev->base.base.alloc; pAllocator ? pAllocator : &dev->base.base.alloc;
const bool signaled = pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT;
VkResult result;
struct vn_fence *fence = vk_zalloc(alloc, sizeof(*fence), VN_DEFAULT_ALIGN, struct vn_fence *fence = vk_zalloc(alloc, sizeof(*fence), VN_DEFAULT_ALIGN,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@ -491,21 +635,27 @@ vn_CreateFence(VkDevice device,
fence->is_external = !!export_info->handleTypes; fence->is_external = !!export_info->handleTypes;
} }
VkResult result = vn_fence_init_payloads( result = vn_fence_init_payloads(dev, fence, signaled, alloc);
dev, fence, pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT, alloc); if (result != VK_SUCCESS)
if (result != VK_SUCCESS) { goto out_object_base_fini;
vn_object_base_fini(&fence->base);
vk_free(alloc, fence);
return vn_error(dev->instance, result);
}
VkFence fence_handle = vn_fence_to_handle(fence); result = vn_fence_feedback_init(dev, fence, signaled, alloc);
vn_async_vkCreateFence(dev->instance, device, pCreateInfo, NULL, if (result != VK_SUCCESS)
&fence_handle); goto out_payloads_fini;
*pFence = fence_handle; *pFence = vn_fence_to_handle(fence);
vn_async_vkCreateFence(dev->instance, device, pCreateInfo, NULL, pFence);
return VK_SUCCESS; return VK_SUCCESS;
out_payloads_fini:
vn_sync_payload_release(dev, &fence->permanent);
vn_sync_payload_release(dev, &fence->temporary);
out_object_base_fini:
vn_object_base_fini(&fence->base);
vk_free(alloc, fence);
return vn_error(dev->instance, result);
} }
void void
@ -523,6 +673,8 @@ vn_DestroyFence(VkDevice device,
vn_async_vkDestroyFence(dev->instance, device, _fence, NULL); vn_async_vkDestroyFence(dev->instance, device, _fence, NULL);
vn_fence_feedback_fini(dev, fence, alloc);
vn_sync_payload_release(dev, &fence->permanent); vn_sync_payload_release(dev, &fence->permanent);
vn_sync_payload_release(dev, &fence->temporary); vn_sync_payload_release(dev, &fence->temporary);
@ -549,6 +701,9 @@ vn_ResetFences(VkDevice device, uint32_t fenceCount, const VkFence *pFences)
assert(perm->type == VN_SYNC_TYPE_DEVICE_ONLY); assert(perm->type == VN_SYNC_TYPE_DEVICE_ONLY);
fence->payload = perm; fence->payload = perm;
if (fence->feedback.slot)
vn_feedback_reset_status(fence->feedback.slot);
} }
return VK_SUCCESS; return VK_SUCCESS;
@ -564,7 +719,23 @@ vn_GetFenceStatus(VkDevice device, VkFence _fence)
VkResult result; VkResult result;
switch (payload->type) { switch (payload->type) {
case VN_SYNC_TYPE_DEVICE_ONLY: case VN_SYNC_TYPE_DEVICE_ONLY:
result = vn_call_vkGetFenceStatus(dev->instance, device, _fence); if (fence->feedback.slot) {
result = vn_feedback_get_status(fence->feedback.slot);
if (result == VK_SUCCESS) {
/* When fence feedback slot gets signaled, the real fence
* signal operation follows after but the signaling isr can be
* deferred or preempted. To avoid theoretical racing, we let
* the renderer wait for the fence. This also helps resolve
* synchronization validation errors, because the layer no
* longer sees any fence status checks and falsely believes the
* caller does not sync.
*/
vn_async_vkWaitForFences(dev->instance, device, 1, &_fence,
VK_TRUE, UINT64_MAX);
}
} else {
result = vn_call_vkGetFenceStatus(dev->instance, device, _fence);
}
break; break;
case VN_SYNC_TYPE_WSI_SIGNALED: case VN_SYNC_TYPE_WSI_SIGNALED:
result = VK_SUCCESS; result = VK_SUCCESS;

View file

@ -50,6 +50,12 @@ struct vn_fence {
struct vn_sync_payload permanent; struct vn_sync_payload permanent;
struct vn_sync_payload temporary; struct vn_sync_payload temporary;
struct {
/* non-NULL if VN_PERF_NO_FENCE_FEEDBACK is disabled */
struct vn_feedback_slot *slot;
VkCommandBuffer *commands;
} feedback;
bool is_external; bool is_external;
}; };
VK_DEFINE_NONDISP_HANDLE_CASTS(vn_fence, VK_DEFINE_NONDISP_HANDLE_CASTS(vn_fence,