diff --git a/src/vulkan/wsi/wsi_common.c b/src/vulkan/wsi/wsi_common.c index 0c804562c78..6a2c5c80141 100644 --- a/src/vulkan/wsi/wsi_common.c +++ b/src/vulkan/wsi/wsi_common.c @@ -95,6 +95,7 @@ wsi_device_init(struct wsi_device *wsi, WSI_GET_CB(GetPhysicalDeviceProperties2); WSI_GET_CB(GetPhysicalDeviceMemoryProperties); WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties); + WSI_GET_CB(GetPhysicalDeviceProperties); #undef WSI_GET_CB wsi->drm_info.sType = @@ -121,10 +122,18 @@ wsi_device_init(struct wsi_device *wsi, VkQueueFamilyProperties queue_properties[64]; GetPhysicalDeviceQueueFamilyProperties(pdevice, &wsi->queue_family_count, queue_properties); + VkPhysicalDeviceProperties properties; + GetPhysicalDeviceProperties(pdevice, &properties); + wsi->timestamp_period = properties.limits.timestampPeriod; + for (unsigned i = 0; i < wsi->queue_family_count; i++) { VkFlags req_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT; if (queue_properties[i].queueFlags & req_flags) wsi->queue_supports_blit |= BITFIELD64_BIT(i); + + /* Don't want to consider timestamp wrapping logic. */ + if (queue_properties[i].timestampValidBits == 64) + wsi->queue_supports_timestamps |= BITFIELD64_BIT(i); } for (VkExternalSemaphoreHandleTypeFlags handle_type = 1; @@ -180,15 +189,19 @@ wsi_device_init(struct wsi_device *wsi, WSI_GET_CB(CmdPipelineBarrier); WSI_GET_CB(CmdCopyImage); WSI_GET_CB(CmdCopyImageToBuffer); + WSI_GET_CB(CmdResetQueryPool); + WSI_GET_CB(CmdWriteTimestamp); WSI_GET_CB(CreateBuffer); WSI_GET_CB(CreateCommandPool); WSI_GET_CB(CreateFence); WSI_GET_CB(CreateImage); + WSI_GET_CB(CreateQueryPool); WSI_GET_CB(CreateSemaphore); WSI_GET_CB(DestroyBuffer); WSI_GET_CB(DestroyCommandPool); WSI_GET_CB(DestroyFence); WSI_GET_CB(DestroyImage); + WSI_GET_CB(DestroyQueryPool); WSI_GET_CB(DestroySemaphore); WSI_GET_CB(EndCommandBuffer); WSI_GET_CB(FreeMemory); @@ -200,9 +213,14 @@ wsi_device_init(struct wsi_device *wsi, WSI_GET_CB(GetImageSubresourceLayout); if (!wsi->sw) WSI_GET_CB(GetMemoryFdKHR); + WSI_GET_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR); + WSI_GET_CB(GetPhysicalDeviceProperties); WSI_GET_CB(GetPhysicalDeviceFormatProperties); WSI_GET_CB(GetPhysicalDeviceFormatProperties2); WSI_GET_CB(GetPhysicalDeviceImageFormatProperties2); + WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties); + WSI_GET_CB(GetCalibratedTimestampsKHR); + WSI_GET_CB(GetQueryPoolResults); WSI_GET_CB(GetSemaphoreFdKHR); WSI_GET_CB(ResetFences); WSI_GET_CB(QueueSubmit2); @@ -459,6 +477,10 @@ configure_image(const struct wsi_swapchain *chain, } } +static void +wsi_surface_capabilities_add_present_stages(const struct wsi_device *wsi_device, + VkPresentTimingSurfaceCapabilitiesEXT *present_timing); + VkResult wsi_swapchain_init(const struct wsi_device *wsi, struct wsi_swapchain *chain, @@ -484,8 +506,10 @@ wsi_swapchain_init(const struct wsi_device *wsi, (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_WAIT_2_BIT_KHR); chain->blit.queue = NULL; - if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) { - if (wsi->get_blit_queue) { + if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT || + (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)) { + + if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT && wsi->get_blit_queue) { chain->blit.queue = wsi->get_blit_queue(_device); } @@ -506,10 +530,18 @@ wsi_swapchain_init(const struct wsi_device *wsi, if (chain->blit.queue != NULL) { queue_family_index = chain->blit.queue->queue_family_index; } else { + uint64_t effective_queues = wsi->queue_supports_blit; + if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) + effective_queues &= wsi->queue_supports_timestamps; + + /* Fallback. If this happens we don't advertise support for queue complete times. */ + if (!effective_queues) + effective_queues = wsi->queue_supports_blit; + /* Queues returned by get_blit_queue() might not be listed in * GetPhysicalDeviceQueueFamilyProperties, so this check is skipped for those queues. */ - if (!(wsi->queue_supports_blit & BITFIELD64_BIT(queue_family_index))) + if (!(effective_queues & BITFIELD64_BIT(queue_family_index))) continue; } @@ -537,6 +569,24 @@ wsi_swapchain_init(const struct wsi_device *wsi, goto fail; #endif + if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) { + ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface); + struct wsi_interface *iface = wsi->wsi[surface->platform]; + + VkPresentTimingSurfaceCapabilitiesEXT timing_caps = { + .sType = VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT, + }; + + VkSurfaceCapabilities2KHR caps2 = { + .sType = VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR, + .pNext = &timing_caps, + }; + + iface->get_capabilities2(surface, (struct wsi_device *)wsi, NULL, &caps2); + wsi_surface_capabilities_add_present_stages(wsi, &timing_caps); + chain->present_timing.supported_query_stages = timing_caps.presentStageQueries; + } + return VK_SUCCESS; fail: @@ -619,7 +669,7 @@ wsi_swapchain_finish(struct wsi_swapchain *chain) chain->wsi->DestroySemaphore(chain->device, chain->present_id_timeline, &chain->alloc); - if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) { + if (chain->cmd_pools) { int cmd_pools_count = chain->blit.queue != NULL ? 1 : chain->wsi->queue_family_count; for (uint32_t i = 0; i < cmd_pools_count; i++) { @@ -824,6 +874,88 @@ fail: return result; } +/** + * Creates the timestamp-query command buffers for the end of rendering, that + * will be used to report QUEUE_COMPLETE timestamp for EXT_present_timing. + * + * Unless the swapchain is blitting, we don't know what queue family a Present + * will happen on. So we make a timestamp command buffer for each so they're + * ready to go at present time. + */ +VkResult +wsi_image_init_timestamp(const struct wsi_swapchain *chain, + struct wsi_image *image) +{ + const struct wsi_device *wsi = chain->wsi; + VkResult result; + /* Set up command buffer to get timestamp info */ + + result = wsi->CreateQueryPool( + chain->device, + &(const VkQueryPoolCreateInfo){ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = 1, + }, + NULL, + &image->query_pool); + + if (result != VK_SUCCESS) + goto fail; + + uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count; + + if (!image->timestamp_cmd_buffers) { + image->timestamp_cmd_buffers = + vk_zalloc(&chain->alloc, sizeof(VkCommandBuffer) * family_count, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image->timestamp_cmd_buffers) + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + for (uint32_t i = 0; i < family_count; i++) { + /* We can only use timestamps on a queue that reports timestamp bits != 0. + * Since we don't consider device timestamp wrapping in this implementation (unclear how that would ever work), + * only report queue done where timestamp bits == 64. */ + if (!chain->cmd_pools[i]) + continue; + + result = wsi->AllocateCommandBuffers( + chain->device, + &(const VkCommandBufferAllocateInfo){ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .pNext = NULL, + .commandPool = chain->cmd_pools[i], + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }, &image->timestamp_cmd_buffers[i]); + + if (result != VK_SUCCESS) + goto fail; + + wsi->BeginCommandBuffer( + image->timestamp_cmd_buffers[i], + &(VkCommandBufferBeginInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + }); + + wsi->CmdResetQueryPool(image->timestamp_cmd_buffers[i], + image->query_pool, + 0, 1); + + wsi->CmdWriteTimestamp(image->timestamp_cmd_buffers[i], + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + image->query_pool, + 0); + + wsi->EndCommandBuffer(image->timestamp_cmd_buffers[i]); + } + + return VK_SUCCESS; +fail: + return result; +} + void wsi_destroy_image(const struct wsi_swapchain *chain, struct wsi_image *image) @@ -859,6 +991,19 @@ wsi_destroy_image(const struct wsi_swapchain *chain, vk_free(&chain->alloc, image->blit.cmd_buffers); } + wsi->DestroyQueryPool(chain->device, image->query_pool, NULL); + + if (image->timestamp_cmd_buffers) { + uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count; + for (uint32_t i = 0; i < family_count; i++) { + if (image->timestamp_cmd_buffers[i]) { + wsi->FreeCommandBuffers(chain->device, chain->cmd_pools[i], + 1, &image->timestamp_cmd_buffers[i]); + } + } + vk_free(&chain->alloc, image->timestamp_cmd_buffers); + } + wsi->FreeMemory(chain->device, image->memory, &chain->alloc); wsi->DestroyImage(chain->device, image->image, &chain->alloc); wsi->DestroyImage(chain->device, image->blit.image, &chain->alloc); @@ -910,6 +1055,37 @@ wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR( return result; } +static void +wsi_surface_capabilities_add_present_stages(const struct wsi_device *wsi_device, + VkPresentTimingSurfaceCapabilitiesEXT *present_timing) +{ + if (wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps) { + /* Make sure the implementation is capable of calibrating timestamps. */ + if (wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR && wsi_device->GetCalibratedTimestampsKHR) { + VkTimeDomainKHR domains[64]; + uint32_t count = ARRAY_SIZE(domains); + wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR(wsi_device->pdevice, &count, domains); + + bool supports_device = false, supports_monotonic = false, supports_monotonic_raw = false; + + for (uint32_t i = 0; i < count; i++) { + if (domains[i] == VK_TIME_DOMAIN_DEVICE_KHR) + supports_device = true; + else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR) + supports_monotonic = true; + else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR) + supports_monotonic_raw = true; + } + + /* Current present timing implementations do not use anything outside these. + * QPC might be relevant for Dozen at some point, but for now, we only consider Linux-centric + * platforms for present timing. */ + if (supports_device && supports_monotonic && supports_monotonic_raw) + present_timing->presentStageQueries |= VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT; + } + } +} + VKAPI_ATTR VkResult VKAPI_CALL wsi_GetPhysicalDeviceSurfaceCapabilities2KHR( VkPhysicalDevice physicalDevice, @@ -921,8 +1097,26 @@ wsi_GetPhysicalDeviceSurfaceCapabilities2KHR( struct wsi_device *wsi_device = device->wsi_device; struct wsi_interface *iface = wsi_device->wsi[surface->platform]; - return iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext, - pSurfaceCapabilities); + VkResult vr = iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext, + pSurfaceCapabilities); + if (vr != VK_SUCCESS) + return vr; + + VkPresentTimingSurfaceCapabilitiesEXT *present_timing = + vk_find_struct(pSurfaceCapabilities, PRESENT_TIMING_SURFACE_CAPABILITIES_EXT); + + if (present_timing && present_timing->presentTimingSupported) { + wsi_surface_capabilities_add_present_stages(wsi_device, present_timing); + if (!(present_timing->presentStageQueries & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT)) { + /* CTS and proposal document assert that QUEUE_OPERATIONS_END must be supported. */ + present_timing->presentTimingSupported = VK_FALSE; + present_timing->presentAtAbsoluteTimeSupported = VK_FALSE; + present_timing->presentAtRelativeTimeSupported = VK_FALSE; + present_timing->presentStageQueries = 0; + } + } + + return vr; } VKAPI_ATTR VkResult VKAPI_CALL @@ -1126,6 +1320,15 @@ wsi_CreateSwapchainKHR(VkDevice _device, swapchain->present_timing.active = true; mtx_init(&swapchain->present_timing.lock, 0); + for (uint32_t i = 0; i < swapchain->image_count; i++) { + struct wsi_image *image = swapchain->get_wsi_image(swapchain, i); + result = wsi_image_init_timestamp(swapchain, image); + if (result != VK_SUCCESS) { + swapchain->destroy(swapchain, alloc); + return result; + } + } + if (swapchain->poll_early_refresh) { /* If we can query the display directly, we should report something reasonable on first query * before we even present the first time. */ @@ -1195,8 +1398,87 @@ wsi_ReleaseSwapchainImagesKHR(VkDevice _device, return VK_SUCCESS; } +static uint64_t +wsi_swapchain_present_convert_device_to_cpu(struct wsi_swapchain *chain, + uint64_t device_timestamp_ns) +{ + if (device_timestamp_ns == 0) + return 0; + + /* This is only relevant if application is requesting that QUEUE_DONE present stage + * is used as target time domain (targetTimeDomainPresentStage). */ + + /* We have already made sure that the implementation supports these. */ + const VkCalibratedTimestampInfoKHR infos[2] = { + { + .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR, + .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR, + }, + { + .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR, + .timeDomain = chain->present_timing.time_domain, + }, + }; + + uint64_t timestamps[2]; + uint64_t max_deviation; + + /* Ignore target time if this fails for whatever reason. */ + if (chain->wsi->GetCalibratedTimestampsKHR(chain->device, 2, infos, timestamps, &max_deviation) != VK_SUCCESS) + return 0; + + /* PRESENT_STAGE_LOCAL_EXT is in terms of nanoseconds, so we don't scale that. */ + timestamps[0] = (uint64_t)((double)chain->wsi->timestamp_period * (double)timestamps[0]); + int64_t device_delta_ns = (int64_t)device_timestamp_ns - (int64_t)timestamps[0]; + uint64_t target_timestamp = timestamps[1] + device_delta_ns; + return target_timestamp; +} + +static void +wsi_swapchain_present_timing_sample_query_pool(struct wsi_swapchain *chain, + struct wsi_presentation_timing *timing, + struct wsi_image *image, + uint64_t upper_bound) +{ + /* Application can query for stages which are not supported. We need to return 0 here. */ + if (!(timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) || + !(chain->present_timing.supported_query_stages & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT)) + return; + + /* The GPU really should be done by now, and we should be able to read the timestamp, + * but it's possible that the present was discarded and we have a 0 timestamp here for the present. + * In this case, we should not block to wait on the queue dispatch timestamp. */ + uint64_t queue_ts; + + if (chain->wsi->GetQueryPoolResults(chain->device, image->query_pool, 0, 1, sizeof(uint64_t), + &queue_ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT) == VK_SUCCESS) { + /* This is still reported in nanoseconds. */ + timing->queue_done_time = (uint64_t)((double)queue_ts * (double)chain->wsi->timestamp_period); + } +} + +static void +wsi_swapchain_present_timing_notify_recycle_locked(struct wsi_swapchain *chain, + struct wsi_image *image) +{ + assert(chain->present_timing.active); + + for (size_t i = 0; i < chain->present_timing.timings_count; i++) { + if (chain->present_timing.timings[i].image == image) { + /* A different present takes ownership of the image's query pool index now. */ + chain->present_timing.timings[i].image = NULL; + chain->present_timing.timings[i].queue_done_time = 0; + + /* We waited on progress fence, so the timestamp query is guaranteed to be done. */ + wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, 0); + break; + } + } +} + static VkResult wsi_common_allocate_timing_request( - struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing, uint64_t present_id) + struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing, + uint64_t present_id, struct wsi_image *image) { VkResult vr = VK_SUCCESS; mtx_lock(&swapchain->present_timing.lock); @@ -1206,6 +1488,8 @@ static VkResult wsi_common_allocate_timing_request( goto err; } + wsi_swapchain_present_timing_notify_recycle_locked(swapchain, image); + struct wsi_presentation_timing *wsi_timing = &swapchain->present_timing.timings[swapchain->present_timing.timings_count++]; @@ -1213,8 +1497,12 @@ static VkResult wsi_common_allocate_timing_request( wsi_timing->serial = ++swapchain->present_timing.serial; wsi_timing->target_time = timing->targetTime; wsi_timing->present_id = present_id; + + /* It is allowed to ask for more stages than is supported, but we need to ignore them later. */ wsi_timing->requested_feedback = timing->presentStageQueries; + wsi_timing->image = image; + /* Ignore the time domain since we have a static domain. */ err: @@ -1225,7 +1513,8 @@ err: void wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain, uint64_t timing_serial, - uint64_t timestamp) + uint64_t timestamp, + struct wsi_image *image) { assert(chain->present_timing.active); mtx_lock(&chain->present_timing.lock); @@ -1234,6 +1523,17 @@ wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain, if (chain->present_timing.timings[i].serial == timing_serial) { chain->present_timing.timings[i].complete_time = timestamp; chain->present_timing.timings[i].complete = VK_TRUE; + + /* It's possible that QueuePresentKHR already handled the queue done timestamp for us, + * since the image was recycled before presentation could fully complete. + * In this case, we no longer own the timestamp query pool index, so just skip. */ + if (chain->present_timing.timings[i].image != image) + break; + + /* 0 means unknown. Application can probably fall back to its own timestamps if it wants to. */ + chain->present_timing.timings[i].queue_done_time = 0; + wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, timestamp); + chain->present_timing.timings[i].image = NULL; break; } } @@ -1314,7 +1614,7 @@ wsi_GetPastPresentationTimingEXT( bool stop_timing_removal = false; for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) { - const struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i]; + struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i]; if (!swapchain->present_timing.timings[i].complete || stop_timing_removal) { /* Keep output ordered to be compliant without having to re-sort every time. @@ -1325,6 +1625,26 @@ wsi_GetPastPresentationTimingEXT( continue; } + /* In some odd cases, completions can happen out of order, + * and CTS tests that completion times for IMMEDIATE/MAILBOX are monotonic. + * We always retire in-order, so this is fine. */ + if (in_timing->queue_done_time) { + /* CTS questionably checks that timing is strictly increasing in places. + * Bumping times by one nanosecond works around this. + * In practice, the only scenario where this happens is when + * there is calibration jitter or multiple images are retired + * at exact same time due to MAILBOX/IMMEDIATE. */ + in_timing->queue_done_time = MAX2(in_timing->queue_done_time, + swapchain->present_timing.minimum_queue_done_time + 1); + swapchain->present_timing.minimum_queue_done_time = in_timing->queue_done_time; + } + + if (in_timing->complete_time) { + in_timing->complete_time = MAX2(in_timing->complete_time, + swapchain->present_timing.minimum_complete_time + 1); + swapchain->present_timing.minimum_complete_time = in_timing->complete_time; + } + vk_outarray_append_typed(VkPastPresentationTimingEXT, &timings, timing) { timing->targetTime = swapchain->present_timing.timings[i].target_time; timing->presentId = in_timing->present_id; @@ -1350,12 +1670,23 @@ wsi_GetPastPresentationTimingEXT( } } - if (in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) { - vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) { - stage->stage = in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT; - /* It is expected that implementation will only expose one timing value. */ - assert(util_bitcount(stage->stage) == 1); - stage->time = in_timing->complete_time; + /* CTS expects that we are able to return something for all stages, even if they are not supported. */ + static const VkPresentStageFlagBitsEXT candidate_stages[] = { + VK_PRESENT_STAGE_REQUEST_DEQUEUED_BIT_EXT, + VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT, + VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_VISIBLE_BIT_EXT, + }; + + for (int stage_index = 0; stage_index < ARRAY_SIZE(candidate_stages); stage_index++) { + bool requested = (in_timing->requested_feedback & candidate_stages[stage_index]) != 0; + bool supported = (swapchain->present_timing.supported_query_stages & candidate_stages[stage_index]) != 0; + + if (requested) { + vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) { + stage->stage = candidate_stages[stage_index]; + /* It is expected that implementation will only expose one timing value. */ + stage->time = supported ? in_timing->complete_time : 0; + } } } } @@ -1811,6 +2142,8 @@ wsi_common_queue_present(const struct wsi_device *wsi, const VkPresentTimingsInfoEXT *present_timings_info = vk_find_struct_const(pPresentInfo->pNext, PRESENT_TIMINGS_INFO_EXT); + bool needs_timing_command_buffer = false; + if (present_timings_info) { /* If we fail a present due to full queue, it's a little unclear from * spec if we should treat it as OUT_OF_DATE or OUT_OF_HOST_MEMORY for @@ -1824,19 +2157,40 @@ wsi_common_queue_present(const struct wsi_device *wsi, assert(swapchain->present_timing.active); + uint32_t image_index = pPresentInfo->pImageIndices[i]; + /* EXT_present_timing is defined to only work with present_id2. * It's only used when reporting back timings. */ results[i] = wsi_common_allocate_timing_request( - swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0); + swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0, + swapchain->get_wsi_image(swapchain, image_index)); /* Application is responsible for allocating sufficient size here. * We fail with VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT if application is bugged. */ if (results[i] == VK_SUCCESS) { + /* We may have to rewrite the timestamp if application requests to use timestamps + * in terms of the QUEUE_OPERATIONS_END time domain, which is actually DEVICE. */ + uint64_t target_time = info->targetTime; + + if (info->targetTimeDomainPresentStage == VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT && + !(info->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT)) { + /* For relative, it's all nanoseconds anyway, so no need to do anything. */ + target_time = wsi_swapchain_present_convert_device_to_cpu(swapchain, target_time); + } + swapchain->set_timing_request(swapchain, &(struct wsi_image_timing_request) { .serial = swapchain->present_timing.serial, - .time = info->targetTime, + .time = target_time, .flags = info->flags, }); + + if (info->presentStageQueries & swapchain->present_timing.supported_query_stages & + VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) { + /* It's not a problem if we redundantly submit timing command buffers. + * VUID-12234 also says all swapchains in this present must have been + * created with present timing enabled. */ + needs_timing_command_buffer = true; + } } } } @@ -1919,15 +2273,15 @@ wsi_common_queue_present(const struct wsi_device *wsi, * the per-image semaphores and fences with the blit. */ { - STACK_ARRAY(VkCommandBufferSubmitInfo, blit_command_buffer_infos, - pPresentInfo->swapchainCount); + STACK_ARRAY(VkCommandBufferSubmitInfo, command_buffer_infos, + pPresentInfo->swapchainCount * 2); STACK_ARRAY(VkSemaphoreSubmitInfo, signal_semaphore_infos, pPresentInfo->swapchainCount * ARRAY_SIZE(image_signal_infos[0].semaphore_infos)); STACK_ARRAY(VkFence, fences, pPresentInfo->swapchainCount * ARRAY_SIZE(image_signal_infos[0].fences)); - uint32_t blit_count = 0, signal_semaphore_count = 0, fence_count = 0; + uint32_t command_buffer_count = 0, signal_semaphore_count = 0, fence_count = 0; for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) { VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]); @@ -1981,20 +2335,27 @@ wsi_common_queue_present(const struct wsi_device *wsi, } if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT) { - blit_command_buffer_infos[blit_count++] = (VkCommandBufferSubmitInfo) { + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = image->blit.cmd_buffers[queue->queue_family_index], }; } + + if (needs_timing_command_buffer) { + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = image->timestamp_cmd_buffers[queue->queue_family_index], + }; + } } const VkSubmitInfo2 submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = pPresentInfo->waitSemaphoreCount, .pWaitSemaphoreInfos = semaphore_wait_infos, - .commandBufferInfoCount = blit_count, - .pCommandBufferInfos = blit_command_buffer_infos, + .commandBufferInfoCount = command_buffer_count, + .pCommandBufferInfos = command_buffer_infos, .signalSemaphoreInfoCount = signal_semaphore_count, .pSignalSemaphoreInfos = signal_semaphore_infos, }; @@ -2010,7 +2371,7 @@ wsi_common_queue_present(const struct wsi_device *wsi, STACK_ARRAY_FINISH(fences); STACK_ARRAY_FINISH(signal_semaphore_infos); - STACK_ARRAY_FINISH(blit_command_buffer_infos); + STACK_ARRAY_FINISH(command_buffer_infos); } /* Now do blits on any blit queues */ @@ -2035,17 +2396,27 @@ wsi_common_queue_present(const struct wsi_device *wsi, .semaphore = swapchain->blit.semaphores[image_index], }; - const VkCommandBufferSubmitInfo blit_command_buffer_info = { + VkCommandBufferSubmitInfo command_buffer_infos[2]; + uint32_t command_buffer_count = 0; + + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = image->blit.cmd_buffers[0], }; + if (needs_timing_command_buffer) { + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = image->timestamp_cmd_buffers[0], + }; + } + const VkSubmitInfo2 submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = 1, .pWaitSemaphoreInfos = &blit_semaphore_info, - .commandBufferInfoCount = 1, - .pCommandBufferInfos = &blit_command_buffer_info, + .commandBufferInfoCount = command_buffer_count, + .pCommandBufferInfos = command_buffer_infos, .signalSemaphoreInfoCount = image_signal_infos[i].semaphore_count, .pSignalSemaphoreInfos = image_signal_infos[i].semaphore_infos, }; diff --git a/src/vulkan/wsi/wsi_common.h b/src/vulkan/wsi/wsi_common.h index 2692497122b..892345d6344 100644 --- a/src/vulkan/wsi/wsi_common.h +++ b/src/vulkan/wsi/wsi_common.h @@ -62,6 +62,8 @@ struct wsi_device { VkPhysicalDeviceMemoryProperties memory_props; uint32_t queue_family_count; uint64_t queue_supports_blit; + uint64_t queue_supports_timestamps; + float timestamp_period; VkPhysicalDeviceDrmPropertiesEXT drm_info; VkPhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info; @@ -201,28 +203,37 @@ struct wsi_device { WSI_CB(CmdPipelineBarrier); WSI_CB(CmdCopyImage); WSI_CB(CmdCopyImageToBuffer); + WSI_CB(CmdResetQueryPool); + WSI_CB(CmdWriteTimestamp); WSI_CB(CreateBuffer); WSI_CB(CreateCommandPool); WSI_CB(CreateFence); WSI_CB(CreateImage); + WSI_CB(CreateQueryPool); WSI_CB(CreateSemaphore); WSI_CB(DestroyBuffer); WSI_CB(DestroyCommandPool); WSI_CB(DestroyFence); WSI_CB(DestroyImage); + WSI_CB(DestroyQueryPool); WSI_CB(DestroySemaphore); WSI_CB(EndCommandBuffer); WSI_CB(FreeMemory); WSI_CB(FreeCommandBuffers); WSI_CB(GetBufferMemoryRequirements); + WSI_CB(GetCalibratedTimestampsKHR); WSI_CB(GetFenceStatus); WSI_CB(GetImageDrmFormatModifierPropertiesEXT); WSI_CB(GetImageMemoryRequirements); WSI_CB(GetImageSubresourceLayout); WSI_CB(GetMemoryFdKHR); + WSI_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR); + WSI_CB(GetPhysicalDeviceProperties); WSI_CB(GetPhysicalDeviceFormatProperties); WSI_CB(GetPhysicalDeviceFormatProperties2); WSI_CB(GetPhysicalDeviceImageFormatProperties2); + WSI_CB(GetPhysicalDeviceQueueFamilyProperties); + WSI_CB(GetQueryPoolResults); WSI_CB(GetSemaphoreFdKHR); WSI_CB(ResetFences); WSI_CB(QueueSubmit2); diff --git a/src/vulkan/wsi/wsi_common_private.h b/src/vulkan/wsi/wsi_common_private.h index 513b78ce5a0..a65e7c534b6 100644 --- a/src/vulkan/wsi/wsi_common_private.h +++ b/src/vulkan/wsi/wsi_common_private.h @@ -188,6 +188,9 @@ struct wsi_image { int dma_buf_fd; #endif void *cpu_map; + + VkQueryPool query_pool; + VkCommandBuffer *timestamp_cmd_buffers; }; struct wsi_presentation_timing { @@ -196,6 +199,10 @@ struct wsi_presentation_timing { uint64_t serial; uint64_t queue_done_time; /* GPU timestamp based. */ uint64_t complete_time; /* Best effort timestamp we get from backend. */ + /* If we're rendering with IMMEDIATE, it's possible for images to IDLE long before they complete. + * In this case, we have to ensure that queue_done_time is sampled at QueuePresentKHR time + * before we recycle an image. */ + struct wsi_image *image; VkPresentStageFlagsEXT requested_feedback; VkBool32 complete; }; @@ -272,6 +279,11 @@ struct wsi_swapchain { uint64_t refresh_counter; VkTimeDomainKHR time_domain; + + VkPresentStageFlagsEXT supported_query_stages; + /* Ensures monotonicity for complete_time. */ + uint64_t minimum_queue_done_time; + uint64_t minimum_complete_time; } present_timing; bool capture_key_pressed; @@ -410,6 +422,10 @@ wsi_create_image(const struct wsi_swapchain *chain, void wsi_image_init(struct wsi_image *image); +VkResult +wsi_image_init_timestamp(const struct wsi_swapchain *chain, + struct wsi_image *image); + void wsi_destroy_image(const struct wsi_swapchain *chain, struct wsi_image *image); @@ -420,7 +436,8 @@ wsi_swapchain_wait_for_present_semaphore(const struct wsi_swapchain *chain, void wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain, - uint64_t timing_serial, uint64_t timestamp); + uint64_t timing_serial, uint64_t timestamp, + struct wsi_image *image); void wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain, diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c index 29149fbf4a1..85a82c91f54 100644 --- a/src/vulkan/wsi/wsi_common_wayland.c +++ b/src/vulkan/wsi/wsi_common_wayland.c @@ -1670,7 +1670,15 @@ wsi_GetPhysicalDeviceWaylandPresentationSupportKHR(VkPhysicalDevice physicalDevi struct wsi_wayland *wsi = (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND]; - if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex))) + /* These should overlap. */ + uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps; + + /* If there are no queues that support both blits and timestamps, + * don't report support for queue timestamps. */ + if (!effective_queues) + effective_queues = wsi_device->queue_supports_blit; + + if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex))) return false; struct wsi_wl_display display; @@ -2469,6 +2477,7 @@ struct wsi_wl_present_id { uint64_t target_time; uint64_t correction; struct wl_list link; + struct wsi_image *img; bool user_target_time; }; @@ -2903,7 +2912,7 @@ presentation_handle_presented(void *data, /* Notify this before present wait to reduce latency of presentation timing requests * if the application is driving its queries based off present waits. */ if (id->timing_serial) - wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time); + wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time, id->img); mtx_lock(&chain->present_ids.lock); chain->present_ids.refresh_nsec = refresh; @@ -2940,7 +2949,7 @@ presentation_handle_discarded(void *data) * applications may start to latch onto that timestamp as ground truth, which * is obviously not correct. */ if (id->timing_serial) - wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0); + wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0, id->img); mtx_lock(&chain->present_ids.lock); if (!chain->present_ids.valid_refresh_nsec) { @@ -3217,6 +3226,7 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain, id->present_id = present_id; id->alloc = chain->wsi_wl_surface->display->wsi_wl->alloc; id->timing_serial = chain->timing_request.serial; + id->img = &chain->images[image_index].base; id->user_target_time = chain->timing_request.time != 0; mtx_lock(&chain->present_ids.lock);