diff --git a/docs/features.txt b/docs/features.txt index 5e907be8ff9..e9d52ffb567 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -658,6 +658,7 @@ Khronos extensions that are not part of any Vulkan version: VK_EXT_physical_device_drm DONE (anv, hasvk, hk, nvk, panvk, pvr, radv, tu, v3dv, vn) VK_EXT_pipeline_library_group_handles DONE (anv, lvp, radv, vn) VK_EXT_post_depth_coverage DONE (anv/gfx11+, lvp, nvk, radv/gfx10+, tu, vn) + VK_EXT_present_timing DONE (anv, hk, nvk, radv, tu) VK_EXT_primitive_topology_list_restart DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn, nvk) VK_EXT_primitives_generated_query DONE (anv, hasvk, lvp, nvk, radv, tu, vn) VK_EXT_provoking_vertex DONE (anv, hasvk, hk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn) diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index efff05ca884..0adf258b751 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -20,3 +20,4 @@ VK_KHR_surface_maintenance1 promotion everywhere EXT is exposed VK_KHR_swapchain_maintenance1 promotion everywhere EXT is exposed VK_KHR_dynamic_rendering on PowerVR VK_EXT_multisampled_render_to_single_sampled on panvk +VK_EXT_present_timing on RADV, NVK, Turnip, ANV, Honeykrisp diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index 91473b82817..0f6700ff6ef 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -791,6 +791,10 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device .EXT_pipeline_library_group_handles = radv_enable_rt(pdev), .EXT_pipeline_robustness = !pdev->use_llvm, .EXT_post_depth_coverage = pdev->info.gfx_level >= GFX10, +#ifdef RADV_USE_WSI_PLATFORM + /* KHR_calibrated_timestamps is a requirement to expose EXT_present_timing. */ + .EXT_present_timing = radv_calibrated_timestamps_enabled(pdev), +#endif .EXT_primitive_topology_list_restart = true, .EXT_primitives_generated_query = true, .EXT_private_data = true, @@ -1481,6 +1485,14 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc /* VK_EXT_custom_resolve */ .customResolve = true, + +#ifdef RADV_USE_WSI_PLATFORM + /* VK_EXT_present_timing */ + /* The actual query is deferred to surface time. */ + .presentTiming = true, + .presentAtAbsoluteTime = true, + .presentAtRelativeTime = true, +#endif }; } diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c index c49fac86e21..7fad0103906 100644 --- a/src/asahi/vulkan/hk_physical_device.c +++ b/src/asahi/vulkan/hk_physical_device.c @@ -181,6 +181,9 @@ hk_get_device_extensions(const struct hk_instance *instance, .EXT_pipeline_protected_access = true, .EXT_pipeline_robustness = true, .EXT_physical_device_drm = true, +#ifdef HK_USE_WSI_PLATFORM + .EXT_present_timing = true, +#endif .EXT_primitive_topology_list_restart = true, .EXT_private_data = true, .EXT_primitives_generated_query = false, @@ -623,6 +626,13 @@ hk_get_device_features( /* VK_KHR_shader_relaxed_extended_instruction */ .shaderRelaxedExtendedInstruction = true, + +#ifdef HK_USE_WSI_PLATFORM + /* VK_EXT_present_timing */ + .presentTiming = true, + .presentAtRelativeTime = true, + .presentAtAbsoluteTime = true, +#endif }; } diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c index 25d6ed20754..9dd6a956313 100644 --- a/src/egl/drivers/dri2/platform_x11.c +++ b/src/egl/drivers/dri2/platform_x11.c @@ -792,17 +792,6 @@ dri2_fourcc_for_depth(struct dri2_egl_display *dri2_dpy, uint32_t depth) } } -static int -box_intersection_area(int16_t a_x, int16_t a_y, int16_t a_width, - int16_t a_height, int16_t b_x, int16_t b_y, - int16_t b_width, int16_t b_height) -{ - int w = MIN2(a_x + a_width, b_x + b_width) - MAX2(a_x, b_x); - int h = MIN2(a_y + a_height, b_y + b_height) - MAX2(a_y, b_y); - - return (w < 0 || h < 0) ? 0 : w * h; -} - EGLBoolean dri2_x11_get_msc_rate(_EGLDisplay *display, _EGLSurface *surface, EGLint *numerator, EGLint *denominator) diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index dceb5227116..af341c83a60 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -314,6 +314,9 @@ get_device_extensions(const struct tu_physical_device *device, .EXT_physical_device_drm = !is_kgsl(device->instance), .EXT_pipeline_creation_cache_control = true, .EXT_pipeline_creation_feedback = true, +#ifdef TU_USE_WSI_PLATFORM + .EXT_present_timing = device->info->props.has_persistent_counter, +#endif .EXT_primitive_topology_list_restart = true, .EXT_primitives_generated_query = true, .EXT_private_data = true, @@ -825,6 +828,13 @@ tu_get_features(struct tu_physical_device *pdevice, /* VK_EXT_custom_resolve */ features->customResolve = true; + +#ifdef TU_USE_WSI_PLATFORM + /* VK_EXT_present_timing */ + features->presentTiming = true; + features->presentAtRelativeTime = true; + features->presentAtAbsoluteTime = true; +#endif } static void diff --git a/src/intel/vulkan/anv_physical_device.c b/src/intel/vulkan/anv_physical_device.c index 9c25067e95c..9a89d23d4bd 100644 --- a/src/intel/vulkan/anv_physical_device.c +++ b/src/intel/vulkan/anv_physical_device.c @@ -354,6 +354,9 @@ get_device_extensions(const struct anv_physical_device *device, .EXT_pipeline_protected_access = device->has_protected_contexts, .EXT_pipeline_robustness = true, .EXT_post_depth_coverage = true, +#ifdef ANV_USE_WSI_PLATFORM + .EXT_present_timing = device->has_reg_timestamp, +#endif .EXT_primitive_topology_list_restart = true, .EXT_primitives_generated_query = true, .EXT_private_data = true, @@ -1005,6 +1008,13 @@ get_features(const struct anv_physical_device *pdevice, /* VK_KHR_pipeline_binary */ .pipelineBinaries = true, + +#ifdef ANV_USE_WSI_PLATFORM + /* VK_EXT_present_timing */ + .presentTiming = true, + .presentAtRelativeTime = true, + .presentAtAbsoluteTime = true, +#endif }; /* The new DOOM and Wolfenstein games require depthBounds without diff --git a/src/loader/loader_dri_helper.h b/src/loader/loader_dri_helper.h index 169e36b5d80..5837dde16cd 100644 --- a/src/loader/loader_dri_helper.h +++ b/src/loader/loader_dri_helper.h @@ -29,36 +29,7 @@ #include #include "util/format/u_formats.h" -#ifdef HAVE_X11_PLATFORM -#include -#include -#include - -struct loader_crtc_info { - xcb_randr_crtc_t id; - xcb_timestamp_t timestamp; - - int16_t x, y; - uint16_t width, height; - - unsigned refresh_numerator; - unsigned refresh_denominator; -}; - -struct loader_screen_resources { - mtx_t mtx; - - xcb_connection_t *conn; - xcb_screen_t *screen; - - xcb_timestamp_t config_timestamp; - - /* Number of CRTCs with an active mode set */ - unsigned num_crtcs; - struct loader_crtc_info *crtcs; -}; -#endif - +#include "loader_dri_helper_screen.h" /** * These formats are endian independent they result in the same layout @@ -110,16 +81,4 @@ loader_pipe_format_to_fourcc(enum pipe_format pipe); enum pipe_format loader_fourcc_to_pipe_format(uint32_t fourcc); -#ifdef HAVE_X11_PLATFORM -void -loader_init_screen_resources(struct loader_screen_resources *res, - xcb_connection_t *conn, - xcb_screen_t *screen); -bool -loader_update_screen_resources(struct loader_screen_resources *res); - -void -loader_destroy_screen_resources(struct loader_screen_resources *res); -#endif - #endif /* LOADER_DRI_HELPER_H */ diff --git a/src/loader/loader_dri_helper_screen.h b/src/loader/loader_dri_helper_screen.h new file mode 100644 index 00000000000..290e8dd111c --- /dev/null +++ b/src/loader/loader_dri_helper_screen.h @@ -0,0 +1,76 @@ +/* + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that copyright + * notice and this permission notice appear in supporting documentation, and + * that the name of the copyright holders not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. The copyright holders make no representations + * about the suitability of this software for any purpose. It is provided "as + * is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THIS SOFTWARE. + */ + +#ifndef LOADER_DRI_HELPER_SCREEN_H +#define LOADER_DRI_HELPER_SCREEN_H + +#ifdef HAVE_X11_PLATFORM +#include +#include +#include + +struct loader_crtc_info { + xcb_randr_crtc_t id; + xcb_timestamp_t timestamp; + + int16_t x, y; + uint16_t width, height; + + unsigned refresh_numerator; + unsigned refresh_denominator; +}; + +struct loader_screen_resources { + mtx_t mtx; + + xcb_connection_t *conn; + xcb_screen_t *screen; + + xcb_timestamp_t config_timestamp; + + /* Number of CRTCs with an active mode set */ + unsigned num_crtcs; + struct loader_crtc_info *crtcs; +}; + +void +loader_init_screen_resources(struct loader_screen_resources *res, + xcb_connection_t *conn, + xcb_screen_t *screen); +bool +loader_update_screen_resources(struct loader_screen_resources *res); + +void +loader_destroy_screen_resources(struct loader_screen_resources *res); + +#endif + +static inline int +box_intersection_area(int16_t a_x, int16_t a_y, int16_t a_width, + int16_t a_height, int16_t b_x, int16_t b_y, + int16_t b_width, int16_t b_height) +{ + int w = MIN2(a_x + a_width, b_x + b_width) - MAX2(a_x, b_x); + int h = MIN2(a_y + a_height, b_y + b_height) - MAX2(a_y, b_y); + + return (w < 0 || h < 0) ? 0 : w * h; +} + +#endif diff --git a/src/meson.build b/src/meson.build index f27dae33631..1568cfa656e 100644 --- a/src/meson.build +++ b/src/meson.build @@ -49,7 +49,7 @@ endif if with_platform_x11 subdir('x11') endif -if with_gallium_or_lvp or with_gbm or with_platform_wayland +if with_gallium_or_lvp or with_gbm or with_platform_wayland or with_platform_x11 or with_platform_xcb subdir('loader') endif subdir('compiler') diff --git a/src/nouveau/vulkan/nvk_physical_device.c b/src/nouveau/vulkan/nvk_physical_device.c index 663c1a804e4..81482a90b90 100644 --- a/src/nouveau/vulkan/nvk_physical_device.c +++ b/src/nouveau/vulkan/nvk_physical_device.c @@ -262,6 +262,9 @@ nvk_get_device_extensions(const struct nvk_instance *instance, .EXT_pipeline_robustness = true, .EXT_physical_device_drm = true, .EXT_post_depth_coverage = info->cls_eng3d >= MAXWELL_B, +#ifdef NVK_USE_WSI_PLATFORM + .EXT_present_timing = true, +#endif .EXT_primitive_topology_list_restart = true, .EXT_private_data = true, .EXT_primitives_generated_query = true, @@ -753,6 +756,11 @@ nvk_get_device_features(const struct nv_device_info *info, /* VK_KHR_present_wait2 */ .presentWait2 = true, + + /* VK_EXT_present_timing */ + .presentTiming = true, + .presentAtRelativeTime = true, + .presentAtAbsoluteTime = true, #endif }; } diff --git a/src/vulkan/wsi/meson.build b/src/vulkan/wsi/meson.build index 38a83d08eab..4fc2a9205f1 100644 --- a/src/vulkan/wsi/meson.build +++ b/src/vulkan/wsi/meson.build @@ -26,6 +26,10 @@ if with_platform_wayland files_vulkan_wsi += wp_files['color-management-v1'] endif +if with_platform_x11 or with_platform_xcb + links_vulkan_wsi += libloader +endif + if with_platform_windows files_vulkan_wsi += files('wsi_common_win32.cpp') platform_deps += dep_dxheaders diff --git a/src/vulkan/wsi/wsi_common.c b/src/vulkan/wsi/wsi_common.c index b861bb75ed7..0188e6fda64 100644 --- a/src/vulkan/wsi/wsi_common.c +++ b/src/vulkan/wsi/wsi_common.c @@ -95,6 +95,7 @@ wsi_device_init(struct wsi_device *wsi, WSI_GET_CB(GetPhysicalDeviceProperties2); WSI_GET_CB(GetPhysicalDeviceMemoryProperties); WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties); + WSI_GET_CB(GetPhysicalDeviceProperties); #undef WSI_GET_CB wsi->drm_info.sType = @@ -121,10 +122,18 @@ wsi_device_init(struct wsi_device *wsi, VkQueueFamilyProperties queue_properties[64]; GetPhysicalDeviceQueueFamilyProperties(pdevice, &wsi->queue_family_count, queue_properties); + VkPhysicalDeviceProperties properties; + GetPhysicalDeviceProperties(pdevice, &properties); + wsi->timestamp_period = properties.limits.timestampPeriod; + for (unsigned i = 0; i < wsi->queue_family_count; i++) { VkFlags req_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT; if (queue_properties[i].queueFlags & req_flags) wsi->queue_supports_blit |= BITFIELD64_BIT(i); + + /* Don't want to consider timestamp wrapping logic. */ + if (queue_properties[i].timestampValidBits == 64) + wsi->queue_supports_timestamps |= BITFIELD64_BIT(i); } for (VkExternalSemaphoreHandleTypeFlags handle_type = 1; @@ -180,15 +189,19 @@ wsi_device_init(struct wsi_device *wsi, WSI_GET_CB(CmdPipelineBarrier); WSI_GET_CB(CmdCopyImage); WSI_GET_CB(CmdCopyImageToBuffer); + WSI_GET_CB(CmdResetQueryPool); + WSI_GET_CB(CmdWriteTimestamp); WSI_GET_CB(CreateBuffer); WSI_GET_CB(CreateCommandPool); WSI_GET_CB(CreateFence); WSI_GET_CB(CreateImage); + WSI_GET_CB(CreateQueryPool); WSI_GET_CB(CreateSemaphore); WSI_GET_CB(DestroyBuffer); WSI_GET_CB(DestroyCommandPool); WSI_GET_CB(DestroyFence); WSI_GET_CB(DestroyImage); + WSI_GET_CB(DestroyQueryPool); WSI_GET_CB(DestroySemaphore); WSI_GET_CB(EndCommandBuffer); WSI_GET_CB(FreeMemory); @@ -200,9 +213,14 @@ wsi_device_init(struct wsi_device *wsi, WSI_GET_CB(GetImageSubresourceLayout); if (!wsi->sw) WSI_GET_CB(GetMemoryFdKHR); + WSI_GET_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR); + WSI_GET_CB(GetPhysicalDeviceProperties); WSI_GET_CB(GetPhysicalDeviceFormatProperties); WSI_GET_CB(GetPhysicalDeviceFormatProperties2); WSI_GET_CB(GetPhysicalDeviceImageFormatProperties2); + WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties); + WSI_GET_CB(GetCalibratedTimestampsKHR); + WSI_GET_CB(GetQueryPoolResults); WSI_GET_CB(GetSemaphoreFdKHR); WSI_GET_CB(ResetFences); WSI_GET_CB(QueueSubmit2); @@ -481,8 +499,10 @@ wsi_swapchain_init(const struct wsi_device *wsi, chain->blit.type = get_blit_type(wsi, image_params, _device); chain->blit.queue = NULL; - if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) { - if (wsi->get_blit_queue) { + if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT || + (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)) { + + if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT && wsi->get_blit_queue) { chain->blit.queue = wsi->get_blit_queue(_device); } @@ -503,10 +523,18 @@ wsi_swapchain_init(const struct wsi_device *wsi, if (chain->blit.queue != NULL) { queue_family_index = chain->blit.queue->queue_family_index; } else { + uint64_t effective_queues = wsi->queue_supports_blit; + if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) + effective_queues &= wsi->queue_supports_timestamps; + + /* Fallback. If this happens we don't advertise support for queue complete times. */ + if (!effective_queues) + effective_queues = wsi->queue_supports_blit; + /* Queues returned by get_blit_queue() might not be listed in * GetPhysicalDeviceQueueFamilyProperties, so this check is skipped for those queues. */ - if (!(wsi->queue_supports_blit & BITFIELD64_BIT(queue_family_index))) + if (!(effective_queues & BITFIELD64_BIT(queue_family_index))) continue; } @@ -616,7 +644,7 @@ wsi_swapchain_finish(struct wsi_swapchain *chain) chain->wsi->DestroySemaphore(chain->device, chain->present_id_timeline, &chain->alloc); - if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) { + if (chain->cmd_pools) { int cmd_pools_count = chain->blit.queue != NULL ? 1 : chain->wsi->queue_family_count; for (uint32_t i = 0; i < cmd_pools_count; i++) { @@ -628,6 +656,12 @@ wsi_swapchain_finish(struct wsi_swapchain *chain) vk_free(&chain->alloc, chain->cmd_pools); } + if (chain->present_timing.active) { + mtx_destroy(&chain->present_timing.lock); + if (chain->present_timing.timings) + vk_free(&chain->alloc, chain->present_timing.timings); + } + vk_object_base_finish(&chain->base); } @@ -815,6 +849,88 @@ fail: return result; } +/** + * Creates the timestamp-query command buffers for the end of rendering, that + * will be used to report QUEUE_COMPLETE timestamp for EXT_present_timing. + * + * Unless the swapchain is blitting, we don't know what queue family a Present + * will happen on. So we make a timestamp command buffer for each so they're + * ready to go at present time. + */ +VkResult +wsi_image_init_timestamp(const struct wsi_swapchain *chain, + struct wsi_image *image) +{ + const struct wsi_device *wsi = chain->wsi; + VkResult result; + /* Set up command buffer to get timestamp info */ + + result = wsi->CreateQueryPool( + chain->device, + &(const VkQueryPoolCreateInfo){ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = 1, + }, + NULL, + &image->query_pool); + + if (result != VK_SUCCESS) + goto fail; + + uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count; + + if (!image->timestamp_cmd_buffers) { + image->timestamp_cmd_buffers = + vk_zalloc(&chain->alloc, sizeof(VkCommandBuffer) * family_count, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!image->timestamp_cmd_buffers) + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + for (uint32_t i = 0; i < family_count; i++) { + /* We can only use timestamps on a queue that reports timestamp bits != 0. + * Since we don't consider device timestamp wrapping in this implementation (unclear how that would ever work), + * only report queue done where timestamp bits == 64. */ + if (!chain->cmd_pools[i]) + continue; + + result = wsi->AllocateCommandBuffers( + chain->device, + &(const VkCommandBufferAllocateInfo){ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .pNext = NULL, + .commandPool = chain->cmd_pools[i], + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }, &image->timestamp_cmd_buffers[i]); + + if (result != VK_SUCCESS) + goto fail; + + wsi->BeginCommandBuffer( + image->timestamp_cmd_buffers[i], + &(VkCommandBufferBeginInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + }); + + wsi->CmdResetQueryPool(image->timestamp_cmd_buffers[i], + image->query_pool, + 0, 1); + + wsi->CmdWriteTimestamp(image->timestamp_cmd_buffers[i], + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + image->query_pool, + 0); + + wsi->EndCommandBuffer(image->timestamp_cmd_buffers[i]); + } + + return VK_SUCCESS; +fail: + return result; +} + void wsi_destroy_image(const struct wsi_swapchain *chain, struct wsi_image *image) @@ -850,6 +966,19 @@ wsi_destroy_image(const struct wsi_swapchain *chain, vk_free(&chain->alloc, image->blit.cmd_buffers); } + wsi->DestroyQueryPool(chain->device, image->query_pool, NULL); + + if (image->timestamp_cmd_buffers) { + uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count; + for (uint32_t i = 0; i < family_count; i++) { + if (image->timestamp_cmd_buffers[i]) { + wsi->FreeCommandBuffers(chain->device, chain->cmd_pools[i], + 1, &image->timestamp_cmd_buffers[i]); + } + } + vk_free(&chain->alloc, image->timestamp_cmd_buffers); + } + wsi->FreeMemory(chain->device, image->memory, &chain->alloc); wsi->DestroyImage(chain->device, image->image, &chain->alloc); wsi->DestroyImage(chain->device, image->blit.image, &chain->alloc); @@ -912,8 +1041,43 @@ wsi_GetPhysicalDeviceSurfaceCapabilities2KHR( struct wsi_device *wsi_device = device->wsi_device; struct wsi_interface *iface = wsi_device->wsi[surface->platform]; - return iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext, - pSurfaceCapabilities); + VkResult vr = iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext, + pSurfaceCapabilities); + if (vr != VK_SUCCESS) + return vr; + + struct VkPresentTimingSurfaceCapabilitiesEXT *present_timing = + vk_find_struct(pSurfaceCapabilities, PRESENT_TIMING_SURFACE_CAPABILITIES_EXT); + + if (present_timing && present_timing->presentTimingSupported) { + if (wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps) { + /* Make sure the implementation is capable of calibrating timestamps. */ + if (wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR && wsi_device->GetCalibratedTimestampsKHR) { + VkTimeDomainKHR domains[64]; + uint32_t count = ARRAY_SIZE(domains); + wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR(wsi_device->pdevice, &count, domains); + + bool supports_device = false, supports_monotonic = false, supports_monotonic_raw = false; + + for (uint32_t i = 0; i < count; i++) { + if (domains[i] == VK_TIME_DOMAIN_DEVICE_KHR) + supports_device = true; + else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR) + supports_monotonic = true; + else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR) + supports_monotonic_raw = true; + } + + /* Current present timing implementations do not use anything outside these. + * QPC might be relevant for Dozen at some point, but for now, we only consider Linux-centric + * platforms for present timing. */ + if (supports_device && supports_monotonic && supports_monotonic_raw) + present_timing->presentStageQueries |= VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT; + } + } + } + + return vr; } VKAPI_ATTR VkResult VKAPI_CALL @@ -1112,6 +1276,32 @@ wsi_CreateSwapchainKHR(VkDevice _device, *pSwapchain = wsi_swapchain_to_handle(swapchain); + if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) { + swapchain->present_timing.active = true; + mtx_init(&swapchain->present_timing.lock, 0); + + for (uint32_t i = 0; i < swapchain->image_count; i++) { + struct wsi_image *image = swapchain->get_wsi_image(swapchain, i); + result = wsi_image_init_timestamp(swapchain, image); + if (result != VK_SUCCESS) { + swapchain->destroy(swapchain, alloc); + return result; + } + } + + if (swapchain->poll_early_refresh) { + /* If we can query the display directly, we should report something reasonable on first query + * before we even present the first time. */ + uint64_t interval; + uint64_t refresh_ns = swapchain->poll_early_refresh(swapchain, &interval); + if (refresh_ns) { + swapchain->present_timing.refresh_duration = refresh_ns; + swapchain->present_timing.refresh_interval = interval; + swapchain->present_timing.refresh_counter++; + } + } + } + return VK_SUCCESS; } @@ -1168,6 +1358,353 @@ wsi_ReleaseSwapchainImagesKHR(VkDevice _device, return VK_SUCCESS; } +static void +wsi_swapchain_present_timing_sample_query_pool(struct wsi_swapchain *chain, + struct wsi_presentation_timing *timing, + struct wsi_image *image, + uint64_t upper_bound) +{ + if (!(timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT)) + return; + + /* The GPU really should be done by now, and we should be able to read the timestamp, + * but it's possible that the present was discarded and we have a 0 timestamp here for the present. + * In this case, we should not block to wait on the queue dispatch timestamp. */ + uint64_t queue_ts; + + if (chain->wsi->GetQueryPoolResults(chain->device, image->query_pool, 0, 1, sizeof(uint64_t), + &queue_ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT) != VK_SUCCESS) + return; + + /* There are two ways to deal with DEVICE timestamp domain. + * Either we can report PRESENT_STAGE_LOCAL domain and let application + * calibrate the timestamps on its own. However, this creates an annoying situation + * where application is able to QueuePresentKHR requesting we use QUEUE_OPERATIONS_END time domain as + * the reference (targetTimeDomainPresentStage). + * In that case, we are forced to re-calibrate the timestamp anyway. + * We will also need to implement per-driver plumbing to forward SWAPCHAIN_LOCAL and PRESENT_STAGE_LOCAL + * time domains to the swapchain and query the underlying time domain. + * Instead of dealing with this mess, just recalibrate the timestamp. The accuracy of queue_operations_end + * is not particularly important. */ + + /* We have already made sure that the implementation supports these. */ + const VkCalibratedTimestampInfoKHR infos[2] = { + { + .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR, + .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR, + }, + { + .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR, + .timeDomain = chain->present_timing.time_domain, + }, + }; + + uint64_t timestamps[2]; + uint64_t max_deviation; + if (chain->wsi->GetCalibratedTimestampsKHR(chain->device, 2, infos, timestamps, &max_deviation) == VK_SUCCESS) { + int64_t device_delta_ticks = (int64_t)queue_ts - (int64_t)timestamps[0]; + int64_t device_delta_ns = (int64_t)((double)chain->wsi->timestamp_period * (double)device_delta_ticks); + uint64_t queue_timestamp = timestamps[1] + device_delta_ns; + + /* Make sure we don't report GPU completing after we flip the request. + * Avoids any weird precision issues creeping through. */ + if (upper_bound) + queue_timestamp = MIN2(queue_timestamp, upper_bound); + + timing->queue_done_time = queue_timestamp; + } +} + +static void +wsi_swapchain_present_timing_notify_recycle_locked(struct wsi_swapchain *chain, + struct wsi_image *image) +{ + assert(chain->present_timing.active); + + for (size_t i = 0; i < chain->present_timing.timings_count; i++) { + if (chain->present_timing.timings[i].image == image) { + /* A different present takes ownership of the image's query pool index now. */ + chain->present_timing.timings[i].image = NULL; + chain->present_timing.timings[i].queue_done_time = 0; + + /* We waited on progress fence, so the timestamp query is guaranteed to be done. */ + wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, 0); + break; + } + } +} + +static VkResult wsi_common_allocate_timing_request( + struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing, + uint64_t present_id, struct wsi_image *image) +{ + VkResult vr = VK_SUCCESS; + mtx_lock(&swapchain->present_timing.lock); + + if (swapchain->present_timing.timings_count >= swapchain->present_timing.timings_capacity) { + vr = VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT; + goto err; + } + + wsi_swapchain_present_timing_notify_recycle_locked(swapchain, image); + + struct wsi_presentation_timing *wsi_timing = + &swapchain->present_timing.timings[swapchain->present_timing.timings_count++]; + + memset(wsi_timing, 0, sizeof(*wsi_timing)); + wsi_timing->serial = ++swapchain->present_timing.serial; + wsi_timing->target_time = timing->targetTime; + wsi_timing->present_id = present_id; + wsi_timing->requested_feedback = timing->presentStageQueries; + wsi_timing->image = image; + + /* Ignore the time domain since we have a static domain. */ + +err: + mtx_unlock(&swapchain->present_timing.lock); + return vr; +} + +void +wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain, + uint64_t timing_serial, + uint64_t timestamp, + struct wsi_image *image) +{ + assert(chain->present_timing.active); + mtx_lock(&chain->present_timing.lock); + + for (size_t i = 0; i < chain->present_timing.timings_count; i++) { + if (chain->present_timing.timings[i].serial == timing_serial) { + chain->present_timing.timings[i].complete_time = timestamp; + chain->present_timing.timings[i].complete = VK_TRUE; + + /* It's possible that QueuePresentKHR already handled the queue done timestamp for us, + * since the image was recycled before presentation could fully complete. + * In this case, we no longer own the timestamp query pool index, so just skip. */ + if (chain->present_timing.timings[i].image != image) + break; + + /* 0 means unknown. Application can probably fall back to its own timestamps if it wants to. */ + chain->present_timing.timings[i].queue_done_time = 0; + wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, timestamp); + chain->present_timing.timings[i].image = NULL; + break; + } + } + + mtx_unlock(&chain->present_timing.lock); +} + +void +wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain, + uint64_t refresh_duration, + uint64_t refresh_interval, + int minimum_delta_for_update) +{ + mtx_lock(&chain->present_timing.lock); + + int64_t duration_delta = llabs((int64_t)refresh_duration - (int64_t)chain->present_timing.refresh_duration); + int64_t interval_delta = llabs((int64_t)refresh_interval - (int64_t)chain->present_timing.refresh_interval); + + /* When the refresh rate is an estimate, the value may fluctuate slightly frame to frame, + * don't spam refresh counter updates unless there is a meaningful delta. + * Applications that use absolute timings are expected to recalibrate based on feedback. */ + if (duration_delta > minimum_delta_for_update || interval_delta > minimum_delta_for_update || + chain->present_timing.refresh_counter == 0) { + /* We'll report this updated refresh counter in feedback, + * so that application knows to requery the refresh rate. */ + chain->present_timing.refresh_counter++; + chain->present_timing.refresh_duration = refresh_duration; + chain->present_timing.refresh_interval = refresh_interval; + } + + mtx_unlock(&chain->present_timing.lock); +} + +VKAPI_ATTR VkResult VKAPI_CALL +wsi_GetPastPresentationTimingEXT( + VkDevice device, + const VkPastPresentationTimingInfoEXT* pPastPresentationTimingInfo, + VkPastPresentationTimingPropertiesEXT* pPastPresentationTimingProperties) +{ + VK_FROM_HANDLE(wsi_swapchain, swapchain, pPastPresentationTimingInfo->swapchain); + VkResult vr = VK_SUCCESS; + bool out_of_order = (pPastPresentationTimingInfo->flags & + VK_PAST_PRESENTATION_TIMING_ALLOW_OUT_OF_ORDER_RESULTS_BIT_EXT) != 0; + + if (swapchain->poll_timing_request) + swapchain->poll_timing_request(swapchain); + + mtx_lock(&swapchain->present_timing.lock); + + pPastPresentationTimingProperties->timingPropertiesCounter = swapchain->present_timing.refresh_counter; + pPastPresentationTimingProperties->timeDomainsCounter = 1; + + /* This implementation always returns results in-order, so can ignore the out-of-order flag. + * TODO: Honor the partial results flag. */ + + uint32_t done_count = 0; + for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) { + /* If different presents request different kinds of state, we may get completion out of order. + * If flag is not set, we cannot report frame N until we have completed all frames M < N. */ + if (swapchain->present_timing.timings[i].complete) + done_count++; + else if (!out_of_order) + break; + } + + /* We don't remove timing info from queue until it is consumed. */ + if (!pPastPresentationTimingProperties->pPresentationTimings) { + pPastPresentationTimingProperties->presentationTimingCount = done_count; + mtx_unlock(&swapchain->present_timing.lock); + return VK_SUCCESS; + } + + VK_OUTARRAY_MAKE_TYPED(VkPastPresentationTimingEXT, timings, + pPastPresentationTimingProperties->pPresentationTimings, + &pPastPresentationTimingProperties->presentationTimingCount); + + uint32_t new_timings_count = 0; + bool stop_timing_removal = false; + + for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) { + const struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i]; + + if (!swapchain->present_timing.timings[i].complete || stop_timing_removal) { + /* Keep output ordered to be compliant without having to re-sort every time. + * Queue depth for timestamps is expected to be small. */ + swapchain->present_timing.timings[new_timings_count++] = swapchain->present_timing.timings[i]; + if (!out_of_order) + stop_timing_removal = true; + continue; + } + + vk_outarray_append_typed(VkPastPresentationTimingEXT, &timings, timing) { + timing->targetTime = swapchain->present_timing.timings[i].target_time; + timing->presentId = in_timing->present_id; + timing->timeDomain = swapchain->present_timing.time_domain; + timing->timeDomainId = 0; + timing->reportComplete = in_timing->complete; + + /* No INCOMPLETE is reported here. Failures are silent. + * However, application already knows upper bound for stage count based on the query, + * so this should never fail. */ + VK_OUTARRAY_MAKE_TYPED(VkPresentStageTimeEXT, stages, timing->pPresentStages, &timing->presentStageCount); + + if (in_timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) { + vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) { + stage->stage = VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT; + stage->time = in_timing->queue_done_time; + } + } + + if (in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) { + vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) { + stage->stage = in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT; + /* It is expected that implementation will only expose one timing value. */ + assert(util_bitcount(stage->stage) == 1); + stage->time = in_timing->complete_time; + } + } + } + } + + swapchain->present_timing.timings_count = new_timings_count; + vr = vk_outarray_status(&timings); + + /* This function is fully atomic within implementation, so have to be thread safe. */ + mtx_unlock(&swapchain->present_timing.lock); + return vr; +} + +VKAPI_ATTR VkResult VKAPI_CALL +wsi_GetSwapchainTimeDomainPropertiesEXT( + VkDevice device, + VkSwapchainKHR swapchain_, + VkSwapchainTimeDomainPropertiesEXT* pSwapchainTimeDomainProperties, + uint64_t* pTimeDomainsCounter) +{ + VK_FROM_HANDLE(wsi_swapchain, swapchain, swapchain_); + + /* We don't change time domains. Everything is static. */ + if (pTimeDomainsCounter) + *pTimeDomainsCounter = 1; + + /* This style is a bit goofy and doesn't map cleanly to anything. */ + if (!pSwapchainTimeDomainProperties->pTimeDomainIds && !pSwapchainTimeDomainProperties->pTimeDomains) { + pSwapchainTimeDomainProperties->timeDomainCount = 1; + return VK_SUCCESS; + } else if (pSwapchainTimeDomainProperties->timeDomainCount == 0) { + return VK_INCOMPLETE; + } + + pSwapchainTimeDomainProperties->timeDomainCount = 1; + if (pSwapchainTimeDomainProperties->pTimeDomains) + *pSwapchainTimeDomainProperties->pTimeDomains = swapchain->present_timing.time_domain; + if (pSwapchainTimeDomainProperties->pTimeDomainIds) + *pSwapchainTimeDomainProperties->pTimeDomainIds = 0; + + return VK_SUCCESS; +} + +VKAPI_ATTR VkResult VKAPI_CALL +wsi_GetSwapchainTimingPropertiesEXT( + VkDevice device, + VkSwapchainKHR swapchain_, + VkSwapchainTimingPropertiesEXT* pSwapchainTimingProperties, + uint64_t* pSwapchainTimingPropertiesCounter) +{ + VK_FROM_HANDLE(wsi_swapchain, swapchain, swapchain_); + VkResult vr; + + mtx_lock(&swapchain->present_timing.lock); + /* If we don't have data yet, must return VK_NOT_READY. */ + vr = swapchain->present_timing.refresh_counter ? VK_SUCCESS : VK_NOT_READY; + pSwapchainTimingProperties->refreshInterval = swapchain->present_timing.refresh_interval; + pSwapchainTimingProperties->refreshDuration = swapchain->present_timing.refresh_duration; + if (pSwapchainTimingPropertiesCounter) + *pSwapchainTimingPropertiesCounter = swapchain->present_timing.refresh_counter; + mtx_unlock(&swapchain->present_timing.lock); + return vr; +} + +VKAPI_ATTR VkResult VKAPI_CALL +wsi_SetSwapchainPresentTimingQueueSizeEXT( + VkDevice device, + VkSwapchainKHR swapchain_, + uint32_t size) +{ + VK_FROM_HANDLE(wsi_swapchain, swapchain, swapchain_); + assert(swapchain->present_timing.active); + VkResult vr = VK_SUCCESS; + + mtx_lock(&swapchain->present_timing.lock); + + if (size < swapchain->present_timing.timings_count) { + vr = VK_NOT_READY; + goto error; + } + + if (size > swapchain->present_timing.timings_capacity) { + void *new_ptr = vk_realloc(&swapchain->alloc, swapchain->present_timing.timings, + sizeof(*swapchain->present_timing.timings) * size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_ptr) { + swapchain->present_timing.timings = new_ptr; + swapchain->present_timing.timings_capacity = size; + } else { + vr = VK_ERROR_OUT_OF_HOST_MEMORY; + goto error; + } + } else { + swapchain->present_timing.timings_capacity = size; + } + +error: + mtx_unlock(&swapchain->present_timing.lock); + return vr; +} + VkDeviceMemory wsi_common_get_memory(VkSwapchainKHR _swapchain, uint32_t index) { @@ -1521,6 +2058,50 @@ wsi_common_queue_present(const struct wsi_device *wsi, vk_find_struct_const(pPresentInfo->pNext, PRESENT_ID_2_KHR); const VkSwapchainPresentFenceInfoKHR *present_fence_info = vk_find_struct_const(pPresentInfo->pNext, SWAPCHAIN_PRESENT_FENCE_INFO_KHR); + const VkPresentTimingsInfoEXT *present_timings_info = + vk_find_struct_const(pPresentInfo->pNext, PRESENT_TIMINGS_INFO_EXT); + + bool needs_timing_command_buffer = false; + + if (present_timings_info) { + /* If we fail a present due to full queue, it's a little unclear from + * spec if we should treat it as OUT_OF_DATE or OUT_OF_HOST_MEMORY for + * purposes of signaling. Validation layers and at least one other implementation + * in the wild seems to treat it as OUT_OF_DATE, so do that. */ + for (uint32_t i = 0; i < present_timings_info->swapchainCount; i++) { + const VkPresentTimingInfoEXT *info = &present_timings_info->pTimingInfos[i]; + VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]); + if (results[i] != VK_SUCCESS || !swapchain->set_timing_request) + continue; + + assert(swapchain->present_timing.active); + + uint32_t image_index = pPresentInfo->pImageIndices[i]; + + /* EXT_present_timing is defined to only work with present_id2. + * It's only used when reporting back timings. */ + results[i] = wsi_common_allocate_timing_request( + swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0, + swapchain->get_wsi_image(swapchain, image_index)); + + /* Application is responsible for allocating sufficient size here. + * We fail with VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT if application is bugged. */ + if (results[i] == VK_SUCCESS) { + swapchain->set_timing_request(swapchain, &(struct wsi_image_timing_request) { + .serial = swapchain->present_timing.serial, + .time = info->targetTime, + .flags = info->flags, + }); + + if (info->presentStageQueries & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) { + /* It's not a problem if we redundantly submit timing command buffers. + * VUID-12234 also says all swapchains in this present must have been + * created with present timing enabled. */ + needs_timing_command_buffer = true; + } + } + } + } /* Gather up all the semaphores and fences we need to signal per-image */ STACK_ARRAY(struct wsi_image_signal_info, image_signal_infos, @@ -1596,15 +2177,15 @@ wsi_common_queue_present(const struct wsi_device *wsi, * the per-image semaphores and fences with the blit. */ { - STACK_ARRAY(VkCommandBufferSubmitInfo, blit_command_buffer_infos, - pPresentInfo->swapchainCount); + STACK_ARRAY(VkCommandBufferSubmitInfo, command_buffer_infos, + pPresentInfo->swapchainCount * 2); STACK_ARRAY(VkSemaphoreSubmitInfo, signal_semaphore_infos, pPresentInfo->swapchainCount * ARRAY_SIZE(image_signal_infos[0].semaphore_infos)); STACK_ARRAY(VkFence, fences, pPresentInfo->swapchainCount * ARRAY_SIZE(image_signal_infos[0].fences)); - uint32_t blit_count = 0, signal_semaphore_count = 0, fence_count = 0; + uint32_t command_buffer_count = 0, signal_semaphore_count = 0, fence_count = 0; for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) { VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]); @@ -1612,14 +2193,27 @@ wsi_common_queue_present(const struct wsi_device *wsi, struct wsi_image *image = swapchain->get_wsi_image(swapchain, image_index); + bool separate_queue_blit = swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT && + swapchain->blit.queue != NULL; + + /* For TIMING_QUEUE_FULL_EXT, ensure sync objects are signaled, + * but don't do any real work. */ + if (results[i] == VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT || !separate_queue_blit) { + for (uint32_t j = 0; j < image_signal_infos[i].semaphore_count; j++) { + signal_semaphore_infos[signal_semaphore_count++] = + image_signal_infos[i].semaphore_infos[j]; + } + for (uint32_t j = 0; j < image_signal_infos[i].fence_count; j++) + fences[fence_count++] = image_signal_infos[i].fences[j]; + } + if (results[i] != VK_SUCCESS) continue; /* If we're blitting on another swapchain, just signal the blit * semaphore for now. */ - if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT && - swapchain->blit.queue != NULL) { + if (separate_queue_blit) { /* Create the blit semaphore if needed */ if (swapchain->blit.semaphores[image_index] == VK_NULL_HANDLE) { const VkSemaphoreCreateInfo sem_info = { @@ -1644,27 +2238,27 @@ wsi_common_queue_present(const struct wsi_device *wsi, } if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT) { - blit_command_buffer_infos[blit_count++] = (VkCommandBufferSubmitInfo) { + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = image->blit.cmd_buffers[queue->queue_family_index], }; } - for (uint32_t j = 0; j < image_signal_infos[i].semaphore_count; j++) { - signal_semaphore_infos[signal_semaphore_count++] = - image_signal_infos[i].semaphore_infos[j]; + if (needs_timing_command_buffer) { + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = image->timestamp_cmd_buffers[queue->queue_family_index], + }; } - for (uint32_t j = 0; j < image_signal_infos[i].fence_count; j++) - fences[fence_count++] = image_signal_infos[i].fences[j]; } const VkSubmitInfo2 submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = pPresentInfo->waitSemaphoreCount, .pWaitSemaphoreInfos = semaphore_wait_infos, - .commandBufferInfoCount = blit_count, - .pCommandBufferInfos = blit_command_buffer_infos, + .commandBufferInfoCount = command_buffer_count, + .pCommandBufferInfos = command_buffer_infos, .signalSemaphoreInfoCount = signal_semaphore_count, .pSignalSemaphoreInfos = signal_semaphore_infos, }; @@ -1680,7 +2274,7 @@ wsi_common_queue_present(const struct wsi_device *wsi, STACK_ARRAY_FINISH(fences); STACK_ARRAY_FINISH(signal_semaphore_infos); - STACK_ARRAY_FINISH(blit_command_buffer_infos); + STACK_ARRAY_FINISH(command_buffer_infos); } /* Now do blits on any blit queues */ @@ -1693,8 +2287,10 @@ wsi_common_queue_present(const struct wsi_device *wsi, if (results[i] != VK_SUCCESS) continue; - if (swapchain->blit.type == WSI_SWAPCHAIN_NO_BLIT || - swapchain->blit.queue == NULL) + bool separate_queue_blit = swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT && + swapchain->blit.queue != NULL; + + if (!separate_queue_blit) continue; const VkSemaphoreSubmitInfo blit_semaphore_info = { @@ -1703,17 +2299,27 @@ wsi_common_queue_present(const struct wsi_device *wsi, .semaphore = swapchain->blit.semaphores[image_index], }; - const VkCommandBufferSubmitInfo blit_command_buffer_info = { + VkCommandBufferSubmitInfo command_buffer_infos[2]; + uint32_t command_buffer_count = 0; + + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = image->blit.cmd_buffers[0], }; + if (needs_timing_command_buffer) { + command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = image->timestamp_cmd_buffers[0], + }; + } + const VkSubmitInfo2 submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = 1, .pWaitSemaphoreInfos = &blit_semaphore_info, - .commandBufferInfoCount = 1, - .pCommandBufferInfos = &blit_command_buffer_info, + .commandBufferInfoCount = command_buffer_count, + .pCommandBufferInfos = command_buffer_infos, .signalSemaphoreInfoCount = image_signal_infos[i].semaphore_count, .pSignalSemaphoreInfos = image_signal_infos[i].semaphore_infos, }; diff --git a/src/vulkan/wsi/wsi_common.h b/src/vulkan/wsi/wsi_common.h index ae6b3f7b87c..4a86f35c1de 100644 --- a/src/vulkan/wsi/wsi_common.h +++ b/src/vulkan/wsi/wsi_common.h @@ -62,6 +62,8 @@ struct wsi_device { VkPhysicalDeviceMemoryProperties memory_props; uint32_t queue_family_count; uint64_t queue_supports_blit; + uint64_t queue_supports_timestamps; + float timestamp_period; VkPhysicalDeviceDrmPropertiesEXT drm_info; VkPhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info; @@ -201,28 +203,37 @@ struct wsi_device { WSI_CB(CmdPipelineBarrier); WSI_CB(CmdCopyImage); WSI_CB(CmdCopyImageToBuffer); + WSI_CB(CmdResetQueryPool); + WSI_CB(CmdWriteTimestamp); WSI_CB(CreateBuffer); WSI_CB(CreateCommandPool); WSI_CB(CreateFence); WSI_CB(CreateImage); + WSI_CB(CreateQueryPool); WSI_CB(CreateSemaphore); WSI_CB(DestroyBuffer); WSI_CB(DestroyCommandPool); WSI_CB(DestroyFence); WSI_CB(DestroyImage); + WSI_CB(DestroyQueryPool); WSI_CB(DestroySemaphore); WSI_CB(EndCommandBuffer); WSI_CB(FreeMemory); WSI_CB(FreeCommandBuffers); WSI_CB(GetBufferMemoryRequirements); + WSI_CB(GetCalibratedTimestampsKHR); WSI_CB(GetFenceStatus); WSI_CB(GetImageDrmFormatModifierPropertiesEXT); WSI_CB(GetImageMemoryRequirements); WSI_CB(GetImageSubresourceLayout); WSI_CB(GetMemoryFdKHR); + WSI_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR); + WSI_CB(GetPhysicalDeviceProperties); WSI_CB(GetPhysicalDeviceFormatProperties); WSI_CB(GetPhysicalDeviceFormatProperties2); WSI_CB(GetPhysicalDeviceImageFormatProperties2); + WSI_CB(GetPhysicalDeviceQueueFamilyProperties); + WSI_CB(GetQueryPoolResults); WSI_CB(GetSemaphoreFdKHR); WSI_CB(ResetFences); WSI_CB(QueueSubmit2); diff --git a/src/vulkan/wsi/wsi_common_display.c b/src/vulkan/wsi/wsi_common_display.c index 4c9451a6d00..dc00551afb0 100644 --- a/src/vulkan/wsi/wsi_common_display.c +++ b/src/vulkan/wsi/wsi_common_display.c @@ -156,6 +156,12 @@ enum colorspace_enum { COLORSPACE_ENUM_MAX, }; +enum vrr_tristate { + VRR_TRISTATE_UNKNOWN, + VRR_TRISTATE_DISABLED, + VRR_TRISTATE_ENABLED, +}; + typedef struct wsi_display_connector_metadata { VkHdrMetadataEXT hdr_metadata; bool supports_st2084; @@ -185,6 +191,10 @@ typedef struct wsi_display_connector { struct wsi_display_connector_metadata metadata; uint32_t count_formats; uint32_t *formats; + enum vrr_tristate vrr_capable; + enum vrr_tristate vrr_enabled; + uint64_t last_frame; + uint64_t last_nsec; } wsi_display_connector; struct wsi_display { @@ -370,6 +380,11 @@ find_properties(struct wsi_display_connector *connector, uint32_t count_props, u } } + if (!strcmp(prop->name, "vrr_capable")) + connector->vrr_capable = prop_values[p] != 0 ? VRR_TRISTATE_ENABLED : VRR_TRISTATE_DISABLED; + if (!strcmp(prop->name, "VRR_ENABLED")) + connector->vrr_enabled = prop_values[p] != 0 ? VRR_TRISTATE_ENABLED : VRR_TRISTATE_DISABLED; + drmModeFreeProperty(prop); } @@ -431,38 +446,45 @@ find_connector_properties(struct wsi_display_connector *connector, drmModeConnec enum wsi_image_state { WSI_IMAGE_IDLE, WSI_IMAGE_DRAWING, + WSI_IMAGE_WAITING, + WSI_IMAGE_QUEUED_AFTER_WAIT, WSI_IMAGE_QUEUED, WSI_IMAGE_FLIPPING, WSI_IMAGE_DISPLAYING }; struct wsi_display_image { - struct wsi_image base; - struct wsi_display_swapchain *chain; - enum wsi_image_state state; - uint32_t fb_id; - uint32_t buffer[4]; - uint64_t flip_sequence; - uint64_t present_id; + struct wsi_image base; + struct wsi_display_swapchain *chain; + enum wsi_image_state state; + uint32_t fb_id; + uint32_t buffer[4]; + uint64_t flip_sequence; + uint64_t present_id; + struct wsi_image_timing_request timing_request; + struct wsi_display_fence *fence; + uint64_t minimum_ns; }; struct wsi_display_swapchain { - struct wsi_swapchain base; - struct wsi_display *wsi; - VkIcdSurfaceDisplay *surface; - uint64_t flip_sequence; - VkResult status; + struct wsi_swapchain base; + struct wsi_display *wsi; + VkIcdSurfaceDisplay *surface; + uint64_t flip_sequence; + VkResult status; - mtx_t present_id_mutex; - struct u_cnd_monotonic present_id_cond; - uint64_t present_id; - VkResult present_id_error; + mtx_t present_id_mutex; + struct u_cnd_monotonic present_id_cond; + uint64_t present_id; + VkResult present_id_error; /* A unique ID for the color outcome of the swapchain. A serial of 0 means unset/default. */ - uint64_t color_outcome_serial; - VkHdrMetadataEXT hdr_metadata; + uint64_t color_outcome_serial; + VkHdrMetadataEXT hdr_metadata; - struct wsi_display_image images[0]; + struct wsi_image_timing_request timing_request; + + struct wsi_display_image images[0]; }; struct wsi_display_fence { @@ -473,6 +495,9 @@ struct wsi_display_fence { uint32_t syncobj; /* syncobj to signal on event */ uint64_t sequence; bool device_event; /* fence is used for device events */ + struct wsi_display_connector *connector; + /* Image to be flipped, if this fence is for an image in the WSI_IMAGE_WAITING state that will need to move to QUEUED. */ + struct wsi_display_image *image; }; struct wsi_display_sync { @@ -1319,6 +1344,16 @@ wsi_display_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface, break; } + case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: { + VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext; + + wait->presentStageQueries = VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT; + wait->presentTimingSupported = VK_TRUE; + wait->presentAtAbsoluteTimeSupported = VK_TRUE; + wait->presentAtRelativeTimeSupported = VK_TRUE; + break; + } + default: /* Ignored */ break; @@ -1678,6 +1713,8 @@ wsi_display_image_init(struct wsi_swapchain *drv_chain, image->chain = chain; image->state = WSI_IMAGE_IDLE; + image->fence = NULL; + image->minimum_ns = 0; image->fb_id = 0; uint64_t *fb_modifiers = NULL; @@ -1789,6 +1826,12 @@ wsi_display_idle_old_displaying(struct wsi_display_image *active_image) static VkResult _wsi_display_queue_next(struct wsi_swapchain *drv_chain); +static uint64_t +widen_32_to_64(uint32_t narrow, uint64_t near) +{ + return near + (int32_t)(narrow - near); +} + /** * Wakes up any vkWaitForPresentKHR() waiters on the last present to this * image. @@ -1817,6 +1860,17 @@ wsi_display_surface_error(struct wsi_display_swapchain *swapchain, VkResult resu mtx_unlock(&swapchain->present_id_mutex); } +/** + * libdrm callback for when we get a DRM_EVENT_PAGE_FLIP in response to our + * atomic commit with DRM_MODE_PAGE_FLIP_EVENT. That event can happen at any + * point after vblank, when the old image is no longer being scanned out and + * that commit is set up to be scanned out next. + * + * This means that we can queue up a new atomic commit, if there were presents + * that we hadn't submitted yet (the event queue is driven by + * wsi_display_wait_thread(), so that's what ends up submitting atomic commits + * most of the time). + **/ static void wsi_display_page_flip_handler2(int fd, unsigned int frame, @@ -1828,6 +1882,28 @@ wsi_display_page_flip_handler2(int fd, struct wsi_display_image *image = data; struct wsi_display_swapchain *chain = image->chain; + VkIcdSurfaceDisplay *surface = chain->surface; + wsi_display_mode *display_mode = + wsi_display_mode_from_handle(surface->displayMode); + wsi_display_connector *connector = display_mode->connector; + + uint64_t nsec = 1000000000ull * sec + 1000ull * usec; + /* If we're on VRR timing path, ensure we get a stable pace. */ + nsec = MAX2(nsec, image->minimum_ns); + + uint64_t frame64 = widen_32_to_64(frame, connector->last_frame); + connector->last_frame = frame64; + connector->last_nsec = nsec; + + /* Never update the refresh rate estimate. It's static based on the mode. + * Update this before we signal present wait so that applications + * get lowest possible latency for present time. */ + if (image->timing_request.serial) { + wsi_swapchain_present_timing_notify_completion( + &chain->base, image->timing_request.serial, + nsec, &image->base); + } + wsi_display_debug("image %ld displayed at %d\n", image - &(image->chain->images[0]), frame); image->state = WSI_IMAGE_DISPLAYING; @@ -1841,42 +1917,29 @@ wsi_display_page_flip_handler2(int fd, chain->status = result; } -static void wsi_display_fence_event_handler(struct wsi_display_fence *fence); - -static void wsi_display_page_flip_handler(int fd, - unsigned int frame, - unsigned int sec, - unsigned int usec, - void *data) -{ - wsi_display_page_flip_handler2(fd, frame, sec, usec, 0, data); -} - -static void wsi_display_vblank_handler(int fd, unsigned int frame, - unsigned int sec, unsigned int usec, - void *data) -{ - struct wsi_display_fence *fence = data; - - wsi_display_fence_event_handler(fence); -} +static void wsi_display_fence_event_handler(struct wsi_display_fence *fence, + uint64_t nsec, + uint64_t frame); +/** + * libdrm callback for when we get a DRM_EVENT_CRTC_SEQUENCE in response to a + * drmCrtcQueueSequence(), indicating that the first pixel of a new frame is + * being scanned out. + **/ static void wsi_display_sequence_handler(int fd, uint64_t frame, uint64_t nsec, uint64_t user_data) { struct wsi_display_fence *fence = (struct wsi_display_fence *) (uintptr_t) user_data; - wsi_display_fence_event_handler(fence); + wsi_display_fence_event_handler(fence, nsec, frame); } static drmEventContext event_context = { .version = DRM_EVENT_CONTEXT_VERSION, - .page_flip_handler = wsi_display_page_flip_handler, -#if DRM_EVENT_CONTEXT_VERSION >= 3 + .page_flip_handler = NULL, .page_flip_handler2 = wsi_display_page_flip_handler2, -#endif - .vblank_handler = wsi_display_vblank_handler, + .vblank_handler = NULL, .sequence_handler = wsi_display_sequence_handler, }; @@ -2383,13 +2446,30 @@ wsi_display_fence_check_free(struct wsi_display_fence *fence) vk_free(fence->wsi->alloc, fence); } -static void wsi_display_fence_event_handler(struct wsi_display_fence *fence) +static void wsi_display_fence_event_handler(struct wsi_display_fence *fence, + uint64_t nsec, uint64_t frame) { + struct wsi_display_connector *connector = fence->connector; + struct wsi_display_image *image = fence->image; + if (fence->syncobj) { (void) drmSyncobjSignal(fence->wsi->syncobj_fd, &fence->syncobj, 1); (void) drmSyncobjDestroy(fence->wsi->syncobj_fd, fence->syncobj); } + if (connector) { + connector->last_nsec = nsec; + connector->last_frame = frame; + } + + if (image && image->state == WSI_IMAGE_WAITING) { + /* We may need to do the final sleep on CPU to resolve VRR timings. */ + image->state = WSI_IMAGE_QUEUED_AFTER_WAIT; + VkResult result = _wsi_display_queue_next(&image->chain->base); + if (result != VK_SUCCESS) + image->chain->status = result; + } + fence->event_received = true; wsi_display_fence_check_free(fence); } @@ -2822,9 +2902,11 @@ _wsi_display_queue_next(struct wsi_swapchain *drv_chain) switch (tmp_image->state) { case WSI_IMAGE_FLIPPING: - /* already flipping, don't send another to the kernel yet */ + case WSI_IMAGE_WAITING: + /* already flipping or waiting for a flip, don't send another to the kernel yet */ return VK_SUCCESS; case WSI_IMAGE_QUEUED: + case WSI_IMAGE_QUEUED_AFTER_WAIT: /* find the oldest queued */ if (!image || tmp_image->flip_sequence < image->flip_sequence) image = tmp_image; @@ -2837,6 +2919,95 @@ _wsi_display_queue_next(struct wsi_swapchain *drv_chain) if (!image) return VK_SUCCESS; + if (image->fence) { + image->fence->image = NULL; + wsi_display_fence_destroy(image->fence); + image->fence = NULL; + } + + unsigned num_cycles_to_skip = 0; + int64_t target_relative_ns = 0; + bool skip_timing = false; + bool nearest_cycle = + (image->timing_request.flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) != 0; + + if (image->timing_request.time != 0) { + /* Ensure we have some kind of timebase to work from. */ + if (!connector->last_frame) + drmCrtcGetSequence(wsi->fd, connector->crtc_id, &connector->last_frame, &connector->last_nsec); + + if (!connector->last_frame || chain->base.present_timing.refresh_duration == 0) { + /* Something has gone very wrong. Just ignore present timing for safety. */ + skip_timing = true; + wsi_display_debug("Cannot get a stable timebase, last frame = %"PRIu64", refresh_duration = %"PRIu64".\n", + connector->last_frame, chain->base.present_timing.refresh_duration); + } + } + + if (!skip_timing && image->state == WSI_IMAGE_QUEUED && image->timing_request.time != 0) { + target_relative_ns = (int64_t)image->timing_request.time; + + /* We need to estimate number of refresh cycles to wait for. */ + if (!(image->timing_request.flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT)) { + target_relative_ns -= (int64_t)connector->last_nsec; + } + + if (nearest_cycle) { + /* No need to lock, we never update refresh_duration dynamically. */ + target_relative_ns -= (int64_t)chain->base.present_timing.refresh_duration / 2; + } else { + /* If application is computing an exact value that lands exactly on the refresh cycle, + * pull back the estimate a little bit since DRM precision is 1us. */ + target_relative_ns -= 1000; + } + } + + target_relative_ns = MAX2(target_relative_ns, 0); + if (target_relative_ns && chain->base.present_timing.refresh_duration) + num_cycles_to_skip = target_relative_ns / chain->base.present_timing.refresh_duration; + + /* CRTC cycles is not reliable on VRR. We cannot use that as a time base. */ + bool is_vrr = connector->vrr_enabled == VRR_TRISTATE_ENABLED && + connector->vrr_capable == VRR_TRISTATE_ENABLED; + + if (num_cycles_to_skip) { + if (!is_vrr) { + /* On FRR, we can rely on vblank events to guide time progression. */ + VkDisplayKHR display = wsi_display_connector_to_handle(connector); + image->fence = wsi_display_fence_alloc(wsi, -1); + + if (image->fence) { + image->fence->connector = connector; + image->fence->image = image; + + uint64_t frame_queued; + uint64_t target_frame = connector->last_frame + num_cycles_to_skip; + VkResult result = wsi_register_vblank_event(image->fence, chain->base.wsi, display, + 0, target_frame, &frame_queued); + + if (result == VK_SUCCESS && frame_queued <= target_frame) { + /* Wait until the vblank fence signals and the event handler will attempt to requeue us. */ + image->state = WSI_IMAGE_WAITING; + return VK_SUCCESS; + } + } + } else { + /* On a VRR display, applications can request frame times which are fractional, + * and there is no good way to target absolute time with atomic commits it seems ... */ + int64_t target_ns = target_relative_ns + (int64_t)connector->last_nsec; + image->minimum_ns = target_ns; + + /* Account for some minimum delay in submitting a page flip until it's processed and sleep jitter. + * We will compensate for the difference if there is any, so that we don't report completion + * times in the past. */ + target_ns -= 1 * 1000 * 1000; + + os_time_nanosleep_until(target_ns); + } + } + + image->state = WSI_IMAGE_QUEUED; + int ret = drm_atomic_commit(connector, image); if (ret == 0) { image->state = WSI_IMAGE_FLIPPING; @@ -2859,6 +3030,44 @@ _wsi_display_queue_next(struct wsi_swapchain *drv_chain) } } +static void +wsi_display_set_timing_request(struct wsi_swapchain *drv_chain, + const struct wsi_image_timing_request *request) +{ + struct wsi_display_swapchain *chain = + (struct wsi_display_swapchain *) drv_chain; + chain->timing_request = *request; +} + +static uint64_t +wsi_display_poll_refresh_duration(struct wsi_swapchain *drv_chain, uint64_t *interval) +{ + struct wsi_display_swapchain *chain = + (struct wsi_display_swapchain *)drv_chain; + VkIcdSurfaceDisplay *surface = chain->surface; + wsi_display_mode *display_mode = + wsi_display_mode_from_handle(surface->displayMode); + double refresh = wsi_display_mode_refresh(display_mode); + wsi_display_connector *connector = display_mode->connector; + + uint64_t refresh_ns = (uint64_t)(floor(1.0 / refresh * 1e9 + 0.5)); + + /* Assume FRR by default. */ + *interval = refresh_ns; + + /* If VRR is not enabled on the target CRTC, we should honor that. + * There is no mechanism to clearly request that VRR is desired, + * so we must assume that user might force us into FRR mode. */ + if (connector->vrr_capable == VRR_TRISTATE_ENABLED) { + if (connector->vrr_enabled == VRR_TRISTATE_UNKNOWN) + *interval = 0; /* Somehow we don't know if the connector is VRR or FRR. Report unknown. */ + else if (connector->vrr_enabled == VRR_TRISTATE_ENABLED) + *interval = UINT64_MAX; + } + + return refresh_ns; +} + static VkResult wsi_display_queue_present(struct wsi_swapchain *drv_chain, uint32_t image_index, @@ -2876,16 +3085,19 @@ wsi_display_queue_present(struct wsi_swapchain *drv_chain, return chain->status; image->present_id = present_id; + image->timing_request = chain->timing_request; assert(image->state == WSI_IMAGE_DRAWING); wsi_display_debug("present %d\n", image_index); mtx_lock(&wsi->wait_mutex); - /* Make sure that the page flip handler is processed in finite time if using present wait. */ - if (present_id) + /* Make sure that the page flip handler is processed in finite time if using present wait + * or presentation time. */ + if (present_id || chain->timing_request.serial) wsi_display_start_wait_thread(wsi); + memset(&chain->timing_request, 0, sizeof(chain->timing_request)); image->flip_sequence = ++chain->flip_sequence; image->state = WSI_IMAGE_QUEUED; @@ -3045,6 +3257,9 @@ wsi_display_surface_create_swapchain( chain->base.acquire_next_image = wsi_display_acquire_next_image; chain->base.release_images = wsi_display_release_images; chain->base.queue_present = wsi_display_queue_present; + chain->base.set_timing_request = wsi_display_set_timing_request; + chain->base.poll_early_refresh = wsi_display_poll_refresh_duration; + chain->base.present_timing.time_domain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR; chain->base.wait_for_present = wsi_display_wait_for_present; chain->base.wait_for_present2 = wsi_display_wait_for_present; chain->base.set_hdr_metadata = wsi_display_set_hdr_metadata; diff --git a/src/vulkan/wsi/wsi_common_headless.c b/src/vulkan/wsi/wsi_common_headless.c index 46db5a49f09..807148b4f30 100644 --- a/src/vulkan/wsi/wsi_common_headless.c +++ b/src/vulkan/wsi/wsi_common_headless.c @@ -112,6 +112,16 @@ wsi_headless_surface_get_capabilities2(VkIcdSurfaceBase *surface, break; } + case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: { + VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext; + + wait->presentStageQueries = 0; + wait->presentTimingSupported = VK_FALSE; + wait->presentAtAbsoluteTimeSupported = VK_FALSE; + wait->presentAtRelativeTimeSupported = VK_FALSE; + break; + } + default: /* Ignored */ break; diff --git a/src/vulkan/wsi/wsi_common_metal.c b/src/vulkan/wsi/wsi_common_metal.c index 857de8959e1..2b9176c6719 100644 --- a/src/vulkan/wsi/wsi_common_metal.c +++ b/src/vulkan/wsi/wsi_common_metal.c @@ -139,6 +139,16 @@ wsi_metal_surface_get_capabilities2(VkIcdSurfaceBase *surface, break; } + case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: { + VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext; + + wait->presentStageQueries = 0; + wait->presentTimingSupported = VK_FALSE; + wait->presentAtAbsoluteTimeSupported = VK_FALSE; + wait->presentAtRelativeTimeSupported = VK_FALSE; + break; + } + default: /* Ignored */ break; diff --git a/src/vulkan/wsi/wsi_common_private.h b/src/vulkan/wsi/wsi_common_private.h index 9fe64cfba95..000c6d952c8 100644 --- a/src/vulkan/wsi/wsi_common_private.h +++ b/src/vulkan/wsi/wsi_common_private.h @@ -188,6 +188,29 @@ struct wsi_image { int dma_buf_fd; #endif void *cpu_map; + + VkQueryPool query_pool; + VkCommandBuffer *timestamp_cmd_buffers; +}; + +struct wsi_presentation_timing { + uint64_t present_id; + uint64_t target_time; + uint64_t serial; + uint64_t queue_done_time; /* GPU timestamp based. */ + uint64_t complete_time; /* Best effort timestamp we get from backend. */ + /* If we're rendering with IMMEDIATE, it's possible for images to IDLE long before they complete. + * In this case, we have to ensure that queue_done_time is sampled at QueuePresentKHR time + * before we recycle an image. */ + struct wsi_image *image; + VkPresentStageFlagsEXT requested_feedback; + VkBool32 complete; +}; + +struct wsi_image_timing_request { + uint64_t serial; + uint64_t time; + VkPresentTimingInfoFlagsEXT flags; }; struct wsi_swapchain { @@ -237,7 +260,28 @@ struct wsi_swapchain { struct vk_queue *queue; } blit; + struct { + mtx_t lock; + bool active; + + struct wsi_presentation_timing *timings; + size_t timings_capacity; + size_t timings_count; + + size_t serial; + + /* Maps to Vulkan spec definitions. */ + uint64_t refresh_duration; + uint64_t refresh_interval; + /* When 0, we don't know yet. Every time the refresh rate changes, + * increase this counter. This counter must also be passed in GetPastTimings. */ + uint64_t refresh_counter; + + VkTimeDomainKHR time_domain; + } present_timing; + bool capture_key_pressed; + float timestamp_period; /* Command pools, one per queue family */ VkCommandPool *cmd_pools; @@ -266,6 +310,10 @@ struct wsi_swapchain { VkPresentModeKHR mode); void (*set_hdr_metadata)(struct wsi_swapchain *swap_chain, const VkHdrMetadataEXT* pMetadata); + void (*set_timing_request)(struct wsi_swapchain *swap_chain, + const struct wsi_image_timing_request *request); + void (*poll_timing_request)(struct wsi_swapchain *swap_chain); + uint64_t (*poll_early_refresh)(struct wsi_swapchain *swap_chain, uint64_t *interval); }; bool @@ -369,6 +417,10 @@ wsi_create_image(const struct wsi_swapchain *chain, void wsi_image_init(struct wsi_image *image); +VkResult +wsi_image_init_timestamp(const struct wsi_swapchain *chain, + struct wsi_image *image); + void wsi_destroy_image(const struct wsi_swapchain *chain, struct wsi_image *image); @@ -377,6 +429,16 @@ VkResult wsi_swapchain_wait_for_present_semaphore(const struct wsi_swapchain *chain, uint64_t present_id, uint64_t timeout); +void +wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain, + uint64_t timing_serial, uint64_t timestamp, + struct wsi_image *image); + +void +wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain, + uint64_t refresh_duration, uint64_t refresh_interval, + int minimum_delta_for_update); + #ifdef HAVE_LIBDRM VkResult wsi_prepare_signal_dma_buf_from_semaphore(struct wsi_swapchain *chain, diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c index 083777f42ac..141ddf6fc45 100644 --- a/src/vulkan/wsi/wsi_common_wayland.c +++ b/src/vulkan/wsi/wsi_common_wayland.c @@ -254,6 +254,8 @@ struct wsi_wl_swapchain { bool has_hdr_metadata; } color; + struct wsi_image_timing_request timing_request; + struct wsi_wl_image images[0]; }; VK_DEFINE_NONDISP_HANDLE_CASTS(wsi_wl_swapchain, base.base, VkSwapchainKHR, @@ -1668,7 +1670,15 @@ wsi_GetPhysicalDeviceWaylandPresentationSupportKHR(VkPhysicalDevice physicalDevi struct wsi_wayland *wsi = (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND]; - if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex))) + /* These should overlap. */ + uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps; + + /* If there are no queues that support both blits and timestamps, + * don't report support for queue timestamps. */ + if (!effective_queues) + effective_queues = wsi_device->queue_supports_blit; + + if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex))) return false; struct wsi_wl_display display; @@ -1789,7 +1799,8 @@ wsi_wl_surface_get_capabilities(VkIcdSurfaceBase *icd_surface, static VkResult wsi_wl_surface_check_presentation(VkIcdSurfaceBase *icd_surface, struct wsi_device *wsi_device, - bool *has_wp_presentation) + bool *has_wp_presentation, clockid_t *clock_id, + bool *has_commit_timing, bool *has_fifo) { VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface; struct wsi_wayland *wsi = @@ -1800,7 +1811,17 @@ wsi_wl_surface_check_presentation(VkIcdSurfaceBase *icd_surface, wsi_device->sw, "mesa check wp_presentation")) return VK_ERROR_SURFACE_LOST_KHR; - *has_wp_presentation = !!display.wp_presentation_notwrapped; + if (has_wp_presentation) + *has_wp_presentation = !!display.wp_presentation_notwrapped; + + if (clock_id) + *clock_id = display.presentation_clock_id; + + if (has_commit_timing) + *has_commit_timing = !!display.commit_timing_manager; + + if (has_fifo) + *has_fifo = !!display.fifo_manager; wsi_wl_display_finish(&display); @@ -1893,7 +1914,7 @@ wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface, bool has_feedback; result = wsi_wl_surface_check_presentation(surface, wsi_device, - &has_feedback); + &has_feedback, NULL, NULL, NULL); if (result != VK_SUCCESS) return result; @@ -1906,7 +1927,7 @@ wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface, bool has_feedback; result = wsi_wl_surface_check_presentation(surface, wsi_device, - &has_feedback); + &has_feedback, NULL, NULL, NULL); if (result != VK_SUCCESS) return result; @@ -1914,6 +1935,50 @@ wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface, break; } + case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: { + VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext; + bool has_feedback, has_commit_timing, has_fifo; + + wait->presentStageQueries = 0; + wait->presentTimingSupported = VK_FALSE; + wait->presentAtAbsoluteTimeSupported = VK_FALSE; + wait->presentAtRelativeTimeSupported = VK_FALSE; + + clockid_t clock_id; + + result = wsi_wl_surface_check_presentation(surface, wsi_device, + &has_feedback, &clock_id, + &has_commit_timing, &has_fifo); + + if (result != VK_SUCCESS) + return result; + + if (!has_feedback) + break; + + /* We could deal with esoteric clock domains by exposing VK_TIME_DOMAIN_SWAPCHAIN or PRESENT_STAGE_LOCAL, + * but that requires a lot more scaffolding, and there's no need to add extra complexity if we can + * get away with this. */ + if (clock_id != CLOCK_MONOTONIC && clock_id != CLOCK_MONOTONIC_RAW) + break; + + /* Presentation timing spec talks about the reported time targeting "pixel being visible". + * From presentation-time spec: "Note, that if the display path has a non-zero latency, + * the time instant specified by this counter may differ from the timestamp's." + * No compositor I know of reports where it takes display latency into account, + * so it's a little unclear if we should actually be reporting PIXEL_OUT or PIXEL_VISIBLE. + * Choose PIXEL_OUT for now since no known compositor out there actually implements + * PIXEL_VISIBLE as intended, and we don't want to promise something we cannot hold. */ + wait->presentTimingSupported = VK_TRUE; + wait->presentStageQueries = VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT; + + /* We cannot reliably implement FIFO guarantee + absolute time without the FIFO barrier. + * Presentation timing is only defined to work with FIFO (and its variants like RELAXED and LATEST_READY). */ + wait->presentAtAbsoluteTimeSupported = has_commit_timing && has_fifo; + + break; + } + default: /* Ignored */ break; @@ -2404,6 +2469,7 @@ struct wsi_wl_present_id { * which uses frame callback to signal DRI3 COMPLETE. */ struct wl_callback *frame; uint64_t present_id; + uint64_t timing_serial; struct mesa_trace_flow flow; uint64_t submission_time; const VkAllocationCallbacks *alloc; @@ -2411,6 +2477,8 @@ struct wsi_wl_present_id { uint64_t target_time; uint64_t correction; struct wl_list link; + struct wsi_image *img; + bool user_target_time; }; static struct wsi_image * @@ -2441,6 +2509,14 @@ wsi_wl_swapchain_set_present_mode(struct wsi_swapchain *wsi_chain, chain->base.present_mode = mode; } +static void +wsi_wl_swapchain_set_timing_request(struct wsi_swapchain *wsi_chain, + const struct wsi_image_timing_request *request) +{ + struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)wsi_chain; + chain->timing_request = *request; +} + static VkResult dispatch_present_id_queue(struct wsi_swapchain *wsi_chain, struct timespec *end_time) { @@ -2514,6 +2590,15 @@ dispatch_present_id_queue(struct wsi_swapchain *wsi_chain, struct timespec *end_ return VK_SUCCESS; } +static void +wsi_wl_swapchain_poll_timing_request(struct wsi_swapchain *wsi_chain) +{ + /* Timing requests must complete in finite time, and if we're not calling present wait + * or queue present regularly, timing requests will never come back. */ + struct timespec instant = {0}; + dispatch_present_id_queue(wsi_chain, &instant); +} + static bool wsi_wl_swapchain_present_id_completes_in_finite_time_locked(struct wsi_wl_swapchain *chain, uint64_t present_id) @@ -2794,16 +2879,13 @@ wsi_wl_swapchain_acquire_next_image_implicit(struct wsi_swapchain *wsi_chain, } static void -wsi_wl_presentation_update_present_id(struct wsi_wl_present_id *id) +wsi_wl_presentation_update_present_id_locked(struct wsi_wl_present_id *id) { - mtx_lock(&id->chain->present_ids.lock); id->chain->present_ids.outstanding_count--; if (id->present_id > id->chain->present_ids.max_completed) id->chain->present_ids.max_completed = id->present_id; id->chain->present_ids.display_time_correction -= id->correction; - mtx_unlock(&id->chain->present_ids.lock); - vk_free(id->alloc, id); } static void @@ -2815,6 +2897,20 @@ presentation_handle_presented(void *data, struct wsi_wl_swapchain *chain = id->chain; uint64_t target_time = id->target_time; + /* In v1 of presentation time, we can know if we're likely running VRR, given refresh is 0. + * However, we cannot know what the base refresh rate is without some kind of external information. + * We also cannot know if we're actually driving the display in a VRR fashion. + * In v2, we should always know the "base refresh" rate, but that means we cannot know if we're driving + * the display VRR or FRR. We could try to deduce it based on timestamps, but that is too brittle. + * There is a v3 proposal that adds this information more formally so we don't have to guess. + * Knowing VRR or FRR is not mission critical for most use cases, so just report "Unknown" for now. */ + wsi_swapchain_present_timing_update_refresh_rate(&chain->base, refresh, 0, 0); + + /* Notify this before present wait to reduce latency of presentation timing requests + * if the application is driving its queries based off present waits. */ + if (id->timing_serial) + wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time, id->img); + mtx_lock(&chain->present_ids.lock); chain->present_ids.refresh_nsec = refresh; if (!chain->present_ids.valid_refresh_nsec) { @@ -2826,13 +2922,16 @@ presentation_handle_presented(void *data, if (presentation_time > chain->present_ids.displayed_time) chain->present_ids.displayed_time = presentation_time; - if (target_time && presentation_time > target_time) + /* If we have user-defined target time it can be arbitrarily early, and we don't + * want to start compensating for that error if application stops requesting specific time. */ + if (!id->user_target_time && target_time && presentation_time > target_time) chain->present_ids.display_time_error = presentation_time - target_time; else chain->present_ids.display_time_error = 0; - mtx_unlock(&chain->present_ids.lock); - wsi_wl_presentation_update_present_id(id); + wsi_wl_presentation_update_present_id_locked(id); + mtx_unlock(&chain->present_ids.lock); + vk_free(id->alloc, id); } static void @@ -2841,6 +2940,15 @@ presentation_handle_discarded(void *data) struct wsi_wl_present_id *id = data; struct wsi_wl_swapchain *chain = id->chain; + /* From Vulkan spec: + * "Timing information for some present stages may have a time value of 0, + * indicating that results for that present stage are not available." + * Worst case we can simply take a timestamp of clock_id and pretend, but + * applications may start to latch onto that timestamp as ground truth, which + * is obviously not correct. */ + if (id->timing_serial) + wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0, id->img); + mtx_lock(&chain->present_ids.lock); if (!chain->present_ids.valid_refresh_nsec) { /* We've started occluded, so make up some safe values to throttle us */ @@ -2849,9 +2957,10 @@ presentation_handle_discarded(void *data) chain->present_ids.refresh_nsec = 16666666; chain->present_ids.valid_refresh_nsec = true; } - mtx_unlock(&chain->present_ids.lock); - wsi_wl_presentation_update_present_id(id); + wsi_wl_presentation_update_present_id_locked(id); + mtx_unlock(&chain->present_ids.lock); + vk_free(id->alloc, id); } static void @@ -2870,9 +2979,10 @@ presentation_frame_handle_done(void *data, struct wl_callback *callback, uint32_ mtx_lock(&chain->present_ids.lock); wl_list_remove(&id->link); - mtx_unlock(&chain->present_ids.lock); - wsi_wl_presentation_update_present_id(id); + wsi_wl_presentation_update_present_id_locked(id); + mtx_unlock(&chain->present_ids.lock); + vk_free(id->alloc, id); wl_callback_destroy(callback); } @@ -2895,6 +3005,29 @@ static const struct wl_callback_listener frame_listener = { frame_handle_done, }; +static bool +set_application_driven_timestamp(struct wsi_wl_swapchain *chain, + uint64_t *timestamp, + uint64_t *correction) +{ + if (chain->timing_request.serial && chain->timing_request.time) { + /* Absolute time is requested before we have been able to report a reasonable refresh rate + * to application. This is valid, but we should not try to perform any rounding. + * NEAREST_REFRESH_CYCLE flag cannot be honored because it's impossible to know at this time. */ + struct timespec target_ts; + timespec_from_nsec(&target_ts, chain->timing_request.time); + wp_commit_timer_v1_set_timestamp(chain->commit_timer, + (uint64_t)target_ts.tv_sec >> 32, target_ts.tv_sec, + target_ts.tv_nsec); + *timestamp = chain->timing_request.time; + *correction = 0; + chain->present_ids.last_target_time = chain->timing_request.time; + return true; + } else { + return false; + } +} + /* The present_ids lock must be held */ static bool set_timestamp(struct wsi_wl_swapchain *chain, @@ -2908,7 +3041,7 @@ set_timestamp(struct wsi_wl_swapchain *chain, int32_t error = 0; if (!chain->present_ids.valid_refresh_nsec) - return false; + return set_application_driven_timestamp(chain, timestamp, correction); displayed_time = chain->present_ids.displayed_time; refresh = chain->present_ids.refresh_nsec; @@ -2918,7 +3051,7 @@ set_timestamp(struct wsi_wl_swapchain *chain, * timestamps at all, so bail out. */ if (!refresh) - return false; + return set_application_driven_timestamp(chain, timestamp, correction); /* We assume we're being fed at the display's refresh rate, but * if that doesn't happen our timestamps fall into the past. @@ -2936,6 +3069,10 @@ set_timestamp(struct wsi_wl_swapchain *chain, error = chain->present_ids.display_time_error - chain->present_ids.display_time_correction; + /* If we're driving timestamps from application, this is somewhat redundant + * but it will drain out any accumulated display_time_error over time. + * Accumulated errors are expected since application might not + * align the target time perfectly against a refresh cycle. */ target = chain->present_ids.last_target_time; if (error > 0) { target += (error / refresh) * refresh; @@ -2945,19 +3082,41 @@ set_timestamp(struct wsi_wl_swapchain *chain, } chain->present_ids.display_time_correction += *correction; - target = next_phase_locked_time(displayed_time, - refresh, - target); - /* Take back 500 us as a safety margin, to ensure we don't miss our - * target due to round-off error. - */ - timespec_from_nsec(&target_ts, target - 500000); + + if (chain->timing_request.serial && chain->timing_request.time) { + target = chain->timing_request.time; + chain->present_ids.last_target_time = target; + *timestamp = target; + + if (chain->timing_request.flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) + target -= chain->present_ids.refresh_nsec / 2; + + /* Without the flag, the application is supposed to deal with any safety margins on its own. */ + timespec_from_nsec(&target_ts, target); + + /* If we're using commit timing path, we always have FIFO protocol, so we don't have to + * consider scenarios where application is passing a very low present time. + * I.e., there is no need to max() the application timestamp against our estimated next refresh cycle. + * If the surface is occluded, it's possible to render at a higher rate than display refresh rate, + * but that's okay. Those presents will be discarded anyway, and we won't report odd timestamps to application. */ + } else { + target = next_phase_locked_time(displayed_time, + refresh, + target); + + chain->present_ids.last_target_time = target; + *timestamp = target; + + /* Take back 500 us as a safety margin, to ensure we don't miss our + * target due to round-off error. + */ + timespec_from_nsec(&target_ts, target - 500000); + } + wp_commit_timer_v1_set_timestamp(chain->commit_timer, (uint64_t)target_ts.tv_sec >> 32, target_ts.tv_sec, target_ts.tv_nsec); - chain->present_ids.last_target_time = target; - *timestamp = target; return true; } @@ -3059,13 +3218,16 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain, } if (present_id > 0 || (mode_fifo && chain->commit_timer) || - util_perfetto_is_tracing_enabled()) { + util_perfetto_is_tracing_enabled() || chain->timing_request.serial) { struct wsi_wl_present_id *id = vk_zalloc(chain->wsi_wl_surface->display->wsi_wl->alloc, sizeof(*id), sizeof(uintptr_t), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); id->chain = chain; id->present_id = present_id; id->alloc = chain->wsi_wl_surface->display->wsi_wl->alloc; + id->timing_serial = chain->timing_request.serial; + id->img = &chain->images[image_index].base; + id->user_target_time = chain->timing_request.time != 0; mtx_lock(&chain->present_ids.lock); @@ -3193,6 +3355,8 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain, wsi_wl_surface->display->queue); } + memset(&chain->timing_request, 0, sizeof(chain->timing_request)); + return VK_SUCCESS; } @@ -3427,6 +3591,20 @@ wsi_wl_swapchain_destroy(struct wsi_swapchain *wsi_chain, return VK_SUCCESS; } +static VkTimeDomainKHR +clock_id_to_vk_time_domain(clockid_t id) +{ + switch (id) { + case CLOCK_MONOTONIC: + return VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR; + case CLOCK_MONOTONIC_RAW: + return VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR; + default: + /* Default fallback. Will not be used. */ + return VK_TIME_DOMAIN_DEVICE_KHR; + } +} + static VkResult wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, VkDevice device, @@ -3605,6 +3783,12 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, chain->base.queue_present = wsi_wl_swapchain_queue_present; chain->base.release_images = wsi_wl_swapchain_release_images; chain->base.set_present_mode = wsi_wl_swapchain_set_present_mode; + chain->base.set_timing_request = wsi_wl_swapchain_set_timing_request; + chain->base.poll_timing_request = wsi_wl_swapchain_poll_timing_request; + if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) { + chain->base.present_timing.time_domain = + clock_id_to_vk_time_domain(wsi_wl_surface->display->presentation_clock_id); + } chain->base.wait_for_present = wsi_wl_swapchain_wait_for_present; chain->base.wait_for_present2 = wsi_wl_swapchain_wait_for_present2; chain->base.present_mode = present_mode; diff --git a/src/vulkan/wsi/wsi_common_win32.cpp b/src/vulkan/wsi/wsi_common_win32.cpp index 03b84771b36..df41fd967c4 100644 --- a/src/vulkan/wsi/wsi_common_win32.cpp +++ b/src/vulkan/wsi/wsi_common_win32.cpp @@ -276,6 +276,16 @@ wsi_win32_surface_get_capabilities2(VkIcdSurfaceBase *surface, break; } + case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: { + VkPresentTimingSurfaceCapabilitiesEXT *wait = (VkPresentTimingSurfaceCapabilitiesEXT *)ext; + + wait->presentStageQueries = 0; + wait->presentTimingSupported = VK_FALSE; + wait->presentAtAbsoluteTimeSupported = VK_FALSE; + wait->presentAtRelativeTimeSupported = VK_FALSE; + break; + } + default: /* Ignored */ break; diff --git a/src/vulkan/wsi/wsi_common_x11.c b/src/vulkan/wsi/wsi_common_x11.c index 544ddceca1d..329e745184c 100644 --- a/src/vulkan/wsi/wsi_common_x11.c +++ b/src/vulkan/wsi/wsi_common_x11.c @@ -64,6 +64,7 @@ #include "wsi_common_entrypoints.h" #include "wsi_common_private.h" #include "wsi_common_queue.h" +#include "loader/loader_dri_helper_screen.h" #ifdef HAVE_SYS_SHM_H #include @@ -79,7 +80,14 @@ #define MAX_DAMAGE_RECTS 64 -struct wsi_x11_connection { +struct x11_icd_surface_key { + xcb_connection_t *conn; + xcb_window_t window; + uint32_t padding; /* Makes struct memcmp compatible. */ +}; + +struct wsi_x11_icd_surface { + struct x11_icd_surface_key key; bool has_dri3; bool has_dri3_modifiers; bool has_dri3_explicit_sync; @@ -88,13 +96,80 @@ struct wsi_x11_connection { bool is_xwayland; bool has_mit_shm; bool has_xfixes; + + struct loader_screen_resources screen_resources; + bool screen_resources_valid; + mtx_t mtx; + + /* This holds the fallback for MSC rate, i.e. refresh rate. + * If we cannot get ahold of a stable estimate based on real feedback, + * we defer to using this. With multi-monitors and other potential effects affecting actual rates, + * we shouldn't trust this blindly. */ + uint64_t current_refresh_ns; }; +static uint64_t +x11_icd_surface_update_present_timing(struct wsi_x11_icd_surface *surface, uint32_t width, uint32_t height) +{ + uint64_t ret; + + if (!surface->screen_resources_valid) + return 0; + + mtx_lock(&surface->mtx); + loader_update_screen_resources(&surface->screen_resources); + + if (surface->screen_resources.num_crtcs == 0) { + surface->current_refresh_ns = 0; + goto out; + } + + surface->current_refresh_ns = + 1000000000ull * surface->screen_resources.crtcs[0].refresh_denominator / + surface->screen_resources.crtcs[0].refresh_numerator; + + /* Don't need to ponder multi-monitor. */ + if (surface->screen_resources.num_crtcs == 1) + goto out; + + /* Find the best matching screen for the window. */ + xcb_translate_coordinates_cookie_t cookie = + xcb_translate_coordinates_unchecked(surface->key.conn, surface->key.window, + surface->screen_resources.screen->root, 0, 0); + xcb_translate_coordinates_reply_t *reply = + xcb_translate_coordinates_reply(surface->key.conn, cookie, NULL); + + if (!reply) + goto out; + + int area = 0; + + for (unsigned c = 0; c < surface->screen_resources.num_crtcs; c++) { + struct loader_crtc_info *crtc = &surface->screen_resources.crtcs[c]; + + int c_area = box_intersection_area( + reply->dst_x, reply->dst_y, width, height, crtc->x, + crtc->y, crtc->width, crtc->height); + + if (c_area > area) { + surface->current_refresh_ns = 1000000000ull * crtc->refresh_denominator / crtc->refresh_numerator; + area = c_area; + } + } + + free(reply); + +out: + ret = surface->current_refresh_ns; + mtx_unlock(&surface->mtx); + return ret; +} + struct wsi_x11 { struct wsi_interface base; mtx_t mutex; - /* Hash table of xcb_connection -> wsi_x11_connection mappings */ + /* Hash table of xcb_connection -> wsi_x11_icd_surface mappings */ struct hash_table *connections; }; @@ -224,9 +299,9 @@ wsi_x11_detect_xwayland(xcb_connection_t *conn, return is_xwayland; } -static struct wsi_x11_connection * -wsi_x11_connection_create(struct wsi_device *wsi_dev, - xcb_connection_t *conn) +static struct wsi_x11_icd_surface * +wsi_x11_icd_surface_create(struct wsi_device *wsi_dev, + xcb_connection_t *conn, xcb_window_t window) { xcb_query_extension_cookie_t dri3_cookie, pres_cookie, randr_cookie, amd_cookie, nv_cookie, shm_cookie, sync_cookie, @@ -241,16 +316,19 @@ wsi_x11_connection_create(struct wsi_device *wsi_dev, bool has_dri3_v1_4 = false; bool has_present_v1_4 = false; - /* wsi_x11_get_connection may be called from a thread, but we will never end up here on a worker thread, + /* wsi_x11_get_icd_surface may be called from a thread, but we will never end up here on a worker thread, * since the connection will always be in the hash-map, * so we will not violate Vulkan's rule on allocation callbacks w.r.t. * when it is allowed to call the allocation callbacks. */ - struct wsi_x11_connection *wsi_conn = - vk_alloc(&wsi_dev->instance_alloc, sizeof(*wsi_conn), 8, + struct wsi_x11_icd_surface *wsi_conn = + vk_zalloc(&wsi_dev->instance_alloc, sizeof(*wsi_conn), 8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (!wsi_conn) return NULL; + wsi_conn->key.conn = conn; + wsi_conn->key.window = window; + sync_cookie = xcb_query_extension(conn, 4, "SYNC"); dri3_cookie = xcb_query_extension(conn, 4, "DRI3"); pres_cookie = xcb_query_extension(conn, 7, "Present"); @@ -378,6 +456,27 @@ wsi_x11_connection_create(struct wsi_device *wsi_dev, } #endif + if (window) { + /* This state is only necessary for dealing with present timing, and if we fail, we simply won't expose support. */ + xcb_get_geometry_cookie_t geometry_cookie = xcb_get_geometry_unchecked(conn, window); + xcb_get_geometry_reply_t *geometry_reply = xcb_get_geometry_reply(conn, geometry_cookie, NULL); + if (geometry_reply) { + xcb_screen_iterator_t it = xcb_setup_roots_iterator(xcb_get_setup(conn)); + xcb_screen_t *screen; + + for (screen = it.data; it.rem != 0; xcb_screen_next(&it), screen = it.data) { + if (screen->root == geometry_reply->root) { + loader_init_screen_resources(&wsi_conn->screen_resources, conn, screen); + wsi_conn->screen_resources_valid = true; + mtx_init(&wsi_conn->mtx, 0); + break; + } + } + + free(geometry_reply); + } + } + free(dri3_reply); free(pres_reply); free(randr_reply); @@ -392,14 +491,18 @@ wsi_x11_connection_create(struct wsi_device *wsi_dev, } static void -wsi_x11_connection_destroy(struct wsi_device *wsi_dev, - struct wsi_x11_connection *conn) +wsi_x11_icd_surface_destroy(struct wsi_device *wsi_dev, + struct wsi_x11_icd_surface *conn) { + if (conn->screen_resources_valid) { + loader_destroy_screen_resources(&conn->screen_resources); + mtx_destroy(&conn->mtx); + } vk_free(&wsi_dev->instance_alloc, conn); } static bool -wsi_x11_check_for_dri3(struct wsi_x11_connection *wsi_conn) +wsi_x11_check_for_dri3(struct wsi_x11_icd_surface *wsi_conn) { if (wsi_conn->has_dri3) return true; @@ -418,35 +521,37 @@ wsi_x11_check_for_dri3(struct wsi_x11_connection *wsi_conn) * * If the allocation fails NULL is returned. */ -static struct wsi_x11_connection * -wsi_x11_get_connection(struct wsi_device *wsi_dev, - xcb_connection_t *conn) +static struct wsi_x11_icd_surface * +wsi_x11_get_icd_surface(struct wsi_device *wsi_dev, + xcb_connection_t *conn, xcb_window_t window) { struct wsi_x11 *wsi = (struct wsi_x11 *)wsi_dev->wsi[VK_ICD_WSI_PLATFORM_XCB]; mtx_lock(&wsi->mutex); - struct hash_entry *entry = _mesa_hash_table_search(wsi->connections, conn); + struct x11_icd_surface_key key = { .conn = conn, .window = window }; + + struct hash_entry *entry = _mesa_hash_table_search(wsi->connections, &key); if (!entry) { /* We're about to make a bunch of blocking calls. Let's drop the * mutex for now so we don't block up too badly. */ mtx_unlock(&wsi->mutex); - struct wsi_x11_connection *wsi_conn = - wsi_x11_connection_create(wsi_dev, conn); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_icd_surface_create(wsi_dev, conn, window); if (!wsi_conn) return NULL; mtx_lock(&wsi->mutex); - entry = _mesa_hash_table_search(wsi->connections, conn); + entry = _mesa_hash_table_search(wsi->connections, &wsi_conn->key); if (entry) { /* Oops, someone raced us to it */ - wsi_x11_connection_destroy(wsi_dev, wsi_conn); + wsi_x11_icd_surface_destroy(wsi_dev, wsi_conn); } else { - entry = _mesa_hash_table_insert(wsi->connections, conn, wsi_conn); + entry = _mesa_hash_table_insert(wsi->connections, &wsi_conn->key, wsi_conn); } } @@ -590,11 +695,20 @@ wsi_GetPhysicalDeviceXcbPresentationSupportKHR(VkPhysicalDevice physicalDevice, { VK_FROM_HANDLE(vk_physical_device, pdevice, physicalDevice); struct wsi_device *wsi_device = pdevice->wsi_device; - if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex))) + + /* These should overlap. */ + uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps; + + /* If there are no queues that support both blits and timestamps, + * don't report support for queue timestamps. */ + if (!effective_queues) + effective_queues = wsi_device->queue_supports_blit; + + if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex))) return false; - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection(wsi_device, connection); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface(wsi_device, connection, 0); if (!wsi_conn) return false; @@ -669,8 +783,8 @@ x11_surface_get_support(VkIcdSurfaceBase *icd_surface, xcb_connection_t *conn = x11_surface_get_connection(icd_surface); xcb_window_t window = x11_surface_get_window(icd_surface); - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection(wsi_device, conn); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface(wsi_device, conn, window); if (!wsi_conn) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -722,7 +836,7 @@ x11_get_min_image_count(const struct wsi_device *wsi_device, bool is_xwayland) static unsigned x11_get_min_image_count_for_present_mode(struct wsi_device *wsi_device, - struct wsi_x11_connection *wsi_conn, + struct wsi_x11_icd_surface *wsi_conn, VkPresentModeKHR present_mode); static VkResult @@ -734,8 +848,8 @@ x11_surface_get_capabilities(VkIcdSurfaceBase *icd_surface, xcb_connection_t *conn = x11_surface_get_connection(icd_surface); xcb_window_t window = x11_surface_get_window(icd_surface); struct wsi_x11_vk_surface *surface = (struct wsi_x11_vk_surface*)icd_surface; - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection(wsi_device, conn); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface(wsi_device, conn, window); xcb_get_geometry_cookie_t geom_cookie; xcb_generic_error_t *err; xcb_get_geometry_reply_t *geom; @@ -863,6 +977,52 @@ x11_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface, break; } + case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: { + VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext; + + xcb_connection_t *conn = x11_surface_get_connection(icd_surface); + xcb_window_t window = x11_surface_get_window(icd_surface); + struct wsi_x11_icd_surface *wsi_conn = wsi_x11_get_icd_surface(wsi_device, conn, window); + + wait->presentStageQueries = 0; + wait->presentTimingSupported = VK_FALSE; + wait->presentAtAbsoluteTimeSupported = VK_FALSE; + wait->presentAtRelativeTimeSupported = VK_FALSE; + + /* If we cannot query modes for a screen, it's not possible to get reliable timings. */ + if (!wsi_conn->screen_resources_valid) + break; + + wait->presentTimingSupported = VK_TRUE; + + if (wsi_conn->is_xwayland) { + /* Wayland COMPLETE is tied to fence callback, so that's what we'll report. + * For pure frame pacing support, this is likely fine. */ + wait->presentStageQueries = VK_PRESENT_STAGE_REQUEST_DEQUEUED_BIT_EXT; + + /* Xwayland cannot get a reliable refresh rate estimate since MSC is not tied to monitor refresh at all. + * However, it's pragmatically very important to expose some baseline Xwl support since + * a large amount of applications (mostly games) rely on X11 APIs. + * + * Relative timings are easier to deal with since errors against an absolute timer are more or less expected, + * and it's sufficient for implementing present intervals in GL/D3D, etc, but likely not for + * tight A/V sync in e.g. media players, but those should be using Wayland when available anyway. + * As per-spec the timing request we provide should correlate with PIXEL_VISIBLE_BIT stage, + * but when we only observe dequeue, that's not really possible, but relative timings don't have that problem. + * + * There is PRESENT_CAPABILITY_UST, which would help, but xserver does not implement it at all. + */ + wait->presentAtRelativeTimeSupported = VK_TRUE; + } else { + /* COMPLETE should be tied to page flip on native X11. */ + wait->presentStageQueries = VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT; + wait->presentAtAbsoluteTimeSupported = VK_TRUE; + wait->presentAtRelativeTimeSupported = VK_TRUE; + } + + break; + } + default: /* Ignored */ break; @@ -1092,6 +1252,7 @@ wsi_CreateXlibSurfaceKHR(VkInstance _instance, struct x11_image_pending_completion { uint32_t serial; uint64_t signal_present_id; + uint64_t timing_serial; }; struct x11_image { @@ -1108,6 +1269,7 @@ struct x11_image { VkPresentModeKHR present_mode; xcb_rectangle_t rects[MAX_DAMAGE_RECTS]; int rectangle_count; + struct wsi_image_timing_request timing_request; /* In IMMEDIATE and MAILBOX modes, we can have multiple pending presentations per image. * We need to keep track of them when considering present ID. */ @@ -1125,12 +1287,19 @@ struct x11_image { #endif }; +struct x11_present_timing_entry { + uint64_t msc; + uint64_t ust; +}; +#define X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE 16 + struct x11_swapchain { struct wsi_swapchain base; bool has_dri3_modifiers; bool has_mit_shm; bool has_async_may_tear; + bool has_reliable_msc; xcb_connection_t * conn; xcb_window_t window; @@ -1144,9 +1313,13 @@ struct x11_swapchain { xcb_special_event_t * special_event; uint64_t send_sbc; uint64_t last_present_msc; + uint64_t next_present_ust_lower_bound; uint32_t stamp; uint32_t sent_image_count; + struct x11_present_timing_entry present_timing_window[X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE]; + uint32_t present_timing_window_index; + atomic_int status; bool copy_is_suboptimal; struct wsi_queue present_queue; @@ -1168,14 +1341,121 @@ struct x11_swapchain { uint64_t present_id; VkResult present_progress_error; + struct wsi_image_timing_request timing_request; + bool msc_estimate_is_stable; + struct x11_image images[0]; }; VK_DEFINE_NONDISP_HANDLE_CASTS(x11_swapchain, base.base, VkSwapchainKHR, VK_OBJECT_TYPE_SWAPCHAIN_KHR) -static void x11_present_complete(struct x11_swapchain *swapchain, - struct x11_image *image, uint32_t index) +static bool x11_refresh_rate_estimate_is_stable(struct x11_swapchain *swapchain, uint64_t base_rate) { + /* Only accept a refresh rate estimate if it's *very* stable. + * Keith's old GOOGLE_display_timing MR suggests that using this estimate is better than blindly + * accepting the modeline in some cases. + * When running in VRR modes, the MSC will appear to be highly unstable, and we cannot accept those estimates. */ + + for (int i = 0; i < X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE; i++) { + const struct x11_present_timing_entry *a = + &swapchain->present_timing_window[i]; + const struct x11_present_timing_entry *b = + &swapchain->present_timing_window[(i + 1) % X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE]; + + if (!a->msc || !b->msc) + continue; + + uint64_t ust_delta = MAX2(a->ust, b->ust) - MIN2(a->ust, b->ust); + uint64_t msc_delta = MAX2(a->msc, b->msc) - MIN2(a->msc, b->msc); + + if (msc_delta == 0) + continue; + + uint64_t refresh_ns = 1000 * ust_delta / msc_delta; + + /* The true UST values are expected to be quite accurate. + * Anything more than 10us difference in rate is considered unstable. + * If the MSC is driven by GPU progress in VRR mode, + * it's extremely unlikely that they are paced *perfectly* for 16 frames in a row. */ + if (llabs((int64_t)base_rate - (int64_t)refresh_ns) > 10000) + return false; + } + + return true; +} + +static void x11_present_update_refresh_cycle_estimate(struct x11_swapchain *swapchain, + uint64_t msc, uint64_t ust) +{ + struct wsi_x11_icd_surface *surface = wsi_x11_get_icd_surface( + (struct wsi_device*)swapchain->base.wsi, swapchain->conn, swapchain->window); + + mtx_lock(&surface->mtx); + uint64_t randr_refresh_ns = surface->current_refresh_ns; + mtx_unlock(&surface->mtx); + + swapchain->present_timing_window_index = + (swapchain->present_timing_window_index + 1) % X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE; + struct x11_present_timing_entry *entry = &swapchain->present_timing_window[swapchain->present_timing_window_index]; + + if (!swapchain->has_reliable_msc) { + /* If we don't have reliable MSC, we always trust the fallback RANDR query. + * We have no idea if we're FRR or VRR. */ + wsi_swapchain_present_timing_update_refresh_rate(&swapchain->base, randr_refresh_ns, 0, 0); + entry->msc = msc; + entry->ust = ust; + return; + } + + /* Try to get an initial estimate as quickly as possible, we will refine it over time. */ + if (entry->msc == 0) + entry = &swapchain->present_timing_window[1]; + + if (entry->msc != 0) { + uint64_t msc_delta = msc - entry->msc; + + /* Safeguard against any weird interactions with IMMEDIATE. */ + if (msc_delta != 0) { + uint64_t ust_delta = 1000 * (ust - entry->ust); + uint64_t refresh_ns = ust_delta / msc_delta; + + swapchain->msc_estimate_is_stable = x11_refresh_rate_estimate_is_stable(swapchain, refresh_ns); + + if (swapchain->msc_estimate_is_stable) { + /* If MSC is tightly locked in, we can safely make the assumption we're in FRR mode. + * It's possible we're technically doing VRR, but if we're rendering at above monitor refresh + * rate consistently, then there is no meaningful difference anyway. */ + + /* Our refresh rates are only estimates, so expect some deviation (+/- 1us). */ + wsi_swapchain_present_timing_update_refresh_rate(&swapchain->base, refresh_ns, refresh_ns, 1000); + } else { + /* If we have enabled adaptive sync, and we're seeing highly irregular MSC values, we assume + * we're driving the display VRR. */ + uint64_t refresh_interval = swapchain->base.wsi->enable_adaptive_sync ? UINT64_MAX : 0; + wsi_swapchain_present_timing_update_refresh_rate(&swapchain->base, randr_refresh_ns, refresh_interval, 0); + } + } + } + + entry = &swapchain->present_timing_window[swapchain->present_timing_window_index]; + entry->msc = msc; + entry->ust = ust; +} + +static void x11_present_complete(struct x11_swapchain *swapchain, + struct x11_image *image, uint32_t index, + uint64_t msc, uint64_t ust) +{ + /* Update estimate for refresh rate. */ + if (swapchain->base.present_timing.active) + x11_present_update_refresh_cycle_estimate(swapchain, msc, ust); + + /* Make sure to signal present timings before signalling present wait, + * this way we get minimal latency for reports. */ + uint64_t timing_serial = image->pending_completions[index].timing_serial; + if (timing_serial) + wsi_swapchain_present_timing_notify_completion(&swapchain->base, timing_serial, ust * 1000, &image->base); + uint64_t signal_present_id = image->pending_completions[index].signal_present_id; if (signal_present_id) { mtx_lock(&swapchain->present_progress_mutex); @@ -1327,6 +1607,16 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain, return VK_SUBOPTIMAL_KHR; } + if (chain->base.present_timing.active) { + /* It's possible that we have multiple monitors and moving windows around change the effective rate. + * Lots of logic reused from platform_x11.c. */ + + /* TODO: Should we rate-limit this query? */ + struct wsi_x11_icd_surface *surface = wsi_x11_get_icd_surface( + (struct wsi_device *)chain->base.wsi, chain->conn, chain->window); + x11_icd_surface_update_present_timing(surface, config->width, config->height); + } + break; } @@ -1348,13 +1638,14 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain, case XCB_PRESENT_EVENT_COMPLETE_NOTIFY: { xcb_present_complete_notify_event_t *complete = (void *) event; + uint64_t ust = MAX2(complete->ust, chain->next_present_ust_lower_bound); if (complete->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP) { unsigned i, j; for (i = 0; i < chain->base.image_count; i++) { struct x11_image *image = &chain->images[i]; for (j = 0; j < image->present_queued_count; j++) { if (image->pending_completions[j].serial == complete->serial) { - x11_present_complete(chain, image, j); + x11_present_complete(chain, image, j, complete->msc, ust); } } } @@ -1424,8 +1715,8 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index, int64_t divisor = 0; int64_t remainder = 0; - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window); if (!wsi_conn) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1457,6 +1748,7 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index, (struct x11_image_pending_completion) { .signal_present_id = image->present_id, .serial = serial, + .timing_serial = image->timing_request.serial, }; xcb_void_cookie_t cookie; @@ -1654,7 +1946,7 @@ static VkResult x11_swapchain_read_status_atomic(struct x11_swapchain *chain) */ static bool x11_needs_wait_for_fences(const struct wsi_device *wsi_device, - struct wsi_x11_connection *wsi_conn, + struct wsi_x11_icd_surface *wsi_conn, VkPresentModeKHR present_mode) { if (wsi_conn->is_xwayland && !wsi_device->x11.xwaylandWaitReady) { @@ -1676,7 +1968,7 @@ x11_needs_wait_for_fences(const struct wsi_device *wsi_device, static bool x11_requires_mailbox_image_count(const struct wsi_device *device, - struct wsi_x11_connection *wsi_conn, + struct wsi_x11_icd_surface *wsi_conn, VkPresentModeKHR present_mode) { /* If we're resorting to wait for fences, we're assuming a MAILBOX-like model, @@ -1773,6 +2065,26 @@ x11_set_present_mode(struct wsi_swapchain *wsi_chain, chain->base.present_mode = mode; } +static void +x11_set_timing_request(struct wsi_swapchain *wsi_chain, + const struct wsi_image_timing_request *request) +{ + struct x11_swapchain *chain = (struct x11_swapchain *)wsi_chain; + chain->timing_request = *request; +} + +static uint64_t +x11_poll_early_refresh(struct wsi_swapchain *wsi_chain, uint64_t *interval) +{ + struct x11_swapchain *chain = (struct x11_swapchain *)wsi_chain; + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window); + + /* We don't know yet. */ + *interval = 0; + return x11_icd_surface_update_present_timing(wsi_conn, chain->extent.width, chain->extent.height); +} + /** * Acquire a ready-to-use image from the swapchain. * @@ -1874,6 +2186,8 @@ x11_queue_present(struct wsi_swapchain *wsi_chain, chain->images[image_index].present_id = present_id; /* With KHR_swapchain_maintenance1, the present mode can change per present. */ chain->images[image_index].present_mode = chain->base.present_mode; + chain->images[image_index].timing_request = chain->timing_request; + memset(&chain->timing_request, 0, sizeof(chain->timing_request)); wsi_queue_push(&chain->present_queue, image_index); return x11_swapchain_read_status_atomic(chain); @@ -1974,6 +2288,125 @@ x11_manage_event_queue(void *state) return 0; } +static uint64_t +x11_present_compute_target_msc(struct x11_swapchain *chain, + const struct wsi_image_timing_request *request, + uint64_t minimum_msc) +{ + const struct x11_present_timing_entry *entry = &chain->present_timing_window[chain->present_timing_window_index]; + bool relative = (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT) != 0; + + /* Just use the FIFO derived MSC. From spec on relative: + * "If the swapchain has never been used to present an image, the provided targetTime is ignored." */ + if (!request->serial || !request->time || (relative && !entry->ust)) + return minimum_msc; + + int64_t target_ns; + + mtx_lock(&chain->base.present_timing.lock); + + /* Present timing is only defined to work with FIFO modes, so we can rely on having + * reliable relative timings, since we block for COMPLETE to come through before we queue up more presents. */ + if (relative) { + /* If application is trying to drive us at refresh rate, FIFO will take care of it. + * Don't end up in a situation where we sleep and miss the deadline by mistake. */ + if (!chain->has_reliable_msc) { + uint64_t relative_threshold; + if (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) + relative_threshold = 3 * chain->base.present_timing.refresh_duration / 2; + else + relative_threshold = chain->base.present_timing.refresh_duration; + + if (request->time <= relative_threshold) { + mtx_unlock(&chain->base.present_timing.lock); + return minimum_msc; + } + } + target_ns = 1000 * (int64_t)entry->ust + (int64_t)request->time; + } else { + target_ns = (int64_t)request->time; + } + + /* Snap to nearest half refresh. This only makes sense for FRR, but it is the application's + * responsibility to not use this for VRR. If this flag is not used, this is strictly a "not before". */ + if (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) + target_ns -= (int64_t)chain->base.present_timing.refresh_duration / 2; + + if (entry->msc && chain->base.present_timing.refresh_duration != 0 && + chain->msc_estimate_is_stable && chain->has_reliable_msc) { + /* If we can trust MSC to be a stable FRR heartbeat, we sync to that. */ + uint64_t delta_time_ns = MAX2(target_ns - 1000 * (int64_t)entry->ust, 0); + uint64_t periods = (delta_time_ns + chain->base.present_timing.refresh_duration - 1) / + chain->base.present_timing.refresh_duration; + mtx_unlock(&chain->base.present_timing.lock); + + /* Xwl cannot understand MSC that jumps by more than 1. It appears that if there are MSC jumps above 1, + * each MSC cycle is padded by 16.6ms or something like that. + * If we want to target specific time, we must sleep to achieve that until Xwl improves. + * Fortunately, we're on a submit thread, so that is mostly an acceptable solution. */ + minimum_msc = MAX2(minimum_msc, entry->msc + periods); + } else { + /* If we don't have a stable estimate (e.g. true VRR, or Xwl) we just sleep until deadline. + * This relies on timebase on os_time_nanosleep is MONOTONIC as well as UST being MONOTONIC. */ + + if (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) { + if (!chain->has_reliable_msc && chain->base.present_timing.refresh_duration) { + uint64_t delta_time_ns = MAX2(target_ns - 1000 * (int64_t)entry->ust, 0); + uint64_t periods = delta_time_ns / chain->base.present_timing.refresh_duration; + + target_ns = 1000ull * entry->ust + periods * chain->base.present_timing.refresh_duration; + + /* Set a minimum target that is very close to the real estimate. + * This way, we ensure that we don't regularly round estimates up in + * chain->next_present_ust_lower_bound. */ + target_ns += 63 * chain->base.present_timing.refresh_duration / 64; + } + } + + if (chain->has_reliable_msc) { + /* Very regular sleeping can trigger a strange feedback loop where MSC estimates becomes stable enough + * that we accept it as stable MSC. Perturb the rates enough to make it extremely unlikely + * we accept sleeping patterns as ground truth rate, introduce a 50 us error between each timestamp, + * which should avoid the 10 us check reliably. If sleep quantas are not as accurate, it's extremely unlikely + * we get a stable pace anyway. TODO: Is there a more reliable way? */ + + /* On Xwl we never accept MSC estimates as ground truth, so ignore this perturbation. */ + target_ns += 50000ll * (chain->present_timing_window_index & 1) - 25000; + target_ns = MAX2(target_ns, 0); + } + + /* If we're on Xwl or VRR X11 and trying to target a specific cycle by sleeping, pull back the sleep a bit. + * We will be racing against time once we wake up to send the request to Xwl -> Wayland -> frame callback -> COMPLETE. + * If target_ns syncs well to a refresh cycle, we speculate that COMPLETE will come through at about target_ns. */ + + /* To get proper pace on an actual VRR display, we will have to detect if we're presenting too early + * compared to what application actually expected. + * In that case, we need to remove this compensation if we detect that presents come in too early. + * Effectively, we will need to adjust the report UST up if we somehow end up seeing a timestamp too early. + * The relative refresh will feed off this adjustment in a tight loop, so this should be pretty solid + * for both VRR and FRR. Present timing can only be used with FIFO modes, i.e. we will not overwrite this + * until the present is actually complete. */ + chain->next_present_ust_lower_bound = target_ns / 1000; + + /* We also need to pull back the sleep a bit to account for X.org roundtrip delays. + * Allow up to 4ms of error here. */ + int64_t eager_present_ns = MIN2((int64_t)chain->base.present_timing.refresh_duration / 4, 4 * 1000 * 1000); + target_ns -= eager_present_ns; + target_ns = MAX2(target_ns, 0); + + mtx_unlock(&chain->base.present_timing.lock); + mtx_unlock(&chain->thread_state_lock); + + os_time_nanosleep_until(target_ns); + + /* Reacquiring the lock won't change any invariants for us, so this is fine. + * We make sure to check chain->status after this function in case that got updated while we were sleeping. */ + mtx_lock(&chain->thread_state_lock); + } + + return minimum_msc; +} + /** * Presentation thread. * @@ -1991,8 +2424,8 @@ static int x11_manage_present_queue(void *state) { struct x11_swapchain *chain = state; - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window); VkResult result = VK_SUCCESS; u_thread_setname("WSI swapchain queue"); @@ -2040,6 +2473,8 @@ x11_manage_present_queue(void *state) u_cnd_monotonic_wait(&chain->thread_state_cond, &chain->thread_state_lock); } + target_msc = x11_present_compute_target_msc(chain, &chain->images[image_index].timing_request, target_msc); + if (chain->status < 0) { mtx_unlock(&chain->thread_state_lock); break; @@ -2315,7 +2750,7 @@ wsi_x11_recompute_dri3_modifier_hash(blake3_hash *hash, const struct wsi_drm_ima } static void -wsi_x11_get_dri3_modifiers(struct wsi_x11_connection *wsi_conn, +wsi_x11_get_dri3_modifiers(struct wsi_x11_icd_surface *wsi_conn, xcb_connection_t *conn, xcb_window_t window, uint8_t depth, uint8_t bpp, uint64_t **modifiers_in, uint32_t *num_modifiers_in, @@ -2402,8 +2837,8 @@ wsi_x11_swapchain_query_dri3_modifiers_changed(struct x11_swapchain *chain) uint64_t *modifiers[2] = {NULL, NULL}; uint32_t num_modifiers[2] = {0, 0}; - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window); xcb_get_geometry_reply_t *geometry = xcb_get_geometry_reply(chain->conn, xcb_get_geometry(chain->conn, chain->window), NULL); @@ -2551,7 +2986,7 @@ static VkResult x11_wait_for_present(struct wsi_swapchain *wsi_chain, static unsigned x11_get_min_image_count_for_present_mode(struct wsi_device *wsi_device, - struct wsi_x11_connection *wsi_conn, + struct wsi_x11_icd_surface *wsi_conn, VkPresentModeKHR present_mode) { uint32_t min_image_count = x11_get_min_image_count(wsi_device, wsi_conn->is_xwayland); @@ -2592,8 +3027,9 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, * representing it. */ xcb_connection_t *conn = x11_surface_get_connection(icd_surface); - struct wsi_x11_connection *wsi_conn = - wsi_x11_get_connection(wsi_device, conn); + xcb_window_t window = x11_surface_get_window(icd_surface); + struct wsi_x11_icd_surface *wsi_conn = + wsi_x11_get_icd_surface(wsi_device, conn, window); if (!wsi_conn) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -2613,7 +3049,6 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, } /* Check that we have a window up-front. It is an error to not have one. */ - xcb_window_t window = x11_surface_get_window(icd_surface); /* Get the geometry of that window. The bit depth of the swapchain will be fitted and the * chain's images extents should fit it for performance-optimizing flips. @@ -2736,8 +3171,14 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, chain->base.wait_for_present2 = x11_wait_for_present; chain->base.release_images = x11_release_images; chain->base.set_present_mode = x11_set_present_mode; + chain->base.set_timing_request = x11_set_timing_request; + chain->base.poll_early_refresh = x11_poll_early_refresh; chain->base.present_mode = present_mode; chain->base.image_count = num_images; + + /* This is what Xserver is using. We cannot really query it, but we rely on it working. */ + chain->base.present_timing.time_domain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR; + chain->conn = conn; chain->window = window; chain->depth = bit_depth; @@ -2749,6 +3190,7 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, chain->has_dri3_modifiers = wsi_conn->has_dri3_modifiers; chain->has_mit_shm = wsi_conn->has_mit_shm; chain->has_async_may_tear = present_caps & XCB_PRESENT_CAPABILITY_ASYNC_MAY_TEAR; + chain->has_reliable_msc = !wsi_conn->is_xwayland; /* When images in the swapchain don't fit the window, X can still present them, but it won't * happen by flip, only by copy. So this is a suboptimal copy, because if the client would change @@ -2856,6 +3298,9 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface, /* It is safe to set it here as only one swapchain can be associated with * the window, and swapchain creation does the association. At this point * we know the creation is going to succeed. */ + + /* If we have present timing, we need to make sure we get a useable estimate for refresh rate + * before we let the window run in full VRR. Once we have locked in the rate, we can enable VRR property. */ wsi_x11_set_adaptive_sync_property(conn, window, wsi_device->enable_adaptive_sync); @@ -2889,6 +3334,18 @@ fail_alloc: return result; } +static uint32_t x11_hash_icd_surface(const void *key) +{ + return _mesa_hash_data(key, sizeof(struct x11_icd_surface_key)); +} + +static bool x11_icd_surface_equal(const void *a_, const void *b_) +{ + const struct x11_icd_surface_key *a = a_; + const struct x11_icd_surface_key *b = b_; + return a->conn == b->conn && a->window == b->window; +} + VkResult wsi_x11_init_wsi(struct wsi_device *wsi_device, const VkAllocationCallbacks *alloc, @@ -2916,8 +3373,7 @@ wsi_x11_init_wsi(struct wsi_device *wsi_device, goto fail_alloc; } - wsi->connections = _mesa_hash_table_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); + wsi->connections = _mesa_hash_table_create(NULL, x11_hash_icd_surface, x11_icd_surface_equal); if (!wsi->connections) { result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail_mutex; @@ -2981,7 +3437,7 @@ wsi_x11_finish_wsi(struct wsi_device *wsi_device, if (wsi) { hash_table_foreach(wsi->connections, entry) - wsi_x11_connection_destroy(wsi_device, entry->data); + wsi_x11_icd_surface_destroy(wsi_device, entry->data); _mesa_hash_table_destroy(wsi->connections, NULL);