Merge branch 'present-timing' into 'main'

vulkan/wsi: Implement EXT_present_timing. See merge request mesa/mesa!38770
2025-12-20 05:10:11 +01:00 · 2025-12-20 00:47:43 +00:00 · 2025-12-20 00:47:43 +00:00 · 32a5663d35
commit 32a5663d35
parent c430f394c5 0b4c40858b
21 changed files with 1845 additions and 201 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -658,6 +658,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_EXT_physical_device_drm                            DONE (anv, hasvk, hk, nvk, panvk, pvr, radv, tu, v3dv, vn)
  VK_EXT_pipeline_library_group_handles                 DONE (anv, lvp, radv, vn)
  VK_EXT_post_depth_coverage                            DONE (anv/gfx11+, lvp, nvk, radv/gfx10+, tu, vn)
+  VK_EXT_present_timing                                 DONE (anv, hk, nvk, radv, tu)
  VK_EXT_primitive_topology_list_restart                DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn, nvk)
  VK_EXT_primitives_generated_query                     DONE (anv, hasvk, lvp, nvk, radv, tu, vn)
  VK_EXT_provoking_vertex                               DONE (anv, hasvk, hk, lvp, nvk, panvk, pvr, radv, tu, v3dv, vn)
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -20,3 +20,4 @@ VK_KHR_surface_maintenance1 promotion everywhere EXT is exposed
 VK_KHR_swapchain_maintenance1 promotion everywhere EXT is exposed
 VK_KHR_dynamic_rendering on PowerVR
 VK_EXT_multisampled_render_to_single_sampled on panvk
+VK_EXT_present_timing on RADV, NVK, Turnip, ANV, Honeykrisp
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@ -791,6 +791,10 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
      .EXT_pipeline_library_group_handles = radv_enable_rt(pdev),
      .EXT_pipeline_robustness = !pdev->use_llvm,
      .EXT_post_depth_coverage = pdev->info.gfx_level >= GFX10,
+#ifdef RADV_USE_WSI_PLATFORM
+      /* KHR_calibrated_timestamps is a requirement to expose EXT_present_timing. */
+      .EXT_present_timing = radv_calibrated_timestamps_enabled(pdev),
+#endif
      .EXT_primitive_topology_list_restart = true,
      .EXT_primitives_generated_query = true,
      .EXT_private_data = true,
@ -1481,6 +1485,14 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc

      /* VK_EXT_custom_resolve */
      .customResolve = true,
+
+#ifdef RADV_USE_WSI_PLATFORM
+      /* VK_EXT_present_timing */
+      /* The actual query is deferred to surface time. */
+      .presentTiming = true,
+      .presentAtAbsoluteTime = true,
+      .presentAtRelativeTime = true,
+#endif
   };
 }

--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@ -181,6 +181,9 @@ hk_get_device_extensions(const struct hk_instance *instance,
      .EXT_pipeline_protected_access = true,
      .EXT_pipeline_robustness = true,
      .EXT_physical_device_drm = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .EXT_present_timing = true,
+#endif
      .EXT_primitive_topology_list_restart = true,
      .EXT_private_data = true,
      .EXT_primitives_generated_query = false,
@ -623,6 +626,13 @@ hk_get_device_features(

      /* VK_KHR_shader_relaxed_extended_instruction */
      .shaderRelaxedExtendedInstruction = true,
+
+#ifdef HK_USE_WSI_PLATFORM
+      /* VK_EXT_present_timing */
+      .presentTiming = true,
+      .presentAtRelativeTime = true,
+      .presentAtAbsoluteTime = true,
+#endif
   };
 }

--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@ -792,17 +792,6 @@ dri2_fourcc_for_depth(struct dri2_egl_display *dri2_dpy, uint32_t depth)
   }
 }

-static int
-box_intersection_area(int16_t a_x, int16_t a_y, int16_t a_width,
-                      int16_t a_height, int16_t b_x, int16_t b_y,
-                      int16_t b_width, int16_t b_height)
-{
-   int w = MIN2(a_x + a_width, b_x + b_width) - MAX2(a_x, b_x);
-   int h = MIN2(a_y + a_height, b_y + b_height) - MAX2(a_y, b_y);
-
-   return (w < 0 || h < 0) ? 0 : w * h;
-}
-
 EGLBoolean
 dri2_x11_get_msc_rate(_EGLDisplay *display, _EGLSurface *surface,
                      EGLint *numerator, EGLint *denominator)
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -314,6 +314,9 @@ get_device_extensions(const struct tu_physical_device *device,
      .EXT_physical_device_drm = !is_kgsl(device->instance),
      .EXT_pipeline_creation_cache_control = true,
      .EXT_pipeline_creation_feedback = true,
+#ifdef TU_USE_WSI_PLATFORM
+      .EXT_present_timing = device->info->props.has_persistent_counter,
+#endif
      .EXT_primitive_topology_list_restart = true,
      .EXT_primitives_generated_query = true,
      .EXT_private_data = true,
@ -825,6 +828,13 @@ tu_get_features(struct tu_physical_device *pdevice,

   /* VK_EXT_custom_resolve */
   features->customResolve = true;
+
+#ifdef TU_USE_WSI_PLATFORM
+   /* VK_EXT_present_timing */
+   features->presentTiming = true;
+   features->presentAtRelativeTime = true;
+   features->presentAtAbsoluteTime = true;
+#endif
 }

 static void
--- a/src/intel/vulkan/anv_physical_device.c
+++ b/src/intel/vulkan/anv_physical_device.c
@ -354,6 +354,9 @@ get_device_extensions(const struct anv_physical_device *device,
      .EXT_pipeline_protected_access         = device->has_protected_contexts,
      .EXT_pipeline_robustness               = true,
      .EXT_post_depth_coverage               = true,
+#ifdef ANV_USE_WSI_PLATFORM
+      .EXT_present_timing                    = device->has_reg_timestamp,
+#endif
      .EXT_primitive_topology_list_restart   = true,
      .EXT_primitives_generated_query        = true,
      .EXT_private_data                      = true,
@ -1005,6 +1008,13 @@ get_features(const struct anv_physical_device *pdevice,

      /* VK_KHR_pipeline_binary */
      .pipelineBinaries = true,
+
+#ifdef ANV_USE_WSI_PLATFORM
+      /* VK_EXT_present_timing */
+      .presentTiming = true,
+      .presentAtRelativeTime = true,
+      .presentAtAbsoluteTime = true,
+#endif
   };

   /* The new DOOM and Wolfenstein games require depthBounds without
--- a/src/loader/loader_dri_helper.h
+++ b/src/loader/loader_dri_helper.h
@ -29,36 +29,7 @@
 #include <c11/threads.h>
 #include "util/format/u_formats.h"

-#ifdef HAVE_X11_PLATFORM
-#include <xcb/xcb.h>
-#include <xcb/dri3.h>
-#include <xcb/present.h>
-
-struct loader_crtc_info {
-   xcb_randr_crtc_t id;
-   xcb_timestamp_t timestamp;
-
-   int16_t x, y;
-   uint16_t width, height;
-
-   unsigned refresh_numerator;
-   unsigned refresh_denominator;
-};
-
-struct loader_screen_resources {
-   mtx_t mtx;
-
-   xcb_connection_t *conn;
-   xcb_screen_t *screen;
-
-   xcb_timestamp_t config_timestamp;
-
-   /* Number of CRTCs with an active mode set */
-   unsigned num_crtcs;
-   struct loader_crtc_info *crtcs;
-};
-#endif
-
+#include "loader_dri_helper_screen.h"

 /**
 * These formats are endian independent they result in the same layout
@ -110,16 +81,4 @@ loader_pipe_format_to_fourcc(enum pipe_format pipe);
 enum pipe_format
 loader_fourcc_to_pipe_format(uint32_t fourcc);

-#ifdef HAVE_X11_PLATFORM
-void
-loader_init_screen_resources(struct loader_screen_resources *res,
-                             xcb_connection_t *conn,
-                             xcb_screen_t *screen);
-bool
-loader_update_screen_resources(struct loader_screen_resources *res);
-
-void
-loader_destroy_screen_resources(struct loader_screen_resources *res);
-#endif
-
 #endif /* LOADER_DRI_HELPER_H */
--- a/src/loader/loader_dri_helper_screen.h
+++ b/src/loader/loader_dri_helper_screen.h
@ -0,0 +1,76 @@
+/*
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#ifndef LOADER_DRI_HELPER_SCREEN_H
+#define LOADER_DRI_HELPER_SCREEN_H
+
+#ifdef HAVE_X11_PLATFORM
+#include <xcb/xcb.h>
+#include <xcb/dri3.h>
+#include <xcb/present.h>
+
+struct loader_crtc_info {
+   xcb_randr_crtc_t id;
+   xcb_timestamp_t timestamp;
+
+   int16_t x, y;
+   uint16_t width, height;
+
+   unsigned refresh_numerator;
+   unsigned refresh_denominator;
+};
+
+struct loader_screen_resources {
+   mtx_t mtx;
+
+   xcb_connection_t *conn;
+   xcb_screen_t *screen;
+
+   xcb_timestamp_t config_timestamp;
+
+   /* Number of CRTCs with an active mode set */
+   unsigned num_crtcs;
+   struct loader_crtc_info *crtcs;
+};
+
+void
+loader_init_screen_resources(struct loader_screen_resources *res,
+                             xcb_connection_t *conn,
+                             xcb_screen_t *screen);
+bool
+loader_update_screen_resources(struct loader_screen_resources *res);
+
+void
+loader_destroy_screen_resources(struct loader_screen_resources *res);
+
+#endif
+
+static inline int
+box_intersection_area(int16_t a_x, int16_t a_y, int16_t a_width,
+                      int16_t a_height, int16_t b_x, int16_t b_y,
+                      int16_t b_width, int16_t b_height)
+{
+   int w = MIN2(a_x + a_width, b_x + b_width) - MAX2(a_x, b_x);
+   int h = MIN2(a_y + a_height, b_y + b_height) - MAX2(a_y, b_y);
+
+   return (w < 0 || h < 0) ? 0 : w * h;
+}
+
+#endif
--- a/src/meson.build
+++ b/src/meson.build
@ -49,7 +49,7 @@ endif
 if with_platform_x11
  subdir('x11')
 endif
-if with_gallium_or_lvp or with_gbm or with_platform_wayland
+if with_gallium_or_lvp or with_gbm or with_platform_wayland or with_platform_x11 or with_platform_xcb
  subdir('loader')
 endif
 subdir('compiler')
--- a/src/nouveau/vulkan/nvk_physical_device.c
+++ b/src/nouveau/vulkan/nvk_physical_device.c
@ -262,6 +262,9 @@ nvk_get_device_extensions(const struct nvk_instance *instance,
      .EXT_pipeline_robustness = true,
      .EXT_physical_device_drm = true,
      .EXT_post_depth_coverage = info->cls_eng3d >= MAXWELL_B,
+#ifdef NVK_USE_WSI_PLATFORM
+      .EXT_present_timing = true,
+#endif
      .EXT_primitive_topology_list_restart = true,
      .EXT_private_data = true,
      .EXT_primitives_generated_query = true,
@ -753,6 +756,11 @@ nvk_get_device_features(const struct nv_device_info *info,

      /* VK_KHR_present_wait2 */
      .presentWait2 = true,
+
+      /* VK_EXT_present_timing */
+      .presentTiming = true,
+      .presentAtRelativeTime = true,
+      .presentAtAbsoluteTime = true,
 #endif
   };
 }
--- a/src/vulkan/wsi/meson.build
+++ b/src/vulkan/wsi/meson.build
@ -26,6 +26,10 @@ if with_platform_wayland
  files_vulkan_wsi += wp_files['color-management-v1']
 endif

+if with_platform_x11 or with_platform_xcb
+  links_vulkan_wsi += libloader
+endif
+
 if with_platform_windows
  files_vulkan_wsi += files('wsi_common_win32.cpp')
  platform_deps += dep_dxheaders
--- a/src/vulkan/wsi/wsi_common.c
+++ b/src/vulkan/wsi/wsi_common.c
@ -95,6 +95,7 @@ wsi_device_init(struct wsi_device *wsi,
   WSI_GET_CB(GetPhysicalDeviceProperties2);
   WSI_GET_CB(GetPhysicalDeviceMemoryProperties);
   WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_GET_CB(GetPhysicalDeviceProperties);
 #undef WSI_GET_CB

   wsi->drm_info.sType =
@ -121,10 +122,18 @@ wsi_device_init(struct wsi_device *wsi,
   VkQueueFamilyProperties queue_properties[64];
   GetPhysicalDeviceQueueFamilyProperties(pdevice, &wsi->queue_family_count, queue_properties);

+   VkPhysicalDeviceProperties properties;
+   GetPhysicalDeviceProperties(pdevice, &properties);
+   wsi->timestamp_period = properties.limits.timestampPeriod;
+
   for (unsigned i = 0; i < wsi->queue_family_count; i++) {
      VkFlags req_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
      if (queue_properties[i].queueFlags & req_flags)
         wsi->queue_supports_blit |= BITFIELD64_BIT(i);
+
+      /* Don't want to consider timestamp wrapping logic. */
+      if (queue_properties[i].timestampValidBits == 64)
+         wsi->queue_supports_timestamps |= BITFIELD64_BIT(i);
   }

   for (VkExternalSemaphoreHandleTypeFlags handle_type = 1;
@ -180,15 +189,19 @@ wsi_device_init(struct wsi_device *wsi,
   WSI_GET_CB(CmdPipelineBarrier);
   WSI_GET_CB(CmdCopyImage);
   WSI_GET_CB(CmdCopyImageToBuffer);
+   WSI_GET_CB(CmdResetQueryPool);
+   WSI_GET_CB(CmdWriteTimestamp);
   WSI_GET_CB(CreateBuffer);
   WSI_GET_CB(CreateCommandPool);
   WSI_GET_CB(CreateFence);
   WSI_GET_CB(CreateImage);
+   WSI_GET_CB(CreateQueryPool);
   WSI_GET_CB(CreateSemaphore);
   WSI_GET_CB(DestroyBuffer);
   WSI_GET_CB(DestroyCommandPool);
   WSI_GET_CB(DestroyFence);
   WSI_GET_CB(DestroyImage);
+   WSI_GET_CB(DestroyQueryPool);
   WSI_GET_CB(DestroySemaphore);
   WSI_GET_CB(EndCommandBuffer);
   WSI_GET_CB(FreeMemory);
@ -200,9 +213,14 @@ wsi_device_init(struct wsi_device *wsi,
   WSI_GET_CB(GetImageSubresourceLayout);
   if (!wsi->sw)
      WSI_GET_CB(GetMemoryFdKHR);
+   WSI_GET_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
+   WSI_GET_CB(GetPhysicalDeviceProperties);
   WSI_GET_CB(GetPhysicalDeviceFormatProperties);
   WSI_GET_CB(GetPhysicalDeviceFormatProperties2);
   WSI_GET_CB(GetPhysicalDeviceImageFormatProperties2);
+   WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_GET_CB(GetCalibratedTimestampsKHR);
+   WSI_GET_CB(GetQueryPoolResults);
   WSI_GET_CB(GetSemaphoreFdKHR);
   WSI_GET_CB(ResetFences);
   WSI_GET_CB(QueueSubmit2);
@ -481,8 +499,10 @@ wsi_swapchain_init(const struct wsi_device *wsi,
   chain->blit.type = get_blit_type(wsi, image_params, _device);

   chain->blit.queue = NULL;
-   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
-      if (wsi->get_blit_queue) {
+   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT ||
+       (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)) {
+
+      if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT && wsi->get_blit_queue) {
         chain->blit.queue = wsi->get_blit_queue(_device);
      }

@ -503,10 +523,18 @@ wsi_swapchain_init(const struct wsi_device *wsi,
         if (chain->blit.queue != NULL) {
            queue_family_index = chain->blit.queue->queue_family_index;
         } else {
+            uint64_t effective_queues = wsi->queue_supports_blit;
+            if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)
+               effective_queues &= wsi->queue_supports_timestamps;
+
+            /* Fallback. If this happens we don't advertise support for queue complete times. */
+            if (!effective_queues)
+               effective_queues = wsi->queue_supports_blit;
+
            /* Queues returned by get_blit_queue() might not be listed in
            * GetPhysicalDeviceQueueFamilyProperties, so this check is skipped for those queues.
            */
-            if (!(wsi->queue_supports_blit & BITFIELD64_BIT(queue_family_index)))
+            if (!(effective_queues & BITFIELD64_BIT(queue_family_index)))
               continue;
         }

@ -616,7 +644,7 @@ wsi_swapchain_finish(struct wsi_swapchain *chain)
   chain->wsi->DestroySemaphore(chain->device, chain->present_id_timeline,
                                &chain->alloc);

-   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
+   if (chain->cmd_pools) {
      int cmd_pools_count = chain->blit.queue != NULL ?
         1 : chain->wsi->queue_family_count;
      for (uint32_t i = 0; i < cmd_pools_count; i++) {
@ -628,6 +656,12 @@ wsi_swapchain_finish(struct wsi_swapchain *chain)
      vk_free(&chain->alloc, chain->cmd_pools);
   }

+   if (chain->present_timing.active) {
+      mtx_destroy(&chain->present_timing.lock);
+      if (chain->present_timing.timings)
+         vk_free(&chain->alloc, chain->present_timing.timings);
+   }
+
   vk_object_base_finish(&chain->base);
 }

@ -815,6 +849,88 @@ fail:
   return result;
 }

+/**
+ * Creates the timestamp-query command buffers for the end of rendering, that
+ * will be used to report QUEUE_COMPLETE timestamp for EXT_present_timing.
+ *
+ * Unless the swapchain is blitting, we don't know what queue family a Present
+ * will happen on.  So we make a timestamp command buffer for each so they're
+ * ready to go at present time.
+ */
+VkResult
+wsi_image_init_timestamp(const struct wsi_swapchain *chain,
+                         struct wsi_image *image)
+{
+   const struct wsi_device *wsi = chain->wsi;
+   VkResult result;
+   /* Set up command buffer to get timestamp info */
+
+   result = wsi->CreateQueryPool(
+      chain->device,
+      &(const VkQueryPoolCreateInfo){
+         .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         .queryType = VK_QUERY_TYPE_TIMESTAMP,
+         .queryCount = 1,
+      },
+      NULL,
+      &image->query_pool);
+
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count;
+
+   if (!image->timestamp_cmd_buffers) {
+      image->timestamp_cmd_buffers =
+         vk_zalloc(&chain->alloc, sizeof(VkCommandBuffer) * family_count, 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->timestamp_cmd_buffers)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   for (uint32_t i = 0; i < family_count; i++) {
+      /* We can only use timestamps on a queue that reports timestamp bits != 0.
+       * Since we don't consider device timestamp wrapping in this implementation (unclear how that would ever work),
+       * only report queue done where timestamp bits == 64. */
+      if (!chain->cmd_pools[i])
+         continue;
+
+      result = wsi->AllocateCommandBuffers(
+         chain->device,
+         &(const VkCommandBufferAllocateInfo){
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+            .pNext = NULL,
+            .commandPool = chain->cmd_pools[i],
+            .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+            .commandBufferCount = 1,
+         }, &image->timestamp_cmd_buffers[i]);
+
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      wsi->BeginCommandBuffer(
+         image->timestamp_cmd_buffers[i],
+         &(VkCommandBufferBeginInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+         });
+
+      wsi->CmdResetQueryPool(image->timestamp_cmd_buffers[i],
+                             image->query_pool,
+                             0, 1);
+
+      wsi->CmdWriteTimestamp(image->timestamp_cmd_buffers[i],
+                             VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                             image->query_pool,
+                             0);
+
+      wsi->EndCommandBuffer(image->timestamp_cmd_buffers[i]);
+   }
+
+   return VK_SUCCESS;
+fail:
+   return result;
+}
+
 void
 wsi_destroy_image(const struct wsi_swapchain *chain,
                  struct wsi_image *image)
@ -850,6 +966,19 @@ wsi_destroy_image(const struct wsi_swapchain *chain,
      vk_free(&chain->alloc, image->blit.cmd_buffers);
   }

+   wsi->DestroyQueryPool(chain->device, image->query_pool, NULL);
+
+   if (image->timestamp_cmd_buffers) {
+      uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count;
+      for (uint32_t i = 0; i < family_count; i++) {
+         if (image->timestamp_cmd_buffers[i]) {
+            wsi->FreeCommandBuffers(chain->device, chain->cmd_pools[i],
+                                    1, &image->timestamp_cmd_buffers[i]);
+         }
+      }
+      vk_free(&chain->alloc, image->timestamp_cmd_buffers);
+   }
+
   wsi->FreeMemory(chain->device, image->memory, &chain->alloc);
   wsi->DestroyImage(chain->device, image->image, &chain->alloc);
   wsi->DestroyImage(chain->device, image->blit.image, &chain->alloc);
@ -912,8 +1041,43 @@ wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(
   struct wsi_device *wsi_device = device->wsi_device;
   struct wsi_interface *iface = wsi_device->wsi[surface->platform];

-   return iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext,
-                                   pSurfaceCapabilities);
+   VkResult vr = iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext,
+                                          pSurfaceCapabilities);
+   if (vr != VK_SUCCESS)
+      return vr;
+
+   struct VkPresentTimingSurfaceCapabilitiesEXT *present_timing =
+         vk_find_struct(pSurfaceCapabilities, PRESENT_TIMING_SURFACE_CAPABILITIES_EXT);
+
+   if (present_timing && present_timing->presentTimingSupported) {
+      if (wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps) {
+         /* Make sure the implementation is capable of calibrating timestamps. */
+         if (wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR && wsi_device->GetCalibratedTimestampsKHR) {
+            VkTimeDomainKHR domains[64];
+            uint32_t count = ARRAY_SIZE(domains);
+            wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR(wsi_device->pdevice, &count, domains);
+
+            bool supports_device = false, supports_monotonic = false, supports_monotonic_raw = false;
+
+            for (uint32_t i = 0; i < count; i++) {
+               if (domains[i] == VK_TIME_DOMAIN_DEVICE_KHR)
+                  supports_device = true;
+               else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR)
+                  supports_monotonic = true;
+               else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR)
+                  supports_monotonic_raw = true;
+            }
+
+            /* Current present timing implementations do not use anything outside these.
+             * QPC might be relevant for Dozen at some point, but for now, we only consider Linux-centric
+             * platforms for present timing. */
+            if (supports_device && supports_monotonic && supports_monotonic_raw)
+               present_timing->presentStageQueries |= VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
+         }
+      }
+   }
+
+   return vr;
 }

 VKAPI_ATTR VkResult VKAPI_CALL
@ -1112,6 +1276,32 @@ wsi_CreateSwapchainKHR(VkDevice _device,

   *pSwapchain = wsi_swapchain_to_handle(swapchain);

+   if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) {
+      swapchain->present_timing.active = true;
+      mtx_init(&swapchain->present_timing.lock, 0);
+
+      for (uint32_t i = 0; i < swapchain->image_count; i++) {
+         struct wsi_image *image = swapchain->get_wsi_image(swapchain, i);
+         result = wsi_image_init_timestamp(swapchain, image);
+         if (result != VK_SUCCESS) {
+            swapchain->destroy(swapchain, alloc);
+            return result;
+         }
+      }
+
+      if (swapchain->poll_early_refresh) {
+         /* If we can query the display directly, we should report something reasonable on first query
+          * before we even present the first time. */
+         uint64_t interval;
+         uint64_t refresh_ns = swapchain->poll_early_refresh(swapchain, &interval);
+         if (refresh_ns) {
+            swapchain->present_timing.refresh_duration = refresh_ns;
+            swapchain->present_timing.refresh_interval = interval;
+            swapchain->present_timing.refresh_counter++;
+         }
+      }
+   }
+
   return VK_SUCCESS;
 }

@ -1168,6 +1358,353 @@ wsi_ReleaseSwapchainImagesKHR(VkDevice _device,
   return VK_SUCCESS;
 }

+static void
+wsi_swapchain_present_timing_sample_query_pool(struct wsi_swapchain *chain,
+                                               struct wsi_presentation_timing *timing,
+                                               struct wsi_image *image,
+                                               uint64_t upper_bound)
+{
+   if (!(timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT))
+      return;
+
+   /* The GPU really should be done by now, and we should be able to read the timestamp,
+    * but it's possible that the present was discarded and we have a 0 timestamp here for the present.
+    * In this case, we should not block to wait on the queue dispatch timestamp. */
+   uint64_t queue_ts;
+
+   if (chain->wsi->GetQueryPoolResults(chain->device, image->query_pool, 0, 1, sizeof(uint64_t),
+                                       &queue_ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT) != VK_SUCCESS)
+      return;
+
+   /* There are two ways to deal with DEVICE timestamp domain.
+    * Either we can report PRESENT_STAGE_LOCAL domain and let application
+    * calibrate the timestamps on its own. However, this creates an annoying situation
+    * where application is able to QueuePresentKHR requesting we use QUEUE_OPERATIONS_END time domain as
+    * the reference (targetTimeDomainPresentStage).
+    * In that case, we are forced to re-calibrate the timestamp anyway.
+    * We will also need to implement per-driver plumbing to forward SWAPCHAIN_LOCAL and PRESENT_STAGE_LOCAL
+    * time domains to the swapchain and query the underlying time domain.
+    * Instead of dealing with this mess, just recalibrate the timestamp. The accuracy of queue_operations_end
+    * is not particularly important. */
+
+   /* We have already made sure that the implementation supports these. */
+   const VkCalibratedTimestampInfoKHR infos[2] = {
+      {
+         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
+         .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
+      },
+      {
+         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
+         .timeDomain = chain->present_timing.time_domain,
+      },
+   };
+
+   uint64_t timestamps[2];
+   uint64_t max_deviation;
+   if (chain->wsi->GetCalibratedTimestampsKHR(chain->device, 2, infos, timestamps, &max_deviation) == VK_SUCCESS) {
+      int64_t device_delta_ticks = (int64_t)queue_ts - (int64_t)timestamps[0];
+      int64_t device_delta_ns = (int64_t)((double)chain->wsi->timestamp_period * (double)device_delta_ticks);
+      uint64_t queue_timestamp = timestamps[1] + device_delta_ns;
+
+      /* Make sure we don't report GPU completing after we flip the request.
+       * Avoids any weird precision issues creeping through. */
+      if (upper_bound)
+         queue_timestamp = MIN2(queue_timestamp, upper_bound);
+
+      timing->queue_done_time = queue_timestamp;
+   }
+}
+
+static void
+wsi_swapchain_present_timing_notify_recycle_locked(struct wsi_swapchain *chain,
+                                                   struct wsi_image *image)
+{
+   assert(chain->present_timing.active);
+
+   for (size_t i = 0; i < chain->present_timing.timings_count; i++) {
+      if (chain->present_timing.timings[i].image == image) {
+         /* A different present takes ownership of the image's query pool index now. */
+         chain->present_timing.timings[i].image = NULL;
+         chain->present_timing.timings[i].queue_done_time = 0;
+
+         /* We waited on progress fence, so the timestamp query is guaranteed to be done. */
+         wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, 0);
+         break;
+      }
+   }
+}
+
+static VkResult wsi_common_allocate_timing_request(
+      struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing,
+      uint64_t present_id, struct wsi_image *image)
+{
+   VkResult vr = VK_SUCCESS;
+   mtx_lock(&swapchain->present_timing.lock);
+
+   if (swapchain->present_timing.timings_count >= swapchain->present_timing.timings_capacity) {
+      vr = VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT;
+      goto err;
+   }
+
+   wsi_swapchain_present_timing_notify_recycle_locked(swapchain, image);
+
+   struct wsi_presentation_timing *wsi_timing =
+         &swapchain->present_timing.timings[swapchain->present_timing.timings_count++];
+
+   memset(wsi_timing, 0, sizeof(*wsi_timing));
+   wsi_timing->serial = ++swapchain->present_timing.serial;
+   wsi_timing->target_time = timing->targetTime;
+   wsi_timing->present_id = present_id;
+   wsi_timing->requested_feedback = timing->presentStageQueries;
+   wsi_timing->image = image;
+
+   /* Ignore the time domain since we have a static domain. */
+
+err:
+   mtx_unlock(&swapchain->present_timing.lock);
+   return vr;
+}
+
+void
+wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
+                                               uint64_t timing_serial,
+                                               uint64_t timestamp,
+                                               struct wsi_image *image)
+{
+   assert(chain->present_timing.active);
+   mtx_lock(&chain->present_timing.lock);
+
+   for (size_t i = 0; i < chain->present_timing.timings_count; i++) {
+      if (chain->present_timing.timings[i].serial == timing_serial) {
+         chain->present_timing.timings[i].complete_time = timestamp;
+         chain->present_timing.timings[i].complete = VK_TRUE;
+
+         /* It's possible that QueuePresentKHR already handled the queue done timestamp for us,
+          * since the image was recycled before presentation could fully complete.
+          * In this case, we no longer own the timestamp query pool index, so just skip. */
+         if (chain->present_timing.timings[i].image != image)
+            break;
+
+         /* 0 means unknown. Application can probably fall back to its own timestamps if it wants to. */
+         chain->present_timing.timings[i].queue_done_time = 0;
+         wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, timestamp);
+         chain->present_timing.timings[i].image = NULL;
+         break;
+      }
+   }
+
+   mtx_unlock(&chain->present_timing.lock);
+}
+
+void
+wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain,
+                                                 uint64_t refresh_duration,
+                                                 uint64_t refresh_interval,
+                                                 int minimum_delta_for_update)
+{
+   mtx_lock(&chain->present_timing.lock);
+
+   int64_t duration_delta = llabs((int64_t)refresh_duration - (int64_t)chain->present_timing.refresh_duration);
+   int64_t interval_delta = llabs((int64_t)refresh_interval - (int64_t)chain->present_timing.refresh_interval);
+
+   /* When the refresh rate is an estimate, the value may fluctuate slightly frame to frame,
+    * don't spam refresh counter updates unless there is a meaningful delta.
+    * Applications that use absolute timings are expected to recalibrate based on feedback. */
+   if (duration_delta > minimum_delta_for_update || interval_delta > minimum_delta_for_update ||
+       chain->present_timing.refresh_counter == 0) {
+      /* We'll report this updated refresh counter in feedback,
+       * so that application knows to requery the refresh rate. */
+      chain->present_timing.refresh_counter++;
+      chain->present_timing.refresh_duration = refresh_duration;
+      chain->present_timing.refresh_interval = refresh_interval;
+   }
+
+   mtx_unlock(&chain->present_timing.lock);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+wsi_GetPastPresentationTimingEXT(
+      VkDevice                                    device,
+      const VkPastPresentationTimingInfoEXT*      pPastPresentationTimingInfo,
+      VkPastPresentationTimingPropertiesEXT*      pPastPresentationTimingProperties)
+{
+   VK_FROM_HANDLE(wsi_swapchain, swapchain, pPastPresentationTimingInfo->swapchain);
+   VkResult vr = VK_SUCCESS;
+   bool out_of_order = (pPastPresentationTimingInfo->flags &
+         VK_PAST_PRESENTATION_TIMING_ALLOW_OUT_OF_ORDER_RESULTS_BIT_EXT) != 0;
+
+   if (swapchain->poll_timing_request)
+      swapchain->poll_timing_request(swapchain);
+
+   mtx_lock(&swapchain->present_timing.lock);
+
+   pPastPresentationTimingProperties->timingPropertiesCounter = swapchain->present_timing.refresh_counter;
+   pPastPresentationTimingProperties->timeDomainsCounter = 1;
+
+   /* This implementation always returns results in-order, so can ignore the out-of-order flag.
+    * TODO: Honor the partial results flag. */
+
+   uint32_t done_count = 0;
+   for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) {
+      /* If different presents request different kinds of state, we may get completion out of order.
+       * If flag is not set, we cannot report frame N until we have completed all frames M < N. */
+      if (swapchain->present_timing.timings[i].complete)
+         done_count++;
+      else if (!out_of_order)
+         break;
+   }
+
+   /* We don't remove timing info from queue until it is consumed. */
+   if (!pPastPresentationTimingProperties->pPresentationTimings) {
+      pPastPresentationTimingProperties->presentationTimingCount = done_count;
+      mtx_unlock(&swapchain->present_timing.lock);
+      return VK_SUCCESS;
+   }
+
+   VK_OUTARRAY_MAKE_TYPED(VkPastPresentationTimingEXT, timings,
+                          pPastPresentationTimingProperties->pPresentationTimings,
+                          &pPastPresentationTimingProperties->presentationTimingCount);
+
+   uint32_t new_timings_count = 0;
+   bool stop_timing_removal = false;
+
+   for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) {
+      const struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i];
+
+      if (!swapchain->present_timing.timings[i].complete || stop_timing_removal) {
+         /* Keep output ordered to be compliant without having to re-sort every time.
+          * Queue depth for timestamps is expected to be small. */
+         swapchain->present_timing.timings[new_timings_count++] = swapchain->present_timing.timings[i];
+         if (!out_of_order)
+            stop_timing_removal = true;
+         continue;
+      }
+
+      vk_outarray_append_typed(VkPastPresentationTimingEXT, &timings, timing) {
+         timing->targetTime = swapchain->present_timing.timings[i].target_time;
+         timing->presentId = in_timing->present_id;
+         timing->timeDomain = swapchain->present_timing.time_domain;
+         timing->timeDomainId = 0;
+         timing->reportComplete = in_timing->complete;
+
+         /* No INCOMPLETE is reported here. Failures are silent.
+          * However, application already knows upper bound for stage count based on the query,
+          * so this should never fail. */
+         VK_OUTARRAY_MAKE_TYPED(VkPresentStageTimeEXT, stages, timing->pPresentStages, &timing->presentStageCount);
+
+         if (in_timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
+            vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) {
+               stage->stage = VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
+               stage->time = in_timing->queue_done_time;
+            }
+         }
+
+         if (in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
+            vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) {
+               stage->stage = in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
+               /* It is expected that implementation will only expose one timing value. */
+               assert(util_bitcount(stage->stage) == 1);
+               stage->time = in_timing->complete_time;
+            }
+         }
+      }
+   }
+
+   swapchain->present_timing.timings_count = new_timings_count;
+   vr = vk_outarray_status(&timings);
+
+   /* This function is fully atomic within implementation, so have to be thread safe. */
+   mtx_unlock(&swapchain->present_timing.lock);
+   return vr;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+wsi_GetSwapchainTimeDomainPropertiesEXT(
+      VkDevice                                    device,
+      VkSwapchainKHR                              swapchain_,
+      VkSwapchainTimeDomainPropertiesEXT*         pSwapchainTimeDomainProperties,
+      uint64_t*                                   pTimeDomainsCounter)
+{
+   VK_FROM_HANDLE(wsi_swapchain, swapchain, swapchain_);
+
+   /* We don't change time domains. Everything is static. */
+   if (pTimeDomainsCounter)
+      *pTimeDomainsCounter = 1;
+
+   /* This style is a bit goofy and doesn't map cleanly to anything. */
+   if (!pSwapchainTimeDomainProperties->pTimeDomainIds && !pSwapchainTimeDomainProperties->pTimeDomains) {
+      pSwapchainTimeDomainProperties->timeDomainCount = 1;
+      return VK_SUCCESS;
+   } else if (pSwapchainTimeDomainProperties->timeDomainCount == 0) {
+      return VK_INCOMPLETE;
+   }
+
+   pSwapchainTimeDomainProperties->timeDomainCount = 1;
+   if (pSwapchainTimeDomainProperties->pTimeDomains)
+      *pSwapchainTimeDomainProperties->pTimeDomains = swapchain->present_timing.time_domain;
+   if (pSwapchainTimeDomainProperties->pTimeDomainIds)
+      *pSwapchainTimeDomainProperties->pTimeDomainIds = 0;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+wsi_GetSwapchainTimingPropertiesEXT(
+      VkDevice                                    device,
+      VkSwapchainKHR                              swapchain_,
+      VkSwapchainTimingPropertiesEXT*             pSwapchainTimingProperties,
+      uint64_t*                                   pSwapchainTimingPropertiesCounter)
+{
+   VK_FROM_HANDLE(wsi_swapchain, swapchain, swapchain_);
+   VkResult vr;
+
+   mtx_lock(&swapchain->present_timing.lock);
+   /* If we don't have data yet, must return VK_NOT_READY. */
+   vr = swapchain->present_timing.refresh_counter ? VK_SUCCESS : VK_NOT_READY;
+   pSwapchainTimingProperties->refreshInterval = swapchain->present_timing.refresh_interval;
+   pSwapchainTimingProperties->refreshDuration = swapchain->present_timing.refresh_duration;
+   if (pSwapchainTimingPropertiesCounter)
+      *pSwapchainTimingPropertiesCounter = swapchain->present_timing.refresh_counter;
+   mtx_unlock(&swapchain->present_timing.lock);
+   return vr;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+wsi_SetSwapchainPresentTimingQueueSizeEXT(
+      VkDevice                                    device,
+      VkSwapchainKHR                              swapchain_,
+      uint32_t                                    size)
+{
+   VK_FROM_HANDLE(wsi_swapchain, swapchain, swapchain_);
+   assert(swapchain->present_timing.active);
+   VkResult vr = VK_SUCCESS;
+
+   mtx_lock(&swapchain->present_timing.lock);
+
+   if (size < swapchain->present_timing.timings_count) {
+      vr = VK_NOT_READY;
+      goto error;
+   }
+
+   if (size > swapchain->present_timing.timings_capacity) {
+      void *new_ptr = vk_realloc(&swapchain->alloc, swapchain->present_timing.timings,
+                 sizeof(*swapchain->present_timing.timings) * size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (new_ptr) {
+         swapchain->present_timing.timings = new_ptr;
+         swapchain->present_timing.timings_capacity = size;
+      } else {
+         vr = VK_ERROR_OUT_OF_HOST_MEMORY;
+         goto error;
+      }
+   } else {
+      swapchain->present_timing.timings_capacity = size;
+   }
+
+error:
+   mtx_unlock(&swapchain->present_timing.lock);
+   return vr;
+}
+
 VkDeviceMemory
 wsi_common_get_memory(VkSwapchainKHR _swapchain, uint32_t index)
 {
@ -1521,6 +2058,50 @@ wsi_common_queue_present(const struct wsi_device *wsi,
      vk_find_struct_const(pPresentInfo->pNext, PRESENT_ID_2_KHR);
   const VkSwapchainPresentFenceInfoKHR *present_fence_info =
      vk_find_struct_const(pPresentInfo->pNext, SWAPCHAIN_PRESENT_FENCE_INFO_KHR);
+   const VkPresentTimingsInfoEXT *present_timings_info =
+         vk_find_struct_const(pPresentInfo->pNext, PRESENT_TIMINGS_INFO_EXT);
+
+   bool needs_timing_command_buffer = false;
+
+   if (present_timings_info) {
+      /* If we fail a present due to full queue, it's a little unclear from
+       * spec if we should treat it as OUT_OF_DATE or OUT_OF_HOST_MEMORY for
+       * purposes of signaling. Validation layers and at least one other implementation
+       * in the wild seems to treat it as OUT_OF_DATE, so do that. */
+      for (uint32_t i = 0; i < present_timings_info->swapchainCount; i++) {
+         const VkPresentTimingInfoEXT *info = &present_timings_info->pTimingInfos[i];
+         VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]);
+         if (results[i] != VK_SUCCESS || !swapchain->set_timing_request)
+            continue;
+
+         assert(swapchain->present_timing.active);
+
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
+         /* EXT_present_timing is defined to only work with present_id2.
+          * It's only used when reporting back timings. */
+         results[i] = wsi_common_allocate_timing_request(
+               swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0,
+               swapchain->get_wsi_image(swapchain, image_index));
+
+         /* Application is responsible for allocating sufficient size here.
+          * We fail with VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT if application is bugged. */
+         if (results[i] == VK_SUCCESS) {
+            swapchain->set_timing_request(swapchain, &(struct wsi_image_timing_request) {
+               .serial = swapchain->present_timing.serial,
+               .time = info->targetTime,
+               .flags = info->flags,
+            });
+
+            if (info->presentStageQueries & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
+               /* It's not a problem if we redundantly submit timing command buffers.
+                * VUID-12234 also says all swapchains in this present must have been
+                * created with present timing enabled. */
+               needs_timing_command_buffer = true;
+            }
+         }
+      }
+   }

   /* Gather up all the semaphores and fences we need to signal per-image */
   STACK_ARRAY(struct wsi_image_signal_info, image_signal_infos,
@ -1596,15 +2177,15 @@ wsi_common_queue_present(const struct wsi_device *wsi,
    * the per-image semaphores and fences with the blit.
    */
   {
-      STACK_ARRAY(VkCommandBufferSubmitInfo, blit_command_buffer_infos,
-                  pPresentInfo->swapchainCount);
+      STACK_ARRAY(VkCommandBufferSubmitInfo, command_buffer_infos,
+                  pPresentInfo->swapchainCount * 2);
      STACK_ARRAY(VkSemaphoreSubmitInfo, signal_semaphore_infos,
                  pPresentInfo->swapchainCount *
                  ARRAY_SIZE(image_signal_infos[0].semaphore_infos));
      STACK_ARRAY(VkFence, fences,
                  pPresentInfo->swapchainCount *
                  ARRAY_SIZE(image_signal_infos[0].fences));
-      uint32_t blit_count = 0, signal_semaphore_count = 0, fence_count = 0;
+      uint32_t command_buffer_count = 0, signal_semaphore_count = 0, fence_count = 0;

      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
         VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]);
@ -1612,14 +2193,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
         struct wsi_image *image =
            swapchain->get_wsi_image(swapchain, image_index);

+         bool separate_queue_blit = swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT &&
+                                    swapchain->blit.queue != NULL;
+
+         /* For TIMING_QUEUE_FULL_EXT, ensure sync objects are signaled,
+          * but don't do any real work. */
+         if (results[i] == VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT || !separate_queue_blit) {
+            for (uint32_t j = 0; j < image_signal_infos[i].semaphore_count; j++) {
+               signal_semaphore_infos[signal_semaphore_count++] =
+                     image_signal_infos[i].semaphore_infos[j];
+            }
+            for (uint32_t j = 0; j < image_signal_infos[i].fence_count; j++)
+               fences[fence_count++] = image_signal_infos[i].fences[j];
+         }
+
         if (results[i] != VK_SUCCESS)
            continue;

         /* If we're blitting on another swapchain, just signal the blit
          * semaphore for now.
          */
-         if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT &&
-             swapchain->blit.queue != NULL) {
+         if (separate_queue_blit) {
            /* Create the blit semaphore if needed */
            if (swapchain->blit.semaphores[image_index] == VK_NULL_HANDLE) {
               const VkSemaphoreCreateInfo sem_info = {
@ -1644,27 +2238,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
         }

         if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
-            blit_command_buffer_infos[blit_count++] = (VkCommandBufferSubmitInfo) {
+            command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
               .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
               .commandBuffer =
                  image->blit.cmd_buffers[queue->queue_family_index],
            };
         }

-         for (uint32_t j = 0; j < image_signal_infos[i].semaphore_count; j++) {
-            signal_semaphore_infos[signal_semaphore_count++] =
-               image_signal_infos[i].semaphore_infos[j];
+         if (needs_timing_command_buffer) {
+            command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
+               .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+               .commandBuffer = image->timestamp_cmd_buffers[queue->queue_family_index],
+            };
         }
-         for (uint32_t j = 0; j < image_signal_infos[i].fence_count; j++)
-            fences[fence_count++] = image_signal_infos[i].fences[j];
      }

      const VkSubmitInfo2 submit_info = {
         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
         .waitSemaphoreInfoCount = pPresentInfo->waitSemaphoreCount,
         .pWaitSemaphoreInfos = semaphore_wait_infos,
-         .commandBufferInfoCount = blit_count,
-         .pCommandBufferInfos = blit_command_buffer_infos,
+         .commandBufferInfoCount = command_buffer_count,
+         .pCommandBufferInfos = command_buffer_infos,
         .signalSemaphoreInfoCount = signal_semaphore_count,
         .pSignalSemaphoreInfos = signal_semaphore_infos,
      };
@ -1680,7 +2274,7 @@ wsi_common_queue_present(const struct wsi_device *wsi,

      STACK_ARRAY_FINISH(fences);
      STACK_ARRAY_FINISH(signal_semaphore_infos);
-      STACK_ARRAY_FINISH(blit_command_buffer_infos);
+      STACK_ARRAY_FINISH(command_buffer_infos);
   }

   /* Now do blits on any blit queues */
@ -1693,8 +2287,10 @@ wsi_common_queue_present(const struct wsi_device *wsi,
      if (results[i] != VK_SUCCESS)
         continue;

-      if (swapchain->blit.type == WSI_SWAPCHAIN_NO_BLIT ||
-          swapchain->blit.queue == NULL)
+      bool separate_queue_blit = swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT &&
+                                 swapchain->blit.queue != NULL;
+
+      if (!separate_queue_blit)
         continue;

      const VkSemaphoreSubmitInfo blit_semaphore_info = {
@ -1703,17 +2299,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
         .semaphore = swapchain->blit.semaphores[image_index],
      };

-      const VkCommandBufferSubmitInfo blit_command_buffer_info = {
+      VkCommandBufferSubmitInfo command_buffer_infos[2];
+      uint32_t command_buffer_count = 0;
+
+      command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
         .commandBuffer = image->blit.cmd_buffers[0],
      };

+      if (needs_timing_command_buffer) {
+         command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+            .commandBuffer = image->timestamp_cmd_buffers[0],
+         };
+      }
+
      const VkSubmitInfo2 submit_info = {
         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
         .waitSemaphoreInfoCount = 1,
         .pWaitSemaphoreInfos = &blit_semaphore_info,
-         .commandBufferInfoCount = 1,
-         .pCommandBufferInfos = &blit_command_buffer_info,
+         .commandBufferInfoCount = command_buffer_count,
+         .pCommandBufferInfos = command_buffer_infos,
         .signalSemaphoreInfoCount = image_signal_infos[i].semaphore_count,
         .pSignalSemaphoreInfos = image_signal_infos[i].semaphore_infos,
      };
--- a/src/vulkan/wsi/wsi_common.h
+++ b/src/vulkan/wsi/wsi_common.h
@ -62,6 +62,8 @@ struct wsi_device {
   VkPhysicalDeviceMemoryProperties memory_props;
   uint32_t queue_family_count;
   uint64_t queue_supports_blit;
+   uint64_t queue_supports_timestamps;
+   float timestamp_period;

   VkPhysicalDeviceDrmPropertiesEXT drm_info;
   VkPhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info;
@ -201,28 +203,37 @@ struct wsi_device {
   WSI_CB(CmdPipelineBarrier);
   WSI_CB(CmdCopyImage);
   WSI_CB(CmdCopyImageToBuffer);
+   WSI_CB(CmdResetQueryPool);
+   WSI_CB(CmdWriteTimestamp);
   WSI_CB(CreateBuffer);
   WSI_CB(CreateCommandPool);
   WSI_CB(CreateFence);
   WSI_CB(CreateImage);
+   WSI_CB(CreateQueryPool);
   WSI_CB(CreateSemaphore);
   WSI_CB(DestroyBuffer);
   WSI_CB(DestroyCommandPool);
   WSI_CB(DestroyFence);
   WSI_CB(DestroyImage);
+   WSI_CB(DestroyQueryPool);
   WSI_CB(DestroySemaphore);
   WSI_CB(EndCommandBuffer);
   WSI_CB(FreeMemory);
   WSI_CB(FreeCommandBuffers);
   WSI_CB(GetBufferMemoryRequirements);
+   WSI_CB(GetCalibratedTimestampsKHR);
   WSI_CB(GetFenceStatus);
   WSI_CB(GetImageDrmFormatModifierPropertiesEXT);
   WSI_CB(GetImageMemoryRequirements);
   WSI_CB(GetImageSubresourceLayout);
   WSI_CB(GetMemoryFdKHR);
+   WSI_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
+   WSI_CB(GetPhysicalDeviceProperties);
   WSI_CB(GetPhysicalDeviceFormatProperties);
   WSI_CB(GetPhysicalDeviceFormatProperties2);
   WSI_CB(GetPhysicalDeviceImageFormatProperties2);
+   WSI_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_CB(GetQueryPoolResults);
   WSI_CB(GetSemaphoreFdKHR);
   WSI_CB(ResetFences);
   WSI_CB(QueueSubmit2);
--- a/src/vulkan/wsi/wsi_common_display.c
+++ b/src/vulkan/wsi/wsi_common_display.c
@ -156,6 +156,12 @@ enum colorspace_enum {
   COLORSPACE_ENUM_MAX,
 };

+enum vrr_tristate {
+   VRR_TRISTATE_UNKNOWN,
+   VRR_TRISTATE_DISABLED,
+   VRR_TRISTATE_ENABLED,
+};
+
 typedef struct wsi_display_connector_metadata {
   VkHdrMetadataEXT             hdr_metadata;
   bool                         supports_st2084;
@ -185,6 +191,10 @@ typedef struct wsi_display_connector {
   struct wsi_display_connector_metadata metadata;
   uint32_t                     count_formats;
   uint32_t                     *formats;
+   enum vrr_tristate            vrr_capable;
+   enum vrr_tristate            vrr_enabled;
+   uint64_t                     last_frame;
+   uint64_t                     last_nsec;
 } wsi_display_connector;

 struct wsi_display {
@ -370,6 +380,11 @@ find_properties(struct wsi_display_connector *connector, uint32_t count_props, u
         }
      }

+      if (!strcmp(prop->name, "vrr_capable"))
+         connector->vrr_capable = prop_values[p] != 0 ? VRR_TRISTATE_ENABLED : VRR_TRISTATE_DISABLED;
+      if (!strcmp(prop->name, "VRR_ENABLED"))
+         connector->vrr_enabled = prop_values[p] != 0 ? VRR_TRISTATE_ENABLED : VRR_TRISTATE_DISABLED;
+
      drmModeFreeProperty(prop);
   }

@ -431,38 +446,45 @@ find_connector_properties(struct wsi_display_connector *connector, drmModeConnec
 enum wsi_image_state {
   WSI_IMAGE_IDLE,
   WSI_IMAGE_DRAWING,
+   WSI_IMAGE_WAITING,
+   WSI_IMAGE_QUEUED_AFTER_WAIT,
   WSI_IMAGE_QUEUED,
   WSI_IMAGE_FLIPPING,
   WSI_IMAGE_DISPLAYING
 };

 struct wsi_display_image {
-   struct wsi_image             base;
-   struct wsi_display_swapchain *chain;
-   enum wsi_image_state         state;
-   uint32_t                     fb_id;
-   uint32_t                     buffer[4];
-   uint64_t                     flip_sequence;
-   uint64_t                     present_id;
+   struct wsi_image                base;
+   struct wsi_display_swapchain    *chain;
+   enum wsi_image_state            state;
+   uint32_t                        fb_id;
+   uint32_t                        buffer[4];
+   uint64_t                        flip_sequence;
+   uint64_t                        present_id;
+   struct wsi_image_timing_request timing_request;
+   struct wsi_display_fence        *fence;
+   uint64_t                        minimum_ns;
 };

 struct wsi_display_swapchain {
-   struct wsi_swapchain         base;
-   struct wsi_display           *wsi;
-   VkIcdSurfaceDisplay          *surface;
-   uint64_t                     flip_sequence;
-   VkResult                     status;
+   struct wsi_swapchain            base;
+   struct wsi_display              *wsi;
+   VkIcdSurfaceDisplay             *surface;
+   uint64_t                        flip_sequence;
+   VkResult                        status;

-   mtx_t                        present_id_mutex;
-   struct u_cnd_monotonic       present_id_cond;
-   uint64_t                     present_id;
-   VkResult                     present_id_error;
+   mtx_t                           present_id_mutex;
+   struct u_cnd_monotonic          present_id_cond;
+   uint64_t                        present_id;
+   VkResult                        present_id_error;

   /* A unique ID for the color outcome of the swapchain. A serial of 0 means unset/default. */
-   uint64_t                     color_outcome_serial;
-   VkHdrMetadataEXT             hdr_metadata;
+   uint64_t                        color_outcome_serial;
+   VkHdrMetadataEXT                hdr_metadata;

-   struct wsi_display_image     images[0];
+   struct wsi_image_timing_request timing_request;
+
+   struct wsi_display_image        images[0];
 };

 struct wsi_display_fence {
@ -473,6 +495,9 @@ struct wsi_display_fence {
   uint32_t                     syncobj; /* syncobj to signal on event */
   uint64_t                     sequence;
   bool                         device_event; /* fence is used for device events */
+   struct wsi_display_connector *connector;
+   /* Image to be flipped, if this fence is for an image in the WSI_IMAGE_WAITING state that will need to move to QUEUED. */
+   struct wsi_display_image     *image;
 };

 struct wsi_display_sync {
@ -1319,6 +1344,16 @@ wsi_display_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface,
         break;
      }

+      case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: {
+         VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext;
+
+         wait->presentStageQueries = VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT;
+         wait->presentTimingSupported = VK_TRUE;
+         wait->presentAtAbsoluteTimeSupported = VK_TRUE;
+         wait->presentAtRelativeTimeSupported = VK_TRUE;
+         break;
+      }
+
      default:
         /* Ignored */
         break;
@ -1678,6 +1713,8 @@ wsi_display_image_init(struct wsi_swapchain *drv_chain,

   image->chain = chain;
   image->state = WSI_IMAGE_IDLE;
+   image->fence = NULL;
+   image->minimum_ns = 0;
   image->fb_id = 0;

   uint64_t *fb_modifiers = NULL;
@ -1789,6 +1826,12 @@ wsi_display_idle_old_displaying(struct wsi_display_image *active_image)
 static VkResult
 _wsi_display_queue_next(struct wsi_swapchain *drv_chain);

+static uint64_t
+widen_32_to_64(uint32_t narrow, uint64_t near)
+{
+   return near + (int32_t)(narrow - near);
+}
+
 /**
 * Wakes up any vkWaitForPresentKHR() waiters on the last present to this
 * image.
@ -1817,6 +1860,17 @@ wsi_display_surface_error(struct wsi_display_swapchain *swapchain, VkResult resu
   mtx_unlock(&swapchain->present_id_mutex);
 }

+/**
+ * libdrm callback for when we get a DRM_EVENT_PAGE_FLIP in response to our
+ * atomic commit with DRM_MODE_PAGE_FLIP_EVENT.  That event can happen at any
+ * point after vblank, when the old image is no longer being scanned out and
+ * that commit is set up to be scanned out next.
+ *
+ * This means that we can queue up a new atomic commit, if there were presents
+ * that we hadn't submitted yet (the event queue is driven by
+ * wsi_display_wait_thread(), so that's what ends up submitting atomic commits
+ * most of the time).
+ **/
 static void
 wsi_display_page_flip_handler2(int fd,
                               unsigned int frame,
@ -1828,6 +1882,28 @@ wsi_display_page_flip_handler2(int fd,
   struct wsi_display_image *image = data;
   struct wsi_display_swapchain *chain = image->chain;

+   VkIcdSurfaceDisplay *surface = chain->surface;
+   wsi_display_mode *display_mode =
+         wsi_display_mode_from_handle(surface->displayMode);
+   wsi_display_connector *connector = display_mode->connector;
+
+   uint64_t nsec = 1000000000ull * sec + 1000ull * usec;
+   /* If we're on VRR timing path, ensure we get a stable pace. */
+   nsec = MAX2(nsec, image->minimum_ns);
+
+   uint64_t frame64 = widen_32_to_64(frame, connector->last_frame);
+   connector->last_frame = frame64;
+   connector->last_nsec = nsec;
+
+   /* Never update the refresh rate estimate. It's static based on the mode.
+    * Update this before we signal present wait so that applications
+    * get lowest possible latency for present time. */
+   if (image->timing_request.serial) {
+      wsi_swapchain_present_timing_notify_completion(
+            &chain->base, image->timing_request.serial,
+            nsec, &image->base);
+   }
+
   wsi_display_debug("image %ld displayed at %d\n",
                     image - &(image->chain->images[0]), frame);
   image->state = WSI_IMAGE_DISPLAYING;
@ -1841,42 +1917,29 @@ wsi_display_page_flip_handler2(int fd,
      chain->status = result;
 }

-static void wsi_display_fence_event_handler(struct wsi_display_fence *fence);
-
-static void wsi_display_page_flip_handler(int fd,
-                                          unsigned int frame,
-                                          unsigned int sec,
-                                          unsigned int usec,
-                                          void *data)
-{
-   wsi_display_page_flip_handler2(fd, frame, sec, usec, 0, data);
-}
-
-static void wsi_display_vblank_handler(int fd, unsigned int frame,
-                                       unsigned int sec, unsigned int usec,
-                                       void *data)
-{
-   struct wsi_display_fence *fence = data;
-
-   wsi_display_fence_event_handler(fence);
-}
+static void wsi_display_fence_event_handler(struct wsi_display_fence *fence,
+                                            uint64_t nsec,
+                                            uint64_t frame);

+/**
+ * libdrm callback for when we get a DRM_EVENT_CRTC_SEQUENCE in response to a
+ * drmCrtcQueueSequence(), indicating that the first pixel of a new frame is
+ * being scanned out.
+ **/
 static void wsi_display_sequence_handler(int fd, uint64_t frame,
                                         uint64_t nsec, uint64_t user_data)
 {
   struct wsi_display_fence *fence =
      (struct wsi_display_fence *) (uintptr_t) user_data;

-   wsi_display_fence_event_handler(fence);
+   wsi_display_fence_event_handler(fence, nsec, frame);
 }

 static drmEventContext event_context = {
   .version = DRM_EVENT_CONTEXT_VERSION,
-   .page_flip_handler = wsi_display_page_flip_handler,
-#if DRM_EVENT_CONTEXT_VERSION >= 3
+   .page_flip_handler = NULL,
   .page_flip_handler2 = wsi_display_page_flip_handler2,
-#endif
-   .vblank_handler = wsi_display_vblank_handler,
+   .vblank_handler = NULL,
   .sequence_handler = wsi_display_sequence_handler,
 };

@ -2383,13 +2446,30 @@ wsi_display_fence_check_free(struct wsi_display_fence *fence)
      vk_free(fence->wsi->alloc, fence);
 }

-static void wsi_display_fence_event_handler(struct wsi_display_fence *fence)
+static void wsi_display_fence_event_handler(struct wsi_display_fence *fence,
+                                            uint64_t nsec, uint64_t frame)
 {
+   struct wsi_display_connector *connector = fence->connector;
+   struct wsi_display_image *image = fence->image;
+
   if (fence->syncobj) {
      (void) drmSyncobjSignal(fence->wsi->syncobj_fd, &fence->syncobj, 1);
      (void) drmSyncobjDestroy(fence->wsi->syncobj_fd, fence->syncobj);
   }

+   if (connector) {
+      connector->last_nsec = nsec;
+      connector->last_frame = frame;
+   }
+
+   if (image && image->state == WSI_IMAGE_WAITING) {
+      /* We may need to do the final sleep on CPU to resolve VRR timings. */
+      image->state = WSI_IMAGE_QUEUED_AFTER_WAIT;
+      VkResult result = _wsi_display_queue_next(&image->chain->base);
+      if (result != VK_SUCCESS)
+         image->chain->status = result;
+   }
+
   fence->event_received = true;
   wsi_display_fence_check_free(fence);
 }
@ -2822,9 +2902,11 @@ _wsi_display_queue_next(struct wsi_swapchain *drv_chain)

         switch (tmp_image->state) {
         case WSI_IMAGE_FLIPPING:
-            /* already flipping, don't send another to the kernel yet */
+         case WSI_IMAGE_WAITING:
+            /* already flipping or waiting for a flip, don't send another to the kernel yet */
            return VK_SUCCESS;
         case WSI_IMAGE_QUEUED:
+         case WSI_IMAGE_QUEUED_AFTER_WAIT:
            /* find the oldest queued */
            if (!image || tmp_image->flip_sequence < image->flip_sequence)
               image = tmp_image;
@ -2837,6 +2919,95 @@ _wsi_display_queue_next(struct wsi_swapchain *drv_chain)
      if (!image)
         return VK_SUCCESS;

+      if (image->fence) {
+         image->fence->image = NULL;
+         wsi_display_fence_destroy(image->fence);
+         image->fence = NULL;
+      }
+
+      unsigned num_cycles_to_skip = 0;
+      int64_t target_relative_ns = 0;
+      bool skip_timing = false;
+      bool nearest_cycle =
+            (image->timing_request.flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) != 0;
+
+      if (image->timing_request.time != 0) {
+         /* Ensure we have some kind of timebase to work from. */
+         if (!connector->last_frame)
+            drmCrtcGetSequence(wsi->fd, connector->crtc_id, &connector->last_frame, &connector->last_nsec);
+
+         if (!connector->last_frame || chain->base.present_timing.refresh_duration == 0) {
+            /* Something has gone very wrong. Just ignore present timing for safety. */
+            skip_timing = true;
+            wsi_display_debug("Cannot get a stable timebase, last frame = %"PRIu64", refresh_duration = %"PRIu64".\n",
+                              connector->last_frame, chain->base.present_timing.refresh_duration);
+         }
+      }
+
+      if (!skip_timing && image->state == WSI_IMAGE_QUEUED && image->timing_request.time != 0) {
+         target_relative_ns = (int64_t)image->timing_request.time;
+
+         /* We need to estimate number of refresh cycles to wait for. */
+         if (!(image->timing_request.flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT)) {
+            target_relative_ns -= (int64_t)connector->last_nsec;
+         }
+
+         if (nearest_cycle) {
+            /* No need to lock, we never update refresh_duration dynamically. */
+            target_relative_ns -= (int64_t)chain->base.present_timing.refresh_duration / 2;
+         } else {
+            /* If application is computing an exact value that lands exactly on the refresh cycle,
+             * pull back the estimate a little bit since DRM precision is 1us. */
+            target_relative_ns -= 1000;
+         }
+      }
+
+      target_relative_ns = MAX2(target_relative_ns, 0);
+      if (target_relative_ns && chain->base.present_timing.refresh_duration)
+         num_cycles_to_skip = target_relative_ns / chain->base.present_timing.refresh_duration;
+
+      /* CRTC cycles is not reliable on VRR. We cannot use that as a time base. */
+      bool is_vrr = connector->vrr_enabled == VRR_TRISTATE_ENABLED &&
+                    connector->vrr_capable == VRR_TRISTATE_ENABLED;
+
+      if (num_cycles_to_skip) {
+         if (!is_vrr) {
+            /* On FRR, we can rely on vblank events to guide time progression. */
+            VkDisplayKHR display = wsi_display_connector_to_handle(connector);
+            image->fence = wsi_display_fence_alloc(wsi, -1);
+
+            if (image->fence) {
+               image->fence->connector = connector;
+               image->fence->image = image;
+
+               uint64_t frame_queued;
+               uint64_t target_frame = connector->last_frame + num_cycles_to_skip;
+               VkResult result = wsi_register_vblank_event(image->fence, chain->base.wsi, display,
+                                                           0, target_frame, &frame_queued);
+
+               if (result == VK_SUCCESS && frame_queued <= target_frame) {
+                  /* Wait until the vblank fence signals and the event handler will attempt to requeue us. */
+                  image->state = WSI_IMAGE_WAITING;
+                  return VK_SUCCESS;
+               }
+            }
+         } else {
+            /* On a VRR display, applications can request frame times which are fractional,
+             * and there is no good way to target absolute time with atomic commits it seems ... */
+            int64_t target_ns = target_relative_ns + (int64_t)connector->last_nsec;
+            image->minimum_ns = target_ns;
+
+            /* Account for some minimum delay in submitting a page flip until it's processed and sleep jitter.
+             * We will compensate for the difference if there is any, so that we don't report completion
+             * times in the past. */
+            target_ns -= 1 * 1000 * 1000;
+
+            os_time_nanosleep_until(target_ns);
+         }
+      }
+
+      image->state = WSI_IMAGE_QUEUED;
+
      int ret = drm_atomic_commit(connector, image);
      if (ret == 0) {
         image->state = WSI_IMAGE_FLIPPING;
@ -2859,6 +3030,44 @@ _wsi_display_queue_next(struct wsi_swapchain *drv_chain)
   }
 }

+static void
+wsi_display_set_timing_request(struct wsi_swapchain *drv_chain,
+                               const struct wsi_image_timing_request *request)
+{
+   struct wsi_display_swapchain *chain =
+         (struct wsi_display_swapchain *) drv_chain;
+   chain->timing_request = *request;
+}
+
+static uint64_t
+wsi_display_poll_refresh_duration(struct wsi_swapchain *drv_chain, uint64_t *interval)
+{
+   struct wsi_display_swapchain *chain =
+         (struct wsi_display_swapchain *)drv_chain;
+   VkIcdSurfaceDisplay *surface = chain->surface;
+   wsi_display_mode *display_mode =
+         wsi_display_mode_from_handle(surface->displayMode);
+   double refresh = wsi_display_mode_refresh(display_mode);
+   wsi_display_connector *connector = display_mode->connector;
+
+   uint64_t refresh_ns = (uint64_t)(floor(1.0 / refresh * 1e9 + 0.5));
+
+   /* Assume FRR by default. */
+   *interval = refresh_ns;
+
+   /* If VRR is not enabled on the target CRTC, we should honor that.
+    * There is no mechanism to clearly request that VRR is desired,
+    * so we must assume that user might force us into FRR mode. */
+   if (connector->vrr_capable == VRR_TRISTATE_ENABLED) {
+      if (connector->vrr_enabled == VRR_TRISTATE_UNKNOWN)
+         *interval = 0; /* Somehow we don't know if the connector is VRR or FRR. Report unknown. */
+      else if (connector->vrr_enabled == VRR_TRISTATE_ENABLED)
+         *interval = UINT64_MAX;
+   }
+
+   return refresh_ns;
+}
+
 static VkResult
 wsi_display_queue_present(struct wsi_swapchain *drv_chain,
                          uint32_t image_index,
@ -2876,16 +3085,19 @@ wsi_display_queue_present(struct wsi_swapchain *drv_chain,
      return chain->status;

   image->present_id = present_id;
+   image->timing_request = chain->timing_request;

   assert(image->state == WSI_IMAGE_DRAWING);
   wsi_display_debug("present %d\n", image_index);

   mtx_lock(&wsi->wait_mutex);

-   /* Make sure that the page flip handler is processed in finite time if using present wait. */
-   if (present_id)
+   /* Make sure that the page flip handler is processed in finite time if using present wait
+    * or presentation time. */
+   if (present_id || chain->timing_request.serial)
      wsi_display_start_wait_thread(wsi);

+   memset(&chain->timing_request, 0, sizeof(chain->timing_request));
   image->flip_sequence = ++chain->flip_sequence;
   image->state = WSI_IMAGE_QUEUED;

@ -3045,6 +3257,9 @@ wsi_display_surface_create_swapchain(
   chain->base.acquire_next_image = wsi_display_acquire_next_image;
   chain->base.release_images = wsi_display_release_images;
   chain->base.queue_present = wsi_display_queue_present;
+   chain->base.set_timing_request = wsi_display_set_timing_request;
+   chain->base.poll_early_refresh = wsi_display_poll_refresh_duration;
+   chain->base.present_timing.time_domain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR;
   chain->base.wait_for_present = wsi_display_wait_for_present;
   chain->base.wait_for_present2 = wsi_display_wait_for_present;
   chain->base.set_hdr_metadata = wsi_display_set_hdr_metadata;
--- a/src/vulkan/wsi/wsi_common_headless.c
+++ b/src/vulkan/wsi/wsi_common_headless.c
@ -112,6 +112,16 @@ wsi_headless_surface_get_capabilities2(VkIcdSurfaceBase *surface,
         break;
      }

+      case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: {
+         VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext;
+
+         wait->presentStageQueries = 0;
+         wait->presentTimingSupported = VK_FALSE;
+         wait->presentAtAbsoluteTimeSupported = VK_FALSE;
+         wait->presentAtRelativeTimeSupported = VK_FALSE;
+         break;
+      }
+
      default:
         /* Ignored */
         break;
--- a/src/vulkan/wsi/wsi_common_metal.c
+++ b/src/vulkan/wsi/wsi_common_metal.c
@ -139,6 +139,16 @@ wsi_metal_surface_get_capabilities2(VkIcdSurfaceBase *surface,
         break;
      }

+      case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: {
+         VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext;
+
+         wait->presentStageQueries = 0;
+         wait->presentTimingSupported = VK_FALSE;
+         wait->presentAtAbsoluteTimeSupported = VK_FALSE;
+         wait->presentAtRelativeTimeSupported = VK_FALSE;
+         break;
+      }
+
      default:
         /* Ignored */
         break;
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@ -188,6 +188,29 @@ struct wsi_image {
   int dma_buf_fd;
 #endif
   void *cpu_map;
+
+   VkQueryPool query_pool;
+   VkCommandBuffer *timestamp_cmd_buffers;
+};
+
+struct wsi_presentation_timing {
+   uint64_t present_id;
+   uint64_t target_time;
+   uint64_t serial;
+   uint64_t queue_done_time; /* GPU timestamp based. */
+   uint64_t complete_time; /* Best effort timestamp we get from backend. */
+   /* If we're rendering with IMMEDIATE, it's possible for images to IDLE long before they complete.
+    * In this case, we have to ensure that queue_done_time is sampled at QueuePresentKHR time
+    * before we recycle an image. */
+   struct wsi_image *image;
+   VkPresentStageFlagsEXT requested_feedback;
+   VkBool32 complete;
+};
+
+struct wsi_image_timing_request {
+   uint64_t                    serial;
+   uint64_t                    time;
+   VkPresentTimingInfoFlagsEXT flags;
 };

 struct wsi_swapchain {
@ -237,7 +260,28 @@ struct wsi_swapchain {
      struct vk_queue *queue;
   } blit;

+   struct {
+      mtx_t lock;
+      bool active;
+
+      struct wsi_presentation_timing *timings;
+      size_t timings_capacity;
+      size_t timings_count;
+
+      size_t serial;
+
+      /* Maps to Vulkan spec definitions. */
+      uint64_t refresh_duration;
+      uint64_t refresh_interval;
+      /* When 0, we don't know yet. Every time the refresh rate changes,
+       * increase this counter. This counter must also be passed in GetPastTimings. */
+      uint64_t refresh_counter;
+
+      VkTimeDomainKHR time_domain;
+   } present_timing;
+
   bool capture_key_pressed;
+   float timestamp_period;

   /* Command pools, one per queue family */
   VkCommandPool *cmd_pools;
@ -266,6 +310,10 @@ struct wsi_swapchain {
                            VkPresentModeKHR mode);
   void (*set_hdr_metadata)(struct wsi_swapchain *swap_chain,
                            const VkHdrMetadataEXT* pMetadata);
+   void (*set_timing_request)(struct wsi_swapchain *swap_chain,
+                            const struct wsi_image_timing_request *request);
+   void (*poll_timing_request)(struct wsi_swapchain *swap_chain);
+   uint64_t (*poll_early_refresh)(struct wsi_swapchain *swap_chain, uint64_t *interval);
 };

 bool
@ -369,6 +417,10 @@ wsi_create_image(const struct wsi_swapchain *chain,
 void
 wsi_image_init(struct wsi_image *image);

+VkResult
+wsi_image_init_timestamp(const struct wsi_swapchain *chain,
+                         struct wsi_image *image);
+
 void
 wsi_destroy_image(const struct wsi_swapchain *chain,
                  struct wsi_image *image);
@ -377,6 +429,16 @@ VkResult
 wsi_swapchain_wait_for_present_semaphore(const struct wsi_swapchain *chain,
                                         uint64_t present_id, uint64_t timeout);

+void
+wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
+                                               uint64_t timing_serial, uint64_t timestamp,
+                                               struct wsi_image *image);
+
+void
+wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain,
+                                                 uint64_t refresh_duration, uint64_t refresh_interval,
+                                                 int minimum_delta_for_update);
+
 #ifdef HAVE_LIBDRM
 VkResult
 wsi_prepare_signal_dma_buf_from_semaphore(struct wsi_swapchain *chain,
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@ -254,6 +254,8 @@ struct wsi_wl_swapchain {
      bool has_hdr_metadata;
   } color;

+   struct wsi_image_timing_request timing_request;
+
   struct wsi_wl_image images[0];
 };
 VK_DEFINE_NONDISP_HANDLE_CASTS(wsi_wl_swapchain, base.base, VkSwapchainKHR,
@ -1668,7 +1670,15 @@ wsi_GetPhysicalDeviceWaylandPresentationSupportKHR(VkPhysicalDevice physicalDevi
   struct wsi_wayland *wsi =
      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];

-   if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex)))
+   /* These should overlap. */
+   uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps;
+
+   /* If there are no queues that support both blits and timestamps,
+    * don't report support for queue timestamps. */
+   if (!effective_queues)
+      effective_queues = wsi_device->queue_supports_blit;
+
+   if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex)))
      return false;

   struct wsi_wl_display display;
@ -1789,7 +1799,8 @@ wsi_wl_surface_get_capabilities(VkIcdSurfaceBase *icd_surface,
 static VkResult
 wsi_wl_surface_check_presentation(VkIcdSurfaceBase *icd_surface,
                                  struct wsi_device *wsi_device,
-                                  bool *has_wp_presentation)
+                                  bool *has_wp_presentation, clockid_t *clock_id,
+                                  bool *has_commit_timing, bool *has_fifo)
 {
   VkIcdSurfaceWayland *surface = (VkIcdSurfaceWayland *)icd_surface;
   struct wsi_wayland *wsi =
@ -1800,7 +1811,17 @@ wsi_wl_surface_check_presentation(VkIcdSurfaceBase *icd_surface,
                           wsi_device->sw, "mesa check wp_presentation"))
      return VK_ERROR_SURFACE_LOST_KHR;

-   *has_wp_presentation = !!display.wp_presentation_notwrapped;
+   if (has_wp_presentation)
+      *has_wp_presentation = !!display.wp_presentation_notwrapped;
+
+   if (clock_id)
+      *clock_id = display.presentation_clock_id;
+
+   if (has_commit_timing)
+      *has_commit_timing = !!display.commit_timing_manager;
+
+   if (has_fifo)
+      *has_fifo = !!display.fifo_manager;

   wsi_wl_display_finish(&display);

@ -1893,7 +1914,7 @@ wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface,
         bool has_feedback;

         result = wsi_wl_surface_check_presentation(surface, wsi_device,
-                                                    &has_feedback);
+                                                    &has_feedback, NULL, NULL, NULL);
         if (result != VK_SUCCESS)
            return result;

@ -1906,7 +1927,7 @@ wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface,
         bool has_feedback;

         result = wsi_wl_surface_check_presentation(surface, wsi_device,
-                                                    &has_feedback);
+                                                    &has_feedback, NULL, NULL, NULL);
         if (result != VK_SUCCESS)
            return result;

@ -1914,6 +1935,50 @@ wsi_wl_surface_get_capabilities2(VkIcdSurfaceBase *surface,
         break;
      }

+      case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: {
+         VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext;
+         bool has_feedback, has_commit_timing, has_fifo;
+
+         wait->presentStageQueries = 0;
+         wait->presentTimingSupported = VK_FALSE;
+         wait->presentAtAbsoluteTimeSupported = VK_FALSE;
+         wait->presentAtRelativeTimeSupported = VK_FALSE;
+
+         clockid_t clock_id;
+
+         result = wsi_wl_surface_check_presentation(surface, wsi_device,
+                                                    &has_feedback, &clock_id,
+                                                    &has_commit_timing, &has_fifo);
+
+         if (result != VK_SUCCESS)
+            return result;
+
+         if (!has_feedback)
+            break;
+
+         /* We could deal with esoteric clock domains by exposing VK_TIME_DOMAIN_SWAPCHAIN or PRESENT_STAGE_LOCAL,
+          * but that requires a lot more scaffolding, and there's no need to add extra complexity if we can
+          * get away with this. */
+         if (clock_id != CLOCK_MONOTONIC && clock_id != CLOCK_MONOTONIC_RAW)
+            break;
+
+         /* Presentation timing spec talks about the reported time targeting "pixel being visible".
+          * From presentation-time spec: "Note, that if the display path has a non-zero latency,
+          * the time instant specified by this counter may differ from the timestamp's."
+          * No compositor I know of reports where it takes display latency into account,
+          * so it's a little unclear if we should actually be reporting PIXEL_OUT or PIXEL_VISIBLE.
+          * Choose PIXEL_OUT for now since no known compositor out there actually implements
+          * PIXEL_VISIBLE as intended, and we don't want to promise something we cannot hold. */
+         wait->presentTimingSupported = VK_TRUE;
+         wait->presentStageQueries = VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT;
+
+         /* We cannot reliably implement FIFO guarantee + absolute time without the FIFO barrier.
+          * Presentation timing is only defined to work with FIFO (and its variants like RELAXED and LATEST_READY). */
+         wait->presentAtAbsoluteTimeSupported = has_commit_timing && has_fifo;
+
+         break;
+      }
+
      default:
         /* Ignored */
         break;
@ -2404,6 +2469,7 @@ struct wsi_wl_present_id {
    * which uses frame callback to signal DRI3 COMPLETE. */
   struct wl_callback *frame;
   uint64_t present_id;
+   uint64_t timing_serial;
   struct mesa_trace_flow flow;
   uint64_t submission_time;
   const VkAllocationCallbacks *alloc;
@ -2411,6 +2477,8 @@ struct wsi_wl_present_id {
   uint64_t target_time;
   uint64_t correction;
   struct wl_list link;
+   struct wsi_image *img;
+   bool user_target_time;
 };

 static struct wsi_image *
@ -2441,6 +2509,14 @@ wsi_wl_swapchain_set_present_mode(struct wsi_swapchain *wsi_chain,
   chain->base.present_mode = mode;
 }

+static void
+wsi_wl_swapchain_set_timing_request(struct wsi_swapchain *wsi_chain,
+                                    const struct wsi_image_timing_request *request)
+{
+   struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)wsi_chain;
+   chain->timing_request = *request;
+}
+
 static VkResult
 dispatch_present_id_queue(struct wsi_swapchain *wsi_chain, struct timespec *end_time)
 {
@ -2514,6 +2590,15 @@ dispatch_present_id_queue(struct wsi_swapchain *wsi_chain, struct timespec *end_
   return VK_SUCCESS;
 }

+static void
+wsi_wl_swapchain_poll_timing_request(struct wsi_swapchain *wsi_chain)
+{
+   /* Timing requests must complete in finite time, and if we're not calling present wait
+    * or queue present regularly, timing requests will never come back. */
+   struct timespec instant = {0};
+   dispatch_present_id_queue(wsi_chain, &instant);
+}
+
 static bool
 wsi_wl_swapchain_present_id_completes_in_finite_time_locked(struct wsi_wl_swapchain *chain,
                                                            uint64_t present_id)
@ -2794,16 +2879,13 @@ wsi_wl_swapchain_acquire_next_image_implicit(struct wsi_swapchain *wsi_chain,
 }

 static void
-wsi_wl_presentation_update_present_id(struct wsi_wl_present_id *id)
+wsi_wl_presentation_update_present_id_locked(struct wsi_wl_present_id *id)
 {
-   mtx_lock(&id->chain->present_ids.lock);
   id->chain->present_ids.outstanding_count--;
   if (id->present_id > id->chain->present_ids.max_completed)
      id->chain->present_ids.max_completed = id->present_id;

   id->chain->present_ids.display_time_correction -= id->correction;
-   mtx_unlock(&id->chain->present_ids.lock);
-   vk_free(id->alloc, id);
 }

 static void
@ -2815,6 +2897,20 @@ presentation_handle_presented(void *data,
   struct wsi_wl_swapchain *chain = id->chain;
   uint64_t target_time = id->target_time;

+   /* In v1 of presentation time, we can know if we're likely running VRR, given refresh is 0.
+    * However, we cannot know what the base refresh rate is without some kind of external information.
+    * We also cannot know if we're actually driving the display in a VRR fashion.
+    * In v2, we should always know the "base refresh" rate, but that means we cannot know if we're driving
+    * the display VRR or FRR. We could try to deduce it based on timestamps, but that is too brittle.
+    * There is a v3 proposal that adds this information more formally so we don't have to guess.
+    * Knowing VRR or FRR is not mission critical for most use cases, so just report "Unknown" for now. */
+   wsi_swapchain_present_timing_update_refresh_rate(&chain->base, refresh, 0, 0);
+
+   /* Notify this before present wait to reduce latency of presentation timing requests
+    * if the application is driving its queries based off present waits. */
+   if (id->timing_serial)
+      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time, id->img);
+
   mtx_lock(&chain->present_ids.lock);
   chain->present_ids.refresh_nsec = refresh;
   if (!chain->present_ids.valid_refresh_nsec) {
@ -2826,13 +2922,16 @@ presentation_handle_presented(void *data,
   if (presentation_time > chain->present_ids.displayed_time)
      chain->present_ids.displayed_time = presentation_time;

-   if (target_time && presentation_time > target_time)
+   /* If we have user-defined target time it can be arbitrarily early, and we don't
+    * want to start compensating for that error if application stops requesting specific time. */
+   if (!id->user_target_time && target_time && presentation_time > target_time)
      chain->present_ids.display_time_error = presentation_time - target_time;
   else
      chain->present_ids.display_time_error = 0;
-   mtx_unlock(&chain->present_ids.lock);

-   wsi_wl_presentation_update_present_id(id);
+   wsi_wl_presentation_update_present_id_locked(id);
+   mtx_unlock(&chain->present_ids.lock);
+   vk_free(id->alloc, id);
 }

 static void
@ -2841,6 +2940,15 @@ presentation_handle_discarded(void *data)
   struct wsi_wl_present_id *id = data;
   struct wsi_wl_swapchain *chain = id->chain;

+   /* From Vulkan spec:
+    * "Timing information for some present stages may have a time value of 0,
+    * indicating that results for that present stage are not available."
+    * Worst case we can simply take a timestamp of clock_id and pretend, but
+    * applications may start to latch onto that timestamp as ground truth, which
+    * is obviously not correct. */
+   if (id->timing_serial)
+      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0, id->img);
+
   mtx_lock(&chain->present_ids.lock);
   if (!chain->present_ids.valid_refresh_nsec) {
      /* We've started occluded, so make up some safe values to throttle us */
@ -2849,9 +2957,10 @@ presentation_handle_discarded(void *data)
      chain->present_ids.refresh_nsec = 16666666;
      chain->present_ids.valid_refresh_nsec = true;
   }
-   mtx_unlock(&chain->present_ids.lock);

-   wsi_wl_presentation_update_present_id(id);
+   wsi_wl_presentation_update_present_id_locked(id);
+   mtx_unlock(&chain->present_ids.lock);
+   vk_free(id->alloc, id);
 }

 static void
@ -2870,9 +2979,10 @@ presentation_frame_handle_done(void *data, struct wl_callback *callback, uint32_

   mtx_lock(&chain->present_ids.lock);
   wl_list_remove(&id->link);
-   mtx_unlock(&chain->present_ids.lock);

-   wsi_wl_presentation_update_present_id(id);
+   wsi_wl_presentation_update_present_id_locked(id);
+   mtx_unlock(&chain->present_ids.lock);
+   vk_free(id->alloc, id);
   wl_callback_destroy(callback);
 }

@ -2895,6 +3005,29 @@ static const struct wl_callback_listener frame_listener = {
   frame_handle_done,
 };

+static bool
+set_application_driven_timestamp(struct wsi_wl_swapchain *chain,
+                                 uint64_t *timestamp,
+                                 uint64_t *correction)
+{
+   if (chain->timing_request.serial && chain->timing_request.time) {
+      /* Absolute time is requested before we have been able to report a reasonable refresh rate
+       * to application. This is valid, but we should not try to perform any rounding.
+       * NEAREST_REFRESH_CYCLE flag cannot be honored because it's impossible to know at this time. */
+      struct timespec target_ts;
+      timespec_from_nsec(&target_ts, chain->timing_request.time);
+      wp_commit_timer_v1_set_timestamp(chain->commit_timer,
+                                       (uint64_t)target_ts.tv_sec >> 32, target_ts.tv_sec,
+                                       target_ts.tv_nsec);
+      *timestamp = chain->timing_request.time;
+      *correction = 0;
+      chain->present_ids.last_target_time = chain->timing_request.time;
+      return true;
+   } else {
+      return false;
+   }
+}
+
 /* The present_ids lock must be held */
 static bool
 set_timestamp(struct wsi_wl_swapchain *chain,
@ -2908,7 +3041,7 @@ set_timestamp(struct wsi_wl_swapchain *chain,
   int32_t error = 0;

   if (!chain->present_ids.valid_refresh_nsec)
-      return false;
+      return set_application_driven_timestamp(chain, timestamp, correction);

   displayed_time = chain->present_ids.displayed_time;
   refresh = chain->present_ids.refresh_nsec;
@ -2918,7 +3051,7 @@ set_timestamp(struct wsi_wl_swapchain *chain,
    * timestamps at all, so bail out.
    */
   if (!refresh)
-      return false;
+      return set_application_driven_timestamp(chain, timestamp, correction);

   /* We assume we're being fed at the display's refresh rate, but
    * if that doesn't happen our timestamps fall into the past.
@ -2936,6 +3069,10 @@ set_timestamp(struct wsi_wl_swapchain *chain,
      error = chain->present_ids.display_time_error -
              chain->present_ids.display_time_correction;

+   /* If we're driving timestamps from application, this is somewhat redundant
+    * but it will drain out any accumulated display_time_error over time.
+    * Accumulated errors are expected since application might not
+    * align the target time perfectly against a refresh cycle. */
   target = chain->present_ids.last_target_time;
   if (error > 0)  {
      target += (error / refresh) * refresh;
@ -2945,19 +3082,41 @@ set_timestamp(struct wsi_wl_swapchain *chain,
   }

   chain->present_ids.display_time_correction += *correction;
-   target = next_phase_locked_time(displayed_time,
-                                   refresh,
-                                   target);
-  /* Take back 500 us as a safety margin, to ensure we don't miss our
-   * target due to round-off error.
-   */
-   timespec_from_nsec(&target_ts, target - 500000);
+
+   if (chain->timing_request.serial && chain->timing_request.time) {
+      target = chain->timing_request.time;
+      chain->present_ids.last_target_time = target;
+      *timestamp = target;
+
+      if (chain->timing_request.flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT)
+         target -= chain->present_ids.refresh_nsec / 2;
+
+      /* Without the flag, the application is supposed to deal with any safety margins on its own. */
+      timespec_from_nsec(&target_ts, target);
+
+      /* If we're using commit timing path, we always have FIFO protocol, so we don't have to
+       * consider scenarios where application is passing a very low present time.
+       * I.e., there is no need to max() the application timestamp against our estimated next refresh cycle.
+       * If the surface is occluded, it's possible to render at a higher rate than display refresh rate,
+       * but that's okay. Those presents will be discarded anyway, and we won't report odd timestamps to application. */
+   } else {
+      target = next_phase_locked_time(displayed_time,
+                                      refresh,
+                                      target);
+
+      chain->present_ids.last_target_time = target;
+      *timestamp = target;
+
+      /* Take back 500 us as a safety margin, to ensure we don't miss our
+       * target due to round-off error.
+       */
+      timespec_from_nsec(&target_ts, target - 500000);
+   }
+
   wp_commit_timer_v1_set_timestamp(chain->commit_timer,
                                    (uint64_t)target_ts.tv_sec >> 32, target_ts.tv_sec,
                                    target_ts.tv_nsec);

-   chain->present_ids.last_target_time = target;
-   *timestamp = target;
   return true;
 }

@ -3059,13 +3218,16 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain,
   }

   if (present_id > 0 || (mode_fifo && chain->commit_timer) ||
-       util_perfetto_is_tracing_enabled()) {
+       util_perfetto_is_tracing_enabled() || chain->timing_request.serial) {
      struct wsi_wl_present_id *id =
         vk_zalloc(chain->wsi_wl_surface->display->wsi_wl->alloc, sizeof(*id), sizeof(uintptr_t),
                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
      id->chain = chain;
      id->present_id = present_id;
      id->alloc = chain->wsi_wl_surface->display->wsi_wl->alloc;
+      id->timing_serial = chain->timing_request.serial;
+      id->img = &chain->images[image_index].base;
+      id->user_target_time = chain->timing_request.time != 0;

      mtx_lock(&chain->present_ids.lock);

@ -3193,6 +3355,8 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain,
                                        wsi_wl_surface->display->queue);
   }

+   memset(&chain->timing_request, 0, sizeof(chain->timing_request));
+
   return VK_SUCCESS;
 }

@ -3427,6 +3591,20 @@ wsi_wl_swapchain_destroy(struct wsi_swapchain *wsi_chain,
   return VK_SUCCESS;
 }

+static VkTimeDomainKHR
+clock_id_to_vk_time_domain(clockid_t id)
+{
+   switch (id) {
+      case CLOCK_MONOTONIC:
+         return VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR;
+      case CLOCK_MONOTONIC_RAW:
+         return VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR;
+      default:
+         /* Default fallback. Will not be used. */
+         return VK_TIME_DOMAIN_DEVICE_KHR;
+   }
+}
+
 static VkResult
 wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
                                VkDevice device,
@ -3605,6 +3783,12 @@ wsi_wl_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   chain->base.queue_present = wsi_wl_swapchain_queue_present;
   chain->base.release_images = wsi_wl_swapchain_release_images;
   chain->base.set_present_mode = wsi_wl_swapchain_set_present_mode;
+   chain->base.set_timing_request = wsi_wl_swapchain_set_timing_request;
+   chain->base.poll_timing_request = wsi_wl_swapchain_poll_timing_request;
+   if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) {
+      chain->base.present_timing.time_domain =
+            clock_id_to_vk_time_domain(wsi_wl_surface->display->presentation_clock_id);
+   }
   chain->base.wait_for_present = wsi_wl_swapchain_wait_for_present;
   chain->base.wait_for_present2 = wsi_wl_swapchain_wait_for_present2;
   chain->base.present_mode = present_mode;
--- a/src/vulkan/wsi/wsi_common_win32.cpp
+++ b/src/vulkan/wsi/wsi_common_win32.cpp
@ -276,6 +276,16 @@ wsi_win32_surface_get_capabilities2(VkIcdSurfaceBase *surface,
         break;
      }

+      case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: {
+         VkPresentTimingSurfaceCapabilitiesEXT *wait = (VkPresentTimingSurfaceCapabilitiesEXT *)ext;
+
+         wait->presentStageQueries = 0;
+         wait->presentTimingSupported = VK_FALSE;
+         wait->presentAtAbsoluteTimeSupported = VK_FALSE;
+         wait->presentAtRelativeTimeSupported = VK_FALSE;
+         break;
+      }
+
      default:
         /* Ignored */
         break;
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@ -64,6 +64,7 @@
 #include "wsi_common_entrypoints.h"
 #include "wsi_common_private.h"
 #include "wsi_common_queue.h"
+#include "loader/loader_dri_helper_screen.h"

 #ifdef HAVE_SYS_SHM_H
 #include <sys/ipc.h>
@ -79,7 +80,14 @@

 #define MAX_DAMAGE_RECTS 64

-struct wsi_x11_connection {
+struct x11_icd_surface_key {
+   xcb_connection_t *conn;
+   xcb_window_t window;
+   uint32_t padding; /* Makes struct memcmp compatible. */
+};
+
+struct wsi_x11_icd_surface {
+   struct x11_icd_surface_key key;
   bool has_dri3;
   bool has_dri3_modifiers;
   bool has_dri3_explicit_sync;
@ -88,13 +96,80 @@ struct wsi_x11_connection {
   bool is_xwayland;
   bool has_mit_shm;
   bool has_xfixes;
+
+   struct loader_screen_resources screen_resources;
+   bool screen_resources_valid;
+   mtx_t mtx;
+
+   /* This holds the fallback for MSC rate, i.e. refresh rate.
+    * If we cannot get ahold of a stable estimate based on real feedback,
+    * we defer to using this. With multi-monitors and other potential effects affecting actual rates,
+    * we shouldn't trust this blindly. */
+   uint64_t current_refresh_ns;
 };

+static uint64_t
+x11_icd_surface_update_present_timing(struct wsi_x11_icd_surface *surface, uint32_t width, uint32_t height)
+{
+   uint64_t ret;
+
+   if (!surface->screen_resources_valid)
+      return 0;
+
+   mtx_lock(&surface->mtx);
+   loader_update_screen_resources(&surface->screen_resources);
+
+   if (surface->screen_resources.num_crtcs == 0) {
+      surface->current_refresh_ns = 0;
+      goto out;
+   }
+
+   surface->current_refresh_ns =
+         1000000000ull * surface->screen_resources.crtcs[0].refresh_denominator /
+         surface->screen_resources.crtcs[0].refresh_numerator;
+
+   /* Don't need to ponder multi-monitor. */
+   if (surface->screen_resources.num_crtcs == 1)
+      goto out;
+
+   /* Find the best matching screen for the window. */
+   xcb_translate_coordinates_cookie_t cookie =
+         xcb_translate_coordinates_unchecked(surface->key.conn, surface->key.window,
+                                             surface->screen_resources.screen->root, 0, 0);
+   xcb_translate_coordinates_reply_t *reply =
+         xcb_translate_coordinates_reply(surface->key.conn, cookie, NULL);
+
+   if (!reply)
+      goto out;
+
+   int area = 0;
+
+   for (unsigned c = 0; c < surface->screen_resources.num_crtcs; c++) {
+      struct loader_crtc_info *crtc = &surface->screen_resources.crtcs[c];
+
+      int c_area = box_intersection_area(
+            reply->dst_x, reply->dst_y, width, height, crtc->x,
+            crtc->y, crtc->width, crtc->height);
+
+      if (c_area > area) {
+         surface->current_refresh_ns = 1000000000ull * crtc->refresh_denominator / crtc->refresh_numerator;
+         area = c_area;
+      }
+   }
+
+   free(reply);
+
+out:
+   ret = surface->current_refresh_ns;
+   mtx_unlock(&surface->mtx);
+   return ret;
+}
+
 struct wsi_x11 {
   struct wsi_interface base;

   mtx_t mutex;
-   /* Hash table of xcb_connection -> wsi_x11_connection mappings */
+   /* Hash table of xcb_connection -> wsi_x11_icd_surface mappings */
   struct hash_table *connections;
 };

@ -224,9 +299,9 @@ wsi_x11_detect_xwayland(xcb_connection_t *conn,
   return is_xwayland;
 }

-static struct wsi_x11_connection *
-wsi_x11_connection_create(struct wsi_device *wsi_dev,
-                          xcb_connection_t *conn)
+static struct wsi_x11_icd_surface *
+wsi_x11_icd_surface_create(struct wsi_device *wsi_dev,
+                           xcb_connection_t *conn, xcb_window_t window)
 {
   xcb_query_extension_cookie_t dri3_cookie, pres_cookie, randr_cookie,
                                amd_cookie, nv_cookie, shm_cookie, sync_cookie,
@ -241,16 +316,19 @@ wsi_x11_connection_create(struct wsi_device *wsi_dev,
   bool has_dri3_v1_4 = false;
   bool has_present_v1_4 = false;

-   /* wsi_x11_get_connection may be called from a thread, but we will never end up here on a worker thread,
+   /* wsi_x11_get_icd_surface may be called from a thread, but we will never end up here on a worker thread,
    * since the connection will always be in the hash-map,
    * so we will not violate Vulkan's rule on allocation callbacks w.r.t.
    * when it is allowed to call the allocation callbacks. */
-   struct wsi_x11_connection *wsi_conn =
-      vk_alloc(&wsi_dev->instance_alloc, sizeof(*wsi_conn), 8,
+   struct wsi_x11_icd_surface *wsi_conn =
+      vk_zalloc(&wsi_dev->instance_alloc, sizeof(*wsi_conn), 8,
                VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
   if (!wsi_conn)
      return NULL;

+   wsi_conn->key.conn = conn;
+   wsi_conn->key.window = window;
+
   sync_cookie = xcb_query_extension(conn, 4, "SYNC");
   dri3_cookie = xcb_query_extension(conn, 4, "DRI3");
   pres_cookie = xcb_query_extension(conn, 7, "Present");
@ -378,6 +456,27 @@ wsi_x11_connection_create(struct wsi_device *wsi_dev,
   }
 #endif

+   if (window) {
+      /* This state is only necessary for dealing with present timing, and if we fail, we simply won't expose support. */
+      xcb_get_geometry_cookie_t geometry_cookie = xcb_get_geometry_unchecked(conn, window);
+      xcb_get_geometry_reply_t *geometry_reply = xcb_get_geometry_reply(conn, geometry_cookie, NULL);
+      if (geometry_reply) {
+         xcb_screen_iterator_t it = xcb_setup_roots_iterator(xcb_get_setup(conn));
+         xcb_screen_t *screen;
+
+         for (screen = it.data; it.rem != 0; xcb_screen_next(&it), screen = it.data) {
+            if (screen->root == geometry_reply->root) {
+               loader_init_screen_resources(&wsi_conn->screen_resources, conn, screen);
+               wsi_conn->screen_resources_valid = true;
+               mtx_init(&wsi_conn->mtx, 0);
+               break;
+            }
+         }
+
+         free(geometry_reply);
+      }
+   }
+
   free(dri3_reply);
   free(pres_reply);
   free(randr_reply);
@ -392,14 +491,18 @@ wsi_x11_connection_create(struct wsi_device *wsi_dev,
 }

 static void
-wsi_x11_connection_destroy(struct wsi_device *wsi_dev,
-                           struct wsi_x11_connection *conn)
+wsi_x11_icd_surface_destroy(struct wsi_device *wsi_dev,
+                            struct wsi_x11_icd_surface *conn)
 {
+   if (conn->screen_resources_valid) {
+      loader_destroy_screen_resources(&conn->screen_resources);
+      mtx_destroy(&conn->mtx);
+   }
   vk_free(&wsi_dev->instance_alloc, conn);
 }

 static bool
-wsi_x11_check_for_dri3(struct wsi_x11_connection *wsi_conn)
+wsi_x11_check_for_dri3(struct wsi_x11_icd_surface *wsi_conn)
 {
   if (wsi_conn->has_dri3)
      return true;
@ -418,35 +521,37 @@ wsi_x11_check_for_dri3(struct wsi_x11_connection *wsi_conn)
 *
 * If the allocation fails NULL is returned.
 */
-static struct wsi_x11_connection *
-wsi_x11_get_connection(struct wsi_device *wsi_dev,
-                       xcb_connection_t *conn)
+static struct wsi_x11_icd_surface *
+wsi_x11_get_icd_surface(struct wsi_device *wsi_dev,
+                        xcb_connection_t *conn, xcb_window_t window)
 {
   struct wsi_x11 *wsi =
      (struct wsi_x11 *)wsi_dev->wsi[VK_ICD_WSI_PLATFORM_XCB];

   mtx_lock(&wsi->mutex);

-   struct hash_entry *entry = _mesa_hash_table_search(wsi->connections, conn);
+   struct x11_icd_surface_key key = { .conn = conn, .window = window };
+
+   struct hash_entry *entry = _mesa_hash_table_search(wsi->connections, &key);
   if (!entry) {
      /* We're about to make a bunch of blocking calls.  Let's drop the
       * mutex for now so we don't block up too badly.
       */
      mtx_unlock(&wsi->mutex);

-      struct wsi_x11_connection *wsi_conn =
-         wsi_x11_connection_create(wsi_dev, conn);
+      struct wsi_x11_icd_surface *wsi_conn =
+            wsi_x11_icd_surface_create(wsi_dev, conn, window);
      if (!wsi_conn)
         return NULL;

      mtx_lock(&wsi->mutex);

-      entry = _mesa_hash_table_search(wsi->connections, conn);
+      entry = _mesa_hash_table_search(wsi->connections, &wsi_conn->key);
      if (entry) {
         /* Oops, someone raced us to it */
-         wsi_x11_connection_destroy(wsi_dev, wsi_conn);
+         wsi_x11_icd_surface_destroy(wsi_dev, wsi_conn);
      } else {
-         entry = _mesa_hash_table_insert(wsi->connections, conn, wsi_conn);
+         entry = _mesa_hash_table_insert(wsi->connections, &wsi_conn->key, wsi_conn);
      }
   }

@ -590,11 +695,20 @@ wsi_GetPhysicalDeviceXcbPresentationSupportKHR(VkPhysicalDevice physicalDevice,
 {
   VK_FROM_HANDLE(vk_physical_device, pdevice, physicalDevice);
   struct wsi_device *wsi_device = pdevice->wsi_device;
-   if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex)))
+
+   /* These should overlap. */
+   uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps;
+
+   /* If there are no queues that support both blits and timestamps,
+    * don't report support for queue timestamps. */
+   if (!effective_queues)
+      effective_queues = wsi_device->queue_supports_blit;
+
+   if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex)))
      return false;

-   struct wsi_x11_connection *wsi_conn =
-      wsi_x11_get_connection(wsi_device, connection);
+   struct wsi_x11_icd_surface *wsi_conn =
+      wsi_x11_get_icd_surface(wsi_device, connection, 0);

   if (!wsi_conn)
      return false;
@ -669,8 +783,8 @@ x11_surface_get_support(VkIcdSurfaceBase *icd_surface,
   xcb_connection_t *conn = x11_surface_get_connection(icd_surface);
   xcb_window_t window = x11_surface_get_window(icd_surface);

-   struct wsi_x11_connection *wsi_conn =
-      wsi_x11_get_connection(wsi_device, conn);
+   struct wsi_x11_icd_surface *wsi_conn =
+      wsi_x11_get_icd_surface(wsi_device, conn, window);
   if (!wsi_conn)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

@ -722,7 +836,7 @@ x11_get_min_image_count(const struct wsi_device *wsi_device, bool is_xwayland)

 static unsigned
 x11_get_min_image_count_for_present_mode(struct wsi_device *wsi_device,
-                                         struct wsi_x11_connection *wsi_conn,
+                                         struct wsi_x11_icd_surface *wsi_conn,
                                         VkPresentModeKHR present_mode);

 static VkResult
@ -734,8 +848,8 @@ x11_surface_get_capabilities(VkIcdSurfaceBase *icd_surface,
   xcb_connection_t *conn = x11_surface_get_connection(icd_surface);
   xcb_window_t window = x11_surface_get_window(icd_surface);
   struct wsi_x11_vk_surface *surface = (struct wsi_x11_vk_surface*)icd_surface;
-   struct wsi_x11_connection *wsi_conn =
-      wsi_x11_get_connection(wsi_device, conn);
+   struct wsi_x11_icd_surface *wsi_conn =
+      wsi_x11_get_icd_surface(wsi_device, conn, window);
   xcb_get_geometry_cookie_t geom_cookie;
   xcb_generic_error_t *err;
   xcb_get_geometry_reply_t *geom;
@ -863,6 +977,52 @@ x11_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface,
         break;
      }

+      case VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT: {
+         VkPresentTimingSurfaceCapabilitiesEXT *wait = (void *)ext;
+
+         xcb_connection_t *conn = x11_surface_get_connection(icd_surface);
+         xcb_window_t window = x11_surface_get_window(icd_surface);
+         struct wsi_x11_icd_surface *wsi_conn = wsi_x11_get_icd_surface(wsi_device, conn, window);
+
+         wait->presentStageQueries = 0;
+         wait->presentTimingSupported = VK_FALSE;
+         wait->presentAtAbsoluteTimeSupported = VK_FALSE;
+         wait->presentAtRelativeTimeSupported = VK_FALSE;
+
+         /* If we cannot query modes for a screen, it's not possible to get reliable timings. */
+         if (!wsi_conn->screen_resources_valid)
+            break;
+
+         wait->presentTimingSupported = VK_TRUE;
+
+         if (wsi_conn->is_xwayland) {
+            /* Wayland COMPLETE is tied to fence callback, so that's what we'll report.
+             * For pure frame pacing support, this is likely fine. */
+            wait->presentStageQueries = VK_PRESENT_STAGE_REQUEST_DEQUEUED_BIT_EXT;
+
+            /* Xwayland cannot get a reliable refresh rate estimate since MSC is not tied to monitor refresh at all.
+             * However, it's pragmatically very important to expose some baseline Xwl support since
+             * a large amount of applications (mostly games) rely on X11 APIs.
+             *
+             * Relative timings are easier to deal with since errors against an absolute timer are more or less expected,
+             * and it's sufficient for implementing present intervals in GL/D3D, etc, but likely not for
+             * tight A/V sync in e.g. media players, but those should be using Wayland when available anyway.
+             * As per-spec the timing request we provide should correlate with PIXEL_VISIBLE_BIT stage,
+             * but when we only observe dequeue, that's not really possible, but relative timings don't have that problem.
+             *
+             * There is PRESENT_CAPABILITY_UST, which would help, but xserver does not implement it at all.
+             */
+            wait->presentAtRelativeTimeSupported = VK_TRUE;
+         } else {
+            /* COMPLETE should be tied to page flip on native X11. */
+            wait->presentStageQueries = VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT;
+            wait->presentAtAbsoluteTimeSupported = VK_TRUE;
+            wait->presentAtRelativeTimeSupported = VK_TRUE;
+         }
+
+         break;
+      }
+
      default:
         /* Ignored */
         break;
@ -1092,6 +1252,7 @@ wsi_CreateXlibSurfaceKHR(VkInstance _instance,
 struct x11_image_pending_completion {
   uint32_t serial;
   uint64_t signal_present_id;
+   uint64_t timing_serial;
 };

 struct x11_image {
@ -1108,6 +1269,7 @@ struct x11_image {
   VkPresentModeKHR                          present_mode;
   xcb_rectangle_t                           rects[MAX_DAMAGE_RECTS];
   int                                       rectangle_count;
+   struct wsi_image_timing_request           timing_request;

   /* In IMMEDIATE and MAILBOX modes, we can have multiple pending presentations per image.
    * We need to keep track of them when considering present ID. */
@ -1125,12 +1287,19 @@ struct x11_image {
 #endif
 };

+struct x11_present_timing_entry {
+   uint64_t msc;
+   uint64_t ust;
+};
+#define X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE 16
+
 struct x11_swapchain {
   struct wsi_swapchain                        base;

   bool                                         has_dri3_modifiers;
   bool                                         has_mit_shm;
   bool                                         has_async_may_tear;
+   bool                                         has_reliable_msc;

   xcb_connection_t *                           conn;
   xcb_window_t                                 window;
@ -1144,9 +1313,13 @@ struct x11_swapchain {
   xcb_special_event_t *                        special_event;
   uint64_t                                     send_sbc;
   uint64_t                                     last_present_msc;
+   uint64_t                                     next_present_ust_lower_bound;
   uint32_t                                     stamp;
   uint32_t                                     sent_image_count;

+   struct x11_present_timing_entry              present_timing_window[X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE];
+   uint32_t                                     present_timing_window_index;
+
   atomic_int                                   status;
   bool                                         copy_is_suboptimal;
   struct wsi_queue                             present_queue;
@ -1168,14 +1341,121 @@ struct x11_swapchain {
   uint64_t                                     present_id;
   VkResult                                     present_progress_error;

+   struct wsi_image_timing_request              timing_request;
+   bool                                         msc_estimate_is_stable;
+
   struct x11_image                             images[0];
 };
 VK_DEFINE_NONDISP_HANDLE_CASTS(x11_swapchain, base.base, VkSwapchainKHR,
                               VK_OBJECT_TYPE_SWAPCHAIN_KHR)

-static void x11_present_complete(struct x11_swapchain *swapchain,
-                                 struct x11_image *image, uint32_t index)
+static bool x11_refresh_rate_estimate_is_stable(struct x11_swapchain *swapchain, uint64_t base_rate)
 {
+   /* Only accept a refresh rate estimate if it's *very* stable.
+    * Keith's old GOOGLE_display_timing MR suggests that using this estimate is better than blindly
+    * accepting the modeline in some cases.
+    * When running in VRR modes, the MSC will appear to be highly unstable, and we cannot accept those estimates. */
+
+   for (int i = 0; i < X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE; i++) {
+      const struct x11_present_timing_entry *a =
+            &swapchain->present_timing_window[i];
+      const struct x11_present_timing_entry *b =
+            &swapchain->present_timing_window[(i + 1) % X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE];
+
+      if (!a->msc || !b->msc)
+         continue;
+
+      uint64_t ust_delta = MAX2(a->ust, b->ust) - MIN2(a->ust, b->ust);
+      uint64_t msc_delta = MAX2(a->msc, b->msc) - MIN2(a->msc, b->msc);
+
+      if (msc_delta == 0)
+         continue;
+
+      uint64_t refresh_ns = 1000 * ust_delta / msc_delta;
+
+      /* The true UST values are expected to be quite accurate.
+       * Anything more than 10us difference in rate is considered unstable.
+       * If the MSC is driven by GPU progress in VRR mode,
+       * it's extremely unlikely that they are paced *perfectly* for 16 frames in a row. */
+      if (llabs((int64_t)base_rate - (int64_t)refresh_ns) > 10000)
+         return false;
+   }
+
+   return true;
+}
+
+static void x11_present_update_refresh_cycle_estimate(struct x11_swapchain *swapchain,
+                                                      uint64_t msc, uint64_t ust)
+{
+   struct wsi_x11_icd_surface *surface = wsi_x11_get_icd_surface(
+         (struct wsi_device*)swapchain->base.wsi, swapchain->conn, swapchain->window);
+
+   mtx_lock(&surface->mtx);
+   uint64_t randr_refresh_ns = surface->current_refresh_ns;
+   mtx_unlock(&surface->mtx);
+
+   swapchain->present_timing_window_index =
+         (swapchain->present_timing_window_index + 1) % X11_SWAPCHAIN_REFRESH_RATE_WINDOW_SIZE;
+   struct x11_present_timing_entry *entry = &swapchain->present_timing_window[swapchain->present_timing_window_index];
+
+   if (!swapchain->has_reliable_msc) {
+      /* If we don't have reliable MSC, we always trust the fallback RANDR query.
+       * We have no idea if we're FRR or VRR. */
+      wsi_swapchain_present_timing_update_refresh_rate(&swapchain->base, randr_refresh_ns, 0, 0);
+      entry->msc = msc;
+      entry->ust = ust;
+      return;
+   }
+
+   /* Try to get an initial estimate as quickly as possible, we will refine it over time. */
+   if (entry->msc == 0)
+      entry = &swapchain->present_timing_window[1];
+
+   if (entry->msc != 0) {
+      uint64_t msc_delta = msc - entry->msc;
+
+      /* Safeguard against any weird interactions with IMMEDIATE. */
+      if (msc_delta != 0) {
+         uint64_t ust_delta = 1000 * (ust - entry->ust);
+         uint64_t refresh_ns = ust_delta / msc_delta;
+
+         swapchain->msc_estimate_is_stable = x11_refresh_rate_estimate_is_stable(swapchain, refresh_ns);
+
+         if (swapchain->msc_estimate_is_stable) {
+            /* If MSC is tightly locked in, we can safely make the assumption we're in FRR mode.
+             * It's possible we're technically doing VRR, but if we're rendering at above monitor refresh
+             * rate consistently, then there is no meaningful difference anyway. */
+
+            /* Our refresh rates are only estimates, so expect some deviation (+/- 1us). */
+            wsi_swapchain_present_timing_update_refresh_rate(&swapchain->base, refresh_ns, refresh_ns, 1000);
+         } else {
+            /* If we have enabled adaptive sync, and we're seeing highly irregular MSC values, we assume
+             * we're driving the display VRR. */
+            uint64_t refresh_interval = swapchain->base.wsi->enable_adaptive_sync ? UINT64_MAX : 0;
+            wsi_swapchain_present_timing_update_refresh_rate(&swapchain->base, randr_refresh_ns, refresh_interval, 0);
+         }
+      }
+   }
+
+   entry = &swapchain->present_timing_window[swapchain->present_timing_window_index];
+   entry->msc = msc;
+   entry->ust = ust;
+}
+
+static void x11_present_complete(struct x11_swapchain *swapchain,
+                                 struct x11_image *image, uint32_t index,
+                                 uint64_t msc, uint64_t ust)
+{
+   /* Update estimate for refresh rate. */
+   if (swapchain->base.present_timing.active)
+      x11_present_update_refresh_cycle_estimate(swapchain, msc, ust);
+
+   /* Make sure to signal present timings before signalling present wait,
+    * this way we get minimal latency for reports. */
+   uint64_t timing_serial = image->pending_completions[index].timing_serial;
+   if (timing_serial)
+      wsi_swapchain_present_timing_notify_completion(&swapchain->base, timing_serial, ust * 1000, &image->base);
+
   uint64_t signal_present_id = image->pending_completions[index].signal_present_id;
   if (signal_present_id) {
      mtx_lock(&swapchain->present_progress_mutex);
@ -1327,6 +1607,16 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain,
            return VK_SUBOPTIMAL_KHR;
      }

+      if (chain->base.present_timing.active) {
+         /* It's possible that we have multiple monitors and moving windows around change the effective rate.
+          * Lots of logic reused from platform_x11.c. */
+
+         /* TODO: Should we rate-limit this query? */
+         struct wsi_x11_icd_surface *surface = wsi_x11_get_icd_surface(
+               (struct wsi_device *)chain->base.wsi, chain->conn, chain->window);
+         x11_icd_surface_update_present_timing(surface, config->width, config->height);
+      }
+
      break;
   }

@ -1348,13 +1638,14 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain,

   case XCB_PRESENT_EVENT_COMPLETE_NOTIFY: {
      xcb_present_complete_notify_event_t *complete = (void *) event;
+      uint64_t ust = MAX2(complete->ust, chain->next_present_ust_lower_bound);
      if (complete->kind == XCB_PRESENT_COMPLETE_KIND_PIXMAP) {
         unsigned i, j;
         for (i = 0; i < chain->base.image_count; i++) {
            struct x11_image *image = &chain->images[i];
            for (j = 0; j < image->present_queued_count; j++) {
               if (image->pending_completions[j].serial == complete->serial) {
-                  x11_present_complete(chain, image, j);
+                  x11_present_complete(chain, image, j, complete->msc, ust);
               }
            }
         }
@ -1424,8 +1715,8 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
   int64_t divisor = 0;
   int64_t remainder = 0;

-   struct wsi_x11_connection *wsi_conn =
-      wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn);
+   struct wsi_x11_icd_surface *wsi_conn =
+      wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window);
   if (!wsi_conn)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

@ -1457,6 +1748,7 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
      (struct x11_image_pending_completion) {
         .signal_present_id = image->present_id,
         .serial = serial,
+         .timing_serial = image->timing_request.serial,
      };

   xcb_void_cookie_t cookie;
@ -1654,7 +1946,7 @@ static VkResult x11_swapchain_read_status_atomic(struct x11_swapchain *chain)
 */
 static bool
 x11_needs_wait_for_fences(const struct wsi_device *wsi_device,
-                          struct wsi_x11_connection *wsi_conn,
+                          struct wsi_x11_icd_surface *wsi_conn,
                          VkPresentModeKHR present_mode)
 {
   if (wsi_conn->is_xwayland && !wsi_device->x11.xwaylandWaitReady) {
@ -1676,7 +1968,7 @@ x11_needs_wait_for_fences(const struct wsi_device *wsi_device,

 static bool
 x11_requires_mailbox_image_count(const struct wsi_device *device,
-                                 struct wsi_x11_connection *wsi_conn,
+                                 struct wsi_x11_icd_surface *wsi_conn,
                                 VkPresentModeKHR present_mode)
 {
   /* If we're resorting to wait for fences, we're assuming a MAILBOX-like model,
@ -1773,6 +2065,26 @@ x11_set_present_mode(struct wsi_swapchain *wsi_chain,
   chain->base.present_mode = mode;
 }

+static void
+x11_set_timing_request(struct wsi_swapchain *wsi_chain,
+                       const struct wsi_image_timing_request *request)
+{
+   struct x11_swapchain *chain = (struct x11_swapchain *)wsi_chain;
+   chain->timing_request = *request;
+}
+
+static uint64_t
+x11_poll_early_refresh(struct wsi_swapchain *wsi_chain, uint64_t *interval)
+{
+   struct x11_swapchain *chain = (struct x11_swapchain *)wsi_chain;
+   struct wsi_x11_icd_surface *wsi_conn =
+         wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window);
+
+   /* We don't know yet. */
+   *interval = 0;
+   return x11_icd_surface_update_present_timing(wsi_conn, chain->extent.width, chain->extent.height);
+}
+
 /**
 * Acquire a ready-to-use image from the swapchain.
 *
@ -1874,6 +2186,8 @@ x11_queue_present(struct wsi_swapchain *wsi_chain,
   chain->images[image_index].present_id = present_id;
   /* With KHR_swapchain_maintenance1, the present mode can change per present. */
   chain->images[image_index].present_mode = chain->base.present_mode;
+   chain->images[image_index].timing_request = chain->timing_request;
+   memset(&chain->timing_request, 0, sizeof(chain->timing_request));

   wsi_queue_push(&chain->present_queue, image_index);
   return x11_swapchain_read_status_atomic(chain);
@ -1974,6 +2288,125 @@ x11_manage_event_queue(void *state)
   return 0;
 }

+static uint64_t
+x11_present_compute_target_msc(struct x11_swapchain *chain,
+                               const struct wsi_image_timing_request *request,
+                               uint64_t minimum_msc)
+{
+   const struct x11_present_timing_entry *entry = &chain->present_timing_window[chain->present_timing_window_index];
+   bool relative = (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT) != 0;
+
+   /* Just use the FIFO derived MSC. From spec on relative:
+    * "If the swapchain has never been used to present an image, the provided targetTime is ignored." */
+   if (!request->serial || !request->time || (relative && !entry->ust))
+      return minimum_msc;
+
+   int64_t target_ns;
+
+   mtx_lock(&chain->base.present_timing.lock);
+
+   /* Present timing is only defined to work with FIFO modes, so we can rely on having
+    * reliable relative timings, since we block for COMPLETE to come through before we queue up more presents. */
+   if (relative) {
+      /* If application is trying to drive us at refresh rate, FIFO will take care of it.
+       * Don't end up in a situation where we sleep and miss the deadline by mistake. */
+      if (!chain->has_reliable_msc) {
+         uint64_t relative_threshold;
+         if (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT)
+            relative_threshold = 3 * chain->base.present_timing.refresh_duration / 2;
+         else
+            relative_threshold = chain->base.present_timing.refresh_duration;
+
+         if (request->time <= relative_threshold) {
+            mtx_unlock(&chain->base.present_timing.lock);
+            return minimum_msc;
+         }
+      }
+      target_ns = 1000 * (int64_t)entry->ust + (int64_t)request->time;
+   } else {
+      target_ns = (int64_t)request->time;
+   }
+
+   /* Snap to nearest half refresh. This only makes sense for FRR, but it is the application's
+    * responsibility to not use this for VRR. If this flag is not used, this is strictly a "not before". */
+   if (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT)
+      target_ns -= (int64_t)chain->base.present_timing.refresh_duration / 2;
+
+   if (entry->msc && chain->base.present_timing.refresh_duration != 0 &&
+       chain->msc_estimate_is_stable && chain->has_reliable_msc) {
+      /* If we can trust MSC to be a stable FRR heartbeat, we sync to that. */
+      uint64_t delta_time_ns = MAX2(target_ns - 1000 * (int64_t)entry->ust, 0);
+      uint64_t periods = (delta_time_ns + chain->base.present_timing.refresh_duration - 1) /
+                         chain->base.present_timing.refresh_duration;
+      mtx_unlock(&chain->base.present_timing.lock);
+
+      /* Xwl cannot understand MSC that jumps by more than 1. It appears that if there are MSC jumps above 1,
+       * each MSC cycle is padded by 16.6ms or something like that.
+       * If we want to target specific time, we must sleep to achieve that until Xwl improves.
+       * Fortunately, we're on a submit thread, so that is mostly an acceptable solution. */
+      minimum_msc = MAX2(minimum_msc, entry->msc + periods);
+   } else {
+      /* If we don't have a stable estimate (e.g. true VRR, or Xwl) we just sleep until deadline.
+       * This relies on timebase on os_time_nanosleep is MONOTONIC as well as UST being MONOTONIC. */
+
+      if (request->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_NEAREST_REFRESH_CYCLE_BIT_EXT) {
+         if (!chain->has_reliable_msc && chain->base.present_timing.refresh_duration) {
+            uint64_t delta_time_ns = MAX2(target_ns - 1000 * (int64_t)entry->ust, 0);
+            uint64_t periods = delta_time_ns / chain->base.present_timing.refresh_duration;
+
+            target_ns = 1000ull * entry->ust + periods * chain->base.present_timing.refresh_duration;
+
+            /* Set a minimum target that is very close to the real estimate.
+             * This way, we ensure that we don't regularly round estimates up in
+             * chain->next_present_ust_lower_bound. */
+            target_ns += 63 * chain->base.present_timing.refresh_duration / 64;
+         }
+      }
+
+      if (chain->has_reliable_msc) {
+         /* Very regular sleeping can trigger a strange feedback loop where MSC estimates becomes stable enough
+          * that we accept it as stable MSC. Perturb the rates enough to make it extremely unlikely
+          * we accept sleeping patterns as ground truth rate, introduce a 50 us error between each timestamp,
+          * which should avoid the 10 us check reliably. If sleep quantas are not as accurate, it's extremely unlikely
+          * we get a stable pace anyway. TODO: Is there a more reliable way? */
+
+         /* On Xwl we never accept MSC estimates as ground truth, so ignore this perturbation. */
+         target_ns += 50000ll * (chain->present_timing_window_index & 1) - 25000;
+         target_ns = MAX2(target_ns, 0);
+      }
+
+      /* If we're on Xwl or VRR X11 and trying to target a specific cycle by sleeping, pull back the sleep a bit.
+       * We will be racing against time once we wake up to send the request to Xwl -> Wayland -> frame callback -> COMPLETE.
+       * If target_ns syncs well to a refresh cycle, we speculate that COMPLETE will come through at about target_ns. */
+
+      /* To get proper pace on an actual VRR display, we will have to detect if we're presenting too early
+       * compared to what application actually expected.
+       * In that case, we need to remove this compensation if we detect that presents come in too early.
+       * Effectively, we will need to adjust the report UST up if we somehow end up seeing a timestamp too early.
+       * The relative refresh will feed off this adjustment in a tight loop, so this should be pretty solid
+       * for both VRR and FRR. Present timing can only be used with FIFO modes, i.e. we will not overwrite this
+       * until the present is actually complete. */
+      chain->next_present_ust_lower_bound = target_ns / 1000;
+
+      /* We also need to pull back the sleep a bit to account for X.org roundtrip delays.
+       * Allow up to 4ms of error here. */
+      int64_t eager_present_ns = MIN2((int64_t)chain->base.present_timing.refresh_duration / 4, 4 * 1000 * 1000);
+      target_ns -= eager_present_ns;
+      target_ns = MAX2(target_ns, 0);
+
+      mtx_unlock(&chain->base.present_timing.lock);
+      mtx_unlock(&chain->thread_state_lock);
+
+      os_time_nanosleep_until(target_ns);
+
+      /* Reacquiring the lock won't change any invariants for us, so this is fine.
+       * We make sure to check chain->status after this function in case that got updated while we were sleeping. */
+      mtx_lock(&chain->thread_state_lock);
+   }
+
+   return minimum_msc;
+}
+
 /**
 * Presentation thread.
 *
@ -1991,8 +2424,8 @@ static int
 x11_manage_present_queue(void *state)
 {
   struct x11_swapchain *chain = state;
-   struct wsi_x11_connection *wsi_conn =
-         wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn);
+   struct wsi_x11_icd_surface *wsi_conn =
+         wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window);
   VkResult result = VK_SUCCESS;

   u_thread_setname("WSI swapchain queue");
@ -2040,6 +2473,8 @@ x11_manage_present_queue(void *state)
         u_cnd_monotonic_wait(&chain->thread_state_cond, &chain->thread_state_lock);
      }

+      target_msc = x11_present_compute_target_msc(chain, &chain->images[image_index].timing_request, target_msc);
+
      if (chain->status < 0) {
         mtx_unlock(&chain->thread_state_lock);
         break;
@ -2315,7 +2750,7 @@ wsi_x11_recompute_dri3_modifier_hash(blake3_hash *hash, const struct wsi_drm_ima
 }

 static void
-wsi_x11_get_dri3_modifiers(struct wsi_x11_connection *wsi_conn,
+wsi_x11_get_dri3_modifiers(struct wsi_x11_icd_surface *wsi_conn,
                           xcb_connection_t *conn, xcb_window_t window,
                           uint8_t depth, uint8_t bpp,
                           uint64_t **modifiers_in, uint32_t *num_modifiers_in,
@ -2402,8 +2837,8 @@ wsi_x11_swapchain_query_dri3_modifiers_changed(struct x11_swapchain *chain)
   uint64_t *modifiers[2] = {NULL, NULL};
   uint32_t num_modifiers[2] = {0, 0};

-   struct wsi_x11_connection *wsi_conn =
-         wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn);
+   struct wsi_x11_icd_surface *wsi_conn =
+         wsi_x11_get_icd_surface((struct wsi_device*)chain->base.wsi, chain->conn, chain->window);

   xcb_get_geometry_reply_t *geometry =
         xcb_get_geometry_reply(chain->conn, xcb_get_geometry(chain->conn, chain->window), NULL);
@ -2551,7 +2986,7 @@ static VkResult x11_wait_for_present(struct wsi_swapchain *wsi_chain,

 static unsigned
 x11_get_min_image_count_for_present_mode(struct wsi_device *wsi_device,
-                                         struct wsi_x11_connection *wsi_conn,
+                                         struct wsi_x11_icd_surface *wsi_conn,
                                         VkPresentModeKHR present_mode)
 {
   uint32_t min_image_count = x11_get_min_image_count(wsi_device, wsi_conn->is_xwayland);
@ -2592,8 +3027,9 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
    * representing it.
    */
   xcb_connection_t *conn = x11_surface_get_connection(icd_surface);
-   struct wsi_x11_connection *wsi_conn =
-      wsi_x11_get_connection(wsi_device, conn);
+   xcb_window_t window = x11_surface_get_window(icd_surface);
+   struct wsi_x11_icd_surface *wsi_conn =
+      wsi_x11_get_icd_surface(wsi_device, conn, window);
   if (!wsi_conn)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

@ -2613,7 +3049,6 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   }

   /* Check that we have a window up-front. It is an error to not have one. */
-   xcb_window_t window = x11_surface_get_window(icd_surface);

   /* Get the geometry of that window. The bit depth of the swapchain will be fitted and the
    * chain's images extents should fit it for performance-optimizing flips.
@ -2736,8 +3171,14 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   chain->base.wait_for_present2 = x11_wait_for_present;
   chain->base.release_images = x11_release_images;
   chain->base.set_present_mode = x11_set_present_mode;
+   chain->base.set_timing_request = x11_set_timing_request;
+   chain->base.poll_early_refresh = x11_poll_early_refresh;
   chain->base.present_mode = present_mode;
   chain->base.image_count = num_images;
+
+   /* This is what Xserver is using. We cannot really query it, but we rely on it working. */
+   chain->base.present_timing.time_domain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR;
+
   chain->conn = conn;
   chain->window = window;
   chain->depth = bit_depth;
@ -2749,6 +3190,7 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   chain->has_dri3_modifiers = wsi_conn->has_dri3_modifiers;
   chain->has_mit_shm = wsi_conn->has_mit_shm;
   chain->has_async_may_tear = present_caps & XCB_PRESENT_CAPABILITY_ASYNC_MAY_TEAR;
+   chain->has_reliable_msc = !wsi_conn->is_xwayland;

   /* When images in the swapchain don't fit the window, X can still present them, but it won't
    * happen by flip, only by copy. So this is a suboptimal copy, because if the client would change
@ -2856,6 +3298,9 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   /* It is safe to set it here as only one swapchain can be associated with
    * the window, and swapchain creation does the association. At this point
    * we know the creation is going to succeed. */
+
+   /* If we have present timing, we need to make sure we get a useable estimate for refresh rate
+    * before we let the window run in full VRR. Once we have locked in the rate, we can enable VRR property. */
   wsi_x11_set_adaptive_sync_property(conn, window,
                                      wsi_device->enable_adaptive_sync);

@ -2889,6 +3334,18 @@ fail_alloc:
   return result;
 }

+static uint32_t x11_hash_icd_surface(const void *key)
+{
+   return _mesa_hash_data(key, sizeof(struct x11_icd_surface_key));
+}
+
+static bool x11_icd_surface_equal(const void *a_, const void *b_)
+{
+   const struct x11_icd_surface_key *a = a_;
+   const struct x11_icd_surface_key *b = b_;
+   return a->conn == b->conn && a->window == b->window;
+}
+
 VkResult
 wsi_x11_init_wsi(struct wsi_device *wsi_device,
                 const VkAllocationCallbacks *alloc,
@ -2916,8 +3373,7 @@ wsi_x11_init_wsi(struct wsi_device *wsi_device,
      goto fail_alloc;
   }

-   wsi->connections = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                              _mesa_key_pointer_equal);
+   wsi->connections = _mesa_hash_table_create(NULL, x11_hash_icd_surface, x11_icd_surface_equal);
   if (!wsi->connections) {
      result = VK_ERROR_OUT_OF_HOST_MEMORY;
      goto fail_mutex;
@ -2981,7 +3437,7 @@ wsi_x11_finish_wsi(struct wsi_device *wsi_device,

   if (wsi) {
      hash_table_foreach(wsi->connections, entry)
-         wsi_x11_connection_destroy(wsi_device, entry->data);
+         wsi_x11_icd_surface_destroy(wsi_device, entry->data);

      _mesa_hash_table_destroy(wsi->connections, NULL);