vulkan/wsi: Implement QUEUE_OPERATIONS_END present timing query.

This is mostly provided for convenience, but it's not implementable by applications when we're using blit queues for PRIME, so it's quite useful to have. This is reworked from previous GOOGLE_display_timing MRs by Keith Packard and Emma Anholt. See MR 38472 for reference. Rather than exposing PRESENT_STAGE_LOCAL, we expose all timestamps in one unified domain to simplify the implementation. Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no> Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38770>
2026-02-10 01:00:29 +01:00 · 2025-12-08 14:25:47 +01:00 · 2025-12-08 14:25:47 +01:00 · bf9cd1546f
commit bf9cd1546f
parent cb22d413ba
4 changed files with 440 additions and 31 deletions
--- a/src/vulkan/wsi/wsi_common.c
+++ b/src/vulkan/wsi/wsi_common.c
@ -95,6 +95,7 @@ wsi_device_init(struct wsi_device *wsi,
   WSI_GET_CB(GetPhysicalDeviceProperties2);
   WSI_GET_CB(GetPhysicalDeviceMemoryProperties);
   WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_GET_CB(GetPhysicalDeviceProperties);
 #undef WSI_GET_CB

   wsi->drm_info.sType =
@ -121,10 +122,18 @@ wsi_device_init(struct wsi_device *wsi,
   VkQueueFamilyProperties queue_properties[64];
   GetPhysicalDeviceQueueFamilyProperties(pdevice, &wsi->queue_family_count, queue_properties);

+   VkPhysicalDeviceProperties properties;
+   GetPhysicalDeviceProperties(pdevice, &properties);
+   wsi->timestamp_period = properties.limits.timestampPeriod;
+
   for (unsigned i = 0; i < wsi->queue_family_count; i++) {
      VkFlags req_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
      if (queue_properties[i].queueFlags & req_flags)
         wsi->queue_supports_blit |= BITFIELD64_BIT(i);
+
+      /* Don't want to consider timestamp wrapping logic. */
+      if (queue_properties[i].timestampValidBits == 64)
+         wsi->queue_supports_timestamps |= BITFIELD64_BIT(i);
   }

   for (VkExternalSemaphoreHandleTypeFlags handle_type = 1;
@ -180,15 +189,19 @@ wsi_device_init(struct wsi_device *wsi,
   WSI_GET_CB(CmdPipelineBarrier);
   WSI_GET_CB(CmdCopyImage);
   WSI_GET_CB(CmdCopyImageToBuffer);
+   WSI_GET_CB(CmdResetQueryPool);
+   WSI_GET_CB(CmdWriteTimestamp);
   WSI_GET_CB(CreateBuffer);
   WSI_GET_CB(CreateCommandPool);
   WSI_GET_CB(CreateFence);
   WSI_GET_CB(CreateImage);
+   WSI_GET_CB(CreateQueryPool);
   WSI_GET_CB(CreateSemaphore);
   WSI_GET_CB(DestroyBuffer);
   WSI_GET_CB(DestroyCommandPool);
   WSI_GET_CB(DestroyFence);
   WSI_GET_CB(DestroyImage);
+   WSI_GET_CB(DestroyQueryPool);
   WSI_GET_CB(DestroySemaphore);
   WSI_GET_CB(EndCommandBuffer);
   WSI_GET_CB(FreeMemory);
@ -200,9 +213,14 @@ wsi_device_init(struct wsi_device *wsi,
   WSI_GET_CB(GetImageSubresourceLayout);
   if (!wsi->sw)
      WSI_GET_CB(GetMemoryFdKHR);
+   WSI_GET_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
+   WSI_GET_CB(GetPhysicalDeviceProperties);
   WSI_GET_CB(GetPhysicalDeviceFormatProperties);
   WSI_GET_CB(GetPhysicalDeviceFormatProperties2);
   WSI_GET_CB(GetPhysicalDeviceImageFormatProperties2);
+   WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_GET_CB(GetCalibratedTimestampsKHR);
+   WSI_GET_CB(GetQueryPoolResults);
   WSI_GET_CB(GetSemaphoreFdKHR);
   WSI_GET_CB(ResetFences);
   WSI_GET_CB(QueueSubmit2);
@ -459,6 +477,10 @@ configure_image(const struct wsi_swapchain *chain,
   }
 }

+static void
+wsi_surface_capabilities_add_present_stages(const struct wsi_device *wsi_device,
+                                            VkPresentTimingSurfaceCapabilitiesEXT *present_timing);
+
 VkResult
 wsi_swapchain_init(const struct wsi_device *wsi,
                   struct wsi_swapchain *chain,
@ -484,8 +506,10 @@ wsi_swapchain_init(const struct wsi_device *wsi,
      (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_WAIT_2_BIT_KHR);

   chain->blit.queue = NULL;
-   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
-      if (wsi->get_blit_queue) {
+   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT ||
+       (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)) {
+
+      if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT && wsi->get_blit_queue) {
         chain->blit.queue = wsi->get_blit_queue(_device);
      }

@ -506,10 +530,18 @@ wsi_swapchain_init(const struct wsi_device *wsi,
         if (chain->blit.queue != NULL) {
            queue_family_index = chain->blit.queue->queue_family_index;
         } else {
+            uint64_t effective_queues = wsi->queue_supports_blit;
+            if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)
+               effective_queues &= wsi->queue_supports_timestamps;
+
+            /* Fallback. If this happens we don't advertise support for queue complete times. */
+            if (!effective_queues)
+               effective_queues = wsi->queue_supports_blit;
+
            /* Queues returned by get_blit_queue() might not be listed in
            * GetPhysicalDeviceQueueFamilyProperties, so this check is skipped for those queues.
            */
-            if (!(wsi->queue_supports_blit & BITFIELD64_BIT(queue_family_index)))
+            if (!(effective_queues & BITFIELD64_BIT(queue_family_index)))
               continue;
         }

@ -537,6 +569,24 @@ wsi_swapchain_init(const struct wsi_device *wsi,
      goto fail;
 #endif

+   if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) {
+      ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
+      struct wsi_interface *iface = wsi->wsi[surface->platform];
+
+      VkPresentTimingSurfaceCapabilitiesEXT timing_caps = {
+         .sType = VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT,
+      };
+
+      VkSurfaceCapabilities2KHR caps2 = {
+         .sType = VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR,
+         .pNext = &timing_caps,
+      };
+
+      iface->get_capabilities2(surface, (struct wsi_device *)wsi, NULL, &caps2);
+      wsi_surface_capabilities_add_present_stages(wsi, &timing_caps);
+      chain->present_timing.supported_query_stages = timing_caps.presentStageQueries;
+   }
+
   return VK_SUCCESS;

 fail:
@ -619,7 +669,7 @@ wsi_swapchain_finish(struct wsi_swapchain *chain)
   chain->wsi->DestroySemaphore(chain->device, chain->present_id_timeline,
                                &chain->alloc);

-   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
+   if (chain->cmd_pools) {
      int cmd_pools_count = chain->blit.queue != NULL ?
         1 : chain->wsi->queue_family_count;
      for (uint32_t i = 0; i < cmd_pools_count; i++) {
@ -824,6 +874,88 @@ fail:
   return result;
 }

+/**
+ * Creates the timestamp-query command buffers for the end of rendering, that
+ * will be used to report QUEUE_COMPLETE timestamp for EXT_present_timing.
+ *
+ * Unless the swapchain is blitting, we don't know what queue family a Present
+ * will happen on.  So we make a timestamp command buffer for each so they're
+ * ready to go at present time.
+ */
+VkResult
+wsi_image_init_timestamp(const struct wsi_swapchain *chain,
+                         struct wsi_image *image)
+{
+   const struct wsi_device *wsi = chain->wsi;
+   VkResult result;
+   /* Set up command buffer to get timestamp info */
+
+   result = wsi->CreateQueryPool(
+      chain->device,
+      &(const VkQueryPoolCreateInfo){
+         .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         .queryType = VK_QUERY_TYPE_TIMESTAMP,
+         .queryCount = 1,
+      },
+      NULL,
+      &image->query_pool);
+
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count;
+
+   if (!image->timestamp_cmd_buffers) {
+      image->timestamp_cmd_buffers =
+         vk_zalloc(&chain->alloc, sizeof(VkCommandBuffer) * family_count, 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->timestamp_cmd_buffers)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   for (uint32_t i = 0; i < family_count; i++) {
+      /* We can only use timestamps on a queue that reports timestamp bits != 0.
+       * Since we don't consider device timestamp wrapping in this implementation (unclear how that would ever work),
+       * only report queue done where timestamp bits == 64. */
+      if (!chain->cmd_pools[i])
+         continue;
+
+      result = wsi->AllocateCommandBuffers(
+         chain->device,
+         &(const VkCommandBufferAllocateInfo){
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+            .pNext = NULL,
+            .commandPool = chain->cmd_pools[i],
+            .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+            .commandBufferCount = 1,
+         }, &image->timestamp_cmd_buffers[i]);
+
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      wsi->BeginCommandBuffer(
+         image->timestamp_cmd_buffers[i],
+         &(VkCommandBufferBeginInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+         });
+
+      wsi->CmdResetQueryPool(image->timestamp_cmd_buffers[i],
+                             image->query_pool,
+                             0, 1);
+
+      wsi->CmdWriteTimestamp(image->timestamp_cmd_buffers[i],
+                             VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                             image->query_pool,
+                             0);
+
+      wsi->EndCommandBuffer(image->timestamp_cmd_buffers[i]);
+   }
+
+   return VK_SUCCESS;
+fail:
+   return result;
+}
+
 void
 wsi_destroy_image(const struct wsi_swapchain *chain,
                  struct wsi_image *image)
@ -859,6 +991,19 @@ wsi_destroy_image(const struct wsi_swapchain *chain,
      vk_free(&chain->alloc, image->blit.cmd_buffers);
   }

+   wsi->DestroyQueryPool(chain->device, image->query_pool, NULL);
+
+   if (image->timestamp_cmd_buffers) {
+      uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count;
+      for (uint32_t i = 0; i < family_count; i++) {
+         if (image->timestamp_cmd_buffers[i]) {
+            wsi->FreeCommandBuffers(chain->device, chain->cmd_pools[i],
+                                    1, &image->timestamp_cmd_buffers[i]);
+         }
+      }
+      vk_free(&chain->alloc, image->timestamp_cmd_buffers);
+   }
+
   wsi->FreeMemory(chain->device, image->memory, &chain->alloc);
   wsi->DestroyImage(chain->device, image->image, &chain->alloc);
   wsi->DestroyImage(chain->device, image->blit.image, &chain->alloc);
@ -910,6 +1055,37 @@ wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(
   return result;
 }

+static void
+wsi_surface_capabilities_add_present_stages(const struct wsi_device *wsi_device,
+                                            VkPresentTimingSurfaceCapabilitiesEXT *present_timing)
+{
+   if (wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps) {
+      /* Make sure the implementation is capable of calibrating timestamps. */
+      if (wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR && wsi_device->GetCalibratedTimestampsKHR) {
+         VkTimeDomainKHR domains[64];
+         uint32_t count = ARRAY_SIZE(domains);
+         wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR(wsi_device->pdevice, &count, domains);
+
+         bool supports_device = false, supports_monotonic = false, supports_monotonic_raw = false;
+
+         for (uint32_t i = 0; i < count; i++) {
+            if (domains[i] == VK_TIME_DOMAIN_DEVICE_KHR)
+               supports_device = true;
+            else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR)
+               supports_monotonic = true;
+            else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR)
+               supports_monotonic_raw = true;
+         }
+
+         /* Current present timing implementations do not use anything outside these.
+          * QPC might be relevant for Dozen at some point, but for now, we only consider Linux-centric
+          * platforms for present timing. */
+         if (supports_device && supports_monotonic && supports_monotonic_raw)
+            present_timing->presentStageQueries |= VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
+      }
+   }
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(
   VkPhysicalDevice physicalDevice,
@ -921,8 +1097,26 @@ wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(
   struct wsi_device *wsi_device = device->wsi_device;
   struct wsi_interface *iface = wsi_device->wsi[surface->platform];

-   return iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext,
-                                   pSurfaceCapabilities);
+   VkResult vr = iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext,
+                                          pSurfaceCapabilities);
+   if (vr != VK_SUCCESS)
+      return vr;
+
+   VkPresentTimingSurfaceCapabilitiesEXT *present_timing =
+         vk_find_struct(pSurfaceCapabilities, PRESENT_TIMING_SURFACE_CAPABILITIES_EXT);
+
+   if (present_timing && present_timing->presentTimingSupported) {
+      wsi_surface_capabilities_add_present_stages(wsi_device, present_timing);
+      if (!(present_timing->presentStageQueries & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT)) {
+         /* CTS and proposal document assert that QUEUE_OPERATIONS_END must be supported. */
+         present_timing->presentTimingSupported = VK_FALSE;
+         present_timing->presentAtAbsoluteTimeSupported = VK_FALSE;
+         present_timing->presentAtRelativeTimeSupported = VK_FALSE;
+         present_timing->presentStageQueries = 0;
+      }
+   }
+
+   return vr;
 }

 VKAPI_ATTR VkResult VKAPI_CALL
@ -1126,6 +1320,15 @@ wsi_CreateSwapchainKHR(VkDevice _device,
      swapchain->present_timing.active = true;
      mtx_init(&swapchain->present_timing.lock, 0);

+      for (uint32_t i = 0; i < swapchain->image_count; i++) {
+         struct wsi_image *image = swapchain->get_wsi_image(swapchain, i);
+         result = wsi_image_init_timestamp(swapchain, image);
+         if (result != VK_SUCCESS) {
+            swapchain->destroy(swapchain, alloc);
+            return result;
+         }
+      }
+
      if (swapchain->poll_early_refresh) {
         /* If we can query the display directly, we should report something reasonable on first query
          * before we even present the first time. */
@ -1195,8 +1398,87 @@ wsi_ReleaseSwapchainImagesKHR(VkDevice _device,
   return VK_SUCCESS;
 }

+static uint64_t
+wsi_swapchain_present_convert_device_to_cpu(struct wsi_swapchain *chain,
+                                            uint64_t device_timestamp_ns)
+{
+   if (device_timestamp_ns == 0)
+      return 0;
+
+   /* This is only relevant if application is requesting that QUEUE_DONE present stage
+    * is used as target time domain (targetTimeDomainPresentStage). */
+
+   /* We have already made sure that the implementation supports these. */
+   const VkCalibratedTimestampInfoKHR infos[2] = {
+      {
+         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
+         .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
+      },
+      {
+         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
+         .timeDomain = chain->present_timing.time_domain,
+      },
+   };
+
+   uint64_t timestamps[2];
+   uint64_t max_deviation;
+
+   /* Ignore target time if this fails for whatever reason. */
+   if (chain->wsi->GetCalibratedTimestampsKHR(chain->device, 2, infos, timestamps, &max_deviation) != VK_SUCCESS)
+      return 0;
+
+   /* PRESENT_STAGE_LOCAL_EXT is in terms of nanoseconds, so we don't scale that. */
+   timestamps[0] = (uint64_t)((double)chain->wsi->timestamp_period * (double)timestamps[0]);
+   int64_t device_delta_ns = (int64_t)device_timestamp_ns - (int64_t)timestamps[0];
+   uint64_t target_timestamp = timestamps[1] + device_delta_ns;
+   return target_timestamp;
+}
+
+static void
+wsi_swapchain_present_timing_sample_query_pool(struct wsi_swapchain *chain,
+                                               struct wsi_presentation_timing *timing,
+                                               struct wsi_image *image,
+                                               uint64_t upper_bound)
+{
+   /* Application can query for stages which are not supported. We need to return 0 here. */
+   if (!(timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) ||
+       !(chain->present_timing.supported_query_stages & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT))
+      return;
+
+   /* The GPU really should be done by now, and we should be able to read the timestamp,
+    * but it's possible that the present was discarded and we have a 0 timestamp here for the present.
+    * In this case, we should not block to wait on the queue dispatch timestamp. */
+   uint64_t queue_ts;
+
+   if (chain->wsi->GetQueryPoolResults(chain->device, image->query_pool, 0, 1, sizeof(uint64_t),
+                                       &queue_ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT) == VK_SUCCESS) {
+      /* This is still reported in nanoseconds. */
+      timing->queue_done_time = (uint64_t)((double)queue_ts * (double)chain->wsi->timestamp_period);
+   }
+}
+
+static void
+wsi_swapchain_present_timing_notify_recycle_locked(struct wsi_swapchain *chain,
+                                                   struct wsi_image *image)
+{
+   assert(chain->present_timing.active);
+
+   for (size_t i = 0; i < chain->present_timing.timings_count; i++) {
+      if (chain->present_timing.timings[i].image == image) {
+         /* A different present takes ownership of the image's query pool index now. */
+         chain->present_timing.timings[i].image = NULL;
+         chain->present_timing.timings[i].queue_done_time = 0;
+
+         /* We waited on progress fence, so the timestamp query is guaranteed to be done. */
+         wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, 0);
+         break;
+      }
+   }
+}
+
 static VkResult wsi_common_allocate_timing_request(
-      struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing, uint64_t present_id)
+      struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing,
+      uint64_t present_id, struct wsi_image *image)
 {
   VkResult vr = VK_SUCCESS;
   mtx_lock(&swapchain->present_timing.lock);
@ -1206,6 +1488,8 @@ static VkResult wsi_common_allocate_timing_request(
      goto err;
   }

+   wsi_swapchain_present_timing_notify_recycle_locked(swapchain, image);
+
   struct wsi_presentation_timing *wsi_timing =
         &swapchain->present_timing.timings[swapchain->present_timing.timings_count++];

@ -1213,8 +1497,12 @@ static VkResult wsi_common_allocate_timing_request(
   wsi_timing->serial = ++swapchain->present_timing.serial;
   wsi_timing->target_time = timing->targetTime;
   wsi_timing->present_id = present_id;
+
+   /* It is allowed to ask for more stages than is supported, but we need to ignore them later. */
   wsi_timing->requested_feedback = timing->presentStageQueries;

+   wsi_timing->image = image;
+
   /* Ignore the time domain since we have a static domain. */

 err:
@ -1225,7 +1513,8 @@ err:
 void
 wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
                                               uint64_t timing_serial,
-                                               uint64_t timestamp)
+                                               uint64_t timestamp,
+                                               struct wsi_image *image)
 {
   assert(chain->present_timing.active);
   mtx_lock(&chain->present_timing.lock);
@ -1234,6 +1523,17 @@ wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
      if (chain->present_timing.timings[i].serial == timing_serial) {
         chain->present_timing.timings[i].complete_time = timestamp;
         chain->present_timing.timings[i].complete = VK_TRUE;
+
+         /* It's possible that QueuePresentKHR already handled the queue done timestamp for us,
+          * since the image was recycled before presentation could fully complete.
+          * In this case, we no longer own the timestamp query pool index, so just skip. */
+         if (chain->present_timing.timings[i].image != image)
+            break;
+
+         /* 0 means unknown. Application can probably fall back to its own timestamps if it wants to. */
+         chain->present_timing.timings[i].queue_done_time = 0;
+         wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, timestamp);
+         chain->present_timing.timings[i].image = NULL;
         break;
      }
   }
@ -1314,7 +1614,7 @@ wsi_GetPastPresentationTimingEXT(
   bool stop_timing_removal = false;

   for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) {
-      const struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i];
+      struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i];

      if (!swapchain->present_timing.timings[i].complete || stop_timing_removal) {
         /* Keep output ordered to be compliant without having to re-sort every time.
@ -1325,6 +1625,26 @@ wsi_GetPastPresentationTimingEXT(
         continue;
      }

+      /* In some odd cases, completions can happen out of order,
+       * and CTS tests that completion times for IMMEDIATE/MAILBOX are monotonic.
+       * We always retire in-order, so this is fine. */
+      if (in_timing->queue_done_time) {
+         /* CTS questionably checks that timing is strictly increasing in places.
+          * Bumping times by one nanosecond works around this.
+          * In practice, the only scenario where this happens is when
+          * there is calibration jitter or multiple images are retired
+          * at exact same time due to MAILBOX/IMMEDIATE. */
+         in_timing->queue_done_time = MAX2(in_timing->queue_done_time,
+            swapchain->present_timing.minimum_queue_done_time + 1);
+         swapchain->present_timing.minimum_queue_done_time = in_timing->queue_done_time;
+      }
+
+      if (in_timing->complete_time) {
+         in_timing->complete_time = MAX2(in_timing->complete_time,
+            swapchain->present_timing.minimum_complete_time + 1);
+         swapchain->present_timing.minimum_complete_time = in_timing->complete_time;
+      }
+
      vk_outarray_append_typed(VkPastPresentationTimingEXT, &timings, timing) {
         timing->targetTime = swapchain->present_timing.timings[i].target_time;
         timing->presentId = in_timing->present_id;
@ -1350,12 +1670,23 @@ wsi_GetPastPresentationTimingEXT(
            }
         }

-         if (in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
-            vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) {
-               stage->stage = in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
-               /* It is expected that implementation will only expose one timing value. */
-               assert(util_bitcount(stage->stage) == 1);
-               stage->time = in_timing->complete_time;
+         /* CTS expects that we are able to return something for all stages, even if they are not supported. */
+         static const VkPresentStageFlagBitsEXT candidate_stages[] = {
+            VK_PRESENT_STAGE_REQUEST_DEQUEUED_BIT_EXT,
+            VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT,
+            VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_VISIBLE_BIT_EXT,
+         };
+
+         for (int stage_index = 0; stage_index < ARRAY_SIZE(candidate_stages); stage_index++) {
+            bool requested = (in_timing->requested_feedback & candidate_stages[stage_index]) != 0;
+            bool supported = (swapchain->present_timing.supported_query_stages & candidate_stages[stage_index]) != 0;
+
+            if (requested) {
+               vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) {
+                  stage->stage = candidate_stages[stage_index];
+                  /* It is expected that implementation will only expose one timing value. */
+                  stage->time = supported ? in_timing->complete_time : 0;
+               }
            }
         }
      }
@ -1811,6 +2142,8 @@ wsi_common_queue_present(const struct wsi_device *wsi,
   const VkPresentTimingsInfoEXT *present_timings_info =
         vk_find_struct_const(pPresentInfo->pNext, PRESENT_TIMINGS_INFO_EXT);

+   bool needs_timing_command_buffer = false;
+
   if (present_timings_info) {
      /* If we fail a present due to full queue, it's a little unclear from
       * spec if we should treat it as OUT_OF_DATE or OUT_OF_HOST_MEMORY for
@ -1824,19 +2157,40 @@ wsi_common_queue_present(const struct wsi_device *wsi,

         assert(swapchain->present_timing.active);

+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
         /* EXT_present_timing is defined to only work with present_id2.
          * It's only used when reporting back timings. */
         results[i] = wsi_common_allocate_timing_request(
-               swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0);
+               swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0,
+               swapchain->get_wsi_image(swapchain, image_index));

         /* Application is responsible for allocating sufficient size here.
          * We fail with VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT if application is bugged. */
         if (results[i] == VK_SUCCESS) {
+            /* We may have to rewrite the timestamp if application requests to use timestamps
+             * in terms of the QUEUE_OPERATIONS_END time domain, which is actually DEVICE. */
+            uint64_t target_time = info->targetTime;
+
+            if (info->targetTimeDomainPresentStage == VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT &&
+                !(info->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT)) {
+               /* For relative, it's all nanoseconds anyway, so no need to do anything. */
+               target_time = wsi_swapchain_present_convert_device_to_cpu(swapchain, target_time);
+            }
+
            swapchain->set_timing_request(swapchain, &(struct wsi_image_timing_request) {
               .serial = swapchain->present_timing.serial,
-               .time = info->targetTime,
+               .time = target_time,
               .flags = info->flags,
            });
+
+            if (info->presentStageQueries & swapchain->present_timing.supported_query_stages &
+                VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
+               /* It's not a problem if we redundantly submit timing command buffers.
+                * VUID-12234 also says all swapchains in this present must have been
+                * created with present timing enabled. */
+               needs_timing_command_buffer = true;
+            }
         }
      }
   }
@ -1919,15 +2273,15 @@ wsi_common_queue_present(const struct wsi_device *wsi,
    * the per-image semaphores and fences with the blit.
    */
   {
-      STACK_ARRAY(VkCommandBufferSubmitInfo, blit_command_buffer_infos,
-                  pPresentInfo->swapchainCount);
+      STACK_ARRAY(VkCommandBufferSubmitInfo, command_buffer_infos,
+                  pPresentInfo->swapchainCount * 2);
      STACK_ARRAY(VkSemaphoreSubmitInfo, signal_semaphore_infos,
                  pPresentInfo->swapchainCount *
                  ARRAY_SIZE(image_signal_infos[0].semaphore_infos));
      STACK_ARRAY(VkFence, fences,
                  pPresentInfo->swapchainCount *
                  ARRAY_SIZE(image_signal_infos[0].fences));
-      uint32_t blit_count = 0, signal_semaphore_count = 0, fence_count = 0;
+      uint32_t command_buffer_count = 0, signal_semaphore_count = 0, fence_count = 0;

      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
         VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]);
@ -1981,20 +2335,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
         }

         if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
-            blit_command_buffer_infos[blit_count++] = (VkCommandBufferSubmitInfo) {
+            command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
               .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
               .commandBuffer =
                  image->blit.cmd_buffers[queue->queue_family_index],
            };
         }
+
+         if (needs_timing_command_buffer) {
+            command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
+               .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+               .commandBuffer = image->timestamp_cmd_buffers[queue->queue_family_index],
+            };
+         }
      }

      const VkSubmitInfo2 submit_info = {
         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
         .waitSemaphoreInfoCount = pPresentInfo->waitSemaphoreCount,
         .pWaitSemaphoreInfos = semaphore_wait_infos,
-         .commandBufferInfoCount = blit_count,
-         .pCommandBufferInfos = blit_command_buffer_infos,
+         .commandBufferInfoCount = command_buffer_count,
+         .pCommandBufferInfos = command_buffer_infos,
         .signalSemaphoreInfoCount = signal_semaphore_count,
         .pSignalSemaphoreInfos = signal_semaphore_infos,
      };
@ -2010,7 +2371,7 @@ wsi_common_queue_present(const struct wsi_device *wsi,

      STACK_ARRAY_FINISH(fences);
      STACK_ARRAY_FINISH(signal_semaphore_infos);
-      STACK_ARRAY_FINISH(blit_command_buffer_infos);
+      STACK_ARRAY_FINISH(command_buffer_infos);
   }

   /* Now do blits on any blit queues */
@ -2035,17 +2396,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
         .semaphore = swapchain->blit.semaphores[image_index],
      };

-      const VkCommandBufferSubmitInfo blit_command_buffer_info = {
+      VkCommandBufferSubmitInfo command_buffer_infos[2];
+      uint32_t command_buffer_count = 0;
+
+      command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
         .commandBuffer = image->blit.cmd_buffers[0],
      };

+      if (needs_timing_command_buffer) {
+         command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+            .commandBuffer = image->timestamp_cmd_buffers[0],
+         };
+      }
+
      const VkSubmitInfo2 submit_info = {
         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
         .waitSemaphoreInfoCount = 1,
         .pWaitSemaphoreInfos = &blit_semaphore_info,
-         .commandBufferInfoCount = 1,
-         .pCommandBufferInfos = &blit_command_buffer_info,
+         .commandBufferInfoCount = command_buffer_count,
+         .pCommandBufferInfos = command_buffer_infos,
         .signalSemaphoreInfoCount = image_signal_infos[i].semaphore_count,
         .pSignalSemaphoreInfos = image_signal_infos[i].semaphore_infos,
      };
--- a/src/vulkan/wsi/wsi_common.h
+++ b/src/vulkan/wsi/wsi_common.h
@ -62,6 +62,8 @@ struct wsi_device {
   VkPhysicalDeviceMemoryProperties memory_props;
   uint32_t queue_family_count;
   uint64_t queue_supports_blit;
+   uint64_t queue_supports_timestamps;
+   float timestamp_period;

   VkPhysicalDeviceDrmPropertiesEXT drm_info;
   VkPhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info;
@ -201,28 +203,37 @@ struct wsi_device {
   WSI_CB(CmdPipelineBarrier);
   WSI_CB(CmdCopyImage);
   WSI_CB(CmdCopyImageToBuffer);
+   WSI_CB(CmdResetQueryPool);
+   WSI_CB(CmdWriteTimestamp);
   WSI_CB(CreateBuffer);
   WSI_CB(CreateCommandPool);
   WSI_CB(CreateFence);
   WSI_CB(CreateImage);
+   WSI_CB(CreateQueryPool);
   WSI_CB(CreateSemaphore);
   WSI_CB(DestroyBuffer);
   WSI_CB(DestroyCommandPool);
   WSI_CB(DestroyFence);
   WSI_CB(DestroyImage);
+   WSI_CB(DestroyQueryPool);
   WSI_CB(DestroySemaphore);
   WSI_CB(EndCommandBuffer);
   WSI_CB(FreeMemory);
   WSI_CB(FreeCommandBuffers);
   WSI_CB(GetBufferMemoryRequirements);
+   WSI_CB(GetCalibratedTimestampsKHR);
   WSI_CB(GetFenceStatus);
   WSI_CB(GetImageDrmFormatModifierPropertiesEXT);
   WSI_CB(GetImageMemoryRequirements);
   WSI_CB(GetImageSubresourceLayout);
   WSI_CB(GetMemoryFdKHR);
+   WSI_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
+   WSI_CB(GetPhysicalDeviceProperties);
   WSI_CB(GetPhysicalDeviceFormatProperties);
   WSI_CB(GetPhysicalDeviceFormatProperties2);
   WSI_CB(GetPhysicalDeviceImageFormatProperties2);
+   WSI_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_CB(GetQueryPoolResults);
   WSI_CB(GetSemaphoreFdKHR);
   WSI_CB(ResetFences);
   WSI_CB(QueueSubmit2);
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@ -188,6 +188,9 @@ struct wsi_image {
   int dma_buf_fd;
 #endif
   void *cpu_map;
+
+   VkQueryPool query_pool;
+   VkCommandBuffer *timestamp_cmd_buffers;
 };

 struct wsi_presentation_timing {
@ -196,6 +199,10 @@ struct wsi_presentation_timing {
   uint64_t serial;
   uint64_t queue_done_time; /* GPU timestamp based. */
   uint64_t complete_time; /* Best effort timestamp we get from backend. */
+   /* If we're rendering with IMMEDIATE, it's possible for images to IDLE long before they complete.
+    * In this case, we have to ensure that queue_done_time is sampled at QueuePresentKHR time
+    * before we recycle an image. */
+   struct wsi_image *image;
   VkPresentStageFlagsEXT requested_feedback;
   VkBool32 complete;
 };
@ -272,6 +279,11 @@ struct wsi_swapchain {
      uint64_t refresh_counter;

      VkTimeDomainKHR time_domain;
+
+      VkPresentStageFlagsEXT supported_query_stages;
+      /* Ensures monotonicity for complete_time. */
+      uint64_t minimum_queue_done_time;
+      uint64_t minimum_complete_time;
   } present_timing;

   bool capture_key_pressed;
@ -410,6 +422,10 @@ wsi_create_image(const struct wsi_swapchain *chain,
 void
 wsi_image_init(struct wsi_image *image);

+VkResult
+wsi_image_init_timestamp(const struct wsi_swapchain *chain,
+                         struct wsi_image *image);
+
 void
 wsi_destroy_image(const struct wsi_swapchain *chain,
                  struct wsi_image *image);
@ -420,7 +436,8 @@ wsi_swapchain_wait_for_present_semaphore(const struct wsi_swapchain *chain,

 void
 wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
-                                               uint64_t timing_serial, uint64_t timestamp);
+                                               uint64_t timing_serial, uint64_t timestamp,
+                                               struct wsi_image *image);

 void
 wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain,
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@ -1670,7 +1670,15 @@ wsi_GetPhysicalDeviceWaylandPresentationSupportKHR(VkPhysicalDevice physicalDevi
   struct wsi_wayland *wsi =
      (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];

-   if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex)))
+   /* These should overlap. */
+   uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps;
+
+   /* If there are no queues that support both blits and timestamps,
+    * don't report support for queue timestamps. */
+   if (!effective_queues)
+      effective_queues = wsi_device->queue_supports_blit;
+
+   if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex)))
      return false;

   struct wsi_wl_display display;
@ -2469,6 +2477,7 @@ struct wsi_wl_present_id {
   uint64_t target_time;
   uint64_t correction;
   struct wl_list link;
+   struct wsi_image *img;
   bool user_target_time;
 };

@ -2903,7 +2912,7 @@ presentation_handle_presented(void *data,
   /* Notify this before present wait to reduce latency of presentation timing requests
    * if the application is driving its queries based off present waits. */
   if (id->timing_serial)
-      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time);
+      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time, id->img);

   mtx_lock(&chain->present_ids.lock);
   chain->present_ids.refresh_nsec = refresh;
@ -2940,7 +2949,7 @@ presentation_handle_discarded(void *data)
    * applications may start to latch onto that timestamp as ground truth, which
    * is obviously not correct. */
   if (id->timing_serial)
-      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0);
+      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0, id->img);

   mtx_lock(&chain->present_ids.lock);
   if (!chain->present_ids.valid_refresh_nsec) {
@ -3217,6 +3226,7 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain,
      id->present_id = present_id;
      id->alloc = chain->wsi_wl_surface->display->wsi_wl->alloc;
      id->timing_serial = chain->timing_request.serial;
+      id->img = &chain->images[image_index].base;
      id->user_target_time = chain->timing_request.time != 0;

      mtx_lock(&chain->present_ids.lock);