diff --git a/src/vulkan/wsi/wsi_common.c b/src/vulkan/wsi/wsi_common.c
index 0c804562c78..6a2c5c80141 100644
--- a/src/vulkan/wsi/wsi_common.c
+++ b/src/vulkan/wsi/wsi_common.c
@@ -95,6 +95,7 @@ wsi_device_init(struct wsi_device *wsi,
    WSI_GET_CB(GetPhysicalDeviceProperties2);
    WSI_GET_CB(GetPhysicalDeviceMemoryProperties);
    WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_GET_CB(GetPhysicalDeviceProperties);
 #undef WSI_GET_CB
 
    wsi->drm_info.sType =
@@ -121,10 +122,18 @@ wsi_device_init(struct wsi_device *wsi,
    VkQueueFamilyProperties queue_properties[64];
    GetPhysicalDeviceQueueFamilyProperties(pdevice, &wsi->queue_family_count, queue_properties);
 
+   VkPhysicalDeviceProperties properties;
+   GetPhysicalDeviceProperties(pdevice, &properties);
+   wsi->timestamp_period = properties.limits.timestampPeriod;
+
    for (unsigned i = 0; i < wsi->queue_family_count; i++) {
       VkFlags req_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
       if (queue_properties[i].queueFlags & req_flags)
          wsi->queue_supports_blit |= BITFIELD64_BIT(i);
+
+      /* Don't want to consider timestamp wrapping logic. */
+      if (queue_properties[i].timestampValidBits == 64)
+         wsi->queue_supports_timestamps |= BITFIELD64_BIT(i);
    }
 
    for (VkExternalSemaphoreHandleTypeFlags handle_type = 1;
@@ -180,15 +189,19 @@ wsi_device_init(struct wsi_device *wsi,
    WSI_GET_CB(CmdPipelineBarrier);
    WSI_GET_CB(CmdCopyImage);
    WSI_GET_CB(CmdCopyImageToBuffer);
+   WSI_GET_CB(CmdResetQueryPool);
+   WSI_GET_CB(CmdWriteTimestamp);
    WSI_GET_CB(CreateBuffer);
    WSI_GET_CB(CreateCommandPool);
    WSI_GET_CB(CreateFence);
    WSI_GET_CB(CreateImage);
+   WSI_GET_CB(CreateQueryPool);
    WSI_GET_CB(CreateSemaphore);
    WSI_GET_CB(DestroyBuffer);
    WSI_GET_CB(DestroyCommandPool);
    WSI_GET_CB(DestroyFence);
    WSI_GET_CB(DestroyImage);
+   WSI_GET_CB(DestroyQueryPool);
    WSI_GET_CB(DestroySemaphore);
    WSI_GET_CB(EndCommandBuffer);
    WSI_GET_CB(FreeMemory);
@@ -200,9 +213,14 @@ wsi_device_init(struct wsi_device *wsi,
    WSI_GET_CB(GetImageSubresourceLayout);
    if (!wsi->sw)
       WSI_GET_CB(GetMemoryFdKHR);
+   WSI_GET_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
+   WSI_GET_CB(GetPhysicalDeviceProperties);
    WSI_GET_CB(GetPhysicalDeviceFormatProperties);
    WSI_GET_CB(GetPhysicalDeviceFormatProperties2);
    WSI_GET_CB(GetPhysicalDeviceImageFormatProperties2);
+   WSI_GET_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_GET_CB(GetCalibratedTimestampsKHR);
+   WSI_GET_CB(GetQueryPoolResults);
    WSI_GET_CB(GetSemaphoreFdKHR);
    WSI_GET_CB(ResetFences);
    WSI_GET_CB(QueueSubmit2);
@@ -459,6 +477,10 @@ configure_image(const struct wsi_swapchain *chain,
    }
 }
 
+static void
+wsi_surface_capabilities_add_present_stages(const struct wsi_device *wsi_device,
+                                            VkPresentTimingSurfaceCapabilitiesEXT *present_timing);
+
 VkResult
 wsi_swapchain_init(const struct wsi_device *wsi,
                    struct wsi_swapchain *chain,
@@ -484,8 +506,10 @@ wsi_swapchain_init(const struct wsi_device *wsi,
       (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_WAIT_2_BIT_KHR);
 
    chain->blit.queue = NULL;
-   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
-      if (wsi->get_blit_queue) {
+   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT ||
+       (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)) {
+
+      if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT && wsi->get_blit_queue) {
          chain->blit.queue = wsi->get_blit_queue(_device);
       }
 
@@ -506,10 +530,18 @@ wsi_swapchain_init(const struct wsi_device *wsi,
          if (chain->blit.queue != NULL) {
             queue_family_index = chain->blit.queue->queue_family_index;
          } else {
+            uint64_t effective_queues = wsi->queue_supports_blit;
+            if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT)
+               effective_queues &= wsi->queue_supports_timestamps;
+
+            /* Fallback. If this happens we don't advertise support for queue complete times. */
+            if (!effective_queues)
+               effective_queues = wsi->queue_supports_blit;
+
             /* Queues returned by get_blit_queue() might not be listed in
             * GetPhysicalDeviceQueueFamilyProperties, so this check is skipped for those queues.
             */
-            if (!(wsi->queue_supports_blit & BITFIELD64_BIT(queue_family_index)))
+            if (!(effective_queues & BITFIELD64_BIT(queue_family_index)))
                continue;
          }
 
@@ -537,6 +569,24 @@ wsi_swapchain_init(const struct wsi_device *wsi,
       goto fail;
 #endif
 
+   if (pCreateInfo->flags & VK_SWAPCHAIN_CREATE_PRESENT_TIMING_BIT_EXT) {
+      ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
+      struct wsi_interface *iface = wsi->wsi[surface->platform];
+
+      VkPresentTimingSurfaceCapabilitiesEXT timing_caps = {
+         .sType = VK_STRUCTURE_TYPE_PRESENT_TIMING_SURFACE_CAPABILITIES_EXT,
+      };
+
+      VkSurfaceCapabilities2KHR caps2 = {
+         .sType = VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR,
+         .pNext = &timing_caps,
+      };
+
+      iface->get_capabilities2(surface, (struct wsi_device *)wsi, NULL, &caps2);
+      wsi_surface_capabilities_add_present_stages(wsi, &timing_caps);
+      chain->present_timing.supported_query_stages = timing_caps.presentStageQueries;
+   }
+
    return VK_SUCCESS;
 
 fail:
@@ -619,7 +669,7 @@ wsi_swapchain_finish(struct wsi_swapchain *chain)
    chain->wsi->DestroySemaphore(chain->device, chain->present_id_timeline,
                                 &chain->alloc);
 
-   if (chain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
+   if (chain->cmd_pools) {
       int cmd_pools_count = chain->blit.queue != NULL ?
          1 : chain->wsi->queue_family_count;
       for (uint32_t i = 0; i < cmd_pools_count; i++) {
@@ -824,6 +874,88 @@ fail:
    return result;
 }
 
+/**
+ * Creates the timestamp-query command buffers for the end of rendering, that
+ * will be used to report QUEUE_COMPLETE timestamp for EXT_present_timing.
+ *
+ * Unless the swapchain is blitting, we don't know what queue family a Present
+ * will happen on.  So we make a timestamp command buffer for each so they're
+ * ready to go at present time.
+ */
+VkResult
+wsi_image_init_timestamp(const struct wsi_swapchain *chain,
+                         struct wsi_image *image)
+{
+   const struct wsi_device *wsi = chain->wsi;
+   VkResult result;
+   /* Set up command buffer to get timestamp info */
+
+   result = wsi->CreateQueryPool(
+      chain->device,
+      &(const VkQueryPoolCreateInfo){
+         .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         .queryType = VK_QUERY_TYPE_TIMESTAMP,
+         .queryCount = 1,
+      },
+      NULL,
+      &image->query_pool);
+
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count;
+
+   if (!image->timestamp_cmd_buffers) {
+      image->timestamp_cmd_buffers =
+         vk_zalloc(&chain->alloc, sizeof(VkCommandBuffer) * family_count, 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!image->timestamp_cmd_buffers)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   for (uint32_t i = 0; i < family_count; i++) {
+      /* We can only use timestamps on a queue that reports timestamp bits != 0.
+       * Since we don't consider device timestamp wrapping in this implementation (unclear how that would ever work),
+       * only report queue done where timestamp bits == 64. */
+      if (!chain->cmd_pools[i])
+         continue;
+
+      result = wsi->AllocateCommandBuffers(
+         chain->device,
+         &(const VkCommandBufferAllocateInfo){
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+            .pNext = NULL,
+            .commandPool = chain->cmd_pools[i],
+            .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+            .commandBufferCount = 1,
+         }, &image->timestamp_cmd_buffers[i]);
+
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      wsi->BeginCommandBuffer(
+         image->timestamp_cmd_buffers[i],
+         &(VkCommandBufferBeginInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+         });
+
+      wsi->CmdResetQueryPool(image->timestamp_cmd_buffers[i],
+                             image->query_pool,
+                             0, 1);
+
+      wsi->CmdWriteTimestamp(image->timestamp_cmd_buffers[i],
+                             VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                             image->query_pool,
+                             0);
+
+      wsi->EndCommandBuffer(image->timestamp_cmd_buffers[i]);
+   }
+
+   return VK_SUCCESS;
+fail:
+   return result;
+}
+
 void
 wsi_destroy_image(const struct wsi_swapchain *chain,
                   struct wsi_image *image)
@@ -859,6 +991,19 @@ wsi_destroy_image(const struct wsi_swapchain *chain,
       vk_free(&chain->alloc, image->blit.cmd_buffers);
    }
 
+   wsi->DestroyQueryPool(chain->device, image->query_pool, NULL);
+
+   if (image->timestamp_cmd_buffers) {
+      uint32_t family_count = chain->blit.queue ? 1 : wsi->queue_family_count;
+      for (uint32_t i = 0; i < family_count; i++) {
+         if (image->timestamp_cmd_buffers[i]) {
+            wsi->FreeCommandBuffers(chain->device, chain->cmd_pools[i],
+                                    1, &image->timestamp_cmd_buffers[i]);
+         }
+      }
+      vk_free(&chain->alloc, image->timestamp_cmd_buffers);
+   }
+
    wsi->FreeMemory(chain->device, image->memory, &chain->alloc);
    wsi->DestroyImage(chain->device, image->image, &chain->alloc);
    wsi->DestroyImage(chain->device, image->blit.image, &chain->alloc);
@@ -910,6 +1055,37 @@ wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(
    return result;
 }
 
+static void
+wsi_surface_capabilities_add_present_stages(const struct wsi_device *wsi_device,
+                                            VkPresentTimingSurfaceCapabilitiesEXT *present_timing)
+{
+   if (wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps) {
+      /* Make sure the implementation is capable of calibrating timestamps. */
+      if (wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR && wsi_device->GetCalibratedTimestampsKHR) {
+         VkTimeDomainKHR domains[64];
+         uint32_t count = ARRAY_SIZE(domains);
+         wsi_device->GetPhysicalDeviceCalibrateableTimeDomainsKHR(wsi_device->pdevice, &count, domains);
+
+         bool supports_device = false, supports_monotonic = false, supports_monotonic_raw = false;
+
+         for (uint32_t i = 0; i < count; i++) {
+            if (domains[i] == VK_TIME_DOMAIN_DEVICE_KHR)
+               supports_device = true;
+            else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR)
+               supports_monotonic = true;
+            else if (domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR)
+               supports_monotonic_raw = true;
+         }
+
+         /* Current present timing implementations do not use anything outside these.
+          * QPC might be relevant for Dozen at some point, but for now, we only consider Linux-centric
+          * platforms for present timing. */
+         if (supports_device && supports_monotonic && supports_monotonic_raw)
+            present_timing->presentStageQueries |= VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
+      }
+   }
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(
    VkPhysicalDevice physicalDevice,
@@ -921,8 +1097,26 @@ wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(
    struct wsi_device *wsi_device = device->wsi_device;
    struct wsi_interface *iface = wsi_device->wsi[surface->platform];
 
-   return iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext,
-                                   pSurfaceCapabilities);
+   VkResult vr = iface->get_capabilities2(surface, wsi_device, pSurfaceInfo->pNext,
+                                          pSurfaceCapabilities);
+   if (vr != VK_SUCCESS)
+      return vr;
+
+   VkPresentTimingSurfaceCapabilitiesEXT *present_timing =
+         vk_find_struct(pSurfaceCapabilities, PRESENT_TIMING_SURFACE_CAPABILITIES_EXT);
+
+   if (present_timing && present_timing->presentTimingSupported) {
+      wsi_surface_capabilities_add_present_stages(wsi_device, present_timing);
+      if (!(present_timing->presentStageQueries & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT)) {
+         /* CTS and proposal document assert that QUEUE_OPERATIONS_END must be supported. */
+         present_timing->presentTimingSupported = VK_FALSE;
+         present_timing->presentAtAbsoluteTimeSupported = VK_FALSE;
+         present_timing->presentAtRelativeTimeSupported = VK_FALSE;
+         present_timing->presentStageQueries = 0;
+      }
+   }
+
+   return vr;
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -1126,6 +1320,15 @@ wsi_CreateSwapchainKHR(VkDevice _device,
       swapchain->present_timing.active = true;
       mtx_init(&swapchain->present_timing.lock, 0);
 
+      for (uint32_t i = 0; i < swapchain->image_count; i++) {
+         struct wsi_image *image = swapchain->get_wsi_image(swapchain, i);
+         result = wsi_image_init_timestamp(swapchain, image);
+         if (result != VK_SUCCESS) {
+            swapchain->destroy(swapchain, alloc);
+            return result;
+         }
+      }
+
       if (swapchain->poll_early_refresh) {
          /* If we can query the display directly, we should report something reasonable on first query
           * before we even present the first time. */
@@ -1195,8 +1398,87 @@ wsi_ReleaseSwapchainImagesKHR(VkDevice _device,
    return VK_SUCCESS;
 }
 
+static uint64_t
+wsi_swapchain_present_convert_device_to_cpu(struct wsi_swapchain *chain,
+                                            uint64_t device_timestamp_ns)
+{
+   if (device_timestamp_ns == 0)
+      return 0;
+
+   /* This is only relevant if application is requesting that QUEUE_DONE present stage
+    * is used as target time domain (targetTimeDomainPresentStage). */
+
+   /* We have already made sure that the implementation supports these. */
+   const VkCalibratedTimestampInfoKHR infos[2] = {
+      {
+         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
+         .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
+      },
+      {
+         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
+         .timeDomain = chain->present_timing.time_domain,
+      },
+   };
+
+   uint64_t timestamps[2];
+   uint64_t max_deviation;
+
+   /* Ignore target time if this fails for whatever reason. */
+   if (chain->wsi->GetCalibratedTimestampsKHR(chain->device, 2, infos, timestamps, &max_deviation) != VK_SUCCESS)
+      return 0;
+
+   /* PRESENT_STAGE_LOCAL_EXT is in terms of nanoseconds, so we don't scale that. */
+   timestamps[0] = (uint64_t)((double)chain->wsi->timestamp_period * (double)timestamps[0]);
+   int64_t device_delta_ns = (int64_t)device_timestamp_ns - (int64_t)timestamps[0];
+   uint64_t target_timestamp = timestamps[1] + device_delta_ns;
+   return target_timestamp;
+}
+
+static void
+wsi_swapchain_present_timing_sample_query_pool(struct wsi_swapchain *chain,
+                                               struct wsi_presentation_timing *timing,
+                                               struct wsi_image *image,
+                                               uint64_t upper_bound)
+{
+   /* Application can query for stages which are not supported. We need to return 0 here. */
+   if (!(timing->requested_feedback & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) ||
+       !(chain->present_timing.supported_query_stages & VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT))
+      return;
+
+   /* The GPU really should be done by now, and we should be able to read the timestamp,
+    * but it's possible that the present was discarded and we have a 0 timestamp here for the present.
+    * In this case, we should not block to wait on the queue dispatch timestamp. */
+   uint64_t queue_ts;
+
+   if (chain->wsi->GetQueryPoolResults(chain->device, image->query_pool, 0, 1, sizeof(uint64_t),
+                                       &queue_ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT) == VK_SUCCESS) {
+      /* This is still reported in nanoseconds. */
+      timing->queue_done_time = (uint64_t)((double)queue_ts * (double)chain->wsi->timestamp_period);
+   }
+}
+
+static void
+wsi_swapchain_present_timing_notify_recycle_locked(struct wsi_swapchain *chain,
+                                                   struct wsi_image *image)
+{
+   assert(chain->present_timing.active);
+
+   for (size_t i = 0; i < chain->present_timing.timings_count; i++) {
+      if (chain->present_timing.timings[i].image == image) {
+         /* A different present takes ownership of the image's query pool index now. */
+         chain->present_timing.timings[i].image = NULL;
+         chain->present_timing.timings[i].queue_done_time = 0;
+
+         /* We waited on progress fence, so the timestamp query is guaranteed to be done. */
+         wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, 0);
+         break;
+      }
+   }
+}
+
 static VkResult wsi_common_allocate_timing_request(
-      struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing, uint64_t present_id)
+      struct wsi_swapchain *swapchain, const VkPresentTimingInfoEXT *timing,
+      uint64_t present_id, struct wsi_image *image)
 {
    VkResult vr = VK_SUCCESS;
    mtx_lock(&swapchain->present_timing.lock);
@@ -1206,6 +1488,8 @@ static VkResult wsi_common_allocate_timing_request(
       goto err;
    }
 
+   wsi_swapchain_present_timing_notify_recycle_locked(swapchain, image);
+
    struct wsi_presentation_timing *wsi_timing =
          &swapchain->present_timing.timings[swapchain->present_timing.timings_count++];
 
@@ -1213,8 +1497,12 @@ static VkResult wsi_common_allocate_timing_request(
    wsi_timing->serial = ++swapchain->present_timing.serial;
    wsi_timing->target_time = timing->targetTime;
    wsi_timing->present_id = present_id;
+
+   /* It is allowed to ask for more stages than is supported, but we need to ignore them later. */
    wsi_timing->requested_feedback = timing->presentStageQueries;
 
+   wsi_timing->image = image;
+
    /* Ignore the time domain since we have a static domain. */
 
 err:
@@ -1225,7 +1513,8 @@ err:
 void
 wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
                                                uint64_t timing_serial,
-                                               uint64_t timestamp)
+                                               uint64_t timestamp,
+                                               struct wsi_image *image)
 {
    assert(chain->present_timing.active);
    mtx_lock(&chain->present_timing.lock);
@@ -1234,6 +1523,17 @@ wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
       if (chain->present_timing.timings[i].serial == timing_serial) {
          chain->present_timing.timings[i].complete_time = timestamp;
          chain->present_timing.timings[i].complete = VK_TRUE;
+
+         /* It's possible that QueuePresentKHR already handled the queue done timestamp for us,
+          * since the image was recycled before presentation could fully complete.
+          * In this case, we no longer own the timestamp query pool index, so just skip. */
+         if (chain->present_timing.timings[i].image != image)
+            break;
+
+         /* 0 means unknown. Application can probably fall back to its own timestamps if it wants to. */
+         chain->present_timing.timings[i].queue_done_time = 0;
+         wsi_swapchain_present_timing_sample_query_pool(chain, &chain->present_timing.timings[i], image, timestamp);
+         chain->present_timing.timings[i].image = NULL;
          break;
       }
    }
@@ -1314,7 +1614,7 @@ wsi_GetPastPresentationTimingEXT(
    bool stop_timing_removal = false;
 
    for (uint32_t i = 0; i < swapchain->present_timing.timings_count; i++) {
-      const struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i];
+      struct wsi_presentation_timing *in_timing = &swapchain->present_timing.timings[i];
 
       if (!swapchain->present_timing.timings[i].complete || stop_timing_removal) {
          /* Keep output ordered to be compliant without having to re-sort every time.
@@ -1325,6 +1625,26 @@ wsi_GetPastPresentationTimingEXT(
          continue;
       }
 
+      /* In some odd cases, completions can happen out of order,
+       * and CTS tests that completion times for IMMEDIATE/MAILBOX are monotonic.
+       * We always retire in-order, so this is fine. */
+      if (in_timing->queue_done_time) {
+         /* CTS questionably checks that timing is strictly increasing in places.
+          * Bumping times by one nanosecond works around this.
+          * In practice, the only scenario where this happens is when
+          * there is calibration jitter or multiple images are retired
+          * at exact same time due to MAILBOX/IMMEDIATE. */
+         in_timing->queue_done_time = MAX2(in_timing->queue_done_time,
+            swapchain->present_timing.minimum_queue_done_time + 1);
+         swapchain->present_timing.minimum_queue_done_time = in_timing->queue_done_time;
+      }
+
+      if (in_timing->complete_time) {
+         in_timing->complete_time = MAX2(in_timing->complete_time,
+            swapchain->present_timing.minimum_complete_time + 1);
+         swapchain->present_timing.minimum_complete_time = in_timing->complete_time;
+      }
+
       vk_outarray_append_typed(VkPastPresentationTimingEXT, &timings, timing) {
          timing->targetTime = swapchain->present_timing.timings[i].target_time;
          timing->presentId = in_timing->present_id;
@@ -1350,12 +1670,23 @@ wsi_GetPastPresentationTimingEXT(
             }
          }
 
-         if (in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
-            vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) {
-               stage->stage = in_timing->requested_feedback & ~VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT;
-               /* It is expected that implementation will only expose one timing value. */
-               assert(util_bitcount(stage->stage) == 1);
-               stage->time = in_timing->complete_time;
+         /* CTS expects that we are able to return something for all stages, even if they are not supported. */
+         static const VkPresentStageFlagBitsEXT candidate_stages[] = {
+            VK_PRESENT_STAGE_REQUEST_DEQUEUED_BIT_EXT,
+            VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_OUT_BIT_EXT,
+            VK_PRESENT_STAGE_IMAGE_FIRST_PIXEL_VISIBLE_BIT_EXT,
+         };
+
+         for (int stage_index = 0; stage_index < ARRAY_SIZE(candidate_stages); stage_index++) {
+            bool requested = (in_timing->requested_feedback & candidate_stages[stage_index]) != 0;
+            bool supported = (swapchain->present_timing.supported_query_stages & candidate_stages[stage_index]) != 0;
+
+            if (requested) {
+               vk_outarray_append_typed(VkPresentStageTimeEXT, &stages, stage) {
+                  stage->stage = candidate_stages[stage_index];
+                  /* It is expected that implementation will only expose one timing value. */
+                  stage->time = supported ? in_timing->complete_time : 0;
+               }
             }
          }
       }
@@ -1811,6 +2142,8 @@ wsi_common_queue_present(const struct wsi_device *wsi,
    const VkPresentTimingsInfoEXT *present_timings_info =
          vk_find_struct_const(pPresentInfo->pNext, PRESENT_TIMINGS_INFO_EXT);
 
+   bool needs_timing_command_buffer = false;
+
    if (present_timings_info) {
       /* If we fail a present due to full queue, it's a little unclear from
        * spec if we should treat it as OUT_OF_DATE or OUT_OF_HOST_MEMORY for
@@ -1824,19 +2157,40 @@ wsi_common_queue_present(const struct wsi_device *wsi,
 
          assert(swapchain->present_timing.active);
 
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
          /* EXT_present_timing is defined to only work with present_id2.
           * It's only used when reporting back timings. */
          results[i] = wsi_common_allocate_timing_request(
-               swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0);
+               swapchain, info, present_ids2 ? present_ids2->pPresentIds[i] : 0,
+               swapchain->get_wsi_image(swapchain, image_index));
 
          /* Application is responsible for allocating sufficient size here.
           * We fail with VK_ERROR_PRESENT_TIMING_QUEUE_FULL_EXT if application is bugged. */
          if (results[i] == VK_SUCCESS) {
+            /* We may have to rewrite the timestamp if application requests to use timestamps
+             * in terms of the QUEUE_OPERATIONS_END time domain, which is actually DEVICE. */
+            uint64_t target_time = info->targetTime;
+
+            if (info->targetTimeDomainPresentStage == VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT &&
+                !(info->flags & VK_PRESENT_TIMING_INFO_PRESENT_AT_RELATIVE_TIME_BIT_EXT)) {
+               /* For relative, it's all nanoseconds anyway, so no need to do anything. */
+               target_time = wsi_swapchain_present_convert_device_to_cpu(swapchain, target_time);
+            }
+
             swapchain->set_timing_request(swapchain, &(struct wsi_image_timing_request) {
                .serial = swapchain->present_timing.serial,
-               .time = info->targetTime,
+               .time = target_time,
                .flags = info->flags,
             });
+
+            if (info->presentStageQueries & swapchain->present_timing.supported_query_stages &
+                VK_PRESENT_STAGE_QUEUE_OPERATIONS_END_BIT_EXT) {
+               /* It's not a problem if we redundantly submit timing command buffers.
+                * VUID-12234 also says all swapchains in this present must have been
+                * created with present timing enabled. */
+               needs_timing_command_buffer = true;
+            }
          }
       }
    }
@@ -1919,15 +2273,15 @@ wsi_common_queue_present(const struct wsi_device *wsi,
     * the per-image semaphores and fences with the blit.
     */
    {
-      STACK_ARRAY(VkCommandBufferSubmitInfo, blit_command_buffer_infos,
-                  pPresentInfo->swapchainCount);
+      STACK_ARRAY(VkCommandBufferSubmitInfo, command_buffer_infos,
+                  pPresentInfo->swapchainCount * 2);
       STACK_ARRAY(VkSemaphoreSubmitInfo, signal_semaphore_infos,
                   pPresentInfo->swapchainCount *
                   ARRAY_SIZE(image_signal_infos[0].semaphore_infos));
       STACK_ARRAY(VkFence, fences,
                   pPresentInfo->swapchainCount *
                   ARRAY_SIZE(image_signal_infos[0].fences));
-      uint32_t blit_count = 0, signal_semaphore_count = 0, fence_count = 0;
+      uint32_t command_buffer_count = 0, signal_semaphore_count = 0, fence_count = 0;
 
       for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
          VK_FROM_HANDLE(wsi_swapchain, swapchain, pPresentInfo->pSwapchains[i]);
@@ -1981,20 +2335,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
          }
 
          if (swapchain->blit.type != WSI_SWAPCHAIN_NO_BLIT) {
-            blit_command_buffer_infos[blit_count++] = (VkCommandBufferSubmitInfo) {
+            command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
                .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
                .commandBuffer =
                   image->blit.cmd_buffers[queue->queue_family_index],
             };
          }
+
+         if (needs_timing_command_buffer) {
+            command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
+               .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+               .commandBuffer = image->timestamp_cmd_buffers[queue->queue_family_index],
+            };
+         }
       }
 
       const VkSubmitInfo2 submit_info = {
          .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
          .waitSemaphoreInfoCount = pPresentInfo->waitSemaphoreCount,
          .pWaitSemaphoreInfos = semaphore_wait_infos,
-         .commandBufferInfoCount = blit_count,
-         .pCommandBufferInfos = blit_command_buffer_infos,
+         .commandBufferInfoCount = command_buffer_count,
+         .pCommandBufferInfos = command_buffer_infos,
          .signalSemaphoreInfoCount = signal_semaphore_count,
          .pSignalSemaphoreInfos = signal_semaphore_infos,
       };
@@ -2010,7 +2371,7 @@ wsi_common_queue_present(const struct wsi_device *wsi,
 
       STACK_ARRAY_FINISH(fences);
       STACK_ARRAY_FINISH(signal_semaphore_infos);
-      STACK_ARRAY_FINISH(blit_command_buffer_infos);
+      STACK_ARRAY_FINISH(command_buffer_infos);
    }
 
    /* Now do blits on any blit queues */
@@ -2035,17 +2396,27 @@ wsi_common_queue_present(const struct wsi_device *wsi,
          .semaphore = swapchain->blit.semaphores[image_index],
       };
 
-      const VkCommandBufferSubmitInfo blit_command_buffer_info = {
+      VkCommandBufferSubmitInfo command_buffer_infos[2];
+      uint32_t command_buffer_count = 0;
+
+      command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
          .commandBuffer = image->blit.cmd_buffers[0],
       };
 
+      if (needs_timing_command_buffer) {
+         command_buffer_infos[command_buffer_count++] = (VkCommandBufferSubmitInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+            .commandBuffer = image->timestamp_cmd_buffers[0],
+         };
+      }
+
       const VkSubmitInfo2 submit_info = {
          .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
          .waitSemaphoreInfoCount = 1,
          .pWaitSemaphoreInfos = &blit_semaphore_info,
-         .commandBufferInfoCount = 1,
-         .pCommandBufferInfos = &blit_command_buffer_info,
+         .commandBufferInfoCount = command_buffer_count,
+         .pCommandBufferInfos = command_buffer_infos,
          .signalSemaphoreInfoCount = image_signal_infos[i].semaphore_count,
          .pSignalSemaphoreInfos = image_signal_infos[i].semaphore_infos,
       };
diff --git a/src/vulkan/wsi/wsi_common.h b/src/vulkan/wsi/wsi_common.h
index 2692497122b..892345d6344 100644
--- a/src/vulkan/wsi/wsi_common.h
+++ b/src/vulkan/wsi/wsi_common.h
@@ -62,6 +62,8 @@ struct wsi_device {
    VkPhysicalDeviceMemoryProperties memory_props;
    uint32_t queue_family_count;
    uint64_t queue_supports_blit;
+   uint64_t queue_supports_timestamps;
+   float timestamp_period;
 
    VkPhysicalDeviceDrmPropertiesEXT drm_info;
    VkPhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info;
@@ -201,28 +203,37 @@ struct wsi_device {
    WSI_CB(CmdPipelineBarrier);
    WSI_CB(CmdCopyImage);
    WSI_CB(CmdCopyImageToBuffer);
+   WSI_CB(CmdResetQueryPool);
+   WSI_CB(CmdWriteTimestamp);
    WSI_CB(CreateBuffer);
    WSI_CB(CreateCommandPool);
    WSI_CB(CreateFence);
    WSI_CB(CreateImage);
+   WSI_CB(CreateQueryPool);
    WSI_CB(CreateSemaphore);
    WSI_CB(DestroyBuffer);
    WSI_CB(DestroyCommandPool);
    WSI_CB(DestroyFence);
    WSI_CB(DestroyImage);
+   WSI_CB(DestroyQueryPool);
    WSI_CB(DestroySemaphore);
    WSI_CB(EndCommandBuffer);
    WSI_CB(FreeMemory);
    WSI_CB(FreeCommandBuffers);
    WSI_CB(GetBufferMemoryRequirements);
+   WSI_CB(GetCalibratedTimestampsKHR);
    WSI_CB(GetFenceStatus);
    WSI_CB(GetImageDrmFormatModifierPropertiesEXT);
    WSI_CB(GetImageMemoryRequirements);
    WSI_CB(GetImageSubresourceLayout);
    WSI_CB(GetMemoryFdKHR);
+   WSI_CB(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
+   WSI_CB(GetPhysicalDeviceProperties);
    WSI_CB(GetPhysicalDeviceFormatProperties);
    WSI_CB(GetPhysicalDeviceFormatProperties2);
    WSI_CB(GetPhysicalDeviceImageFormatProperties2);
+   WSI_CB(GetPhysicalDeviceQueueFamilyProperties);
+   WSI_CB(GetQueryPoolResults);
    WSI_CB(GetSemaphoreFdKHR);
    WSI_CB(ResetFences);
    WSI_CB(QueueSubmit2);
diff --git a/src/vulkan/wsi/wsi_common_private.h b/src/vulkan/wsi/wsi_common_private.h
index 513b78ce5a0..a65e7c534b6 100644
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@@ -188,6 +188,9 @@ struct wsi_image {
    int dma_buf_fd;
 #endif
    void *cpu_map;
+
+   VkQueryPool query_pool;
+   VkCommandBuffer *timestamp_cmd_buffers;
 };
 
 struct wsi_presentation_timing {
@@ -196,6 +199,10 @@ struct wsi_presentation_timing {
    uint64_t serial;
    uint64_t queue_done_time; /* GPU timestamp based. */
    uint64_t complete_time; /* Best effort timestamp we get from backend. */
+   /* If we're rendering with IMMEDIATE, it's possible for images to IDLE long before they complete.
+    * In this case, we have to ensure that queue_done_time is sampled at QueuePresentKHR time
+    * before we recycle an image. */
+   struct wsi_image *image;
    VkPresentStageFlagsEXT requested_feedback;
    VkBool32 complete;
 };
@@ -272,6 +279,11 @@ struct wsi_swapchain {
       uint64_t refresh_counter;
 
       VkTimeDomainKHR time_domain;
+
+      VkPresentStageFlagsEXT supported_query_stages;
+      /* Ensures monotonicity for complete_time. */
+      uint64_t minimum_queue_done_time;
+      uint64_t minimum_complete_time;
    } present_timing;
 
    bool capture_key_pressed;
@@ -410,6 +422,10 @@ wsi_create_image(const struct wsi_swapchain *chain,
 void
 wsi_image_init(struct wsi_image *image);
 
+VkResult
+wsi_image_init_timestamp(const struct wsi_swapchain *chain,
+                         struct wsi_image *image);
+
 void
 wsi_destroy_image(const struct wsi_swapchain *chain,
                   struct wsi_image *image);
@@ -420,7 +436,8 @@ wsi_swapchain_wait_for_present_semaphore(const struct wsi_swapchain *chain,
 
 void
 wsi_swapchain_present_timing_notify_completion(struct wsi_swapchain *chain,
-                                               uint64_t timing_serial, uint64_t timestamp);
+                                               uint64_t timing_serial, uint64_t timestamp,
+                                               struct wsi_image *image);
 
 void
 wsi_swapchain_present_timing_update_refresh_rate(struct wsi_swapchain *chain,
diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
index 29149fbf4a1..85a82c91f54 100644
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -1670,7 +1670,15 @@ wsi_GetPhysicalDeviceWaylandPresentationSupportKHR(VkPhysicalDevice physicalDevi
    struct wsi_wayland *wsi =
       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
 
-   if (!(wsi_device->queue_supports_blit & BITFIELD64_BIT(queueFamilyIndex)))
+   /* These should overlap. */
+   uint64_t effective_queues = wsi_device->queue_supports_blit & wsi_device->queue_supports_timestamps;
+
+   /* If there are no queues that support both blits and timestamps,
+    * don't report support for queue timestamps. */
+   if (!effective_queues)
+      effective_queues = wsi_device->queue_supports_blit;
+
+   if (!(effective_queues & BITFIELD64_BIT(queueFamilyIndex)))
       return false;
 
    struct wsi_wl_display display;
@@ -2469,6 +2477,7 @@ struct wsi_wl_present_id {
    uint64_t target_time;
    uint64_t correction;
    struct wl_list link;
+   struct wsi_image *img;
    bool user_target_time;
 };
 
@@ -2903,7 +2912,7 @@ presentation_handle_presented(void *data,
    /* Notify this before present wait to reduce latency of presentation timing requests
     * if the application is driving its queries based off present waits. */
    if (id->timing_serial)
-      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time);
+      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, presentation_time, id->img);
 
    mtx_lock(&chain->present_ids.lock);
    chain->present_ids.refresh_nsec = refresh;
@@ -2940,7 +2949,7 @@ presentation_handle_discarded(void *data)
     * applications may start to latch onto that timestamp as ground truth, which
     * is obviously not correct. */
    if (id->timing_serial)
-      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0);
+      wsi_swapchain_present_timing_notify_completion(&chain->base, id->timing_serial, 0, id->img);
 
    mtx_lock(&chain->present_ids.lock);
    if (!chain->present_ids.valid_refresh_nsec) {
@@ -3217,6 +3226,7 @@ wsi_wl_swapchain_queue_present(struct wsi_swapchain *wsi_chain,
       id->present_id = present_id;
       id->alloc = chain->wsi_wl_surface->display->wsi_wl->alloc;
       id->timing_serial = chain->timing_request.serial;
+      id->img = &chain->images[image_index].base;
       id->user_target_time = chain->timing_request.time != 0;
 
       mtx_lock(&chain->present_ids.lock);