hk: implement timestamps

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32434>
2026-05-04 22:49:13 +02:00 · 2024-12-01 12:22:28 -05:00 · 2024-12-01 12:22:28 -05:00 · 3f5086016b
commit 3f5086016b
parent 55cd987de2
8 changed files with 260 additions and 56 deletions
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@ -245,3 +245,11 @@ struct agx_va *agx_va_alloc(struct agx_device *dev, uint64_t size_B,
                            uint64_t align_B, enum agx_va_flags flags,
                            uint64_t fixed_va);
 void agx_va_free(struct agx_device *dev, struct agx_va *va);
+
+static inline bool
+agx_supports_timestamps(const struct agx_device *dev)
+{
+   /* TODO: Ungate virtio once virglrenderer supports the timestamp uapi */
+   return !dev->is_virtio &&
+          (dev->params.feat_compat & DRM_ASAHI_FEAT_USER_TIMESTAMPS);
+}
--- a/src/asahi/libagx/query.cl
+++ b/src/asahi/libagx/query.cl
@ -29,7 +29,12 @@ libagx_copy_query(global uint32_t *availability, global uint64_t *results,
   uint i = get_global_id(0);
   uint64_t dst = dst_addr + (((uint64_t)i) * dst_stride);
   uint32_t query = first_query + i;
-   bool available = availability[query];
+
+   bool available;
+   if (availability)
+      available = availability[query];
+   else
+      available = (results[query] != LIBAGX_QUERY_UNAVAILABLE);

   if (available || partial) {
      /* For occlusion queries, results[] points to the device global heap. We
@ -109,6 +114,17 @@ libagx_write_u32s(constant struct libagx_imm_write *p)
   *(p[id].address) = p[id].value;
 }

+/*
+ * We set the source as volatile since the caching situation around timestamps
+ * is a bit unclear. It might not be necessary but - absent hardware/firmware
+ * documentation - this gives me peace of mind.
+ */
+KERNEL(1)
+libagx_copy_timestamp(global uint64_t *dest, volatile global uint64_t *src)
+{
+   *dest = *src;
+}
+
 KERNEL(1)
 libagx_write_u32(global uint32_t *address, uint32_t value)
 {
--- a/src/asahi/libagx/query.h
+++ b/src/asahi/libagx/query.h
@ -24,3 +24,5 @@ struct libagx_imm_write {
   GLOBAL(uint32_t) address;
   uint32_t value;
 };
+
+#define LIBAGX_QUERY_UNAVAILABLE (uint64_t)((int64_t)-1)
--- a/src/asahi/vulkan/hk_cmd_buffer.h
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@ -300,6 +300,19 @@ struct hk_scratch_req {
   bool preamble;
 };

+/*
+ * Represents a firmware timestamp request.  Handle is a kernel timestamp object
+ * handle, not a GEM handle.
+ *
+ * The kernel/firmware uses the handle/offset_B to write. We use the address to
+ * read the results back. We could deduplicate this, but this is convenient.
+ */
+struct agx_timestamp_req {
+   uint64_t addr;
+   uint32_t handle;
+   uint32_t offset_B;
+};
+
 /*
 * hk_cs represents a single control stream, to be enqueued either to the
 * CDM or VDM for compute/3D respectively.
@ -353,6 +366,14 @@ struct hk_cs {
      uint32_t calls, cmds, flushes;
   } stats;

+   /* Timestamp writes. Currently just compute end / fragment end. We could
+    * flesh this out later if we want finer info. (We will, but it's not
+    * required for conformance.)
+    */
+   struct {
+      struct agx_timestamp_req end;
+   } timestamp;
+
   /* Remaining state is for graphics only, ignored for compute */
   struct agx_tilebuffer_layout tib;

--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@ -23,6 +23,7 @@
 #include "util/simple_mtx.h"
 #include "vulkan/vulkan_core.h"
 #include "vulkan/wsi/wsi_common.h"
+#include "unstable_asahi_drm.h"
 #include "vk_drm_syncobj.h"
 #include "vk_shader_module.h"

@ -714,7 +715,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .sampledImageStencilSampleCounts = sample_counts,
      .storageImageSampleCounts = sample_counts,
      .maxSampleMaskWords = 1,
-      .timestampComputeAndGraphics = false,
+      .timestampComputeAndGraphics = agx_supports_timestamps(dev),
      .timestampPeriod = 1,
      .maxClipDistances = 8,
      .maxCullDistances = 8,
@ -1378,7 +1379,8 @@ hk_GetPhysicalDeviceQueueFamilyProperties2(
      {
         p->queueFamilyProperties.queueFlags = queue_family->queue_flags;
         p->queueFamilyProperties.queueCount = queue_family->queue_count;
-         p->queueFamilyProperties.timestampValidBits = 0; // TODO 64;
+         p->queueFamilyProperties.timestampValidBits =
+            agx_supports_timestamps(&pdev->dev) ? 64 : 0;
         p->queueFamilyProperties.minImageTransferGranularity =
            (VkExtent3D){1, 1, 1};

--- a/src/asahi/vulkan/hk_query_pool.c
+++ b/src/asahi/vulkan/hk_query_pool.c
@ -31,6 +31,12 @@ struct hk_query_report {
   uint64_t value;
 };

+static inline bool
+hk_has_available(const struct hk_query_pool *pool)
+{
+   return pool->vk.query_type != VK_QUERY_TYPE_TIMESTAMP;
+}
+
 static uint16_t *
 hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
 {
@ -55,6 +61,22 @@ hk_reports_per_query(struct hk_query_pool *pool)
   }
 }

+static void
+hk_flush_if_timestamp(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   /* There might not be a barrier between the timestamp write and the copy
+    * otherwise but we need one to give the CPU a chance to write the timestamp.
+    * This could maybe optimized.
+    */
+   if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      perf_debug(dev, "Flushing for timestamp copy");
+      hk_cmd_buffer_end_graphics(cmd);
+      hk_cmd_buffer_end_compute(cmd);
+   }
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
                   const VkAllocationCallbacks *pAllocator,
@ -64,16 +86,24 @@ hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
   struct hk_query_pool *pool;

   bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
+   bool timestamp = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP;
   unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;

+   /* Workaround for DXVK on old kernels */
+   if (!agx_supports_timestamps(&dev->dev))
+      timestamp = false;
+
   pool =
      vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
   if (!pool)
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

   /* We place the availability first and then data */
-   pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
-                             sizeof(struct hk_query_report));
+   pool->query_start = 0;
+   if (hk_has_available(pool)) {
+      pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
+                                sizeof(struct hk_query_report));
+   }

   uint32_t reports_per_query = hk_reports_per_query(pool);
   pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
@ -87,12 +117,33 @@ hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
      else
         bo_size += pool->query_stride * pool->vk.query_count;

-      pool->bo =
-         agx_bo_create(&dev->dev, bo_size, 0, AGX_BO_WRITEBACK, "Query pool");
+      /* The kernel requires that timestamp buffers are SHARED */
+      enum agx_bo_flags flags = AGX_BO_WRITEBACK;
+      if (timestamp)
+         flags |= AGX_BO_SHARED;
+
+      pool->bo = agx_bo_create(&dev->dev, bo_size, 0, flags, "Query pool");
      if (!pool->bo) {
         hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
      }
+
+      /* Timestamp buffers must be explicitly bound as such before we can use
+       * them.
+       */
+      if (timestamp) {
+         int ret = dev->dev.ops.bo_bind_object(
+            &dev->dev, pool->bo, &pool->handle, pool->bo->size, 0,
+            ASAHI_BIND_OBJECT_USAGE_TIMESTAMPS);
+
+         if (ret) {
+            hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool),
+                                pAllocator);
+            return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         }
+
+         assert(pool->handle && "handles are nonzero");
+      }
   }

   uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
@ -135,6 +186,9 @@ hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
      hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
   }

+   if (pool->handle)
+      dev->dev.ops.bo_unbind_object(&dev->dev, pool->handle, 0);
+
   agx_bo_unreference(&dev->dev, pool->bo);
   vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
 }
@ -142,6 +196,7 @@ hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
 static uint64_t
 hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
 {
+   assert(hk_has_available(pool));
   assert(query < pool->vk.query_count);
   return pool->bo->va->addr + query * sizeof(uint32_t);
 }
@ -149,6 +204,7 @@ hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
 static uint32_t *
 hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
 {
+   assert(hk_has_available(pool));
   assert(query < pool->vk.query_count);
   return (uint32_t *)agx_bo_map(pool->bo) + query;
 }
@ -264,16 +320,45 @@ emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
   struct hk_device *dev = hk_cmd_buffer_device(cmd);

   for (uint32_t i = 0; i < num_queries; i++) {
-      uint64_t available = hk_query_available_addr(pool, first_index + i);
      uint64_t report = hk_query_report_addr(dev, pool, first_index + i);
-      hk_queue_write(cmd, available, set_available, false);
+
+      uint64_t value = 0;
+      if (hk_has_available(pool)) {
+         uint64_t available = hk_query_available_addr(pool, first_index + i);
+         hk_queue_write(cmd, available, set_available, false);
+      } else {
+         value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
+      }

      /* XXX: is this supposed to happen on the begin? */
      for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
-         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0,
-                        false);
+         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)),
+                        value, false);
         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)) + 4,
-                        0, false);
+                        value >> 32, false);
+      }
+   }
+}
+
+static void
+host_zero_queries(struct hk_device *dev, struct hk_query_pool *pool,
+                  uint32_t first_index, uint32_t num_queries,
+                  bool set_available)
+{
+   for (uint32_t i = 0; i < num_queries; i++) {
+      struct hk_query_report *reports =
+         hk_query_report_map(dev, pool, first_index + i);
+
+      uint64_t value = 0;
+      if (hk_has_available(pool)) {
+         uint32_t *available = hk_query_available_map(pool, first_index + i);
+         *available = set_available;
+      } else {
+         value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
+      }
+
+      for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
+         reports[j].value = value;
      }
   }
 }
@ -285,11 +370,7 @@ hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
   VK_FROM_HANDLE(hk_device, dev, device);

-   uint32_t *available = hk_query_available_map(pool, firstQuery);
-   struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery);
-
-   memset(available, 0, queryCount * sizeof(*available));
-   memset(reports, 0, queryCount * pool->query_stride);
+   host_zero_queries(dev, pool, firstQuery, queryCount, false);
 }

 VKAPI_ATTR void VKAPI_CALL
@ -300,6 +381,8 @@ hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
   struct hk_device *dev = hk_cmd_buffer_device(cmd);

+   hk_flush_if_timestamp(cmd, pool);
+
   perf_debug(dev, "Reset query pool");
   emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
 }
@ -309,35 +392,56 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
                      VkPipelineStageFlags2 stage, VkQueryPool queryPool,
                      uint32_t query)
 {
-   unreachable("todo");
-#if 0
   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);

-   struct nv_push *p = hk_cmd_buffer_push(cmd, 10);
+   /* Workaround for DXVK on old kernels */
+   if (!agx_supports_timestamps(&dev->dev))
+      return;

-   uint64_t report_addr = hk_query_report_addr(pool, query);
-   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-   P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
-   P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
-   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
-   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-      .operation = OPERATION_REPORT_ONLY,
-      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
-      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
-   });
+   uint64_t report_addr = hk_query_report_addr(dev, pool, query);

-   uint64_t available_addr = hk_query_available_addr(pool, query);
-   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
-   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
-   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
-   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-      .operation = OPERATION_RELEASE,
-      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
-      .pipeline_location = PIPELINE_LOCATION_ALL,
-      .structure_size = STRUCTURE_SIZE_ONE_WORD,
-   });
+   bool after_gfx = cmd->current_cs.gfx != NULL;
+
+   /* When writing timestamps for compute, we split the control stream at each
+    * write. This ensures we never need to copy compute timestamps, which would
+    * require an extra control stream anyway. Unlike graphics, splitting compute
+    * control streams is inexpensive so there's not a strong performance reason
+    * to do otherwise. Finally, batching multiple timestamp writes (like we do
+    * for graphics) would destroy the ability to profile individual compute
+    * dispatches. While that's allowed by the Vulkan spec, it's pretty mean to
+    * apps. So.. don't do that.
+    */
+   if (!after_gfx && cmd->current_cs.cs &&
+       cmd->current_cs.cs->timestamp.end.addr) {
+
+      perf_debug(dev, "Splitting for compute timestamp");
+      hk_cmd_buffer_end_compute(cmd);
+   }
+
+   struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
+      cmd, after_gfx ? &cmd->current_cs.gfx : &cmd->current_cs.cs, true);
+   if (!cs)
+      return;
+
+   if (cs->timestamp.end.addr) {
+      assert(after_gfx && "compute is handled above");
+
+      struct hk_cs *after =
+         hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.post_gfx, true);
+      if (!after)
+         return;
+
+      libagx_copy_timestamp(after, agx_1d(1), report_addr,
+                            cs->timestamp.end.addr);
+   } else {
+      cs->timestamp.end = (struct agx_timestamp_req){
+         .addr = report_addr,
+         .handle = pool->handle,
+         .offset_B = hk_query_offset(pool, query),
+      };
+   }

   /* From the Vulkan spec:
    *
@ -361,7 +465,6 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
      if (num_queries > 1)
         emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
   }
-#endif
 }

 static void
@ -467,10 +570,18 @@ hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
 }

 static bool
-hk_query_is_available(struct hk_query_pool *pool, uint32_t query)
+hk_query_is_available(struct hk_device *dev, struct hk_query_pool *pool,
+                      uint32_t query)
 {
-   uint32_t *available = hk_query_available_map(pool, query);
-   return p_atomic_read(available) != 0;
+   if (hk_has_available(pool)) {
+      uint32_t *available = hk_query_available_map(pool, query);
+      return p_atomic_read(available) != 0;
+   } else {
+      const struct hk_query_report *report =
+         hk_query_report_map(dev, pool, query);
+
+      return report->value != LIBAGX_QUERY_UNAVAILABLE;
+   }
 }

 #define HK_QUERY_TIMEOUT 2000000000ull
@ -482,7 +593,7 @@ hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);

   while (os_time_get_nano() < abs_timeout_ns) {
-      if (hk_query_is_available(pool, query))
+      if (hk_query_is_available(dev, pool, query))
         return VK_SUCCESS;

      VkResult status = vk_device_check_status(&dev->vk);
@ -522,7 +633,7 @@ hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
   for (uint32_t i = 0; i < queryCount; i++) {
      const uint32_t query = firstQuery + i;

-      bool available = hk_query_is_available(pool, query);
+      bool available = hk_query_is_available(dev, pool, query);

      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
         status = hk_query_wait_for_available(dev, pool, query);
@ -566,6 +677,8 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
   VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);

   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   hk_flush_if_timestamp(cmd, pool);
+
   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true);
   if (!cs)
      return;
@ -574,7 +687,7 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);

   struct libagx_copy_query_args info = {
-      .availability = pool->bo->va->addr,
+      .availability = hk_has_available(pool) ? pool->bo->va->addr : 0,
      .results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr
                                  : pool->bo->va->addr + pool->query_start,
      .oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0,
--- a/src/asahi/vulkan/hk_query_pool.h
+++ b/src/asahi/vulkan/hk_query_pool.h
@ -21,6 +21,11 @@ struct hk_query_pool {
   struct agx_bo *bo;
   void *bo_map;

+   /* For timestamp queries, the kernel-assigned timestamp buffer handle. Unused
+    * for all other query types
+    */
+   uint32_t handle;
+
   unsigned oq_queries;
 };

--- a/src/asahi/vulkan/hk_queue.c
+++ b/src/asahi/vulkan/hk_queue.c
@ -68,7 +68,8 @@ queue_submit_empty(struct hk_device *dev, struct hk_queue *queue,

 static void
 asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
-                       struct drm_asahi_cmd_compute *cmd)
+                       struct drm_asahi_cmd_compute *cmd,
+                       struct drm_asahi_cmd_compute_user_timestamps *timestamps)
 {
   size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start);

@ -87,6 +88,18 @@ asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
      .unk_mask = 0xffffffff,
   };

+   if (cs->timestamp.end.handle) {
+      assert(agx_supports_timestamps(&dev->dev));
+
+      *timestamps = (struct drm_asahi_cmd_compute_user_timestamps){
+         .type = ASAHI_COMPUTE_EXT_TIMESTAMPS,
+         .end_handle = cs->timestamp.end.handle,
+         .end_offset = cs->timestamp.end.offset_B,
+      };
+
+      cmd->extensions = (uint64_t)(uintptr_t)timestamps;
+   }
+
   if (cs->scratch.cs.main || cs->scratch.cs.preamble) {
      cmd->helper_arg = dev->scratch.cs.buf->va->addr;
      cmd->helper_cfg = cs->scratch.cs.preamble ? (1 << 16) : 0;
@ -96,7 +109,8 @@ asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,

 static void
 asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
-                       struct drm_asahi_cmd_render *c)
+                       struct drm_asahi_cmd_render *c,
+                       struct drm_asahi_cmd_render_user_timestamps *timestamps)
 {
   unsigned cmd_ta_id = agx_get_global_id(&dev->dev);
   unsigned cmd_3d_id = agx_get_global_id(&dev->dev);
@ -251,6 +265,18 @@ asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
      c->fragment_helper_cfg = cs->scratch.fs.preamble ? (1 << 16) : 0;
      c->fragment_helper_program = agx_helper_program(&dev->bg_eot);
   }
+
+   if (cs->timestamp.end.handle) {
+      assert(agx_supports_timestamps(&dev->dev));
+
+      c->extensions = (uint64_t)(uintptr_t)timestamps;
+
+      *timestamps = (struct drm_asahi_cmd_render_user_timestamps){
+         .type = ASAHI_RENDER_EXT_TIMESTAMPS,
+         .frg_end_handle = cs->timestamp.end.handle,
+         .frg_end_offset = cs->timestamp.end.offset_B,
+      };
+   }
 }

 static void
@ -278,6 +304,11 @@ union drm_asahi_cmd {
   struct drm_asahi_cmd_render render;
 };

+union drm_asahi_user_timestamps {
+   struct drm_asahi_cmd_compute_user_timestamps compute;
+   struct drm_asahi_cmd_render_user_timestamps render;
+};
+
 /* XXX: Batching multiple commands per submission is causing rare (7ppm) flakes
 * on the CTS once lossless compression is enabled. This needs to be
 * investigated before we can reenable this mechanism. We are likely missing a
@ -466,6 +497,8 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
   struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count);
   union drm_asahi_cmd *cmds_inner =
      alloca(sizeof(*cmds_inner) * command_count);
+   union drm_asahi_user_timestamps *ts_inner =
+      alloca(sizeof(*ts_inner) * command_count);

   unsigned cmd_it = 0;
   unsigned nr_vdm = 0, nr_cdm = 0;
@ -491,29 +524,33 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
               "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes",
               i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes);

-            assert(cs->stats.cmds > 0 || cs->stats.flushes > 0);
+            assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 ||
+                   cs->timestamp.end.handle);

            cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE;
            cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute);
            nr_cdm++;

+            asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute,
+                                   &ts_inner[cmd_it].compute);
+
            /* Work around for shipping 6.11.8 kernels, remove when we bump uapi
             */
-            if (!cmd.extensions)
+            if (!agx_supports_timestamps(&dev->dev))
               cmd.cmd_buffer_size -= 8;
-
-            asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute);
         } else {
            assert(cs->type == HK_CS_VDM);
            perf_debug(dev, "%u: Submitting VDM with %u API draws, %u draws", i,
                       cs->stats.calls, cs->stats.cmds);
-            assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles);
+            assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles ||
+                   cs->timestamp.end.handle);

            cmd.cmd_type = DRM_ASAHI_CMD_RENDER;
            cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render);
            nr_vdm++;

-            asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render);
+            asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render,
+                                   &ts_inner[cmd_it].render);
         }

         cmds[cmd_it++] = cmd;