vulkan: implement VK_AMD_anti_lag as implicit vulkan layer

VkLayer_MESA_anti_lag is a lightweight implicit layer which provides an open-source implementation of the VK_AMD_anti_lag vulkan extension. The algorithm used by this layer is very simplistic and only aims to minimize the delay between calls to vkQueueSubmit or vkQueueSubmit2 and the begin of the execution of the submission. In order to build VkLayer_MESA_anti_lag, pass -Dlayers=anti-lag to meson. It is possible to either install the layer or to use VK_ADD_IMPLICIT_LAYER_PATH=<buildpath>/share/vulkan/implicit_layer.d/ for testing purposes. (Keep in mind that you have to adjust the library_path in the json file in that case.) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34242>
2025-12-23 22:00:13 +01:00 · 2025-03-24 21:25:29 +01:00 · 2025-03-24 21:25:29 +01:00 · 722ffe9a73
commit 722ffe9a73
parent 699ae0aad9
9 changed files with 1715 additions and 1 deletions
--- a/meson.build
+++ b/meson.build
@ -95,6 +95,7 @@ with_vulkan_overlay_layer = get_option('vulkan-layers').contains('overlay')
 with_vulkan_device_select_layer = get_option('vulkan-layers').contains('device-select')
 with_vulkan_screenshot_layer = get_option('vulkan-layers').contains('screenshot')
 with_vulkan_vram_report_limit_layer = get_option('vulkan-layers').contains('vram-report-limit')
 with_vulkan_anti_lag_layer = get_option('vulkan-layers').contains('anti-lag')
 with_tools = get_option('tools')
 if with_tools.contains('all')
  with_tools = [
--- a/meson.options
+++ b/meson.options
@ -299,7 +299,7 @@ option(
  type : 'array',
  value : [],
  choices : [
-    'device-select', 'intel-nullhw', 'overlay', 'screenshot',
+    'device-select', 'intel-nullhw', 'overlay', 'screenshot', 'anti-lag',
    'vram-report-limit',
  ],
  description : 'List of vulkan layers to build'
--- a/src/vulkan/anti-lag-layer/VkLayer_MESA_anti_lag.json
+++ b/src/vulkan/anti-lag-layer/VkLayer_MESA_anti_lag.json
@ -0,0 +1,26 @@
 {
   "file_format_version": "1.2.1",
   "layer": {
      "name": "VK_LAYER_MESA_anti_lag",
      "type": "GLOBAL",
      "library_path": "libVkLayer_MESA_anti_lag.so",
      "api_version": "1.4.303",
      "implementation_version": "1",
      "description": "Open-source implementation of the VK_AMD_anti_lag extension.",
      "functions": {
         "vkNegotiateLoaderLayerInterfaceVersion": "anti_lag_NegotiateLoaderLayerInterfaceVersion"
      },
      "device_extensions": [
         {
            "name": "VK_AMD_anti_lag",
            "spec_version": "1",
            "entrypoints": [
               "vkAntiLagUpdateAMD"
            ]
         }
      ],
      "disable_environment": {
         "DISABLE_LAYER_MESA_ANTI_LAG": "1"
      }
   }
 }
--- a/src/vulkan/anti-lag-layer/anti_lag_layer.c
+++ b/src/vulkan/anti-lag-layer/anti_lag_layer.c
@ -0,0 +1,590 @@
 /*
 * Copyright © 2025 Valve Corporation
 *
 * SPDX-License-Identifier: MIT
 */
 #include "anti_lag_layer.h"
 #include <string.h>
 #include "util/os_time.h"
 #include "util/simple_mtx.h"
 #include "vulkan/vulkan_core.h"
 #include "ringbuffer.h"
 #include "vk_alloc.h"
 #include "vk_util.h"
 static bool
 evaluate_frame(device_context *ctx, frame *frame, bool force_wait)
 {
   if (frame->state != FRAME_PRESENT) {
      /* This frame is not finished yet. */
      assert(!force_wait);
      return false;
   }
   int query_flags = VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT;
   const uint32_t frame_idx = ringbuffer_index(ctx->frames, frame);
   /* Before we commit to completing a frame, all submits on all queues must have completed. */
   for (unsigned i = 0; i < ctx->num_queues; i++) {
      queue_context *queue_ctx = &ctx->queues[i];
      ringbuffer_lock(queue_ctx->queries);
      uint64_t expected_signal_value = queue_ctx->semaphore_value - queue_ctx->queries.size +
                                       queue_ctx->submissions_per_frame[frame_idx];
      ringbuffer_unlock(queue_ctx->queries);
      if (force_wait) {
         /* Wait for the timeline semaphore of the frame to be signaled. */
         struct VkSemaphoreWaitInfo wait_info = {
            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
            .semaphoreCount = 1,
            .pSemaphores = &queue_ctx->semaphore,
            .pValues = &expected_signal_value,
         };
         ctx->vtable.WaitSemaphores(ctx->device, &wait_info, 0);
      } else {
         /* Return early if the last timeline semaphore of the frame has not been signaled yet. */
         uint64_t signal_value;
         ctx->vtable.GetSemaphoreCounterValue(ctx->device, queue_ctx->semaphore, &signal_value);
         if (signal_value < expected_signal_value)
            return false;
      }
   }
   /* For each queue, retrieve timestamp query results. */
   for (unsigned i = 0; i < ctx->num_queues; i++) {
      queue_context *queue_ctx = &ctx->queues[i];
      /* As we hold a global mtx and this is the only place where queries are free'd,
       * we don't need to lock the query ringbuffer here in order to read the first entry.
       */
      struct query *query = ringbuffer_first(queue_ctx->queries);
      uint32_t query_idx = ringbuffer_index(queue_ctx->queries, query);
      int num_timestamps =
         MIN2(queue_ctx->submissions_per_frame[frame_idx], MAX_QUERIES - query_idx);
      while (num_timestamps > 0) {
         /* Retreive timestamp results from this queue. */
         ctx->vtable.GetQueryPoolResults(ctx->device, queue_ctx->queryPool, query_idx,
                                         num_timestamps, sizeof(uint64_t), &query->begin_gpu_ts,
                                         sizeof(struct query), query_flags);
         ringbuffer_lock(queue_ctx->queries);
         for (unsigned j = 0; j < num_timestamps; j++) {
            /* Calibrate device timestamps. */
            query->begin_gpu_ts =
               ctx->calibration.delta +
               (uint64_t)(query->begin_gpu_ts * ctx->calibration.timestamp_period);
            if (query->begin_gpu_ts > query->submit_cpu_ts)
               frame->min_delay =
                  MIN2(frame->min_delay, query->begin_gpu_ts - query->submit_cpu_ts);
            /* Check if we can reset half of the query pool at once. */
            uint32_t next_idx = ringbuffer_index(queue_ctx->queries, query) + 1;
            const bool reset = next_idx == MAX_QUERIES || next_idx == MAX_QUERIES / 2;
            if (reset) {
               ringbuffer_unlock(queue_ctx->queries);
               ctx->vtable.ResetQueryPool(ctx->device, queue_ctx->queryPool,
                                          next_idx - MAX_QUERIES / 2, MAX_QUERIES / 2);
               ringbuffer_lock(queue_ctx->queries);
            }
            /* Free query. */
            ringbuffer_free(queue_ctx->queries, query);
            queue_ctx->submissions_per_frame[frame_idx]--;
            query = ringbuffer_first(queue_ctx->queries);
         }
         /* Ensure that the total number of queries across all frames is correct. */
         ASSERTED uint32_t count = 0;
         for (unsigned i = 0; i < MAX_FRAMES; i++)
            count += queue_ctx->submissions_per_frame[i];
         assert(count == queue_ctx->queries.size);
         query_idx = ringbuffer_index(queue_ctx->queries, query);
         num_timestamps =
            MIN2(queue_ctx->submissions_per_frame[frame_idx], MAX_QUERIES - query_idx);
         ringbuffer_unlock(queue_ctx->queries);
      }
   }
   frame->min_delay++; /* wrap UINT64_MAX in case we didn't have any submissions. */
   return true;
 }
 static bool
 calibrate_timestamps(device_context *ctx)
 {
   uint64_t ts[2];
   uint64_t deviation;
   VkCalibratedTimestampInfoKHR info[2] = {
      {
         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
         .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
      },
      {
         .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_KHR,
         .timeDomain = VK_TIME_DOMAIN_DEVICE_KHR,
      },
   };
   VkResult result = ctx->vtable.GetCalibratedTimestampsKHR(ctx->device, 2, info, ts, &deviation);
   if (result == VK_SUCCESS) {
      /* We take a moving average in order to avoid variance. */
      int64_t new_delta = ts[0] - (int64_t)(ts[1] * ctx->calibration.timestamp_period);
      if (ctx->calibration.delta == 0) {
         ctx->calibration.delta = new_delta;
      } else {
         int64_t diff = new_delta - ctx->calibration.delta;
         ctx->calibration.delta += diff / 8;
      }
      /* Take a new calibrated timestamp every second. */
      ctx->calibration.recalibrate_when = ts[0] + 1000000000ull;
   }
   return result == VK_SUCCESS;
 }
 static void
 begin_next_frame(device_context *ctx)
 {
   frame *next_frame;
   if (ctx->active_frame) {
      assert(ctx->active_frame->state == FRAME_SUBMIT);
      ctx->active_frame->state = FRAME_PRESENT;
      next_frame = ringbuffer_next(ctx->frames, ctx->active_frame);
   } else {
      next_frame = ringbuffer_last(ctx->frames);
   }
   /* If there is a frame ready, it becomes active. */
   if (next_frame->state == FRAME_INPUT) {
      next_frame->state = FRAME_SUBMIT;
      ctx->active_frame = next_frame;
   } else {
      ctx->active_frame = NULL;
   }
 }
 static void
 anti_lag_disable(device_context *ctx)
 {
   ringbuffer_lock(ctx->frames);
   while (ctx->frames.size) {
      /* Set force-wait=true, so that all pending timestamp queries get completed. */
      begin_next_frame(ctx);
      frame *frame = ringbuffer_first(ctx->frames);
      evaluate_frame(ctx, frame, true);
      frame->state = FRAME_INVALID;
      ringbuffer_free(ctx->frames, frame);
   }
   assert(!ctx->active_frame);
   ringbuffer_unlock(ctx->frames);
 }
 #define TARGET_DELAY 4000000ll /* 4 ms */
 /**
 * Returns the amount of time that we want the next frame to be delayed.
 *
 * The algorithm used by this function is very simplistic and only aims
 * to minimize the delay between calls to vkQueueSubmit or vkQueueSubmit2
 * and the begin of the execution of the submission.
 */
 static int64_t
 get_wait_time(device_context *ctx)
 {
   /* Take the previous evaluated frame's delay as baseline. */
   int64_t imposed_delay = ctx->base_delay;
   int64_t adaptation = 0;
   ringbuffer_lock(ctx->frames);
   /* In case our ringbuffer is completely full and no frame is in PRESENT stage,
    * just move the oldest frame to PRESENT stage, and force-wait.
    */
   bool force_wait = ctx->frames.size == MAX_FRAMES;
   frame *next_frame = ringbuffer_first(ctx->frames);
   if (force_wait && next_frame->state != FRAME_PRESENT)
      begin_next_frame(ctx);
   /* Also force-wait for the oldest frame if there is already 2 frames in PRESENT stage. */
   force_wait |= ringbuffer_next(ctx->frames, next_frame)->state == FRAME_PRESENT;
   ringbuffer_unlock(ctx->frames);
   /* Take new evaluated frames into consideration. */
   while (evaluate_frame(ctx, next_frame, force_wait)) {
      if (next_frame->min_delay < TARGET_DELAY / 2 && ctx->adaptation <= 0) {
         /* If there is no delay between submission and GPU start, halve the base delay and
          * set the delay for this frame to zero, in order to account for sudden changes.
          */
         ctx->base_delay = ctx->base_delay / 2;
         adaptation = -ctx->base_delay;
      } else {
         /* We use some kind of exponential weighted moving average function here,
          * in order to determine a base-delay. We use a smoothing-factor of roughly
          * 3%, but don't discount the previous value. This helps keeping the delay
          * slightly below the target of 5 ms, most of the time.
          */
         int64_t diff = (int64_t)next_frame->min_delay - TARGET_DELAY;
         ctx->base_delay = MAX2(0, ctx->base_delay + diff / 32); /* corresponds to ~3 % */
         /* As the base-delay gets adjusted rather slowly, we additionally use the half of the
          * diff as adaptation delay to account for sudden changes. A quarter of the adaptation
          * is then subtracted for the next frame, so that we can avoid overcompensation.
          */
         adaptation = diff / 2 - ctx->adaptation / 4;
      }
      /* We only need space for one frame. */
      force_wait = false;
      ringbuffer_lock(ctx->frames);
      next_frame->state = FRAME_INVALID;
      ringbuffer_free(ctx->frames, next_frame);
      next_frame = ringbuffer_first(ctx->frames);
      ringbuffer_unlock(ctx->frames);
   }
   imposed_delay = ctx->base_delay + adaptation;
   ctx->adaptation = adaptation;
   if (imposed_delay > 100000000) {
      /* This corresponds to <10 FPS. Something might have gone wrong. */
      calibrate_timestamps(ctx);
      ctx->base_delay = ctx->adaptation = imposed_delay = 0;
   }
   return MAX2(0, imposed_delay);
 }
 static void
 reset_frame(frame *frame)
 {
   assert(frame->state == FRAME_INVALID);
   frame->frame_idx = 0;
   frame->frame_start_time = 0;
   frame->min_delay = UINT64_MAX;
   frame->state = FRAME_INPUT;
 }
 VKAPI_ATTR void VKAPI_CALL
 anti_lag_AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD *pData)
 {
   if (pData == NULL)
      return;
   device_context *ctx = get_device_context(device);
   if (pData->mode == VK_ANTI_LAG_MODE_OFF_AMD) {
      /* Application request to disable Anti-Lag. */
      simple_mtx_lock(&ctx->mtx);
      anti_lag_disable(ctx);
      simple_mtx_unlock(&ctx->mtx);
      return;
   }
   uint64_t frame_idx = 0;
   int64_t now = os_time_get_nano();
   int64_t imposed_delay = 0;
   int64_t last_frame_begin = 0;
   if (pData->pPresentationInfo) {
      /* The same frameIndex value should be used with VK_ANTI_LAG_STAGE_INPUT_AMD before
       * the frame begins and with VK_ANTI_LAG_STAGE_PRESENT_AMD when the frame ends.
       */
      frame_idx = pData->pPresentationInfo->frameIndex;
      /* This marks the end of the current frame. */
      if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_PRESENT_AMD) {
         /* If there is already a new frame pending, any submission that happens afterwards
          * gets associated with the new frame.
          */
         ringbuffer_lock(ctx->frames);
         /* Check that the currently active frame is indeed the frame we are ending now. */
         while (ctx->active_frame && ctx->active_frame->frame_idx <= frame_idx) {
            begin_next_frame(ctx);
         }
         ringbuffer_unlock(ctx->frames);
         return;
      }
   }
   /* Lock this function, in order to avoid race conditions on frame allocation. */
   simple_mtx_lock(&ctx->mtx);
   /* VK_ANTI_LAG_STAGE_INPUT_AMD: This marks the begin of a new frame.
    * Evaluate previous frames in order to determine the wait time.
    */
   imposed_delay = get_wait_time(ctx);
   int64_t next_deadline = now + imposed_delay;
   /* Ensure maxFPS adherence. */
   if (pData->maxFPS) {
      int64_t frametime_period = 1000000000u / pData->maxFPS;
      last_frame_begin = ringbuffer_last(ctx->frames)->frame_start_time;
      next_deadline = MAX2(next_deadline, last_frame_begin + frametime_period);
   }
   /* Recalibrate every now and then. */
   if (next_deadline > ctx->calibration.recalibrate_when)
      calibrate_timestamps(ctx);
   /* Sleep until deadline is met. */
   os_time_nanosleep_until(next_deadline);
   /* Initialize new frame. */
   ringbuffer_lock(ctx->frames);
   frame *new_frame = ringbuffer_alloc(ctx->frames);
   reset_frame(new_frame);
   new_frame->frame_start_time = next_deadline;
   new_frame->imposed_delay = imposed_delay;
   new_frame->frame_idx = frame_idx;
   /* Immediately set the frame active if there is no other frame already active. */
   if (!ctx->active_frame)
      begin_next_frame(ctx);
   ringbuffer_unlock(ctx->frames);
   simple_mtx_unlock(&ctx->mtx);
 }
 static queue_context *
 get_queue_context(device_context *ctx, VkQueue queue)
 {
   for (unsigned i = 0; i < ctx->num_queues; i++) {
      if (ctx->queues[i].queue == queue)
         return &ctx->queues[i];
   }
   return NULL;
 }
 static struct query *
 allocate_query(device_context *ctx, queue_context *queue_ctx)
 {
   if (!ctx->active_frame)
      return NULL;
   /* Allow for a single frame to use at most half of the query pool. */
   uint32_t frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame);
   if (queue_ctx->submissions_per_frame[frame_idx] > MAX_QUERIES / 2)
      return NULL;
   /* Check that the next query index has been reset properly:
    *
    * We use some double-buffering here in order to reduce the number of
    * VkResetQueryPool commands.
    * Return false if the next query-index allocation crosses into the half
    * which still contains active queries,
    */
   if (queue_ctx->queries.size > MAX_QUERIES / 2) {
      struct query *last_query = ringbuffer_last(queue_ctx->queries);
      uint32_t next_idx = ringbuffer_index(queue_ctx->queries, last_query) + 1;
      if (next_idx == MAX_QUERIES || next_idx == MAX_QUERIES / 2)
         return NULL;
   }
   return ringbuffer_alloc(queue_ctx->queries);
 }
 static bool
 get_commandbuffer(device_context *ctx, queue_context *queue_ctx, VkCommandBuffer *cmdbuffer)
 {
   uint64_t now = os_time_get_nano();
   /* Begin critical section. */
   ringbuffer_lock(ctx->frames);
   ringbuffer_lock(queue_ctx->queries);
   struct query *query = allocate_query(ctx, queue_ctx);
   if (query == NULL) {
      ringbuffer_unlock(queue_ctx->queries);
      ringbuffer_unlock(ctx->frames);
      return false;
   }
   query->submit_cpu_ts = now;
   /* Assign commandBuffer for timestamp. */
   *cmdbuffer = query->cmdbuffer;
   /* Increment timeline semaphore count. */
   queue_ctx->semaphore_value++;
   /* Add new submission entry for the current frame */
   assert(ctx->active_frame->state == FRAME_SUBMIT);
   uint32_t frame_idx = ringbuffer_index(ctx->frames, ctx->active_frame);
   queue_ctx->submissions_per_frame[frame_idx]++;
   ringbuffer_unlock(queue_ctx->queries);
   ringbuffer_unlock(ctx->frames);
   return true;
 }
 static VkResult
 queue_submit2(device_context *ctx, VkQueue queue, uint32_t submitCount,
              const VkSubmitInfo2 *pSubmits, VkFence fence, PFN_vkQueueSubmit2 queueSubmit2)
 {
   queue_context *queue_ctx = get_queue_context(ctx, queue);
   if (!ctx->active_frame || !queue_ctx)
      return queueSubmit2(queue, submitCount, pSubmits, fence);
   int first = -1;
   VkCommandBuffer timestamp_cmdbuffer;
   /* Check if any submission contains commandbuffers. */
   for (unsigned i = 0; i < submitCount; i++) {
      if (pSubmits[i].commandBufferInfoCount) {
         first = i;
         break;
      }
   }
   /* Get timestamp commandbuffer. */
   if (first == -1 || !get_commandbuffer(ctx, queue_ctx, &timestamp_cmdbuffer))
      return queueSubmit2(queue, submitCount, pSubmits, fence);
   VkSubmitInfo2 *submits;
   VkCommandBufferSubmitInfo *cmdbuffers;
   VkSemaphoreSubmitInfo *semaphores;
   VK_MULTIALLOC(ma);
   vk_multialloc_add(&ma, &submits, VkSubmitInfo2, submitCount);
   vk_multialloc_add(&ma, &cmdbuffers, VkCommandBufferSubmitInfo,
                     pSubmits[first].commandBufferInfoCount + 1);
   vk_multialloc_add(&ma, &semaphores, VkSemaphoreSubmitInfo,
                     pSubmits[first].signalSemaphoreInfoCount + 1);
   void *buf = vk_multialloc_zalloc(&ma, &ctx->alloc, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
   if (!buf)
      return VK_ERROR_OUT_OF_HOST_MEMORY;
   memcpy(submits, pSubmits, sizeof(VkSubmitInfo2) * submitCount);
   VkSubmitInfo2 *submit_info = &submits[first];
   /* Add commandbuffer to submission. */
   cmdbuffers[0] = (VkCommandBufferSubmitInfo){
      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
      .commandBuffer = timestamp_cmdbuffer,
   };
   memcpy(&cmdbuffers[1], submit_info->pCommandBufferInfos,
          sizeof(VkCommandBufferSubmitInfo) * submit_info->commandBufferInfoCount);
   submit_info->pCommandBufferInfos = cmdbuffers;
   submit_info->commandBufferInfoCount++;
   /* Add timeline semaphore to submission. */
   memcpy(semaphores, submit_info->pSignalSemaphoreInfos,
          sizeof(VkSemaphoreSubmitInfo) * submit_info->signalSemaphoreInfoCount);
   semaphores[submit_info->signalSemaphoreInfoCount] = (VkSemaphoreSubmitInfo){
      .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
      .semaphore = queue_ctx->semaphore,
      .value = queue_ctx->semaphore_value,
      .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
   };
   submit_info->pSignalSemaphoreInfos = semaphores;
   submit_info->signalSemaphoreInfoCount++;
   /* Submit with added timestamp query commandbuffer. */
   VkResult res = queueSubmit2(queue, submitCount, submits, fence);
   vk_free(&ctx->alloc, submits);
   return res;
 }
 VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_QueueSubmit2KHR(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits,
                         VkFence fence)
 {
   device_context *ctx = get_device_context(queue);
   return queue_submit2(ctx, queue, submitCount, pSubmits, fence, ctx->vtable.QueueSubmit2KHR);
 }
 VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_QueueSubmit2(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits,
                      VkFence fence)
 {
   device_context *ctx = get_device_context(queue);
   return queue_submit2(ctx, queue, submitCount, pSubmits, fence, ctx->vtable.QueueSubmit2);
 }
 VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits,
                     VkFence fence)
 {
   device_context *ctx = get_device_context(queue);
   queue_context *queue_ctx = get_queue_context(ctx, queue);
   if (!ctx->active_frame || !queue_ctx)
      return ctx->vtable.QueueSubmit(queue, submitCount, pSubmits, fence);
   int first = -1;
   VkCommandBuffer timestamp_cmdbuffer;
   /* Check if any submission contains commandbuffers. */
   for (unsigned i = 0; i < submitCount; i++) {
      if (pSubmits[i].commandBufferCount) {
         first = i;
         break;
      }
   }
   /* Get timestamp commandbuffer. */
   if (first == -1 || !get_commandbuffer(ctx, queue_ctx, &timestamp_cmdbuffer))
      return ctx->vtable.QueueSubmit(queue, submitCount, pSubmits, fence);
   VkSubmitInfo *submits;
   VkCommandBuffer *cmdbuffers;
   VkSemaphore *semaphores;
   VkTimelineSemaphoreSubmitInfo *semaphore_info;
   uint64_t *semaphore_values;
   VK_MULTIALLOC(ma);
   vk_multialloc_add(&ma, &submits, VkSubmitInfo, submitCount);
   vk_multialloc_add(&ma, &cmdbuffers, VkCommandBuffer, pSubmits[first].commandBufferCount + 1);
   vk_multialloc_add(&ma, &semaphores, VkSemaphore, pSubmits[first].signalSemaphoreCount + 1);
   vk_multialloc_add(&ma, &semaphore_info, VkTimelineSemaphoreSubmitInfo, 1);
   vk_multialloc_add(&ma, &semaphore_values, uint64_t, pSubmits[first].signalSemaphoreCount + 1);
   void *buf = vk_multialloc_zalloc(&ma, &ctx->alloc, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
   if (!buf)
      return VK_ERROR_OUT_OF_HOST_MEMORY;
   memcpy(submits, pSubmits, sizeof(VkSubmitInfo) * submitCount);
   VkSubmitInfo *submit_info = &submits[first];
   /* Add commandbuffer to submission. */
   cmdbuffers[0] = timestamp_cmdbuffer;
   memcpy(&cmdbuffers[1], submit_info->pCommandBuffers,
          sizeof(VkCommandBuffer) * submit_info->commandBufferCount);
   submit_info->pCommandBuffers = cmdbuffers;
   submit_info->commandBufferCount++;
   /* Add timeline semaphore to submission. */
   const VkTimelineSemaphoreSubmitInfo *tlssi =
      vk_find_struct_const(pSubmits[first].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO);
   semaphores[0] = queue_ctx->semaphore;
   memcpy(&semaphores[1], submit_info->pSignalSemaphores,
          sizeof(VkSemaphore) * submit_info->signalSemaphoreCount);
   submit_info->pSignalSemaphores = semaphores;
   submit_info->signalSemaphoreCount++;
   semaphore_values[0] = queue_ctx->semaphore_value;
   if (tlssi) {
      *semaphore_info = *tlssi; /* save original values */
      memcpy(&semaphore_values[1], tlssi->pSignalSemaphoreValues,
             sizeof(uint64_t) * tlssi->signalSemaphoreValueCount);
      ((VkTimelineSemaphoreSubmitInfo *)tlssi)->pSignalSemaphoreValues = semaphore_values;
      ((VkTimelineSemaphoreSubmitInfo *)tlssi)->signalSemaphoreValueCount =
         submit_info->signalSemaphoreCount;
   } else {
      *semaphore_info = (VkTimelineSemaphoreSubmitInfo){
         .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
         .pNext = submit_info->pNext,
         .signalSemaphoreValueCount = submit_info->signalSemaphoreCount,
         .pSignalSemaphoreValues = semaphore_values,
      };
      submit_info->pNext = semaphore_info;
   }
   /* Submit with added timestamp query commandbuffer. */
   VkResult res = ctx->vtable.QueueSubmit(queue, submitCount, submits, fence);
   if (tlssi)
      *(VkTimelineSemaphoreSubmitInfo *)tlssi = *semaphore_info; /* restore */
   vk_free(&ctx->alloc, buf);
   return res;
 }
--- a/src/vulkan/anti-lag-layer/anti_lag_layer.h
+++ b/src/vulkan/anti-lag-layer/anti_lag_layer.h
@ -0,0 +1,111 @@
 /*
 * Copyright © 2025 Valve Corporation
 *
 * SPDX-License-Identifier: MIT
 */
 #ifndef ANTI_LAG_LAYER_H
 #define ANTI_LAG_LAYER_H
 #include "util/simple_mtx.h"
 #include "vulkan/vk_layer.h"
 #include "vulkan/vulkan_core.h"
 #include "ringbuffer.h"
 #define MAX_FRAMES  8
 #define MAX_QUERIES 256
 enum frame_state {
   FRAME_INVALID = 0,
   FRAME_INPUT,   /* Frame is in input stage. */
   FRAME_SUBMIT,  /* All current queueSubmit calls are associated with this frame. */
   FRAME_PRESENT, /* Frame is in present stage and latencies can be evaluated. */
 };
 typedef struct frame {
   uint64_t frame_idx;
   uint64_t frame_start_time;
   uint64_t min_delay;
   uint64_t imposed_delay;
   enum frame_state state;
 } frame;
 struct query {
   uint64_t begin_gpu_ts;
   uint64_t submit_cpu_ts;
   VkCommandBuffer cmdbuffer;
 };
 typedef struct queue_context {
   VkQueue queue;
   uint32_t queue_family_idx;
   VkCommandPool cmdPool;
   VkQueryPool queryPool;
   VkSemaphore semaphore;
   uint64_t semaphore_value;
   uint8_t submissions_per_frame[MAX_FRAMES];
   RINGBUFFER_DECLARE(queries, struct query, MAX_QUERIES);
 } queue_context;
 typedef struct device_context {
   struct DeviceDispatchTable {
 #define DECLARE_HOOK(fn) PFN_vk##fn fn
      DECLARE_HOOK(GetDeviceProcAddr);
      DECLARE_HOOK(SetDeviceLoaderData);
      DECLARE_HOOK(DestroyDevice);
      DECLARE_HOOK(QueueSubmit);
      DECLARE_HOOK(QueueSubmit2);
      DECLARE_HOOK(QueueSubmit2KHR);
      DECLARE_HOOK(GetDeviceQueue);
      DECLARE_HOOK(CreateCommandPool);
      DECLARE_HOOK(DestroyCommandPool);
      DECLARE_HOOK(CreateQueryPool);
      DECLARE_HOOK(ResetQueryPool);
      DECLARE_HOOK(DestroyQueryPool);
      DECLARE_HOOK(GetQueryPoolResults);
      DECLARE_HOOK(AllocateCommandBuffers);
      DECLARE_HOOK(FreeCommandBuffers);
      DECLARE_HOOK(BeginCommandBuffer);
      DECLARE_HOOK(EndCommandBuffer);
      DECLARE_HOOK(GetCalibratedTimestampsKHR);
      DECLARE_HOOK(CmdWriteTimestamp);
      DECLARE_HOOK(CreateSemaphore);
      DECLARE_HOOK(DestroySemaphore);
      DECLARE_HOOK(GetSemaphoreCounterValue);
      DECLARE_HOOK(WaitSemaphores);
 #undef DECLARE_HOOK
   } vtable;
   VkDevice device;
   VkAllocationCallbacks alloc;
   simple_mtx_t mtx;
   struct {
      int64_t delta;
      uint64_t recalibrate_when;
      float timestamp_period;
   } calibration;
   RINGBUFFER_DECLARE(frames, frame, MAX_FRAMES);
   frame *active_frame;
   int64_t base_delay;
   int64_t adaptation;
   unsigned num_queues;
   queue_context queues[];
 } device_context;
 device_context *get_device_context(const void *object);
 void anti_lag_AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD *pData);
 VkResult anti_lag_QueueSubmit2KHR(VkQueue queue, uint32_t submitCount,
                                  const VkSubmitInfo2 *pSubmits, VkFence fence);
 VkResult anti_lag_QueueSubmit2(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits,
                               VkFence fence);
 VkResult anti_lag_QueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits,
                              VkFence fence);
 VkResult anti_lag_NegotiateLoaderLayerInterfaceVersion(VkNegotiateLayerInterface *pVersionStruct);
 #endif /* ANTI_LAG_LAYER_H */
--- a/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c
+++ b/src/vulkan/anti-lag-layer/anti_lag_layer_interface.c
@ -0,0 +1,899 @@
 /*
 * Copyright © 2025 Valve Corporation
 *
 * SPDX-License-Identifier: MIT
 */
 #include "util/simple_mtx.h"
 #include "vulkan/vk_layer.h"
 #include "vulkan/vulkan_core.h"
 #include "anti_lag_layer.h"
 #include "vk_alloc.h"
 #include "vk_util.h"
 static uintptr_t
 object_to_key(const void *object)
 {
   return (uintptr_t)*(uintptr_t *)object;
 }
 typedef struct instance_data {
   struct InstanceDispatchTable {
 #define DECLARE_HOOK(fn) PFN_vk##fn fn
      DECLARE_HOOK(GetInstanceProcAddr);
      DECLARE_HOOK(CreateInstance);
      DECLARE_HOOK(DestroyInstance);
      DECLARE_HOOK(CreateDevice);
      DECLARE_HOOK(EnumerateDeviceExtensionProperties);
      DECLARE_HOOK(GetPhysicalDeviceFeatures2KHR);
      DECLARE_HOOK(GetPhysicalDeviceFeatures2);
      DECLARE_HOOK(GetPhysicalDeviceProperties);
      DECLARE_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsEXT);
      DECLARE_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
      DECLARE_HOOK(GetPhysicalDeviceQueueFamilyProperties);
 #undef DECLARE_HOOK
   } vtable;
   VkInstance instance;
   uint32_t apiVersion;
   VkAllocationCallbacks alloc;
   struct instance_data *next;
 } instance_data;
 static void
 init_instance_vtable(instance_data *ctx, PFN_vkGetInstanceProcAddr gpa)
 {
   ctx->vtable.GetInstanceProcAddr = gpa;
 #define INIT_HOOK(fn) ctx->vtable.fn = (PFN_vk##fn)gpa(ctx->instance, "vk" #fn)
   INIT_HOOK(CreateInstance);
   INIT_HOOK(DestroyInstance);
   INIT_HOOK(CreateDevice);
   INIT_HOOK(EnumerateDeviceExtensionProperties);
   INIT_HOOK(GetPhysicalDeviceFeatures2KHR);
   INIT_HOOK(GetPhysicalDeviceFeatures2);
   INIT_HOOK(GetPhysicalDeviceProperties);
   INIT_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsEXT);
   INIT_HOOK(GetPhysicalDeviceCalibrateableTimeDomainsKHR);
   INIT_HOOK(GetPhysicalDeviceQueueFamilyProperties);
 #undef INIT_HOOK
 }
 static simple_mtx_t instance_mtx = SIMPLE_MTX_INITIALIZER;
 static instance_data *instance_list = NULL;
 static void
 add_instance(instance_data *instance)
 {
   simple_mtx_lock(&instance_mtx);
   instance_data **ptr = &instance_list;
   while (*ptr != NULL)
      ptr = &(*ptr)->next;
   *ptr = instance;
   simple_mtx_unlock(&instance_mtx);
 }
 static instance_data *
 remove_instance(const void *object)
 {
   uintptr_t key = object_to_key(object);
   simple_mtx_lock(&instance_mtx);
   instance_data **ptr = &instance_list;
   while (*ptr && key != object_to_key((*ptr)->instance))
      ptr = &(*ptr)->next;
   instance_data *ctx = *ptr;
   *ptr = ctx ? ctx->next : NULL;
   simple_mtx_unlock(&instance_mtx);
   return ctx;
 }
 static instance_data *
 get_instance_data(const void *object)
 {
   uintptr_t key = object_to_key(object);
   simple_mtx_lock(&instance_mtx);
   instance_data *ctx = instance_list;
   while (ctx && key != object_to_key(ctx->instance))
      ctx = ctx->next;
   simple_mtx_unlock(&instance_mtx);
   return ctx;
 }
 static VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
                        const VkAllocationCallbacks *pAllocator, VkInstance *pInstance)
 {
   VkLayerInstanceCreateInfo *chain_info = (VkLayerInstanceCreateInfo *)(pCreateInfo->pNext);
   while (chain_info && !(chain_info->sType == VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO &&
                          chain_info->function == VK_LAYER_LINK_INFO)) {
      chain_info = (VkLayerInstanceCreateInfo *)(chain_info->pNext);
   }
   assert(chain_info && chain_info->u.pLayerInfo);
   PFN_vkGetInstanceProcAddr fpGetInstanceProcAddr =
      chain_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
   PFN_vkCreateInstance fpCreateInstance =
      (PFN_vkCreateInstance)fpGetInstanceProcAddr(NULL, "vkCreateInstance");
   if (fpCreateInstance == NULL)
      return VK_ERROR_INITIALIZATION_FAILED;
   /* Advance the link info for the next element on the chain. */
   chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext;
   /* Create Instance. */
   VkResult result = fpCreateInstance(pCreateInfo, pAllocator, pInstance);
   if (result != VK_SUCCESS)
      return result;
   /* Create Instance context. */
   const VkAllocationCallbacks *alloc = pAllocator ? pAllocator : vk_default_allocator();
   void *buf = vk_alloc(alloc, sizeof(instance_data), alignof(instance_data),
                        VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
   if (!buf) {
      PFN_vkDestroyInstance fpDestroyInstance =
         (PFN_vkDestroyInstance)fpGetInstanceProcAddr(*pInstance, "vkDestroyInstance");
      fpDestroyInstance(*pInstance, alloc);
      return VK_ERROR_OUT_OF_HOST_MEMORY;
   }
   instance_data *ctx = (instance_data *)buf;
   ctx->apiVersion = pCreateInfo->pApplicationInfo && pCreateInfo->pApplicationInfo->apiVersion
                        ? pCreateInfo->pApplicationInfo->apiVersion
                        : VK_API_VERSION_1_0;
   ctx->instance = *pInstance;
   ctx->alloc = *alloc;
   ctx->next = NULL;
   init_instance_vtable(ctx, fpGetInstanceProcAddr);
   add_instance(ctx);
   return VK_SUCCESS;
 }
 static VKAPI_ATTR void VKAPI_CALL
 anti_lag_DestroyInstance(VkInstance instance, const VkAllocationCallbacks *pAllocator)
 {
   instance_data *ctx = remove_instance(instance);
   if (ctx) {
      ctx->vtable.DestroyInstance(instance, pAllocator);
      vk_free(&ctx->alloc, ctx);
   }
 }
 typedef struct device_data {
   VkDevice device;
   PFN_vkGetDeviceProcAddr GetDeviceProcAddr;
   device_context *ctx; /* NULL if anti-lag ext is not enabled. */
   struct device_data *next;
 } device_data;
 static void
 init_device_vtable(device_context *ctx, PFN_vkGetDeviceProcAddr gpa, PFN_vkSetDeviceLoaderData sld,
                   bool calibrated_timestamps_khr, bool host_query_reset_ext,
                   bool timeline_semaphore_khr)
 {
   ctx->vtable.GetDeviceProcAddr = gpa;
   ctx->vtable.SetDeviceLoaderData = sld;
 #define INIT_HOOK(fn) ctx->vtable.fn = (PFN_vk##fn)gpa(ctx->device, "vk" #fn)
 #define INIT_HOOK_ALIAS(fn, alias, cond)                                                           \
   ctx->vtable.fn = (PFN_vk##fn)gpa(ctx->device, cond ? "vk" #alias : "vk" #fn)
   INIT_HOOK(DestroyDevice);
   INIT_HOOK(QueueSubmit);
   INIT_HOOK(QueueSubmit2);
   INIT_HOOK(QueueSubmit2KHR);
   INIT_HOOK(GetDeviceQueue);
   INIT_HOOK(CreateCommandPool);
   INIT_HOOK(DestroyCommandPool);
   INIT_HOOK(CreateQueryPool);
   INIT_HOOK_ALIAS(ResetQueryPool, ResetQueryPoolEXT, host_query_reset_ext);
   INIT_HOOK(DestroyQueryPool);
   INIT_HOOK(GetQueryPoolResults);
   INIT_HOOK(AllocateCommandBuffers);
   INIT_HOOK(FreeCommandBuffers);
   INIT_HOOK(BeginCommandBuffer);
   INIT_HOOK(EndCommandBuffer);
   INIT_HOOK_ALIAS(GetCalibratedTimestampsKHR, GetCalibratedTimestampsEXT, !calibrated_timestamps_khr);
   INIT_HOOK(CmdWriteTimestamp);
   INIT_HOOK(CreateSemaphore);
   INIT_HOOK(DestroySemaphore);
   INIT_HOOK_ALIAS(GetSemaphoreCounterValue, GetSemaphoreCounterValueKHR, timeline_semaphore_khr);
   INIT_HOOK_ALIAS(WaitSemaphores, WaitSemaphoresKHR, timeline_semaphore_khr);
 #undef INIT_HOOK
 #undef INIT_HOOK_ALIAS
 }
 static simple_mtx_t device_mtx = SIMPLE_MTX_INITIALIZER;
 static device_data *device_list = NULL;
 static void
 add_device(device_data *device)
 {
   simple_mtx_lock(&device_mtx);
   device_data **ptr = &device_list;
   while (*ptr != NULL)
      ptr = &(*ptr)->next;
   *ptr = device;
   simple_mtx_unlock(&device_mtx);
 }
 static device_data *
 remove_device(const void *object)
 {
   uintptr_t key = object_to_key(object);
   simple_mtx_lock(&device_mtx);
   device_data **ptr = &device_list;
   while (*ptr && key != object_to_key((*ptr)->device))
      ptr = &(*ptr)->next;
   device_data *ctx = *ptr;
   *ptr = ctx ? ctx->next : NULL;
   simple_mtx_unlock(&device_mtx);
   return ctx;
 }
 static device_data *
 get_device_data(const void *object)
 {
   uintptr_t key = object_to_key(object);
   simple_mtx_lock(&device_mtx);
   device_data *ctx = device_list;
   while (ctx && key != object_to_key(ctx->device))
      ctx = ctx->next;
   simple_mtx_unlock(&device_mtx);
   return ctx;
 }
 device_context *
 get_device_context(const void *object)
 {
   device_data *data = get_device_data(object);
   assert(data && data->ctx);
   return data->ctx;
 }
 static VkLayerDeviceCreateInfo *
 get_device_chain_info(const VkDeviceCreateInfo *pCreateInfo, VkLayerFunction func)
 {
   vk_foreach_struct_const (item, pCreateInfo->pNext) {
      if (item->sType == VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO &&
          ((VkLayerDeviceCreateInfo *)item)->function == func)
         return (VkLayerDeviceCreateInfo *)item;
   }
   return NULL;
 }
 static bool
 should_enable_layer(instance_data *ctx, VkPhysicalDevice physicalDevice,
                    VkPhysicalDeviceAntiLagFeaturesAMD ext_feature)
 {
   /* The extension is not requested by the application. */
   if (!ext_feature.antiLag)
      return false;
   /* Ensure that the underlying implementation does not expose VK_AMD_anti_lag itself. */
   ext_feature.antiLag = false;
   VkPhysicalDeviceFeatures2 features = {
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
      .pNext = &ext_feature,
   };
   if (ctx->vtable.GetPhysicalDeviceFeatures2KHR) {
      ctx->vtable.GetPhysicalDeviceFeatures2KHR(physicalDevice, &features);
      return !ext_feature.antiLag;
   }
   if (ctx->vtable.GetPhysicalDeviceFeatures2) {
      ctx->vtable.GetPhysicalDeviceFeatures2(physicalDevice, &features);
      return !ext_feature.antiLag;
   }
   return false;
 }
 static bool
 check_calibrated_timestamps(instance_data *data, VkPhysicalDevice physicalDevice, bool *has_khr)
 {
   VkResult res;
   uint32_t count = 0;
   res = data->vtable.EnumerateDeviceExtensionProperties(physicalDevice, NULL, &count, NULL);
   VkExtensionProperties *extensions =
      vk_alloc(&data->alloc, count * sizeof(VkExtensionProperties), alignof(VkExtensionProperties),
               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
   if (!extensions)
      return false;
   res |= data->vtable.EnumerateDeviceExtensionProperties(physicalDevice, NULL, &count, extensions);
   *has_khr = false;
   bool has_ext = false;
   if (res == VK_SUCCESS) {
      for (unsigned i = 0; i < count; i++) {
         if (strcmp(extensions[i].extensionName, VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0)
            *has_khr = true;
         if (strcmp(extensions[i].extensionName, VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0)
            has_ext = true;
      }
   }
   vk_free(&data->alloc, extensions);
   return *has_khr || has_ext;
 }
 /* Initialize per-queue context:
 *
 * This includes creating one CommandPool and one QueryPool per Queue as well as
 * recording one CommandBuffer per timestamp query.
 */
 static VkResult
 init_queue_context(device_context *ctx, queue_context *queue_ctx)
 {
 #define CHECK_RESULT(res, label)                                                                   \
   if (res != VK_SUCCESS) {                                                                        \
      goto label;                                                                                  \
   }
   VkResult result;
   /* Create command pool */
   struct VkCommandPoolCreateInfo pool_info = {
      .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
      .pNext = NULL,
      .flags = 0,
      .queueFamilyIndex = queue_ctx->queue_family_idx,
   };
   result =
      ctx->vtable.CreateCommandPool(ctx->device, &pool_info, &ctx->alloc, &queue_ctx->cmdPool);
   CHECK_RESULT(result, fail_cmdpool)
   /* Create query pool */
   VkQueryPoolCreateInfo query_pool_info = {
      .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
      .queryType = VK_QUERY_TYPE_TIMESTAMP,
      .queryCount = MAX_QUERIES,
   };
   result = ctx->vtable.CreateQueryPool(ctx->device, &query_pool_info, &ctx->alloc,
                                        &queue_ctx->queryPool);
   CHECK_RESULT(result, fail_querypool)
   ctx->vtable.ResetQueryPool(ctx->device, queue_ctx->queryPool, 0, MAX_QUERIES);
   ringbuffer_init(queue_ctx->queries);
   /* Create timeline semaphore */
   VkSemaphoreTypeCreateInfo timelineCreateInfo = {
      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
      .pNext = NULL,
      .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
      .initialValue = 0,
   };
   VkSemaphoreCreateInfo createInfo = {
      .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
      .pNext = &timelineCreateInfo,
      .flags = 0,
   };
   result =
      ctx->vtable.CreateSemaphore(ctx->device, &createInfo, &ctx->alloc, &queue_ctx->semaphore);
   CHECK_RESULT(result, fail_semaphore);
   for (unsigned j = 0; j < MAX_QUERIES; j++) {
      struct query *query = &queue_ctx->queries.data[j];
      /* Allocate commandBuffer for timestamp. */
      VkCommandBufferAllocateInfo buffer_info = {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
         .commandPool = queue_ctx->cmdPool,
         .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
         .commandBufferCount = 1,
      };
      result = ctx->vtable.AllocateCommandBuffers(ctx->device, &buffer_info, &query->cmdbuffer);
      CHECK_RESULT(result, fail)
      result = ctx->vtable.SetDeviceLoaderData(ctx->device, query->cmdbuffer);
      CHECK_RESULT(result, fail)
      /* Record commandbuffer. */
      VkCommandBufferBeginInfo beginInfo = {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
      };
      result = ctx->vtable.BeginCommandBuffer(query->cmdbuffer, &beginInfo);
      CHECK_RESULT(result, fail)
      ctx->vtable.CmdWriteTimestamp(query->cmdbuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
                                    queue_ctx->queryPool, j);
      result = ctx->vtable.EndCommandBuffer(query->cmdbuffer);
      CHECK_RESULT(result, fail)
   }
 #undef CHECK_RESULT
   return result;
 fail:
   ctx->vtable.DestroySemaphore(ctx->device, queue_ctx->semaphore, &ctx->alloc);
 fail_semaphore:
   ctx->vtable.DestroyQueryPool(ctx->device, queue_ctx->queryPool, &ctx->alloc);
 fail_querypool:
   ctx->vtable.DestroyCommandPool(ctx->device, queue_ctx->cmdPool, &ctx->alloc);
 fail_cmdpool:
   for (queue_context *qctx = ctx->queues; qctx != queue_ctx; qctx++) {
      ctx->vtable.DestroyQueryPool(ctx->device, qctx->queryPool, &ctx->alloc);
      ctx->vtable.DestroyCommandPool(ctx->device, qctx->cmdPool, &ctx->alloc);
   }
   return result;
 }
 static VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
                      const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
 {
   instance_data *instance_ctx = get_instance_data(physicalDevice);
   VkLayerDeviceCreateInfo *chain_info = get_device_chain_info(pCreateInfo, VK_LAYER_LINK_INFO);
   PFN_vkGetDeviceProcAddr fpGetDeviceProcAddr = chain_info->u.pLayerInfo->pfnNextGetDeviceProcAddr;
   PFN_vkGetInstanceProcAddr fpGetInstanceProcAddr =
      chain_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
   PFN_vkCreateDevice fpCreateDevice =
      (PFN_vkCreateDevice)fpGetInstanceProcAddr(instance_ctx->instance, "vkCreateDevice");
   if (fpCreateDevice == NULL)
      return VK_ERROR_INITIALIZATION_FAILED;
   /* Advance the link info for the next element on the chain. */
   chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext;
   const VkAllocationCallbacks *alloc = pAllocator ? pAllocator : &instance_ctx->alloc;
   device_data *data;
   VkResult result;
   /*  Only allocate a context and add to dispatch if the extension is enabled. */
   const VkPhysicalDeviceAntiLagFeaturesAMD *ext_features =
      vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
   bool enable = ext_features && should_enable_layer(instance_ctx, physicalDevice, *ext_features);
   if (enable) {
      /* Count queues with sufficient timestamp valid bits. */
      // TODO: make it work with less than 64 valid bits
      unsigned num_queue_families = 0;
      unsigned num_queues = 0;
      for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
         num_queue_families =
            MAX2(num_queue_families, pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex + 1);
      VkQueueFamilyProperties *queue_family_props =
         vk_alloc(alloc, num_queue_families * sizeof(VkQueueFamilyProperties),
                  alignof(VkQueueFamilyProperties), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
      if (!queue_family_props)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      instance_ctx->vtable.GetPhysicalDeviceQueueFamilyProperties(
         physicalDevice, &num_queue_families, queue_family_props);
      for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
         uint32_t queue_family_idx = pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex;
         if (queue_family_props[queue_family_idx].timestampValidBits == 64 &&
             (queue_family_props[queue_family_idx].queueFlags &
              (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT))) {
            num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount;
         }
      }
      /* Allocate the context. */
      device_context *ctx;
      queue_context *queues;
      VK_MULTIALLOC(ma);
      vk_multialloc_add(&ma, &data, device_data, 1);
      vk_multialloc_add(&ma, &ctx, struct device_context, 1);
      vk_multialloc_add(&ma, &queues, queue_context, num_queues);
      void *buf = vk_multialloc_zalloc(&ma, alloc, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
      if (!buf) {
         vk_free(alloc, queue_family_props);
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      }
      VkPhysicalDeviceProperties properties;
      instance_ctx->vtable.GetPhysicalDeviceProperties(physicalDevice, &properties);
      /* Ensure that calibrated timestamps and host query reset extensions are enabled. */
      bool has_calibrated_timestamps = false;
      bool has_calibrated_timestamps_khr = false;
      bool has_vk12 = instance_ctx->apiVersion >= VK_API_VERSION_1_2 &&
                      properties.apiVersion >= VK_API_VERSION_1_2;
      bool has_host_query_reset = has_vk12;
      bool has_host_query_reset_ext = false;
      bool has_timeline_semaphore = has_vk12;
      bool has_timeline_semaphore_khr = false;
      for (unsigned i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
         if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
                    VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0)
            has_calibrated_timestamps = has_calibrated_timestamps_khr = true;
         if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
                    VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) == 0)
            has_calibrated_timestamps = true;
         if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
                    VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME) == 0)
            has_host_query_reset = has_host_query_reset_ext = true;
         if (strcmp(pCreateInfo->ppEnabledExtensionNames[i],
                    VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME) == 0)
            has_timeline_semaphore = has_timeline_semaphore_khr = true;
      }
      /* Add missing extensions. */
      VkDeviceCreateInfo create_info = *pCreateInfo;
      const char **ext_names = NULL;
      uint32_t num_extra_extensions =
         !has_calibrated_timestamps + !has_host_query_reset + !has_timeline_semaphore;
      if (num_extra_extensions) {
         ext_names = vk_alloc(
            alloc, (pCreateInfo->enabledExtensionCount + num_extra_extensions) * sizeof(char *),
            alignof(char *), VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
         if (!ext_names) {
            result = VK_ERROR_OUT_OF_HOST_MEMORY;
            goto fail;
         }
         memcpy(ext_names, pCreateInfo->ppEnabledExtensionNames,
                sizeof(char *) * pCreateInfo->enabledExtensionCount);
         if (!has_timeline_semaphore) {
            has_timeline_semaphore_khr = true;
            ext_names[create_info.enabledExtensionCount++] =
               VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME;
         }
         if (!has_host_query_reset) {
            has_host_query_reset_ext = true;
            ext_names[create_info.enabledExtensionCount++] = VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME;
         }
         if (!has_calibrated_timestamps) {
            check_calibrated_timestamps(instance_ctx, physicalDevice,
                                        &has_calibrated_timestamps_khr);
            ext_names[create_info.enabledExtensionCount++] =
               has_calibrated_timestamps_khr ? VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME
                                             : VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME;
         }
         create_info.ppEnabledExtensionNames = ext_names;
      }
      /* Ensure that hostQueryReset feature is enabled. */
      const VkPhysicalDeviceVulkan12Features *vk12 =
         vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_VULKAN_1_2_FEATURES);
      const VkPhysicalDeviceHostQueryResetFeatures *query_reset =
         vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES);
      const VkPhysicalDeviceTimelineSemaphoreFeatures *timeline_semaphore =
         vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES);
      uint32_t prev_hostQueryReset;
      uint32_t prev_timelineSemaphore;
      if (vk12) {
         prev_hostQueryReset = vk12->hostQueryReset;
         prev_timelineSemaphore = vk12->timelineSemaphore;
         ((VkPhysicalDeviceVulkan12Features *)vk12)->hostQueryReset = VK_TRUE;
         ((VkPhysicalDeviceVulkan12Features *)vk12)->timelineSemaphore = VK_TRUE;
      } else {
         if (query_reset) {
            prev_hostQueryReset = query_reset->hostQueryReset;
            ((VkPhysicalDeviceHostQueryResetFeatures *)query_reset)->hostQueryReset = VK_TRUE;
         } else {
            VkPhysicalDeviceHostQueryResetFeatures *feat =
               alloca(sizeof(VkPhysicalDeviceHostQueryResetFeatures));
            *feat = (VkPhysicalDeviceHostQueryResetFeatures){
               .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES,
               .pNext = (void *)create_info.pNext,
               .hostQueryReset = VK_TRUE,
            };
            create_info.pNext = feat;
         }
         if (timeline_semaphore) {
            prev_timelineSemaphore = timeline_semaphore->timelineSemaphore;
            ((VkPhysicalDeviceTimelineSemaphoreFeatures *)timeline_semaphore)->timelineSemaphore =
               VK_TRUE;
         } else {
            VkPhysicalDeviceTimelineSemaphoreFeatures *feat =
               alloca(sizeof(VkPhysicalDeviceTimelineSemaphoreFeatures));
            *feat = (VkPhysicalDeviceTimelineSemaphoreFeatures){
               .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES,
               .pNext = (void *)create_info.pNext,
               .timelineSemaphore = VK_TRUE,
            };
            create_info.pNext = feat;
         }
      }
      /* Create Device. */
      result = fpCreateDevice(physicalDevice, &create_info, pAllocator, pDevice);
      if (vk12) {
         ((VkPhysicalDeviceVulkan12Features *)vk12)->hostQueryReset = prev_hostQueryReset;
         ((VkPhysicalDeviceVulkan12Features *)vk12)->timelineSemaphore = prev_timelineSemaphore;
      } else {
         if (query_reset)
            ((VkPhysicalDeviceHostQueryResetFeatures *)query_reset)->hostQueryReset =
               prev_hostQueryReset;
         if (timeline_semaphore)
            ((VkPhysicalDeviceTimelineSemaphoreFeatures *)timeline_semaphore)->timelineSemaphore =
               prev_timelineSemaphore;
      }
      if (ext_names)
         vk_free(alloc, ext_names);
      if (result != VK_SUCCESS)
         goto fail;
      /* Initialize Context. */
      data->ctx = ctx;
      ctx->device = *pDevice;
      chain_info = get_device_chain_info(pCreateInfo, VK_LOADER_DATA_CALLBACK);
      PFN_vkSetDeviceLoaderData fpSetDeviceLoaderData =
         (PFN_vkSetDeviceLoaderData)chain_info->u.pfnSetDeviceLoaderData;
      init_device_vtable(ctx, fpGetDeviceProcAddr, fpSetDeviceLoaderData,
                         has_calibrated_timestamps_khr, has_host_query_reset_ext,
                         has_timeline_semaphore_khr);
      simple_mtx_init(&ctx->mtx, mtx_plain);
      ctx->num_queues = num_queues;
      ctx->alloc = *alloc;
      ctx->calibration.timestamp_period = properties.limits.timestampPeriod;
      ringbuffer_init(ctx->frames);
      /* Initialize Queue contexts. */
      unsigned idx = 0;
      for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
         /* Skip queue families without sufficient timestamp valid bits.
          * Also skip queue families which cannot do GRAPHICS or COMPUTE since they
          * always heavily async in nature (DMA transfers and sparse for example).
          * Video is also irrelvant here since it should never be a critical path
          * in a game that wants anti-lag. */
         uint32_t queue_family_idx = pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex;
         if (queue_family_props[queue_family_idx].timestampValidBits != 64 ||
             !(queue_family_props[queue_family_idx].queueFlags &
               (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT)))
            continue;
         for (unsigned j = 0; j < pCreateInfo->pQueueCreateInfos[i].queueCount; j++) {
            VkQueue queue;
            ctx->vtable.GetDeviceQueue(*pDevice, queue_family_idx, j, &queue);
            ctx->queues[idx].queue = queue;
            ctx->queues[idx].queue_family_idx = queue_family_idx;
            result = init_queue_context(ctx, &ctx->queues[idx]);
            idx++;
            if (result != VK_SUCCESS)
               goto fail;
         }
      }
      assert(idx == num_queues);
   fail:
      vk_free(alloc, queue_family_props);
   } else {
      data = (device_data *)vk_alloc(alloc, sizeof(device_data), alignof(device_data),
                                     VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
      if (!data)
         return VK_ERROR_OUT_OF_HOST_MEMORY;
      result = fpCreateDevice(physicalDevice, pCreateInfo, pAllocator, pDevice);
      data->ctx = NULL;
   }
   if (result == VK_SUCCESS) {
      data->device = *pDevice;
      data->GetDeviceProcAddr = fpGetDeviceProcAddr;
      data->next = NULL;
      add_device(data);
   } else {
      vk_free(alloc, data);
   }
   return result;
 }
 static VKAPI_ATTR void VKAPI_CALL
 anti_lag_DestroyDevice(VkDevice pDevice, const VkAllocationCallbacks *pAllocator)
 {
   device_data *data = remove_device(pDevice);
   assert(data && data->ctx);
   device_context *ctx = data->ctx;
   /* Destroy per-queue context.
    * The application must ensure that no work is active on the device.
    */
   for (unsigned i = 0; i < ctx->num_queues; i++) {
      queue_context *queue_ctx = &ctx->queues[i];
      ctx->vtable.DestroyQueryPool(ctx->device, queue_ctx->queryPool, &ctx->alloc);
      ctx->vtable.DestroyCommandPool(ctx->device, queue_ctx->cmdPool, &ctx->alloc);
      ctx->vtable.DestroySemaphore(ctx->device, queue_ctx->semaphore, &ctx->alloc);
   }
   ctx->vtable.DestroyDevice(pDevice, pAllocator);
   vk_free(&ctx->alloc, data);
 }
 static bool
 is_anti_lag_supported(VkPhysicalDevice physicalDevice)
 {
   instance_data *data = get_instance_data(physicalDevice);
   VkPhysicalDeviceProperties properties;
   data->vtable.GetPhysicalDeviceProperties(physicalDevice, &properties);
   if (properties.limits.timestampPeriod == 0.0 || !properties.limits.timestampComputeAndGraphics)
      return false;
   /* Check whether calibrated timestamps are supported. */
   bool has_khr;
   if (!check_calibrated_timestamps(data, physicalDevice, &has_khr))
      return false;
   /* Check whether timeline semaphores and host query reset are supported. */
   VkPhysicalDeviceTimelineSemaphoreFeatures timeline_semaphore = {
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES,
      .timelineSemaphore = VK_FALSE,
   };
   VkPhysicalDeviceHostQueryResetFeatures query_reset = {
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES,
      .pNext = &timeline_semaphore,
      .hostQueryReset = VK_FALSE,
   };
   VkPhysicalDeviceFeatures2 features = {
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
      .pNext = &query_reset,
   };
   if (data->vtable.GetPhysicalDeviceFeatures2KHR)
      data->vtable.GetPhysicalDeviceFeatures2KHR(physicalDevice, &features);
   else if (data->vtable.GetPhysicalDeviceFeatures2)
      data->vtable.GetPhysicalDeviceFeatures2(physicalDevice, &features);
   if (!timeline_semaphore.timelineSemaphore || !query_reset.hostQueryReset)
      return false;
   /* Check that DEVICE and CLOCK_MONOTONIC time domains are available. */
   VkResult res;
   uint32_t count = 0;
   PFN_vkGetPhysicalDeviceCalibrateableTimeDomainsKHR ctd =
      has_khr ? data->vtable.GetPhysicalDeviceCalibrateableTimeDomainsKHR
              : data->vtable.GetPhysicalDeviceCalibrateableTimeDomainsEXT;
   res = ctd(physicalDevice, &count, NULL);
   VkTimeDomainKHR *time_domains = alloca(count * sizeof(VkTimeDomainKHR));
   res |= ctd(physicalDevice, &count, time_domains);
   if (res != VK_SUCCESS)
      return false;
   bool has_device_domain = false;
   bool has_host_domain = false;
   for (unsigned i = 0; i < count; i++) {
      has_device_domain |= time_domains[i] == VK_TIME_DOMAIN_DEVICE_KHR;
      has_host_domain |= time_domains[i] == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR;
   }
   return has_device_domain && has_host_domain;
 }
 static VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_EnumerateDeviceExtensionProperties(VkPhysicalDevice physicalDevice, const char *pLayerName,
                                            uint32_t *pPropertyCount,
                                            VkExtensionProperties *pProperties)
 {
   instance_data *instance_data = get_instance_data(physicalDevice);
   if (pLayerName && strcmp(pLayerName, "VK_LAYER_MESA_anti_lag") == 0) {
      if (!is_anti_lag_supported(physicalDevice)) {
         *pPropertyCount = 0;
         return VK_SUCCESS;
      }
      VK_OUTARRAY_MAKE_TYPED(VkExtensionProperties, out, pProperties, pPropertyCount);
      vk_outarray_append_typed(VkExtensionProperties, &out, prop)
      {
         *prop =
            (VkExtensionProperties){VK_AMD_ANTI_LAG_EXTENSION_NAME, VK_AMD_ANTI_LAG_SPEC_VERSION};
      }
      return vk_outarray_status(&out);
   }
   return instance_data->vtable.EnumerateDeviceExtensionProperties(physicalDevice, pLayerName,
                                                                   pPropertyCount, pProperties);
 }
 static VKAPI_ATTR void VKAPI_CALL
 anti_lag_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
                                    VkPhysicalDeviceFeatures2 *pFeatures)
 {
   instance_data *ctx = get_instance_data(physicalDevice);
   ctx->vtable.GetPhysicalDeviceFeatures2(physicalDevice, pFeatures);
   VkPhysicalDeviceAntiLagFeaturesAMD *anti_lag_features =
      vk_find_struct(pFeatures->pNext, PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
   if (anti_lag_features) {
      anti_lag_features->antiLag |= is_anti_lag_supported(physicalDevice);
   }
 }
 static VKAPI_ATTR void VKAPI_CALL
 anti_lag_GetPhysicalDeviceFeatures2KHR(VkPhysicalDevice physicalDevice,
                                       VkPhysicalDeviceFeatures2 *pFeatures)
 {
   instance_data *ctx = get_instance_data(physicalDevice);
   ctx->vtable.GetPhysicalDeviceFeatures2KHR(physicalDevice, pFeatures);
   VkPhysicalDeviceAntiLagFeaturesAMD *anti_lag_features =
      vk_find_struct(pFeatures->pNext, PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
   if (anti_lag_features) {
      anti_lag_features->antiLag |= is_anti_lag_supported(physicalDevice);
   }
 }
 static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 anti_lag_GetInstanceProcAddr(VkInstance instance, const char *pName);
 static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 anti_lag_GetDeviceProcAddr(VkDevice device, const char *pName);
 #define ADD_HOOK(fn) {"vk" #fn, (PFN_vkVoidFunction)anti_lag_##fn}
 static const struct {
   const char *name;
   PFN_vkVoidFunction ptr;
 } instance_funcptr_map[] = {
   ADD_HOOK(GetInstanceProcAddr),
   ADD_HOOK(CreateInstance),
   ADD_HOOK(DestroyInstance),
   ADD_HOOK(EnumerateDeviceExtensionProperties),
   ADD_HOOK(CreateDevice),
   ADD_HOOK(GetPhysicalDeviceFeatures2),
   ADD_HOOK(GetPhysicalDeviceFeatures2KHR),
 };
 static const struct {
   const char *name;
   PFN_vkVoidFunction ptr;
 } device_funcptr_map[] = {
   ADD_HOOK(GetDeviceProcAddr),
   ADD_HOOK(DestroyDevice),
   ADD_HOOK(AntiLagUpdateAMD),
   ADD_HOOK(QueueSubmit),
   ADD_HOOK(QueueSubmit2),
   ADD_HOOK(QueueSubmit2KHR),
 };
 #undef ADD_HOOK
 static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 anti_lag_GetInstanceProcAddr(VkInstance instance, const char *pName)
 {
   if (!pName)
      return NULL;
   PFN_vkVoidFunction result = NULL;
   if (instance) {
      instance_data *ctx = get_instance_data(instance);
      if (ctx)
         result = ctx->vtable.GetInstanceProcAddr(instance, pName);
   }
   /* Only hook instance functions which are exposed by the underlying impl.
    * Ignore instance parameter for vkCreateInstance and vkCreateDevice.
    */
   if (result || strcmp(pName, "vkCreateInstance") == 0 || strcmp(pName, "vkCreateDevice") == 0) {
      for (uint32_t i = 0; i < ARRAY_SIZE(instance_funcptr_map); i++) {
         if (strcmp(pName, instance_funcptr_map[i].name) == 0)
            return instance_funcptr_map[i].ptr;
      }
   }
   return result;
 }
 static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 anti_lag_GetDeviceProcAddr(VkDevice device, const char *pName)
 {
   if (!pName || !device)
      return NULL;
   device_data *data = get_device_data(device);
   PFN_vkVoidFunction result = data->GetDeviceProcAddr(device, pName);
   /* Only hook device functions if the Layer extension is enabled. */
   if (data->ctx && (result || strcmp(pName, "vkAntiLagUpdateAMD") == 0)) {
      for (uint32_t i = 0; i < ARRAY_SIZE(device_funcptr_map); i++) {
         if (strcmp(pName, device_funcptr_map[i].name) == 0)
            return device_funcptr_map[i].ptr;
      }
   }
   return result;
 }
 PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
 anti_lag_NegotiateLoaderLayerInterfaceVersion(VkNegotiateLayerInterface *pVersionStruct)
 {
   assert(pVersionStruct != NULL);
   assert(pVersionStruct->sType == LAYER_NEGOTIATE_INTERFACE_STRUCT);
   if (pVersionStruct->loaderLayerInterfaceVersion >= 2) {
      pVersionStruct->loaderLayerInterfaceVersion = 2;
      pVersionStruct->pfnGetInstanceProcAddr = anti_lag_GetInstanceProcAddr;
      pVersionStruct->pfnGetDeviceProcAddr = anti_lag_GetDeviceProcAddr;
      pVersionStruct->pfnGetPhysicalDeviceProcAddr = NULL;
   }
   return VK_SUCCESS;
 }
--- a/src/vulkan/anti-lag-layer/meson.build
+++ b/src/vulkan/anti-lag-layer/meson.build
@ -0,0 +1,26 @@
 # Copyright © 2025 Valve Corporation
 # SPDX-License-Identifier: MIT
 vklayer_files = files(
  'anti_lag_layer.c',
  'anti_lag_layer_interface.c',
 )
 shared_library(
  'VkLayer_MESA_anti_lag',
  vklayer_files,
  c_args : [no_override_init_args],
  gnu_symbol_visibility : 'hidden',
  dependencies : [
    idep_vulkan_util, idep_mesautil,
  ],
  include_directories : [inc_include, inc_util, inc_src],
  link_args : cc.get_supported_link_arguments(['-Wl,-Bsymbolic-functions', '-Wl,-z,relro']),
  install : true
 )
 install_data(
  files('VkLayer_MESA_anti_lag.json'),
  install_dir : join_paths(get_option('datadir'), 'vulkan', 'implicit_layer.d'),
  install_tag : 'runtime',
 )
--- a/src/vulkan/anti-lag-layer/ringbuffer.h
+++ b/src/vulkan/anti-lag-layer/ringbuffer.h
@ -0,0 +1,58 @@
 /*
 * Copyright © 2025 Valve Corporation
 *
 * SPDX-License-Identifier: MIT
 */
 #ifndef RINGBUFFER_H
 #define RINGBUFFER_H
 #include "util/macros.h"
 #define RINGBUFFER_DECLARE(name, type, N)                                                          \
   struct {                                                                                        \
      type data[N];                                                                                \
      uint32_t head;                                                                               \
      uint32_t tail;                                                                               \
      uint32_t size;                                                                               \
      simple_mtx_t mtx;                                                                            \
   } name
 #define ringbuffer_init(buffer)                                                                    \
   (buffer.head = buffer.tail = buffer.size = 0, simple_mtx_init(&buffer.mtx, mtx_plain))
 #define ringbuffer_lock(buffer)   simple_mtx_lock(&buffer.mtx)
 #define ringbuffer_unlock(buffer) simple_mtx_unlock(&buffer.mtx)
 static inline uint32_t
 __ringbuffer_add_wrap(uint32_t *val, uint32_t *size, uint32_t N)
 {
   uint32_t prev = *val;
   *val = (*val + 1) % N;
   *size = *size + 1;
   assert(*size <= N);
   return prev;
 }
 #define ringbuffer_alloc(buffer)                                                                   \
   (buffer.size == ARRAY_SIZE(buffer.data)                                                         \
       ? NULL                                                                                      \
       : &buffer.data[__ringbuffer_add_wrap(&buffer.head, &buffer.size, ARRAY_SIZE(buffer.data))])
 #define ringbuffer_free(buffer, elem)                                                              \
   assert(elem == NULL || elem == &buffer.data[buffer.tail]);                                      \
   buffer.size--;                                                                                  \
   assert(buffer.size < ARRAY_SIZE(buffer.data));                                                  \
   buffer.tail = (buffer.tail + 1) % ARRAY_SIZE(buffer.data)
 #define ringbuffer_first(buffer) (&buffer.data[buffer.tail])
 #define ringbuffer_last(buffer)                                                                    \
   (&buffer.data[(buffer.head + ARRAY_SIZE(buffer.data) - 1) % ARRAY_SIZE(buffer.data)])
 #define ringbuffer_index(buffer, elem) (elem - buffer.data)
 #define ringbuffer_next(buffer, elem)                                                              \
   (&buffer.data[(ringbuffer_index(buffer, elem) + 1) % ARRAY_SIZE(buffer.data)])
 #endif /* RINGBUFFER_H */
--- a/src/vulkan/meson.build
+++ b/src/vulkan/meson.build
@ -98,3 +98,6 @@ endif
 if with_vulkan_vram_report_limit_layer
  subdir('vram-report-limit-layer')
 endif
 if with_vulkan_anti_lag_layer
  subdir('anti-lag-layer')
 endif