diff --git a/meson.build b/meson.build
index d2eab192618..4afad99f68d 100644
--- a/meson.build
+++ b/meson.build
@@ -240,7 +240,7 @@ elif _vulkan_drivers.contains('all')
    _vulkan_drivers = ['amd', 'intel', 'intel_hasvk', 'swrast',
                       'freedreno', 'panfrost', 'virtio', 'broadcom',
                       'imagination-experimental', 'microsoft-experimental',
-                      'nouveau']
+                      'nouveau', 'asahi']
 endif
 
 with_intel_vk = _vulkan_drivers.contains('intel')
@@ -255,6 +255,7 @@ with_imagination_vk = _vulkan_drivers.contains('imagination-experimental')
 with_imagination_srv = get_option('imagination-srv')
 with_microsoft_vk = _vulkan_drivers.contains('microsoft-experimental')
 with_nouveau_vk = _vulkan_drivers.contains('nouveau')
+with_asahi_vk = _vulkan_drivers.contains('asahi')
 with_any_vk = _vulkan_drivers.length() != 0
 
 if with_any_vk and host_machine.system() == 'windows' and meson.version().version_compare('< 1.3')
@@ -850,7 +851,7 @@ if with_gallium_rusticl
 endif
 
 with_clover_spirv = with_gallium_clover and get_option('opencl-spirv')
-with_clc = with_microsoft_clc or with_intel_clc or with_gallium_asahi or with_gallium_rusticl or with_clover_spirv
+with_clc = with_microsoft_clc or with_intel_clc or with_gallium_asahi or with_asahi_vk or with_gallium_rusticl or with_clover_spirv
 
 dep_clc = null_dep
 if with_gallium_clover or with_clc
diff --git a/meson_options.txt b/meson_options.txt
index f8f4ec29513..ff669621267 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -228,7 +228,7 @@ option(
   value : ['auto'],
   choices : ['auto', 'amd', 'broadcom', 'freedreno', 'intel', 'intel_hasvk',
              'panfrost', 'swrast', 'virtio', 'imagination-experimental',
-             'microsoft-experimental', 'nouveau', 'all'],
+             'microsoft-experimental', 'nouveau', 'asahi', 'all'],
   description : 'List of vulkan drivers to build. If this is set to auto ' +
                 'all drivers applicable to the target OS/architecture ' +
                 'will be built'
diff --git a/src/.clang-format b/src/.clang-format
index d13cd051cf4..142700a493c 100644
--- a/src/.clang-format
+++ b/src/.clang-format
@@ -186,6 +186,8 @@ ForEachMacros:
 # asahi
   - foreach_active
   - foreach_submitted
+  - hk_foreach_view
+  - hk_foreach_variant
   - AGX_BATCH_FOREACH_BO_HANDLE
   - agx_pack
   - agx_push
diff --git a/src/asahi/meson.build b/src/asahi/meson.build
index ac58326a822..c5f08ead519 100644
--- a/src/asahi/meson.build
+++ b/src/asahi/meson.build
@@ -6,7 +6,7 @@ inc_asahi = include_directories([
    '.', 'layout', 'lib', 'genxml', 'compiler'
 ])
 
-if with_gallium_asahi
+if with_gallium_asahi or with_asahi_vk
    subdir('layout')
    subdir('compiler')
    subdir('clc')
@@ -14,6 +14,10 @@ if with_gallium_asahi
    subdir('lib')
 endif
 
+if with_asahi_vk
+   subdir('vulkan')
+endif
+
 if with_tools.contains('drm-shim')
   subdir('drm-shim')
 endif
diff --git a/src/asahi/vulkan/hk_buffer.c b/src/asahi/vulkan/hk_buffer.c
new file mode 100644
index 00000000000..63bec5a0f70
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_buffer.h"
+
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+static uint32_t
+hk_get_buffer_alignment(const struct hk_physical_device *pdev,
+                        VkBufferUsageFlags2KHR usage_flags,
+                        VkBufferCreateFlags create_flags)
+{
+   uint32_t alignment = 16;
+
+   if (usage_flags & VK_BUFFER_USAGE_2_UNIFORM_BUFFER_BIT_KHR)
+      alignment = MAX2(alignment, HK_MIN_UBO_ALIGNMENT);
+
+   if (usage_flags & VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT_KHR)
+      alignment = MAX2(alignment, HK_MIN_SSBO_ALIGNMENT);
+
+   if (usage_flags & (VK_BUFFER_USAGE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR |
+                      VK_BUFFER_USAGE_2_STORAGE_TEXEL_BUFFER_BIT_KHR))
+      alignment = MAX2(alignment, HK_MIN_TEXEL_BUFFER_ALIGNMENT);
+
+   if (create_flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+                       VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))
+      alignment = MAX2(alignment, 4096);
+
+   return alignment;
+}
+
+static uint64_t
+hk_get_bda_replay_addr(const VkBufferCreateInfo *pCreateInfo)
+{
+   uint64_t addr = 0;
+   vk_foreach_struct_const(ext, pCreateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO: {
+         const VkBufferOpaqueCaptureAddressCreateInfo *bda = (void *)ext;
+         if (bda->opaqueCaptureAddress != 0) {
+#ifdef NDEBUG
+            return bda->opaqueCaptureAddress;
+#else
+            assert(addr == 0 || bda->opaqueCaptureAddress == addr);
+            addr = bda->opaqueCaptureAddress;
+#endif
+         }
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT: {
+         const VkBufferDeviceAddressCreateInfoEXT *bda = (void *)ext;
+         if (bda->deviceAddress != 0) {
+#ifdef NDEBUG
+            return bda->deviceAddress;
+#else
+            assert(addr == 0 || bda->deviceAddress == addr);
+            addr = bda->deviceAddress;
+#endif
+         }
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   return addr;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateBuffer(VkDevice device, const VkBufferCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_buffer *buffer;
+
+   if (pCreateInfo->size > HK_MAX_BUFFER_SIZE)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   buffer =
+      vk_buffer_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*buffer));
+   if (!buffer)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   if (buffer->vk.size > 0 &&
+       (buffer->vk.create_flags &
+        (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+         VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))) {
+
+      unreachable("todo");
+#if 0
+      const uint32_t alignment =
+         hk_get_buffer_alignment(hk_device_physical(dev),
+                                  buffer->vk.usage,
+                                  buffer->vk.create_flags);
+      assert(alignment >= 4096);
+      buffer->vma_size_B = align64(buffer->vk.size, alignment);
+
+      const bool sparse_residency =
+         buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT;
+      const bool bda_capture_replay =
+         buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT;
+
+      uint64_t bda_replay_addr = 0;
+      if (bda_capture_replay)
+         bda_replay_addr = hk_get_bda_replay_addr(pCreateInfo);
+
+      buffer->addr = nouveau_ws_alloc_vma(dev->ws_dev, bda_replay_addr,
+                                          buffer->vma_size_B,
+                                          alignment, bda_capture_replay,
+                                          sparse_residency);
+#endif
+      if (buffer->addr == 0) {
+         vk_buffer_destroy(&dev->vk, pAllocator, &buffer->vk);
+         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "Sparse VMA allocation failed");
+      }
+   }
+
+   *pBuffer = hk_buffer_to_handle(buffer);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyBuffer(VkDevice device, VkBuffer _buffer,
+                 const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   if (!buffer)
+      return;
+
+   if (buffer->vma_size_B > 0) {
+      unreachable("todo");
+#if 0
+      const bool sparse_residency =
+         buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT;
+      const bool bda_capture_replay =
+         buffer->vk.create_flags &
+         VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT;
+
+      agx_bo_unbind_vma(dev->ws_dev, buffer->addr, buffer->vma_size_B);
+      nouveau_ws_free_vma(dev->ws_dev, buffer->addr, buffer->vma_size_B,
+                          bda_capture_replay, sparse_residency);
+#endif
+   }
+
+   vk_buffer_destroy(&dev->vk, pAllocator, &buffer->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceBufferMemoryRequirements(
+   VkDevice device, const VkDeviceBufferMemoryRequirements *pInfo,
+   VkMemoryRequirements2 *pMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   const uint32_t alignment = hk_get_buffer_alignment(
+      hk_device_physical(dev), pInfo->pCreateInfo->usage,
+      pInfo->pCreateInfo->flags);
+
+   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements){
+      .size = align64(pInfo->pCreateInfo->size, alignment),
+      .alignment = alignment,
+      .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
+   };
+
+   vk_foreach_struct_const(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *dedicated = (void *)ext;
+         dedicated->prefersDedicatedAllocation = false;
+         dedicated->requiresDedicatedAllocation = false;
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceExternalBufferProperties(
+   VkPhysicalDevice physicalDevice,
+   const VkPhysicalDeviceExternalBufferInfo *pExternalBufferInfo,
+   VkExternalBufferProperties *pExternalBufferProperties)
+{
+   /* The Vulkan 1.3.256 spec says:
+    *
+    *    VUID-VkPhysicalDeviceExternalBufferInfo-handleType-parameter
+    *
+    *    "handleType must be a valid VkExternalMemoryHandleTypeFlagBits value"
+    *
+    * This differs from VkPhysicalDeviceExternalImageFormatInfo, which
+    * surprisingly permits handleType == 0.
+    */
+   assert(pExternalBufferInfo->handleType != 0);
+
+   /* All of the current flags are for sparse which we don't support yet.
+    * Even when we do support it, doing sparse on external memory sounds
+    * sketchy.  Also, just disallowing flags is the safe option.
+    */
+   if (pExternalBufferInfo->flags)
+      goto unsupported;
+
+   switch (pExternalBufferInfo->handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      pExternalBufferProperties->externalMemoryProperties =
+         hk_dma_buf_mem_props;
+      return;
+   default:
+      goto unsupported;
+   }
+
+unsupported:
+   /* From the Vulkan 1.3.256 spec:
+    *
+    *    compatibleHandleTypes must include at least handleType.
+    */
+   pExternalBufferProperties->externalMemoryProperties =
+      (VkExternalMemoryProperties){
+         .compatibleHandleTypes = pExternalBufferInfo->handleType,
+      };
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_BindBufferMemory2(VkDevice device, uint32_t bindInfoCount,
+                     const VkBindBufferMemoryInfo *pBindInfos)
+{
+   for (uint32_t i = 0; i < bindInfoCount; ++i) {
+      VK_FROM_HANDLE(hk_device_memory, mem, pBindInfos[i].memory);
+      VK_FROM_HANDLE(hk_buffer, buffer, pBindInfos[i].buffer);
+
+      if (buffer->vma_size_B) {
+         unreachable("todo");
+#if 0
+         VK_FROM_HANDLE(hk_device, dev, device);
+         agx_bo_bind_vma(dev->ws_dev,
+                                mem->bo,
+                                buffer->addr,
+                                buffer->vma_size_B,
+                                pBindInfos[i].memoryOffset,
+                                0 /* pte_kind */);
+#endif
+      } else {
+         buffer->addr = mem->bo->ptr.gpu + pBindInfos[i].memoryOffset;
+      }
+
+      const VkBindMemoryStatusKHR *status =
+         vk_find_struct_const(pBindInfos[i].pNext, BIND_MEMORY_STATUS_KHR);
+      if (status != NULL && status->pResult != NULL)
+         *status->pResult = VK_SUCCESS;
+   }
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkDeviceAddress VKAPI_CALL
+hk_GetBufferDeviceAddress(UNUSED VkDevice device,
+                          const VkBufferDeviceAddressInfo *pInfo)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, pInfo->buffer);
+
+   return hk_buffer_address(buffer, 0);
+}
+
+VKAPI_ATTR uint64_t VKAPI_CALL
+hk_GetBufferOpaqueCaptureAddress(UNUSED VkDevice device,
+                                 const VkBufferDeviceAddressInfo *pInfo)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, pInfo->buffer);
+
+   return hk_buffer_address(buffer, 0);
+}
diff --git a/src/asahi/vulkan/hk_buffer.h b/src/asahi/vulkan/hk_buffer.h
new file mode 100644
index 00000000000..f349a3df0e2
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include "hk_device_memory.h"
+#include "hk_private.h"
+
+#include "vk_buffer.h"
+
+struct hk_device_memory;
+struct hk_physical_device;
+
+struct hk_buffer {
+   struct vk_buffer vk;
+   uint64_t addr;
+
+   /** Size of the reserved VMA range for sparse buffers, zero otherwise. */
+   uint64_t vma_size_B;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_buffer, vk.base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+
+static inline uint64_t
+hk_buffer_address(const struct hk_buffer *buffer, uint64_t offset)
+{
+   return buffer->addr + offset;
+}
+
+static inline struct hk_addr_range
+hk_buffer_addr_range(const struct hk_buffer *buffer, uint64_t offset,
+                     uint64_t range)
+{
+   if (buffer == NULL)
+      return (struct hk_addr_range){.range = 0};
+
+   return (struct hk_addr_range){
+      .addr = hk_buffer_address(buffer, offset),
+      .range = vk_buffer_range(&buffer->vk, offset, range),
+   };
+}
diff --git a/src/asahi/vulkan/hk_buffer_view.c b/src/asahi/vulkan/hk_buffer_view.c
new file mode 100644
index 00000000000..73d32d945ae
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer_view.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_buffer_view.h"
+#include "asahi/lib/agx_formats.h"
+#include "asahi/lib/agx_nir_lower_vbo.h"
+#include "util/bitscan.h"
+#include "util/format/u_format.h"
+#include "util/format/u_formats.h"
+
+#include "agx_helpers.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "hk_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vk_format.h"
+
+VkFormatFeatureFlags2
+hk_get_buffer_format_features(struct hk_physical_device *pdev,
+                              VkFormat vk_format)
+{
+   VkFormatFeatureFlags2 features = 0;
+   enum pipe_format p_format = vk_format_to_pipe_format(vk_format);
+
+   if (p_format == PIPE_FORMAT_NONE)
+      return 0;
+
+   if (agx_vbo_supports_format(p_format))
+      features |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
+
+   if (agx_pixel_format[p_format].texturable &&
+       !util_format_is_depth_or_stencil(p_format)) {
+
+      features |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT;
+
+      /* RGB32 specially supported for uniform texel buffers only. */
+      if (util_is_power_of_two_nonzero(util_format_get_blocksize(p_format))) {
+         features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT |
+                     VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+      }
+
+      if (p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT)
+         features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+   }
+
+   return features;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo,
+                    const VkAllocationCallbacks *pAllocator,
+                    VkBufferView *pBufferView)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_buffer, buffer, pCreateInfo->buffer);
+   struct hk_buffer_view *view;
+   VkResult result;
+
+   view = vk_buffer_view_create(&device->vk, pCreateInfo, pAllocator,
+                                sizeof(*view));
+   if (!view)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   enum pipe_format format = vk_format_to_pipe_format(view->vk.format);
+   const struct util_format_description *desc = util_format_description(format);
+
+   uint8_t format_swizzle[4] = {
+      desc->swizzle[0],
+      desc->swizzle[1],
+      desc->swizzle[2],
+      desc->swizzle[3],
+   };
+
+   if (util_format_is_depth_or_stencil(format)) {
+      assert(!util_format_is_depth_and_stencil(format) &&
+             "separate stencil always used");
+
+      /* Broadcast depth and stencil */
+      format_swizzle[0] = 0;
+      format_swizzle[1] = 0;
+      format_swizzle[2] = 0;
+      format_swizzle[3] = 0;
+   }
+
+   /* Decompose the offset into a multiple of 16-bytes (which we can include in
+    * the address) and an extra texel-aligned tail offset of up to 15 bytes.
+    *
+    * This lets us offset partially in the shader instead, getting
+    * around alignment restrictions on the base address pointer.
+    */
+   uint64_t base = hk_buffer_address(buffer, 0) + (view->vk.offset & ~0xf);
+   uint32_t tail_offset_B = view->vk.offset & 0xf;
+   uint32_t tail_offset_el = tail_offset_B / util_format_get_blocksize(format);
+   assert(tail_offset_el * util_format_get_blocksize(format) == tail_offset_B &&
+          "must be texel aligned");
+
+   struct agx_texture_packed tex;
+   agx_pack(&tex, TEXTURE, cfg) {
+      cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
+      cfg.layout = AGX_LAYOUT_LINEAR;
+      cfg.channels = agx_pixel_format[format].channels;
+      cfg.type = agx_pixel_format[format].type;
+      cfg.swizzle_r = agx_channel_from_pipe(format_swizzle[0]);
+      cfg.swizzle_g = agx_channel_from_pipe(format_swizzle[1]);
+      cfg.swizzle_b = agx_channel_from_pipe(format_swizzle[2]);
+      cfg.swizzle_a = agx_channel_from_pipe(format_swizzle[3]);
+
+      cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
+      cfg.height = DIV_ROUND_UP(view->vk.elements, cfg.width);
+      cfg.first_level = cfg.last_level = 0;
+
+      cfg.address = base;
+      cfg.buffer_size_sw = view->vk.elements;
+      cfg.buffer_offset_sw = tail_offset_el;
+
+      cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+      cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
+
+      cfg.depth = 1;
+      cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16;
+   }
+
+   struct agx_pbe_packed pbe;
+   agx_pack(&pbe, PBE, cfg) {
+      cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
+      cfg.layout = AGX_LAYOUT_LINEAR;
+      cfg.channels = agx_pixel_format[format].channels;
+      cfg.type = agx_pixel_format[format].type;
+      cfg.srgb = util_format_is_srgb(format);
+
+      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
+
+      for (unsigned i = 0; i < desc->nr_channels; ++i) {
+         if (desc->swizzle[i] == 0)
+            cfg.swizzle_r = i;
+         else if (desc->swizzle[i] == 1)
+            cfg.swizzle_g = i;
+         else if (desc->swizzle[i] == 2)
+            cfg.swizzle_b = i;
+         else if (desc->swizzle[i] == 3)
+            cfg.swizzle_a = i;
+      }
+
+      cfg.buffer = base;
+      cfg.buffer_offset_sw = tail_offset_el;
+
+      cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
+      cfg.height = DIV_ROUND_UP(view->vk.elements, cfg.width);
+      cfg.level = 0;
+      cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 4;
+      cfg.layers = 1;
+      cfg.levels = 1;
+   };
+
+   result = hk_descriptor_table_add(device, &device->images, &tex, sizeof(tex),
+                                    &view->tex_desc_index);
+   if (result != VK_SUCCESS) {
+      vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+      return result;
+   }
+
+   result = hk_descriptor_table_add(device, &device->images, &pbe, sizeof(pbe),
+                                    &view->pbe_desc_index);
+   if (result != VK_SUCCESS) {
+      hk_descriptor_table_remove(device, &device->images, view->tex_desc_index);
+      vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+      return result;
+   }
+
+   *pBufferView = hk_buffer_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
+                     const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_buffer_view, view, bufferView);
+
+   if (!view)
+      return;
+
+   hk_descriptor_table_remove(device, &device->images, view->tex_desc_index);
+   hk_descriptor_table_remove(device, &device->images, view->pbe_desc_index);
+
+   vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+}
diff --git a/src/asahi/vulkan/hk_buffer_view.h b/src/asahi/vulkan/hk_buffer_view.h
new file mode 100644
index 00000000000..6b182006f1a
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer_view.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_buffer_view.h"
+
+struct hk_physical_device;
+
+VkFormatFeatureFlags2
+hk_get_buffer_format_features(struct hk_physical_device *pdevice,
+                              VkFormat format);
+
+struct hk_buffer_view {
+   struct vk_buffer_view vk;
+
+   /** Index in the image descriptor table */
+   uint32_t tex_desc_index, pbe_desc_index;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_buffer_view, vk.base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
diff --git a/src/asahi/vulkan/hk_cmd_buffer.c b/src/asahi/vulkan/hk_cmd_buffer.c
new file mode 100644
index 00000000000..b3b362bf2b7
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_buffer.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_cmd_buffer.h"
+
+#include "agx_bo.h"
+#include "agx_linker.h"
+#include "agx_tilebuffer.h"
+#include "agx_usc.h"
+#include "hk_buffer.h"
+#include "hk_cmd_pool.h"
+#include "hk_descriptor_set.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_entrypoints.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+
+#include "pool.h"
+#include "shader_enums.h"
+#include "vk_pipeline_layout.h"
+#include "vk_synchronization.h"
+
+#include "nouveau/nouveau.h"
+#include "util/list.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "vulkan/vulkan_core.h"
+
+static void
+hk_descriptor_state_fini(struct hk_cmd_buffer *cmd,
+                         struct hk_descriptor_state *desc)
+{
+   struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
+
+   for (unsigned i = 0; i < HK_MAX_SETS; i++) {
+      vk_free(&pool->vk.alloc, desc->push[i]);
+      desc->push[i] = NULL;
+   }
+}
+
+static void
+hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd)
+{
+   struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
+
+   hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors);
+   hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors);
+
+   hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos);
+   hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos);
+
+   list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) {
+      list_del(&it->node);
+      hk_cs_destroy(it);
+   }
+
+   util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) {
+      agx_bo_unreference(*bo);
+   }
+
+   util_dynarray_clear(&cmd->large_bos);
+}
+
+static void
+hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
+{
+   struct hk_cmd_buffer *cmd =
+      container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
+   struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
+
+   hk_free_resettable_cmd_buffer(cmd);
+   vk_command_buffer_finish(&cmd->vk);
+   vk_free(&pool->vk.alloc, cmd);
+}
+
+static VkResult
+hk_create_cmd_buffer(struct vk_command_pool *vk_pool,
+                     VkCommandBufferLevel level,
+                     struct vk_command_buffer **cmd_buffer_out)
+{
+   struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk);
+   struct hk_device *dev = hk_cmd_pool_device(pool);
+   struct hk_cmd_buffer *cmd;
+   VkResult result;
+
+   cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (cmd == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result =
+      vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level);
+   if (result != VK_SUCCESS) {
+      vk_free(&pool->vk.alloc, cmd);
+      return result;
+   }
+
+   util_dynarray_init(&cmd->large_bos, NULL);
+
+   cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi;
+   cmd->vk.dynamic_graphics_state.ms.sample_locations =
+      &cmd->state.gfx._dynamic_sl;
+
+   list_inithead(&cmd->uploader.main.bos);
+   list_inithead(&cmd->uploader.usc.bos);
+   list_inithead(&cmd->control_streams);
+
+   *cmd_buffer_out = &cmd->vk;
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
+                    UNUSED VkCommandBufferResetFlags flags)
+{
+   struct hk_cmd_buffer *cmd =
+      container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
+
+   vk_command_buffer_reset(&cmd->vk);
+   hk_free_resettable_cmd_buffer(cmd);
+
+   cmd->uploader.main.map = NULL;
+   cmd->uploader.main.base = 0;
+   cmd->uploader.main.offset = 0;
+   cmd->uploader.usc.map = NULL;
+   cmd->uploader.usc.base = 0;
+   cmd->uploader.usc.offset = 0;
+
+   cmd->current_cs.gfx = NULL;
+   cmd->current_cs.cs = NULL;
+   cmd->current_cs.post_gfx = NULL;
+   cmd->current_cs.pre_gfx = NULL;
+
+   /* TODO: clear pool! */
+
+   memset(&cmd->state, 0, sizeof(cmd->state));
+}
+
+const struct vk_command_buffer_ops hk_cmd_buffer_ops = {
+   .create = hk_create_cmd_buffer,
+   .reset = hk_reset_cmd_buffer,
+   .destroy = hk_destroy_cmd_buffer,
+};
+
+static VkResult
+hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc,
+                       struct hk_cmd_bo **bo_out)
+{
+   VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (usc)
+      list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos);
+   else
+      list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos);
+
+   return VK_SUCCESS;
+}
+
+struct agx_ptr
+hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
+                       uint32_t alignment, bool usc)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_uploader *uploader =
+      usc ? &cmd->uploader.usc : &cmd->uploader.main;
+
+   /* Specially handle large allocations owned by the command buffer, e.g. used
+    * for statically allocated vertex output buffers with geometry shaders.
+    */
+   if (size > HK_CMD_BO_SIZE) {
+      uint32_t flags = usc ? AGX_BO_LOW_VA : 0;
+      struct agx_bo *bo =
+         agx_bo_create(&dev->dev, size, flags, "Large pool allocation");
+
+      util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo);
+      return bo->ptr;
+   }
+
+   assert(size <= HK_CMD_BO_SIZE);
+   assert(alignment > 0);
+
+   uint32_t offset = align(uploader->offset, alignment);
+
+   assert(offset <= HK_CMD_BO_SIZE);
+   if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) {
+      uploader->offset = offset + size;
+
+      return (struct agx_ptr){
+         .gpu = uploader->base + offset,
+         .cpu = uploader->map + offset,
+      };
+   }
+
+   struct hk_cmd_bo *bo;
+   VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return (struct agx_ptr){0};
+   }
+
+   /* Pick whichever of the current upload BO and the new BO will have more
+    * room left to be the BO for the next upload.  If our upload size is
+    * bigger than the old offset, we're better off burning the whole new
+    * upload BO on this one allocation and continuing on the current upload
+    * BO.
+    */
+   if (uploader->map == NULL || size < uploader->offset) {
+      uploader->map = bo->bo->ptr.cpu;
+      uploader->base = bo->bo->ptr.gpu;
+      uploader->offset = size;
+   }
+
+   return (struct agx_ptr){
+      .gpu = bo->bo->ptr.gpu,
+      .cpu = bo->map,
+   };
+}
+
+uint64_t
+hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size,
+               uint32_t alignment)
+{
+   struct agx_ptr T = hk_pool_alloc(cmd, size, alignment);
+   if (unlikely(T.cpu == NULL))
+      return 0;
+
+   memcpy(T.cpu, data, size);
+   return T.gpu;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
+                      const VkCommandBufferBeginInfo *pBeginInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   hk_reset_cmd_buffer(&cmd->vk, 0);
+
+   hk_cmd_buffer_begin_compute(cmd, pBeginInfo);
+   hk_cmd_buffer_begin_graphics(cmd, pBeginInfo);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL &&
+          "must end rendering before ending the command buffer");
+
+   hk_cmd_buffer_end_compute(cmd);
+   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+
+   return vk_command_buffer_get_record_result(&cmd->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
+                       const VkDependencyInfo *pDependencyInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   /* The big hammer. We end both compute and graphics batches. Ending compute
+    * here is necessary to properly handle graphics->compute dependencies.
+    *
+    * XXX: perf. */
+   hk_cmd_buffer_end_compute(cmd);
+   hk_cmd_buffer_end_graphics(cmd);
+}
+
+void
+hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
+                    const gl_shader_stage *stages,
+                    struct vk_shader **const shaders)
+{
+   struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
+
+   for (uint32_t i = 0; i < stage_count; i++) {
+      struct hk_api_shader *shader =
+         container_of(shaders[i], struct hk_api_shader, vk);
+
+      if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL)
+         hk_cmd_bind_compute_shader(cmd, shader);
+      else
+         hk_cmd_bind_graphics_shader(cmd, stages[i], shader);
+   }
+}
+
+static void
+hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
+                        struct hk_descriptor_state *desc,
+                        const VkBindDescriptorSetsInfoKHR *info)
+{
+   VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
+
+   /* Fro the Vulkan 1.3.275 spec:
+    *
+    *    "When binding a descriptor set (see Descriptor Set Binding) to
+    *    set number N...
+    *
+    *    If, additionally, the previously bound descriptor set for set
+    *    N was bound using a pipeline layout not compatible for set N,
+    *    then all bindings in sets numbered greater than N are
+    *    disturbed."
+    *
+    * This means that, if some earlier set gets bound in such a way that
+    * it changes set_dynamic_buffer_start[s], this binding is implicitly
+    * invalidated.  Therefore, we can always look at the current value
+    * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
+    * range and it's only our responsibility to adjust all
+    * set_dynamic_buffer_start[p] for p > s as needed.
+    */
+   uint8_t dyn_buffer_start =
+      desc->root.set_dynamic_buffer_start[info->firstSet];
+
+   uint32_t next_dyn_offset = 0;
+   for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
+      unsigned s = i + info->firstSet;
+      VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]);
+
+      if (desc->sets[s] != set) {
+         if (set != NULL) {
+            desc->root.sets[s] = hk_descriptor_set_addr(set);
+            desc->set_sizes[s] = set->size;
+         } else {
+            desc->root.sets[s] = 0;
+            desc->set_sizes[s] = 0;
+         }
+         desc->sets[s] = set;
+         desc->sets_dirty |= BITFIELD_BIT(s);
+
+         /* Binding descriptors invalidates push descriptors */
+         desc->push_dirty &= ~BITFIELD_BIT(s);
+      }
+
+      desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
+
+      if (pipeline_layout->set_layouts[s] != NULL) {
+         const struct hk_descriptor_set_layout *set_layout =
+            vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]);
+
+         if (set != NULL && set_layout->dynamic_buffer_count > 0) {
+            for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) {
+               struct hk_buffer_address addr = set->dynamic_buffers[j];
+               addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j];
+               desc->root.dynamic_buffers[dyn_buffer_start + j] = addr;
+            }
+            next_dyn_offset += set->layout->dynamic_buffer_count;
+         }
+
+         dyn_buffer_start += set_layout->dynamic_buffer_count;
+      } else {
+         assert(set == NULL);
+      }
+   }
+   assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
+   assert(next_dyn_offset <= info->dynamicOffsetCount);
+
+   for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
+        s++)
+      desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
+
+   desc->root_dirty = true;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindDescriptorSets2KHR(
+   VkCommandBuffer commandBuffer,
+   const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+      hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors,
+                              pBindDescriptorSetsInfo);
+   }
+
+   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors,
+                              pBindDescriptorSetsInfo);
+   }
+}
+
+static void
+hk_push_constants(UNUSED struct hk_cmd_buffer *cmd,
+                  struct hk_descriptor_state *desc,
+                  const VkPushConstantsInfoKHR *info)
+{
+   memcpy(desc->root.push + info->offset, info->pValues, info->size);
+   desc->root_dirty = true;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
+                        const VkPushConstantsInfoKHR *pPushConstantsInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
+      hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo);
+
+   if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
+      hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo);
+}
+
+static struct hk_push_descriptor_set *
+hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd,
+                        struct hk_descriptor_state *desc, uint32_t set)
+{
+   assert(set < HK_MAX_SETS);
+   if (unlikely(desc->push[set] == NULL)) {
+      desc->push[set] =
+         vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (unlikely(desc->push[set] == NULL)) {
+         vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return NULL;
+      }
+   }
+
+   /* Pushing descriptors replaces whatever sets are bound */
+   desc->sets[set] = NULL;
+   desc->push_dirty |= BITFIELD_BIT(set);
+
+   return desc->push[set];
+}
+
+static void
+hk_push_descriptor_set(struct hk_cmd_buffer *cmd,
+                       struct hk_descriptor_state *desc,
+                       const VkPushDescriptorSetInfoKHR *info)
+{
+   VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
+
+   struct hk_push_descriptor_set *push_set =
+      hk_cmd_push_descriptors(cmd, desc, info->set);
+   if (unlikely(push_set == NULL))
+      return;
+
+   struct hk_descriptor_set_layout *set_layout =
+      vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]);
+
+   hk_push_descriptor_set_update(push_set, set_layout,
+                                 info->descriptorWriteCount,
+                                 info->pDescriptorWrites);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPushDescriptorSet2KHR(
+   VkCommandBuffer commandBuffer,
+   const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+      hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors,
+                             pPushDescriptorSetInfo);
+   }
+
+   if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors,
+                             pPushDescriptorSetInfo);
+   }
+}
+
+void
+hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
+                                     struct hk_descriptor_state *desc)
+{
+   u_foreach_bit(set_idx, desc->push_dirty) {
+      struct hk_push_descriptor_set *push_set = desc->push[set_idx];
+      uint64_t push_set_addr = hk_pool_upload(
+         cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT);
+
+      desc->root.sets[set_idx] = push_set_addr;
+      desc->set_sizes[set_idx] = sizeof(push_set->data);
+   }
+
+   desc->root_dirty = true;
+   desc->push_dirty = 0;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPushDescriptorSetWithTemplate2KHR(
+   VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR
+                                     *pPushDescriptorSetWithTemplateInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_descriptor_update_template, template,
+                  pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
+   VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout,
+                  pPushDescriptorSetWithTemplateInfo->layout);
+
+   struct hk_descriptor_state *desc =
+      hk_get_descriptors_state(cmd, template->bind_point);
+   struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors(
+      cmd, desc, pPushDescriptorSetWithTemplateInfo->set);
+   if (unlikely(push_set == NULL))
+      return;
+
+   struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(
+      pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]);
+
+   hk_push_descriptor_set_update_template(
+      push_set, set_layout, template,
+      pPushDescriptorSetWithTemplateInfo->pData);
+}
+
+uint64_t
+hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
+                          VkPipelineBindPoint bind_point)
+{
+   struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
+   struct hk_root_descriptor_table *root = &desc->root;
+
+   struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8);
+   if (!root_ptr.cpu)
+      return 0;
+
+   root->root_desc_addr = root_ptr.gpu;
+
+   memcpy(root_ptr.cpu, root, sizeof(*root));
+   return root_ptr.gpu;
+}
+
+void
+hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
+                               struct hk_cmd_buffer *cmd)
+{
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   /* Upload texture/PBE descriptors for each render target so we can clear
+    * spilled render targets.
+    */
+   struct agx_ptr descs =
+      hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64);
+   struct agx_texture_packed *desc = descs.cpu;
+   if (!desc)
+      return;
+
+   for (unsigned i = 0; i < render->color_att_count; ++i) {
+      struct hk_image_view *iview = render->color_att[i].iview;
+      if (!iview) {
+         /* XXX: probably should emit a null descriptor here...? */
+         continue;
+      }
+
+      memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc));
+      memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc));
+   }
+
+   desc = descs.cpu;
+
+   /* Bind the base as u0_u1 for bindless access */
+   agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8));
+}
+
+void
+hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                   struct hk_shader *s)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   uint32_t max_scratch_size =
+      MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size);
+
+   if (max_scratch_size == 0)
+      return;
+
+   unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0;
+
+   /* XXX: need to lock around agx_scratch_alloc... */
+   /* Note: this uses the hardware stage, not the software stage */
+   switch (s->b.info.stage) {
+   case PIPE_SHADER_FRAGMENT:
+      agx_scratch_alloc(&dev->scratch.fs, max_scratch_size, 0);
+      cs->scratch.fs.main = true;
+      cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size);
+      break;
+   case PIPE_SHADER_VERTEX:
+      agx_scratch_alloc(&dev->scratch.vs, max_scratch_size, 0);
+      cs->scratch.vs.main = true;
+      cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size);
+      break;
+   default:
+      agx_scratch_alloc(&dev->scratch.cs, max_scratch_size, 0);
+      cs->scratch.cs.main = true;
+      cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size);
+      break;
+   }
+}
+
+uint32_t
+hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
+                    struct hk_linked_shader *linked)
+{
+   enum pipe_shader_type sw_stage = s->info.stage;
+   enum pipe_shader_type hw_stage = s->b.info.stage;
+
+   unsigned constant_push_ranges =
+      DIV_ROUND_UP(s->b.info.immediate_size_16, 64);
+   unsigned push_ranges = 2;
+   unsigned stage_ranges = 3;
+
+   size_t usc_size =
+      agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4);
+   struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
+   if (!t.cpu)
+      return 0;
+
+   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
+
+   uint64_t root_ptr;
+
+   if (sw_stage == PIPE_SHADER_COMPUTE)
+      root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
+   else
+      root_ptr = cmd->state.gfx.root;
+
+   static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0,
+                 "self-reflective");
+
+   agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr);
+
+   if (sw_stage == MESA_SHADER_VERTEX) {
+      unsigned count =
+         DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4);
+
+      if (count) {
+         agx_usc_uniform(
+            &b, 0, 4 * count,
+            root_ptr + hk_root_descriptor_offset(draw.attrib_base));
+
+         agx_usc_uniform(
+            &b, 4 * count, 2 * count,
+            root_ptr + hk_root_descriptor_offset(draw.attrib_clamps));
+      }
+
+      if (cmd->state.gfx.draw_params)
+         agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params);
+
+      if (cmd->state.gfx.draw_id_ptr)
+         agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr);
+
+      if (hw_stage == MESA_SHADER_COMPUTE) {
+         agx_usc_uniform(
+            &b, (6 * count) + 8, 4,
+            root_ptr + hk_root_descriptor_offset(draw.input_assembly));
+      }
+   } else if (sw_stage == MESA_SHADER_FRAGMENT) {
+      if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) {
+         hk_usc_upload_spilled_rt_descs(&b, cmd);
+      }
+
+      agx_usc_uniform(
+         &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant));
+
+      /* The SHARED state is baked into linked->usc for non-fragment shaders. We
+       * don't pass around the information to bake the tilebuffer layout.
+       *
+       * TODO: We probably could with some refactor.
+       */
+      agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc);
+   }
+
+   agx_usc_push_blob(&b, linked->usc.data, linked->usc.size);
+   return t.gpu;
+}
+
+/* Specialized variant of hk_upload_usc_words for internal dispatches that do
+ * not use any state except for some directly mapped uniforms.
+ */
+uint32_t
+hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd, struct hk_shader *s,
+                           void *data, size_t data_size)
+{
+   assert(s->info.stage == MESA_SHADER_COMPUTE);
+   assert(s->b.info.scratch_size == 0 && "you shouldn't be spilling!");
+   assert(s->b.info.preamble_scratch_size == 0 && "you shouldn't be spilling!");
+
+   unsigned constant_push_ranges =
+      DIV_ROUND_UP(s->b.info.immediate_size_16, 64);
+
+   size_t usc_size = agx_usc_size(constant_push_ranges + 7);
+   struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
+   if (!t.cpu)
+      return 0;
+
+   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
+
+   /* Map the data directly as uniforms starting at u0 */
+   agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2),
+                   hk_pool_upload(cmd, data, data_size, 4));
+
+   agx_usc_push_blob(&b, s->only_linked->usc.data, s->only_linked->usc.size);
+   return t.gpu;
+}
+
+void
+hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
+{
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+   uint8_t *map = cs->current;
+
+   cs->tib = render->tilebuffer;
+
+   /* Assume this is not the first control stream of the render pass, so
+    * initially use the partial background program and ZLS control.
+    * hk_BeginRendering will override.
+    */
+   cs->cr = render->cr;
+   cs->cr.bg.main = render->cr.bg.partial;
+   cs->cr.zls_control = render->cr.zls_control_partial;
+
+   /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
+    * with another that caused stale data to be cached and the CPU wrote to it
+    * in the meantime.
+    */
+   agx_push(map, VDM_BARRIER, cfg) {
+      cfg.usc_cache_inval = true;
+   }
+
+   struct AGX_PPP_HEADER present = {
+      .w_clamp = true,
+      .occlusion_query_2 = true,
+      .output_unknown = true,
+      .varying_word_2 = true,
+      .viewport_count = 1, /* irrelevant */
+   };
+
+   size_t size = agx_ppp_update_size(&present);
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
+   if (!T.cpu)
+      return;
+
+   struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
+
+   /* clang-format off */
+   agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
+   agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
+   agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
+   agx_ppp_push(&ppp, VARYING_2, cfg);
+   /* clang-format on */
+
+   agx_ppp_fini(&map, &ppp);
+   cs->current = map;
+
+   util_dynarray_init(&cs->scissor, NULL);
+   util_dynarray_init(&cs->depth_bias, NULL);
+
+   /* All graphics state must be reemited in each control stream */
+   hk_cmd_buffer_dirty_all(cmd);
+}
+
+void
+hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                       size_t space)
+{
+   bool vdm = cs->type == HK_CS_VDM;
+
+   size_t link_length =
+      vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
+
+   /* Assert that we have space for a link tag */
+   assert((cs->current + link_length) <= cs->end && "Encoder overflowed");
+
+   /* Always leave room for a link tag, in case we run out of space later,
+    * plus padding because VDM apparently overreads?
+    *
+    * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
+    */
+   space += link_length + 0x800;
+
+   /* If there is room in the command buffer, we're done */
+   if (likely((cs->end - cs->current) >= space))
+      return;
+
+   /* Otherwise, we need to allocate a new command buffer. We use memory owned
+    * by the batch to simplify lifetime management for the BO.
+    */
+   size_t size = 65536;
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 256);
+
+   /* Jump from the old control stream to the new control stream */
+   if (vdm) {
+      agx_pack(cs->current, VDM_STREAM_LINK, cfg) {
+         cfg.target_lo = T.gpu & BITFIELD_MASK(32);
+         cfg.target_hi = T.gpu >> 32;
+      }
+   } else {
+      agx_pack(cs->current, CDM_STREAM_LINK, cfg) {
+         cfg.target_lo = T.gpu & BITFIELD_MASK(32);
+         cfg.target_hi = T.gpu >> 32;
+      }
+   }
+
+   /* Swap out the control stream */
+   cs->current = T.cpu;
+   cs->end = cs->current + size;
+   cs->stream_linked = true;
+}
diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h
new file mode 100644
index 00000000000..0b93f0a924f
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@@ -0,0 +1,767 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "util/macros.h"
+
+#include "util/list.h"
+#include "agx_helpers.h"
+#include "agx_linker.h"
+#include "agx_pack.h"
+#include "agx_tilebuffer.h"
+#include "agx_uvs.h"
+#include "pool.h"
+#include "shader_enums.h"
+
+#include "hk_private.h"
+#include "hk_shader.h"
+
+#include "hk_cmd_pool.h"
+#include "hk_descriptor_set.h"
+
+#include "asahi/lib/agx_nir_lower_vbo.h"
+#include "util/u_dynarray.h"
+#include "vulkan/vulkan_core.h"
+
+#include "vk_command_buffer.h"
+
+#include <stdio.h>
+
+struct hk_buffer;
+struct hk_cmd_bo;
+struct hk_cmd_pool;
+struct hk_image_view;
+struct hk_push_descriptor_set;
+struct hk_shader;
+struct hk_linked_shader;
+struct agx_usc_builder;
+struct vk_shader;
+
+/** Root descriptor table. */
+struct hk_root_descriptor_table {
+   uint64_t root_desc_addr;
+
+   union {
+      struct {
+         uint32_t view_index;
+         uint32_t ppp_multisamplectl;
+
+         /* Vertex input state */
+         uint64_t attrib_base[AGX_MAX_VBUFS];
+         uint32_t attrib_clamps[AGX_MAX_VBUFS];
+
+         /* Pointer to the VS->TCS, VS->GS, or TES->GS buffer. */
+         uint64_t vertex_output_buffer;
+
+         /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
+         uint64_t vertex_outputs;
+
+         /* Address of input assembly buffer if geom/tess is used, else 0 */
+         uint64_t input_assembly;
+
+         /* Address of tessellation param buffer if tessellation used, else 0 */
+         uint64_t tess_params;
+
+         /* Address of geometry param buffer if GS is used, else 0 */
+         uint64_t geometry_params;
+
+         /* Pipeline statistics queries. This is a base address with flags. */
+         uint64_t pipeline_stats;
+         VkQueryPipelineStatisticFlags pipeline_stats_flags;
+
+         float blend_constant[4];
+         uint16_t no_epilog_discard;
+         uint16_t _pad1;
+         uint16_t api_sample_mask;
+         uint16_t _pad2;
+         uint16_t force_never_in_shader;
+         uint16_t _pad3;
+         uint16_t provoking;
+         uint16_t _pad4;
+
+         /* Mapping from varying slots written by the last vertex stage to UVS
+          * indices. This mapping must be compatible with the fragment shader.
+          */
+         uint8_t uvs_index[VARYING_SLOT_MAX];
+      } draw;
+      struct {
+         uint64_t group_count_addr;
+         uint32_t base_group[3];
+      } cs;
+   };
+
+   /* Client push constants */
+   uint8_t push[HK_MAX_PUSH_SIZE];
+
+   /* Descriptor set base addresses */
+   uint64_t sets[HK_MAX_SETS];
+
+   /* Dynamic buffer bindings */
+   struct hk_buffer_address dynamic_buffers[HK_MAX_DYNAMIC_BUFFERS];
+
+   /* Start index in dynamic_buffers where each set starts */
+   uint8_t set_dynamic_buffer_start[HK_MAX_SETS];
+};
+
+/* helper macro for computing root descriptor byte offsets */
+#define hk_root_descriptor_offset(member)                                      \
+   offsetof(struct hk_root_descriptor_table, member)
+
+struct hk_descriptor_state {
+   bool root_dirty;
+   struct hk_root_descriptor_table root;
+
+   uint32_t set_sizes[HK_MAX_SETS];
+   struct hk_descriptor_set *sets[HK_MAX_SETS];
+   uint32_t sets_dirty;
+
+   struct hk_push_descriptor_set *push[HK_MAX_SETS];
+   uint32_t push_dirty;
+};
+
+struct hk_attachment {
+   VkFormat vk_format;
+   struct hk_image_view *iview;
+
+   VkResolveModeFlagBits resolve_mode;
+   struct hk_image_view *resolve_iview;
+};
+
+struct hk_bg_eot {
+   uint64_t usc;
+   struct agx_counts_packed counts;
+};
+
+struct hk_render_registers {
+   uint32_t width, height, layers;
+   uint32_t isp_bgobjdepth;
+   uint32_t isp_bgobjvals;
+   struct agx_zls_control_packed zls_control, zls_control_partial;
+   uint32_t iogpu_unk_214;
+   uint32_t depth_dimensions;
+
+   struct {
+      uint32_t dimensions;
+      uint64_t buffer, meta;
+      uint32_t stride, meta_stride;
+   } depth;
+
+   struct {
+      uint64_t buffer, meta;
+      uint32_t stride, meta_stride;
+   } stencil;
+
+   struct {
+      struct hk_bg_eot main;
+      struct hk_bg_eot partial;
+   } bg;
+
+   struct {
+      struct hk_bg_eot main;
+      struct hk_bg_eot partial;
+   } eot;
+};
+
+struct hk_rendering_state {
+   VkRenderingFlagBits flags;
+
+   VkRect2D area;
+   uint32_t layer_count;
+   uint32_t view_mask;
+
+   uint32_t color_att_count;
+   struct hk_attachment color_att[HK_MAX_RTS];
+   struct hk_attachment depth_att;
+   struct hk_attachment stencil_att;
+
+   struct agx_tilebuffer_layout tilebuffer;
+   struct hk_render_registers cr;
+};
+
+struct hk_index_buffer_state {
+   struct hk_addr_range buffer;
+   enum agx_index_size size;
+   uint32_t restart;
+};
+
+/* Dirty tracking bits for state not tracked by vk_dynamic_graphics_state or
+ * shaders_dirty.
+ */
+enum hk_dirty {
+   HK_DIRTY_INDEX = BITFIELD_BIT(0),
+   HK_DIRTY_VB = BITFIELD_BIT(1),
+   HK_DIRTY_OCCLUSION = BITFIELD_BIT(2),
+   HK_DIRTY_PROVOKING = BITFIELD_BIT(3),
+   HK_DIRTY_VARYINGS = BITFIELD_BIT(4),
+};
+
+struct hk_graphics_state {
+   struct hk_rendering_state render;
+   struct hk_descriptor_state descriptors;
+
+   enum hk_dirty dirty;
+
+   uint64_t root;
+   uint64_t draw_params;
+   uint64_t draw_id_ptr;
+
+   uint32_t shaders_dirty;
+   struct hk_api_shader *shaders[MESA_SHADER_MESH + 1];
+
+   /* Vertex buffers */
+   struct hk_addr_range vb[AGX_MAX_VBUFS];
+
+   /* Transform feedback buffers */
+   struct hk_addr_range xfb[4];
+
+   /* Is transform feedback enabled? */
+   bool xfb_enabled;
+
+   /* Internal transform feedback offset vec4.
+    *
+    * TODO: Strictly could be global.
+    */
+   uint64_t xfb_offsets;
+
+   /* Pointer to the GPU memory backing active transform feedback queries,
+    * per-stream. Zero if no query is bound.
+    */
+   uint64_t xfb_query[4];
+
+   struct hk_index_buffer_state index;
+   enum agx_primitive topology;
+   enum agx_object_type object_type;
+
+   /* Provoking vertex 0, 1, or 2. Usually 0 or 2 for FIRST/LAST. 1 can only be
+    * set for tri fans.
+    */
+   uint8_t provoking;
+
+   struct {
+      enum agx_visibility_mode mode;
+
+      /* If enabled, index of the current occlusion query in the occlusion heap.
+       * There can only be one active at a time (hardware contraint).
+       */
+      uint16_t index;
+   } occlusion;
+
+   /* Fast linked shader data structures */
+   uint64_t varyings;
+   struct agx_varyings_vs linked_varyings;
+
+   uint32_t linked_dirty;
+   struct hk_linked_shader *linked[PIPE_SHADER_TYPES];
+   bool generate_primitive_id;
+
+   /* Tessellation state */
+   uint64_t tess_out_draws;
+
+   /* Needed by vk_command_buffer::dynamic_graphics_state */
+   struct vk_vertex_input_state _dynamic_vi;
+   struct vk_sample_locations_state _dynamic_sl;
+};
+
+struct hk_compute_state {
+   struct hk_descriptor_state descriptors;
+   struct hk_api_shader *shader;
+};
+
+struct hk_cmd_push {
+   void *map;
+   uint64_t addr;
+   uint32_t range;
+   bool no_prefetch;
+};
+
+struct hk_scratch_req {
+   bool main;
+   bool preamble;
+};
+
+/*
+ * hk_cs represents a single control stream, to be enqueued either to the
+ * CDM or VDM for compute/3D respectively.
+ */
+enum hk_cs_type {
+   HK_CS_CDM,
+   HK_CS_VDM,
+};
+
+struct hk_cs {
+   struct list_head node;
+
+   /* Data master */
+   enum hk_cs_type type;
+
+   /* Address of the root control stream for the job */
+   uint64_t addr;
+
+   /* Start pointer of the root control stream */
+   void *start;
+
+   /* Current pointer within the control stream */
+   void *current;
+
+   /* End pointer of the current chunk of the control stream */
+   void *end;
+
+   /* Whether there is more than just the root chunk */
+   bool stream_linked;
+
+   /* Scratch requirements */
+   struct {
+      union {
+         struct hk_scratch_req vs;
+         struct hk_scratch_req cs;
+      };
+
+      struct hk_scratch_req fs;
+   } scratch;
+
+   /* Remaining state is for graphics only, ignored for compute */
+   struct agx_tilebuffer_layout tib;
+
+   struct util_dynarray scissor, depth_bias;
+   uint64_t uploaded_scissor, uploaded_zbias;
+
+   /* We can only set ppp_multisamplectl once per batch. has_sample_locations
+    * tracks if we've committed to a set of sample locations yet. vk_meta
+    * operations do not set has_sample_locations since they don't care and it
+    * would interfere with the app-provided samples.
+    *
+    */
+   bool has_sample_locations;
+   uint32_t ppp_multisamplectl;
+
+   struct hk_render_registers cr;
+};
+
+struct hk_uploader {
+   /** List of hk_cmd_bo */
+   struct list_head bos;
+
+   /* Current addresses */
+   uint8_t *map;
+   uint64_t base;
+   uint32_t offset;
+};
+
+struct hk_cmd_buffer {
+   struct vk_command_buffer vk;
+
+   struct {
+      struct hk_graphics_state gfx;
+      struct hk_compute_state cs;
+   } state;
+
+   struct {
+      struct hk_uploader main, usc;
+   } uploader;
+
+   /* List of all recorded control streams */
+   struct list_head control_streams;
+
+   /* Current recorded control stream */
+   struct {
+      /* VDM stream for 3D */
+      struct hk_cs *gfx;
+
+      /* CDM stream for compute */
+      struct hk_cs *cs;
+
+      /* CDM stream that executes immediately before the current graphics
+       * control stream. Used for geometry shading, tessellation, etc.
+       */
+      struct hk_cs *pre_gfx;
+
+      /* CDM stream that will execute after the current graphics control stream
+       * finishes. Used for queries.
+       */
+      struct hk_cs *post_gfx;
+   } current_cs;
+
+   /* Are we currently inside a vk_meta operation? This alters sample location
+    * behaviour.
+    */
+   bool in_meta;
+
+   /* XXX: move me?
+    *
+    * Indirect draw generated by the pre-GS for the geometry shader.
+    */
+   uint64_t geom_indirect;
+
+   /* Does the command buffer use the geometry heap? */
+   bool uses_heap;
+
+   /* Owned large BOs */
+   struct util_dynarray large_bos;
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+
+extern const struct vk_command_buffer_ops hk_cmd_buffer_ops;
+
+static inline struct hk_device *
+hk_cmd_buffer_device(struct hk_cmd_buffer *cmd)
+{
+   return (struct hk_device *)cmd->vk.base.device;
+}
+
+static inline struct hk_cmd_pool *
+hk_cmd_buffer_pool(struct hk_cmd_buffer *cmd)
+{
+   return (struct hk_cmd_pool *)cmd->vk.pool;
+}
+
+/*
+ * The hardware vertex shader is supplied by the last geometry stage. The
+ * geometry pipeline is vertex->tess->geometry so we search backwards.
+ */
+static inline struct hk_shader *
+hk_bound_hw_vs(struct hk_graphics_state *gfx)
+{
+   struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
+   struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+
+   if (gs)
+      return &gs->variants[HK_GS_VARIANT_RAST];
+   else if (tes)
+      return &tes->variants[HK_VS_VARIANT_HW];
+   else
+      return &vs->variants[HK_VS_VARIANT_HW];
+}
+
+static inline struct hk_shader *
+hk_bound_sw_vs(struct hk_graphics_state *gfx)
+{
+   struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+
+   if (hw_vs == &vs->variants[HK_VS_VARIANT_HW])
+      return hw_vs;
+   else
+      return &vs->variants[HK_VS_VARIANT_SW];
+}
+
+static inline struct hk_shader *
+hk_bound_sw_vs_before_gs(struct hk_graphics_state *gfx)
+{
+   struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
+   struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
+   struct hk_api_shader *api = tes ?: vs;
+
+   return &api->variants[HK_VS_VARIANT_SW];
+}
+
+struct agx_ptr hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
+                                      uint32_t alignment, bool usc);
+
+uint64_t hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data,
+                        uint32_t size, uint32_t alignment);
+
+static inline struct agx_ptr
+hk_pool_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment)
+{
+   return hk_pool_alloc_internal(cmd, size, alignment, false);
+}
+
+static inline struct agx_ptr
+hk_pool_usc_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment)
+{
+   return hk_pool_alloc_internal(cmd, size, alignment, true);
+}
+
+void hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs);
+uint32_t hk_default_sample_positions(unsigned nr_samples);
+
+static inline struct hk_cs *
+hk_cmd_buffer_get_cs_general(struct hk_cmd_buffer *cmd, struct hk_cs **ptr,
+                             bool compute)
+{
+   if ((*ptr) == NULL) {
+      /* Allocate root control stream */
+      size_t initial_size = 65536;
+      struct agx_ptr root = hk_pool_alloc(cmd, initial_size, 1024);
+      if (!root.cpu)
+         return NULL;
+
+      /* Allocate hk_cs for the new stream */
+      struct hk_cs *cs = malloc(sizeof(*cs));
+      *cs = (struct hk_cs){
+         .type = compute ? HK_CS_CDM : HK_CS_VDM,
+         .addr = root.gpu,
+         .start = root.cpu,
+         .current = root.cpu,
+         .end = root.cpu + initial_size,
+      };
+
+      list_inithead(&cs->node);
+
+      bool before_gfx = (ptr == &cmd->current_cs.pre_gfx);
+
+      /* Insert into the command buffer. We usually append to the end of the
+       * command buffer, except for pre-graphics streams which go right before
+       * the graphics workload. (This implies a level of out-of-order processing
+       * that's allowed by Vulkan and required for efficient
+       * geometry/tessellation shaders.)
+       */
+      if (before_gfx && cmd->current_cs.gfx) {
+         list_addtail(&cs->node, &cmd->current_cs.gfx->node);
+      } else {
+         list_addtail(&cs->node, &cmd->control_streams);
+      }
+
+      *ptr = cs;
+
+      if (!compute)
+         hk_cs_init_graphics(cmd, cs);
+   }
+
+   assert(*ptr != NULL);
+   return *ptr;
+}
+
+static inline struct hk_cs *
+hk_cmd_buffer_get_cs(struct hk_cmd_buffer *cmd, bool compute)
+{
+   struct hk_cs **ptr = compute ? &cmd->current_cs.cs : &cmd->current_cs.gfx;
+   return hk_cmd_buffer_get_cs_general(cmd, ptr, compute);
+}
+
+void hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                            size_t space);
+
+static void
+hk_cmd_buffer_dirty_all(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   vk_dynamic_graphics_state_dirty_all(dyn);
+   gfx->dirty = ~0;
+   gfx->shaders_dirty = ~0;
+   gfx->linked_dirty = ~0;
+   gfx->descriptors.root_dirty = true;
+}
+
+static inline void
+hk_cs_destroy(struct hk_cs *cs)
+{
+   if (cs->type == HK_CS_VDM) {
+      util_dynarray_fini(&cs->scissor);
+      util_dynarray_fini(&cs->depth_bias);
+   }
+
+   free(cs);
+}
+
+static void
+hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
+{
+   if (*ptr) {
+      struct hk_cs *cs = *ptr;
+      void *map = cs->current;
+      agx_push(map, CDM_STREAM_TERMINATE, _)
+         ;
+
+      cs->current = map;
+   }
+
+   *ptr = NULL;
+}
+
+static void
+hk_cmd_buffer_end_compute(struct hk_cmd_buffer *cmd)
+{
+   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.cs);
+}
+
+static void
+hk_cmd_buffer_end_graphics(struct hk_cmd_buffer *cmd)
+{
+   struct hk_cs *cs = cmd->current_cs.gfx;
+
+   if (cs) {
+      void *map = cs->current;
+      agx_push(map, VDM_STREAM_TERMINATE, _)
+         ;
+
+      /* Scissor and depth bias arrays are staged to dynamic arrays on the CPU.
+       * When we end the control stream, they're done growing and are ready for
+       * upload.
+       */
+      cs->uploaded_scissor =
+         hk_pool_upload(cmd, cs->scissor.data, cs->scissor.size, 64);
+
+      cs->uploaded_zbias =
+         hk_pool_upload(cmd, cs->depth_bias.data, cs->depth_bias.size, 64);
+
+      /* TODO: maybe free scissor/depth_bias now? */
+
+      cmd->current_cs.gfx->current = map;
+      cmd->current_cs.gfx = NULL;
+      hk_cmd_buffer_end_compute_internal(&cmd->current_cs.pre_gfx);
+      hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+   }
+
+   assert(cmd->current_cs.gfx == NULL);
+
+   /* We just flushed out the heap use. If we want to use it again, we'll need
+    * to queue a free for it again.
+    */
+   cmd->uses_heap = false;
+}
+
+static inline uint64_t
+hk_pipeline_stat_addr(struct hk_cmd_buffer *cmd,
+                      VkQueryPipelineStatisticFlagBits stat)
+{
+   struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
+   VkQueryPipelineStatisticFlags flags = root->draw.pipeline_stats_flags;
+
+   if (flags & stat) {
+      assert(!cmd->in_meta && "queries paused for meta");
+      assert(util_bitcount(stat) == 1 && "by construction");
+
+      /* Prefix sum to determine the compacted index in the query pool */
+      uint32_t index = util_bitcount(flags & (stat - 1));
+
+      return root->draw.pipeline_stats + (sizeof(uint64_t) * index);
+   } else {
+      /* Query disabled */
+      return 0;
+   }
+}
+
+void hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
+                                  const VkCommandBufferBeginInfo *pBeginInfo);
+void hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd,
+                                 const VkCommandBufferBeginInfo *pBeginInfo);
+
+void hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd);
+void hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd);
+
+void hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
+                         const gl_shader_stage *stages,
+                         struct vk_shader **const shaders);
+
+void hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
+                                 const gl_shader_stage stage,
+                                 struct hk_api_shader *shader);
+
+void hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd,
+                                struct hk_api_shader *shader);
+
+void hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
+                               struct hk_addr_range addr_range);
+
+static inline struct hk_descriptor_state *
+hk_get_descriptors_state(struct hk_cmd_buffer *cmd,
+                         VkPipelineBindPoint bind_point)
+{
+   switch (bind_point) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      return &cmd->state.gfx.descriptors;
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      return &cmd->state.cs.descriptors;
+   default:
+      unreachable("Unhandled bind point");
+   }
+};
+
+void hk_cmd_flush_wait_dep(struct hk_cmd_buffer *cmd,
+                           const VkDependencyInfo *dep, bool wait);
+
+void hk_cmd_invalidate_deps(struct hk_cmd_buffer *cmd, uint32_t dep_count,
+                            const VkDependencyInfo *deps);
+
+void hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
+                                          struct hk_descriptor_state *desc);
+
+void hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd,
+                               const VkRenderingInfo *pRenderingInfo);
+
+uint64_t hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
+                                   VkPipelineBindPoint bind_point);
+
+void hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                        struct hk_shader *s);
+
+uint32_t hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
+                             struct hk_linked_shader *linked);
+
+uint32_t hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd,
+                                    struct hk_shader *s, void *data,
+                                    size_t data_size);
+
+void hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
+                                    struct hk_cmd_buffer *cmd);
+
+void hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs);
+
+struct hk_grid {
+   bool indirect;
+   union {
+      uint32_t count[3];
+      uint64_t ptr;
+   };
+};
+
+static struct hk_grid
+hk_grid(uint32_t x, uint32_t y, uint32_t z)
+{
+   return (struct hk_grid){.indirect = false, .count = {x, y, z}};
+}
+
+static struct hk_grid
+hk_grid_indirect(uint64_t ptr)
+{
+   return (struct hk_grid){.indirect = true, .ptr = ptr};
+}
+
+void hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs,
+                          struct hk_shader *s, uint32_t usc,
+                          struct hk_grid grid, struct hk_grid local_size);
+
+static inline void
+hk_dispatch_with_local_size(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                            struct hk_shader *s, struct hk_grid grid,
+                            struct hk_grid local_size)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   uint32_t usc = hk_upload_usc_words(cmd, s, s->only_linked);
+
+   hk_reserve_scratch(cmd, cs, s);
+   hk_dispatch_with_usc(dev, cs, s, usc, grid, local_size);
+}
+
+static inline void
+hk_dispatch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_shader *s,
+            struct hk_grid grid)
+{
+   assert(s->info.stage == MESA_SHADER_COMPUTE);
+
+   struct hk_grid local_size =
+      hk_grid(s->info.cs.local_size[0], s->info.cs.local_size[1],
+              s->info.cs.local_size[2]);
+
+   if (!grid.indirect) {
+      grid.count[0] *= local_size.count[0];
+      grid.count[1] *= local_size.count[1];
+      grid.count[2] *= local_size.count[2];
+   }
+
+   hk_dispatch_with_local_size(cmd, cs, s, grid, local_size);
+}
+
+void hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
+                    bool after_gfx);
diff --git a/src/asahi/vulkan/hk_cmd_clear.c b/src/asahi/vulkan/hk_cmd_clear.c
new file mode 100644
index 00000000000..427c5fed2a1
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_clear.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "agx_formats.h"
+#include "hk_cmd_buffer.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+
+#include "vk_format.h"
+#include "vk_meta.h"
+
+static VkImageViewType
+render_view_type(VkImageType image_type, unsigned layer_count)
+{
+   switch (image_type) {
+   case VK_IMAGE_TYPE_1D:
+      return layer_count == 1 ? VK_IMAGE_VIEW_TYPE_1D
+                              : VK_IMAGE_VIEW_TYPE_1D_ARRAY;
+   case VK_IMAGE_TYPE_2D:
+      return layer_count == 1 ? VK_IMAGE_VIEW_TYPE_2D
+                              : VK_IMAGE_VIEW_TYPE_2D_ARRAY;
+   case VK_IMAGE_TYPE_3D:
+      return VK_IMAGE_VIEW_TYPE_3D;
+   default:
+      unreachable("Invalid image type");
+   }
+}
+
+static void
+clear_image(struct hk_cmd_buffer *cmd, struct hk_image *image,
+            VkImageLayout image_layout, VkFormat format,
+            const VkClearValue *clear_value, uint32_t range_count,
+            const VkImageSubresourceRange *ranges)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   ASSERTED VkResult result;
+
+   for (uint32_t r = 0; r < range_count; r++) {
+      const uint32_t level_count =
+         vk_image_subresource_level_count(&image->vk, &ranges[r]);
+
+      for (uint32_t l = 0; l < level_count; l++) {
+         const uint32_t level = ranges[r].baseMipLevel + l;
+
+         const VkExtent3D level_extent =
+            vk_image_mip_level_extent(&image->vk, level);
+
+         uint32_t base_array_layer, layer_count;
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+            base_array_layer = 0;
+            layer_count = level_extent.depth;
+         } else {
+            base_array_layer = ranges[r].baseArrayLayer;
+            layer_count =
+               vk_image_subresource_layer_count(&image->vk, &ranges[r]);
+         }
+
+         const VkImageViewUsageCreateInfo view_usage_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+            .usage = (ranges[r].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
+                        ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT
+                        : VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+         };
+         const VkImageViewCreateInfo view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+            .pNext = &view_usage_info,
+            .image = hk_image_to_handle(image),
+            .viewType = render_view_type(image->vk.image_type, layer_count),
+            .format = format,
+            .subresourceRange =
+               {
+                  .aspectMask = image->vk.aspects,
+                  .baseMipLevel = level,
+                  .levelCount = 1,
+                  .baseArrayLayer = base_array_layer,
+                  .layerCount = layer_count,
+               },
+         };
+
+         /* We use vk_meta_create_image_view here for lifetime managemnt */
+         VkImageView view;
+         result =
+            vk_meta_create_image_view(&cmd->vk, &dev->meta, &view_info, &view);
+         assert(result == VK_SUCCESS);
+
+         VkRenderingInfo render = {
+            .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
+            .renderArea =
+               {
+                  .offset = {0, 0},
+                  .extent = {level_extent.width, level_extent.height},
+               },
+            .layerCount = layer_count,
+         };
+
+         VkRenderingAttachmentInfo vk_att = {
+            .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+            .imageView = view,
+            .imageLayout = image_layout,
+            .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .clearValue = *clear_value,
+         };
+
+         if (ranges[r].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+            render.colorAttachmentCount = 1;
+            render.pColorAttachments = &vk_att;
+         }
+         if (ranges[r].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
+            render.pDepthAttachment = &vk_att;
+         if (ranges[r].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
+            render.pStencilAttachment = &vk_att;
+
+         hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), &render);
+         hk_CmdEndRendering(hk_cmd_buffer_to_handle(cmd));
+      }
+   }
+}
+
+static VkFormat
+vk_packed_int_format_for_size(unsigned size_B)
+{
+   switch (size_B) {
+   case 1:
+      return VK_FORMAT_R8_UINT;
+   case 2:
+      return VK_FORMAT_R16_UINT;
+   case 4:
+      return VK_FORMAT_R32_UINT;
+   case 8:
+      return VK_FORMAT_R32G32_UINT;
+   case 16:
+      return VK_FORMAT_R32G32B32A32_UINT;
+   default:
+      unreachable("Invalid image format size");
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdClearColorImage(VkCommandBuffer commandBuffer, VkImage _image,
+                      VkImageLayout imageLayout,
+                      const VkClearColorValue *pColor, uint32_t rangeCount,
+                      const VkImageSubresourceRange *pRanges)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   VkClearValue clear_value = {
+      .color = *pColor,
+   };
+
+   VkFormat vk_format = image->vk.format;
+   if (vk_format == VK_FORMAT_R64_UINT || vk_format == VK_FORMAT_R64_SINT)
+      vk_format = VK_FORMAT_R32G32_UINT;
+
+   enum pipe_format p_format = vk_format_to_pipe_format(vk_format);
+   assert(p_format != PIPE_FORMAT_NONE);
+
+   if (!agx_pixel_format[p_format].renderable) {
+      memset(&clear_value, 0, sizeof(clear_value));
+      util_format_pack_rgba(p_format, clear_value.color.uint32, pColor->uint32,
+                            1);
+
+      unsigned bpp = util_format_get_blocksize(p_format);
+      vk_format = vk_packed_int_format_for_size(bpp);
+   }
+
+   clear_image(cmd, image, imageLayout, vk_format, &clear_value, rangeCount,
+               pRanges);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, VkImage _image,
+                             VkImageLayout imageLayout,
+                             const VkClearDepthStencilValue *pDepthStencil,
+                             uint32_t rangeCount,
+                             const VkImageSubresourceRange *pRanges)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   const VkClearValue clear_value = {
+      .depthStencil = *pDepthStencil,
+   };
+
+   clear_image(cmd, image, imageLayout, image->vk.format, &clear_value,
+               rangeCount, pRanges);
+}
diff --git a/src/asahi/vulkan/hk_cmd_dispatch.c b/src/asahi/vulkan/hk_cmd_dispatch.c
new file mode 100644
index 00000000000..54c1a454992
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_dispatch.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "shaders/query.h"
+#include "vulkan/vulkan_core.h"
+#include "agx_helpers.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_gs.h"
+#include "agx_pack.h"
+#include "agx_scratch.h"
+#include "agx_tilebuffer.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_set.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+#include "pool.h"
+
+void
+hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd,
+                            const VkCommandBufferBeginInfo *pBeginInfo)
+{
+}
+
+void
+hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd)
+{
+   memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
+}
+
+void
+hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd,
+                           struct hk_api_shader *shader)
+{
+   cmd->state.cs.shader = shader;
+}
+
+void
+hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs)
+{
+   assert(cs->type == HK_CS_CDM);
+   assert(cs->current + AGX_CDM_BARRIER_LENGTH < cs->end &&
+          "caller must ensure space");
+
+   uint8_t *out = cs->current;
+
+   agx_push(out, CDM_BARRIER, cfg) {
+      cfg.unk_5 = true;
+      cfg.unk_6 = true;
+      cfg.unk_8 = true;
+      // cfg.unk_11 = true;
+      // cfg.unk_20 = true;
+      if (dev->dev.params.num_clusters_total > 1) {
+         // cfg.unk_24 = true;
+         if (dev->dev.params.gpu_generation == 13) {
+            cfg.unk_4 = true;
+            // cfg.unk_26 = true;
+         }
+      }
+
+      /* With multiple launches in the same CDM stream, we can get cache
+       * coherency (? or sync?) issues. We hit this with blits, which need - in
+       * between dispatches - need the PBE cache to be flushed and the texture
+       * cache to be invalidated. Until we know what bits mean what exactly,
+       * let's just set these after every launch to be safe. We can revisit in
+       * the future when we figure out what the bits mean.
+       */
+      cfg.unk_0 = true;
+      cfg.unk_1 = true;
+      cfg.unk_2 = true;
+      cfg.usc_cache_inval = true;
+      cfg.unk_4 = true;
+      cfg.unk_5 = true;
+      cfg.unk_6 = true;
+      cfg.unk_7 = true;
+      cfg.unk_8 = true;
+      cfg.unk_9 = true;
+      cfg.unk_10 = true;
+      cfg.unk_11 = true;
+      cfg.unk_12 = true;
+      cfg.unk_13 = true;
+      cfg.unk_14 = true;
+      cfg.unk_15 = true;
+      cfg.unk_16 = true;
+      cfg.unk_17 = true;
+      cfg.unk_18 = true;
+      cfg.unk_19 = true;
+   }
+
+   cs->current = out;
+}
+
+/*
+ * Enqueue workgroups to a given CDM control stream with a given prepared USC
+ * words. This does not interact with any global state, so it is suitable for
+ * internal dispatches that do not save/restore state. That can be simpler /
+ * lower overhead than vk_meta for special operations that logically operate
+ * as graphics.
+ */
+void
+hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs,
+                     struct hk_shader *s, uint32_t usc, struct hk_grid grid,
+                     struct hk_grid local_size)
+{
+   assert(cs->current + 0x2000 < cs->end && "should have ensured space");
+   uint8_t *out = cs->current;
+
+   agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
+      if (grid.indirect)
+         cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
+      else
+         cfg.mode = AGX_CDM_MODE_DIRECT;
+
+      /* For now, always bind the txf sampler and nothing else */
+      cfg.sampler_state_register_count = 1;
+
+      cfg.uniform_register_count = s->b.info.push_count;
+      cfg.preshader_register_count = s->b.info.nr_preamble_gprs;
+   }
+
+   agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
+      cfg.pipeline = usc;
+   }
+
+   /* Added in G14X */
+   if (dev->dev.params.gpu_generation >= 14 &&
+       dev->dev.params.num_clusters_total > 1) {
+
+      agx_push(out, CDM_UNK_G14X, cfg)
+         ;
+   }
+
+   assert(!local_size.indirect);
+
+   if (grid.indirect) {
+      agx_push(out, CDM_INDIRECT, cfg) {
+         cfg.address_hi = grid.ptr >> 32;
+         cfg.address_lo = grid.ptr & BITFIELD64_MASK(32);
+      }
+   } else {
+      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
+         cfg.x = grid.count[0];
+         cfg.y = grid.count[1];
+         cfg.z = grid.count[2];
+      }
+   }
+
+   agx_push(out, CDM_LOCAL_SIZE, cfg) {
+      cfg.x = local_size.count[0];
+      cfg.y = local_size.count[1];
+      cfg.z = local_size.count[2];
+   }
+
+   cs->current = out;
+   hk_cdm_cache_flush(dev, cs);
+}
+
+static void
+dispatch(struct hk_cmd_buffer *cmd, struct hk_grid grid)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_shader *s = hk_only_variant(cmd->state.cs.shader);
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true /* compute */);
+   if (!cs)
+      return;
+
+   uint64_t stat = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
+
+   if (stat) {
+      uint32_t local_size_threads = s->info.cs.local_size[0] *
+                                    s->info.cs.local_size[1] *
+                                    s->info.cs.local_size[2];
+
+      struct libagx_cs_invocation_params p = {
+         .grid = cmd->state.cs.descriptors.root.cs.group_count_addr,
+         .local_size_threads = local_size_threads,
+         .statistic = stat,
+      };
+
+      struct hk_shader *s =
+         hk_meta_kernel(dev, agx_nir_increment_cs_invocations, NULL, 0);
+
+      uint64_t params = hk_pool_upload(cmd, &p, sizeof(p), 8);
+      uint32_t usc =
+         hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+
+      hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1));
+   }
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+   hk_dispatch(cmd, cs, s, grid);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t baseGroupX,
+                   uint32_t baseGroupY, uint32_t baseGroupZ,
+                   uint32_t groupCountX, uint32_t groupCountY,
+                   uint32_t groupCountZ)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_descriptor_state *desc = &cmd->state.cs.descriptors;
+   if (desc->push_dirty)
+      hk_cmd_buffer_flush_push_descriptors(cmd, desc);
+
+   desc->root.cs.base_group[0] = baseGroupX;
+   desc->root.cs.base_group[1] = baseGroupY;
+   desc->root.cs.base_group[2] = baseGroupZ;
+
+   /* We don't want to key the shader to whether we're indirectly dispatching,
+    * so treat everything as indirect.
+    */
+   VkDispatchIndirectCommand group_count = {
+      .x = groupCountX,
+      .y = groupCountY,
+      .z = groupCountZ,
+   };
+
+   desc->root.cs.group_count_addr =
+      hk_pool_upload(cmd, &group_count, sizeof(group_count), 8);
+
+   dispatch(cmd, hk_grid(groupCountX, groupCountY, groupCountZ));
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                       VkDeviceSize offset)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+   struct hk_descriptor_state *desc = &cmd->state.cs.descriptors;
+   if (desc->push_dirty)
+      hk_cmd_buffer_flush_push_descriptors(cmd, desc);
+
+   desc->root.cs.base_group[0] = 0;
+   desc->root.cs.base_group[1] = 0;
+   desc->root.cs.base_group[2] = 0;
+
+   uint64_t dispatch_addr = hk_buffer_address(buffer, offset);
+   assert(dispatch_addr != 0);
+
+   desc->root.cs.group_count_addr = dispatch_addr;
+   dispatch(cmd, hk_grid_indirect(dispatch_addr));
+}
diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c
new file mode 100644
index 00000000000..78a7a922d15
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@@ -0,0 +1,3737 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include <assert.h>
+#include "agx_bg_eot.h"
+#include "agx_bo.h"
+#include "agx_compile.h"
+#include "agx_compiler.h"
+#include "agx_device.h"
+#include "agx_helpers.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_gs.h"
+#include "agx_nir_lower_vbo.h"
+#include "agx_ppp.h"
+#include "agx_tilebuffer.h"
+#include "agx_usc.h"
+#include "agx_uvs.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+#include "hk_private.h"
+#include "hk_shader.h"
+
+#include "asahi/genxml/agx_pack.h"
+#include "asahi/lib/libagx_shaders.h"
+#include "asahi/lib/shaders/geometry.h"
+#include "shaders/query.h"
+#include "shaders/tessellator.h"
+#include "util/bitpack_helpers.h"
+#include "util/blend.h"
+#include "util/format/format_utils.h"
+#include "util/format/u_formats.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "vulkan/vulkan_core.h"
+#include "layout.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_lower_blend.h"
+#include "nir_xfb_info.h"
+#include "pool.h"
+#include "shader_enums.h"
+#include "vk_blend.h"
+#include "vk_enum_to_str.h"
+#include "vk_format.h"
+#include "vk_graphics_state.h"
+#include "vk_pipeline.h"
+#include "vk_render_pass.h"
+#include "vk_standard_sample_locations.h"
+#include "vk_util.h"
+
+#define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
+
+#define IS_SHADER_DIRTY(bit)                                                   \
+   (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
+
+#define IS_LINKED_DIRTY(bit)                                                   \
+   (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
+
+struct hk_draw {
+   struct hk_grid b;
+   struct hk_addr_range index;
+   bool indexed;
+   uint32_t start;
+   uint32_t index_bias;
+   uint32_t start_instance;
+
+   /* Indicates that the indirect draw consists of raw VDM commands and should
+    * be stream linked to. Used to accelerate tessellation.
+    */
+   bool raw;
+
+   /* Set within hk_draw() but here so geometry/tessellation can override */
+   bool restart;
+   enum agx_index_size index_size;
+};
+
+static struct hk_draw
+hk_draw_indirect(uint64_t ptr)
+{
+   return (struct hk_draw){.b = hk_grid_indirect(ptr)};
+}
+
+static struct hk_draw
+hk_draw_indexed_indirect(uint64_t ptr, struct hk_addr_range index,
+                         enum agx_index_size index_size, bool restart)
+{
+   return (struct hk_draw){
+      .b = hk_grid_indirect(ptr),
+      .index = index,
+      .indexed = true,
+      .index_size = index_size,
+      .restart = restart,
+   };
+}
+
+/* XXX: deduplicate */
+static inline enum mesa_prim
+vk_conv_topology(VkPrimitiveTopology topology)
+{
+   switch (topology) {
+   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+      return MESA_PRIM_POINTS;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+      return MESA_PRIM_LINES;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+      return MESA_PRIM_LINE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+   case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
+#pragma GCC diagnostic pop
+      return MESA_PRIM_TRIANGLES;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+      return MESA_PRIM_TRIANGLE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+      return MESA_PRIM_TRIANGLE_FAN;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      return MESA_PRIM_LINES_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+      return MESA_PRIM_LINE_STRIP_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      return MESA_PRIM_TRIANGLES_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+      return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
+      return MESA_PRIM_PATCHES;
+   default:
+      unreachable("invalid");
+   }
+}
+
+static void
+hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   /* These depend on color attachment count */
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
+
+   /* These depend on the depth/stencil format */
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
+
+   /* This may depend on render targets for ESO */
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
+}
+
+void
+hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
+                             const VkCommandBufferBeginInfo *pBeginInfo)
+{
+   if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+       (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
+      char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
+      const VkRenderingInfo *resume_info =
+         vk_get_command_buffer_inheritance_as_rendering_resume(
+            cmd->vk.level, pBeginInfo, gcbiar_data);
+      if (resume_info) {
+         hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
+      } else {
+         const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
+            vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
+                                                             pBeginInfo);
+         assert(inheritance_info);
+
+         struct hk_rendering_state *render = &cmd->state.gfx.render;
+         render->flags = inheritance_info->flags;
+         render->area = (VkRect2D){};
+         render->layer_count = 0;
+         render->view_mask = inheritance_info->viewMask;
+         render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
+
+         render->color_att_count = inheritance_info->colorAttachmentCount;
+         for (uint32_t i = 0; i < render->color_att_count; i++) {
+            render->color_att[i].vk_format =
+               inheritance_info->pColorAttachmentFormats[i];
+         }
+         render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
+         render->stencil_att.vk_format =
+            inheritance_info->stencilAttachmentFormat;
+
+         hk_cmd_buffer_dirty_render_pass(cmd);
+      }
+   }
+
+   hk_cmd_buffer_dirty_all(cmd);
+
+   /* If multiview is disabled, always read 0. If multiview is enabled,
+    * hk_set_view_index will dirty the root each draw.
+    */
+   cmd->state.gfx.descriptors.root.draw.view_index = 0;
+   cmd->state.gfx.descriptors.root_dirty = true;
+}
+
+void
+hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
+{
+   hk_cmd_buffer_dirty_all(cmd);
+
+   /* From the Vulkan 1.3.275 spec:
+    *
+    *    "...There is one exception to this rule - if the primary command
+    *    buffer is inside a render pass instance, then the render pass and
+    *    subpass state is not disturbed by executing secondary command
+    *    buffers."
+    *
+    * We need to reset everything EXCEPT the render pass state.
+    */
+   struct hk_rendering_state render_save = cmd->state.gfx.render;
+   memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
+   cmd->state.gfx.render = render_save;
+}
+
+static void
+hk_attachment_init(struct hk_attachment *att,
+                   const VkRenderingAttachmentInfo *info)
+{
+   if (info == NULL || info->imageView == VK_NULL_HANDLE) {
+      *att = (struct hk_attachment){
+         .iview = NULL,
+      };
+      return;
+   }
+
+   VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
+   *att = (struct hk_attachment){
+      .vk_format = iview->vk.format,
+      .iview = iview,
+   };
+
+   if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
+      VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
+      att->resolve_mode = info->resolveMode;
+      att->resolve_iview = res_iview;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetRenderingAreaGranularityKHR(
+   VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
+   VkExtent2D *pGranularity)
+{
+   *pGranularity = (VkExtent2D){.width = 1, .height = 1};
+}
+
+static struct hk_bg_eot
+hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
+                bool store, bool partial_render, bool incomplete_render_area)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   /* Construct the key */
+   struct agx_bg_eot_key key = {.tib = render->tilebuffer};
+   static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
+
+   key.tib.layered = (render->cr.layers > 1);
+
+   bool needs_textures_for_spilled_rts =
+      agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
+
+   for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
+      const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
+      if (att_info->imageView == VK_NULL_HANDLE)
+         continue;
+
+      /* Partial render programs exist only to store/load the tilebuffer to
+       * main memory. When render targets are already spilled to main memory,
+       * there's nothing to do.
+       */
+      if (key.tib.spilled[i] && (partial_render || store))
+         continue;
+
+      if (store) {
+         bool store = att_info->storeOp == VK_ATTACHMENT_STORE_OP_STORE;
+
+         /* When resolving, we store the intermediate multisampled image as the
+          * resolve is a separate control stream. This could be optimized.
+          */
+         store |= att_info->resolveMode != VK_RESOLVE_MODE_NONE;
+
+         /* Partial renders always need to flush to memory. */
+         store |= partial_render;
+
+         key.op[i] = store ? AGX_EOT_STORE : AGX_BG_EOT_NONE;
+      } else {
+         bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
+         bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
+
+         /* The background program used for partial renders must always load
+          * whatever was stored in the mid-frame end-of-tile program.
+          */
+         load |= partial_render;
+
+         /* With an incomplete render area, we're forced to load back tiles and
+          * then use the 3D pipe for the clear.
+          */
+         load |= incomplete_render_area;
+
+         /* Don't read back spilled render targets, they're already in memory */
+         load &= !key.tib.spilled[i];
+
+         key.op[i] = load    ? AGX_BG_LOAD
+                     : clear ? AGX_BG_CLEAR
+                             : AGX_BG_EOT_NONE;
+      }
+   }
+
+   /* Begin building the pipeline */
+   size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
+   struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
+   if (!t.cpu)
+      return (struct hk_bg_eot){.usc = t.gpu};
+
+   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
+
+   bool uses_txf = false;
+   unsigned uniforms = 0;
+   unsigned nr_tex = 0;
+
+   for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
+      const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
+      struct hk_image_view *iview = render->color_att[rt].iview;
+
+      if (key.op[rt] == AGX_BG_LOAD) {
+         uses_txf = true;
+
+         uint32_t index = key.tib.layered
+                             ? iview->planes[0].layered_background_desc_index
+                             : iview->planes[0].background_desc_index;
+
+         agx_usc_pack(&b, TEXTURE, cfg) {
+            /* Shifted to match eMRT indexing, could be optimized */
+            cfg.start = rt * 2;
+            cfg.count = 1;
+            cfg.buffer = dev->images.bo->ptr.gpu + index * AGX_TEXTURE_LENGTH;
+         }
+
+         nr_tex = (rt * 2) + 1;
+      } else if (key.op[rt] == AGX_BG_CLEAR) {
+         static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
+         uint64_t colour =
+            hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
+
+         agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
+         uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
+      } else if (key.op[rt] == AGX_EOT_STORE) {
+         uint32_t index = key.tib.layered
+                             ? iview->planes[0].layered_eot_pbe_desc_index
+                             : iview->planes[0].eot_pbe_desc_index;
+
+         agx_usc_pack(&b, TEXTURE, cfg) {
+            cfg.start = rt;
+            cfg.count = 1;
+            cfg.buffer = dev->images.bo->ptr.gpu + index * AGX_TEXTURE_LENGTH;
+         }
+
+         nr_tex = rt + 1;
+      }
+   }
+
+   if (needs_textures_for_spilled_rts) {
+      hk_usc_upload_spilled_rt_descs(&b, cmd);
+      uniforms = MAX2(uniforms, 4);
+   }
+
+   if (uses_txf) {
+      agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler);
+   }
+
+   /* For attachmentless rendering, we don't know the sample count until
+    * draw-time. But we have trivial bg/eot programs in that case too.
+    */
+   if (key.tib.nr_samples >= 1) {
+      agx_usc_push_packed(&b, SHARED, &key.tib.usc);
+   } else {
+      assert(key.tib.sample_size_B == 0);
+      agx_usc_shared_none(&b);
+
+      key.tib.nr_samples = 1;
+   }
+
+   /* Get the shader */
+   key.reserved_preamble = uniforms;
+   /* XXX: locking? */
+   struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
+
+   agx_usc_pack(&b, SHADER, cfg) {
+      cfg.code = shader->ptr;
+      cfg.unk_2 = 0;
+   }
+
+   agx_usc_pack(&b, REGISTERS, cfg)
+      cfg.register_count = shader->info.nr_gprs;
+
+   if (shader->info.has_preamble) {
+      agx_usc_pack(&b, PRESHADER, cfg) {
+         cfg.code = shader->ptr + shader->info.preamble_offset;
+      }
+   } else {
+      agx_usc_pack(&b, NO_PRESHADER, cfg)
+         ;
+   }
+
+   struct hk_bg_eot ret = {.usc = t.gpu};
+
+   agx_pack(&ret.counts, COUNTS, cfg) {
+      cfg.uniform_register_count = shader->info.push_count;
+      cfg.preshader_register_count = shader->info.nr_preamble_gprs;
+      cfg.texture_state_register_count = nr_tex;
+      cfg.sampler_state_register_count =
+         agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
+   }
+
+   return ret;
+}
+
+static bool
+is_aligned(unsigned x, unsigned pot_alignment)
+{
+   assert(util_is_power_of_two_nonzero(pot_alignment));
+   return (x & (pot_alignment - 1)) == 0;
+}
+
+static void
+hk_merge_render_iview(struct hk_rendering_state *render,
+                      struct hk_image_view *iview)
+{
+   if (iview) {
+      unsigned samples = iview->vk.image->samples;
+      /* TODO: is this right for ycbcr? */
+      unsigned level = iview->vk.base_mip_level;
+      unsigned width = u_minify(iview->vk.image->extent.width, level);
+      unsigned height = u_minify(iview->vk.image->extent.height, level);
+
+      assert(render->tilebuffer.nr_samples == 0 ||
+             render->tilebuffer.nr_samples == samples);
+      render->tilebuffer.nr_samples = samples;
+
+      /* TODO: Is this merging logic sound? Not sure how this is supposed to
+       * work conceptually.
+       */
+      render->cr.width = MAX2(render->cr.width, width);
+      render->cr.height = MAX2(render->cr.height, height);
+   }
+}
+
+static void
+hk_pack_zls_control(struct agx_zls_control_packed *packed,
+                    struct ail_layout *z_layout, struct ail_layout *s_layout,
+                    const VkRenderingAttachmentInfo *attach_z,
+                    const VkRenderingAttachmentInfo *attach_s,
+                    bool incomplete_render_area, bool partial_render)
+{
+   agx_pack(packed, ZLS_CONTROL, zls_control) {
+      if (z_layout) {
+         zls_control.z_store_enable =
+            attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
+            attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render;
+
+         zls_control.z_load_enable =
+            attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
+            incomplete_render_area;
+
+         if (ail_is_compressed(z_layout)) {
+            zls_control.z_compress_1 = true;
+            zls_control.z_compress_2 = true;
+         }
+
+         if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
+            zls_control.z_format = AGX_ZLS_FORMAT_16;
+         } else {
+            zls_control.z_format = AGX_ZLS_FORMAT_32F;
+         }
+      }
+
+      if (s_layout) {
+         /* TODO:
+          * Fail
+          * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
+          * without the force
+          * .. maybe a VkRenderPass emulation bug.
+          */
+         zls_control.s_store_enable =
+            attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
+            attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
+            true;
+
+         zls_control.s_load_enable =
+            attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
+            incomplete_render_area;
+
+         if (ail_is_compressed(s_layout)) {
+            zls_control.s_compress_1 = true;
+            zls_control.s_compress_2 = true;
+         }
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
+                     const VkRenderingInfo *pRenderingInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   memset(render, 0, sizeof(*render));
+
+   render->flags = pRenderingInfo->flags;
+   render->area = pRenderingInfo->renderArea;
+   render->view_mask = pRenderingInfo->viewMask;
+   render->layer_count = pRenderingInfo->layerCount;
+   render->tilebuffer.nr_samples = 0;
+
+   const uint32_t layer_count = render->view_mask
+                                   ? util_last_bit(render->view_mask)
+                                   : render->layer_count;
+
+   render->color_att_count = pRenderingInfo->colorAttachmentCount;
+   for (uint32_t i = 0; i < render->color_att_count; i++) {
+      hk_attachment_init(&render->color_att[i],
+                         &pRenderingInfo->pColorAttachments[i]);
+   }
+
+   hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
+   hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
+
+   for (uint32_t i = 0; i < render->color_att_count; i++) {
+      hk_merge_render_iview(render, render->color_att[i].iview);
+   }
+
+   hk_merge_render_iview(render,
+                         render->depth_att.iview ?: render->stencil_att.iview);
+
+   /* Infer for attachmentless. samples is inferred at draw-time. */
+   render->cr.width =
+      MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
+
+   render->cr.height = MAX2(render->cr.height,
+                            render->area.offset.y + render->area.extent.height);
+
+   render->cr.layers = layer_count;
+
+   /* Choose a tilebuffer layout given the framebuffer key */
+   enum pipe_format formats[HK_MAX_RTS] = {0};
+   for (unsigned i = 0; i < render->color_att_count; ++i) {
+      formats[i] = vk_format_to_pipe_format(render->color_att[i].vk_format);
+   }
+
+   /* For now, we force layered=true since it makes compatibility problems way
+    * easier.
+    */
+   render->tilebuffer = agx_build_tilebuffer_layout(
+      formats, render->color_att_count, render->tilebuffer.nr_samples, true);
+
+   hk_cmd_buffer_dirty_render_pass(cmd);
+
+   /* Determine whether the render area is complete, enabling us to use a
+    * fast-clear.
+    *
+    * TODO: If it is incomplete but tile aligned, it should be possibly to fast
+    * clear with the appropriate settings. This is critical for performance.
+    */
+   bool incomplete_render_area =
+      render->area.offset.x > 0 || render->area.offset.y > 0 ||
+      render->area.extent.width < render->cr.width ||
+      render->area.extent.height < render->cr.height ||
+      (render->view_mask &&
+       render->view_mask != BITFIELD64_MASK(render->cr.layers));
+
+   render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
+                                        incomplete_render_area);
+   render->cr.bg.partial =
+      hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
+
+   render->cr.eot.main =
+      hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
+   render->cr.eot.partial = render->cr.eot.main;
+
+   render->cr.isp_bgobjvals = 0x300;
+
+   const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
+   const VkRenderingAttachmentInfo *attach_s =
+      pRenderingInfo->pStencilAttachment;
+
+   render->cr.iogpu_unk_214 = 0xc000;
+
+   struct ail_layout *z_layout = NULL, *s_layout = NULL;
+
+   if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
+      struct hk_image_view *view = render->depth_att.iview;
+      struct hk_image *image =
+         container_of(view->vk.image, struct hk_image, vk);
+
+      z_layout = &image->planes[0].layout;
+
+      unsigned level = view->vk.base_mip_level;
+      unsigned first_layer = view->vk.base_array_layer;
+
+      const struct util_format_description *desc =
+         util_format_description(vk_format_to_pipe_format(view->vk.format));
+
+      assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
+             desc->format == PIPE_FORMAT_Z16_UNORM ||
+             desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+
+      render->cr.depth.buffer =
+         hk_image_base_address(image, 0) +
+         ail_get_layer_level_B(z_layout, first_layer, level);
+
+      /* Main stride in pages */
+      assert((z_layout->depth_px == 1 ||
+              is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
+             "Page aligned Z layers");
+
+      unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
+      render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
+
+      assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
+
+      if (ail_is_compressed(z_layout)) {
+         render->cr.depth.meta =
+            hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
+            (first_layer * z_layout->compression_layer_stride_B) +
+            z_layout->level_offsets_compressed_B[level];
+
+         /* Meta stride in cache lines */
+         assert(
+            is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
+            "Cacheline aligned Z meta layers");
+
+         unsigned stride_lines =
+            z_layout->compression_layer_stride_B / AIL_CACHELINE;
+         render->cr.depth.meta_stride = (stride_lines - 1) << 14;
+      }
+
+      float clear_depth = attach_z->clearValue.depthStencil.depth;
+
+      if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
+         render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
+         render->cr.iogpu_unk_214 |= 0x40000;
+      } else {
+         render->cr.isp_bgobjdepth = fui(clear_depth);
+      }
+   }
+
+   if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
+      struct hk_image_view *view = render->stencil_att.iview;
+      struct hk_image *image =
+         container_of(view->vk.image, struct hk_image, vk);
+
+      /* Stencil is always the last plane (possibly the only plane) */
+      unsigned plane = image->plane_count - 1;
+      s_layout = &image->planes[plane].layout;
+      assert(s_layout->format == PIPE_FORMAT_S8_UINT);
+
+      unsigned level = view->vk.base_mip_level;
+      unsigned first_layer = view->vk.base_array_layer;
+
+      render->cr.stencil.buffer =
+         hk_image_base_address(image, plane) +
+         ail_get_layer_level_B(s_layout, first_layer, level);
+
+      /* Main stride in pages */
+      assert((s_layout->depth_px == 1 ||
+              is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
+             "Page aligned S layers");
+      unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
+      render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
+
+      if (ail_is_compressed(s_layout)) {
+         render->cr.stencil.meta =
+            hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
+            (first_layer * s_layout->compression_layer_stride_B) +
+            s_layout->level_offsets_compressed_B[level];
+
+         /* Meta stride in cache lines */
+         assert(
+            is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
+            "Cacheline aligned S meta layers");
+
+         unsigned stride_lines =
+            s_layout->compression_layer_stride_B / AIL_CACHELINE;
+
+         render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
+      }
+
+      render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
+   }
+
+   hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
+                       attach_s, incomplete_render_area, false);
+
+   hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
+                       attach_z, attach_s, incomplete_render_area, true);
+
+   /* If multiview is disabled, always read 0. If multiview is enabled,
+    * hk_set_view_index will dirty the root each draw.
+    */
+   cmd->state.gfx.descriptors.root.draw.view_index = 0;
+   cmd->state.gfx.descriptors.root_dirty = true;
+
+   if (render->flags & VK_RENDERING_RESUMING_BIT)
+      return;
+
+   /* The first control stream of the render pass is special since it gets
+    * the clears. Create it and swap in the clear.
+    */
+   assert(!cmd->current_cs.gfx && "not already in a render pass");
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
+   if (!cs)
+      return;
+
+   cs->cr.bg.main = render->cr.bg.main;
+   cs->cr.zls_control = render->cr.zls_control;
+
+   /* Reordering barrier for post-gfx, in case we had any. */
+   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+
+   /* Don't reorder compute across render passes.
+    *
+    * TODO: Check if this is necessary if the proper PipelineBarriers are
+    * handled... there may be CTS bugs...
+    */
+   hk_cmd_buffer_end_compute(cmd);
+
+   if (incomplete_render_area) {
+      uint32_t clear_count = 0;
+      VkClearAttachment clear_att[HK_MAX_RTS + 1];
+      for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
+         const VkRenderingAttachmentInfo *att_info =
+            &pRenderingInfo->pColorAttachments[i];
+         if (att_info->imageView == VK_NULL_HANDLE ||
+             att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
+            continue;
+
+         clear_att[clear_count++] = (VkClearAttachment){
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .colorAttachment = i,
+            .clearValue = att_info->clearValue,
+         };
+      }
+
+      clear_att[clear_count] = (VkClearAttachment){
+         .aspectMask = 0,
+      };
+
+      if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
+          attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+         clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
+         clear_att[clear_count].clearValue.depthStencil.depth =
+            attach_z->clearValue.depthStencil.depth;
+      }
+
+      if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
+          attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+         clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+         clear_att[clear_count].clearValue.depthStencil.stencil =
+            attach_s->clearValue.depthStencil.stencil;
+      }
+
+      if (clear_att[clear_count].aspectMask != 0)
+         clear_count++;
+
+      if (clear_count > 0) {
+         const VkClearRect clear_rect = {
+            .rect = render->area,
+            .baseArrayLayer = 0,
+            .layerCount = render->view_mask ? 1 : render->layer_count,
+         };
+
+         hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
+                                clear_att, 1, &clear_rect);
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndRendering(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   hk_cmd_buffer_end_graphics(cmd);
+
+   bool need_resolve = false;
+
+   /* Translate render state back to VK for meta */
+   VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
+   for (uint32_t i = 0; i < render->color_att_count; i++) {
+      if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
+         need_resolve = true;
+
+      vk_color_att[i] = (VkRenderingAttachmentInfo){
+         .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+         .imageView = hk_image_view_to_handle(render->color_att[i].iview),
+         .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .resolveMode = render->color_att[i].resolve_mode,
+         .resolveImageView =
+            hk_image_view_to_handle(render->color_att[i].resolve_iview),
+         .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      };
+   }
+
+   const VkRenderingAttachmentInfo vk_depth_att = {
+      .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+      .imageView = hk_image_view_to_handle(render->depth_att.iview),
+      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      .resolveMode = render->depth_att.resolve_mode,
+      .resolveImageView =
+         hk_image_view_to_handle(render->depth_att.resolve_iview),
+      .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
+   if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
+      need_resolve = true;
+
+   const VkRenderingAttachmentInfo vk_stencil_att = {
+      .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+      .imageView = hk_image_view_to_handle(render->stencil_att.iview),
+      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      .resolveMode = render->stencil_att.resolve_mode,
+      .resolveImageView =
+         hk_image_view_to_handle(render->stencil_att.resolve_iview),
+      .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
+   if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
+      need_resolve = true;
+
+   const VkRenderingInfo vk_render = {
+      .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
+      .renderArea = render->area,
+      .layerCount = render->layer_count,
+      .viewMask = render->view_mask,
+      .colorAttachmentCount = render->color_att_count,
+      .pColorAttachments = vk_color_att,
+      .pDepthAttachment = &vk_depth_att,
+      .pStencilAttachment = &vk_stencil_att,
+   };
+
+   if (render->flags & VK_RENDERING_SUSPENDING_BIT)
+      need_resolve = false;
+
+   memset(render, 0, sizeof(*render));
+
+   if (need_resolve) {
+      hk_meta_resolve_rendering(cmd, &vk_render);
+   }
+}
+
+static uint64_t
+hk_geometry_state(struct hk_cmd_buffer *cmd)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   /* We tie heap allocation to geometry state allocation, so allocate now. */
+   if (unlikely(!dev->heap)) {
+      size_t size = 128 * 1024 * 1024;
+      dev->heap = agx_bo_create(&dev->dev, size, 0, "Geometry heap");
+
+      /* The geometry state buffer is initialized here and then is treated by
+       * the CPU as rodata, even though the GPU uses it for scratch internally.
+       */
+      off_t off = dev->rodata.geometry_state - dev->rodata.bo->ptr.gpu;
+      struct agx_geometry_state *map = dev->rodata.bo->ptr.cpu + off;
+
+      *map = (struct agx_geometry_state){
+         .heap = dev->heap->ptr.gpu,
+         .heap_size = size,
+      };
+   }
+
+   /* We need to free all allocations after each command buffer execution */
+   if (!cmd->uses_heap) {
+      uint64_t addr = dev->rodata.geometry_state;
+
+      /* Zeroing the allocated index frees everything */
+      hk_queue_write(cmd,
+                     addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
+                     true /* after gfx */);
+
+      cmd->uses_heap = true;
+   }
+
+   return dev->rodata.geometry_state;
+}
+
+static uint64_t
+hk_upload_gsi_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
+
+   unsigned index_size_B =
+      draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
+
+   uint64_t vb;
+   if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
+      assert(index_size_B == 4);
+
+      vb = desc->root.draw.tess_params +
+           offsetof(struct libagx_tess_args, tes_buffer);
+   } else {
+      vb = desc->root.root_desc_addr +
+           offsetof(struct hk_root_descriptor_table, draw.vertex_output_buffer);
+   }
+
+   struct agx_gs_setup_indirect_params gsi = {
+      .index_buffer = draw.index.addr,
+      .index_size_B = index_size_B,
+      .index_buffer_range_el = draw.index.range / index_size_B,
+      .zero_sink = dev->rodata.zero_sink,
+      .draw = draw.b.ptr,
+      .vertex_buffer = vb,
+      .ia = desc->root.draw.input_assembly,
+      .geom = desc->root.draw.geometry_params,
+      .vs_outputs = vs->b.info.outputs,
+   };
+
+   return hk_pool_upload(cmd, &gsi, sizeof(gsi), 8);
+}
+
+static uint64_t
+hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   assert(!draw.b.indirect && "indirect params written by GPU");
+
+   struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
+
+   if (draw.indexed) {
+      unsigned index_size_B = agx_index_size_to_B(draw.index_size);
+      unsigned range_el = draw.index.range / index_size_B;
+
+      ia.index_buffer =
+         libagx_index_buffer(draw.index.addr, range_el, draw.start,
+                             index_size_B, dev->rodata.zero_sink);
+
+      ia.index_buffer_range_el =
+         libagx_index_buffer_range_el(range_el, draw.start);
+   }
+
+   return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
+}
+
+static enum mesa_prim
+hk_gs_in_prim(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
+
+   if (tes != NULL)
+      return tes->variants[HK_GS_VARIANT_RAST].info.ts.out_prim;
+   else
+      return vk_conv_topology(dyn->ia.primitive_topology);
+}
+
+static enum mesa_prim
+hk_rast_prim(struct hk_cmd_buffer *cmd)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+
+   if (gs != NULL)
+      return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim;
+   else
+      return hk_gs_in_prim(cmd);
+}
+
+static uint64_t
+hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
+
+   bool rast_disc = dyn->rs.rasterizer_discard_enable;
+   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
+
+   /* XXX: We should deduplicate this logic */
+   bool restart = (draw.indexed && draw.restart);
+   bool indirect =
+      draw.b.indirect || gfx->shaders[MESA_SHADER_TESS_EVAL] || restart;
+   enum mesa_prim mode = hk_gs_in_prim(cmd);
+
+   if (restart) {
+      mode = u_decomposed_prim(mode);
+   }
+
+   struct agx_geometry_params params = {
+      .state = hk_geometry_state(cmd),
+      .indirect_desc = cmd->geom_indirect,
+      .flat_outputs = fs ? fs->info.fs.interp.flat : 0,
+      .input_topology = mode,
+
+      /* Overriden by the indirect setup kernel. As tess->GS is always indirect,
+       * we can assume here that we're VS->GS.
+       */
+      .input_buffer = desc->root.draw.vertex_output_buffer,
+      .input_mask = desc->root.draw.vertex_outputs,
+   };
+
+   if (gfx->xfb_enabled) {
+      for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
+         params.xfb_base_original[i] = gfx->xfb[i].addr;
+         params.xfb_size[i] = gfx->xfb[i].range;
+         params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
+      }
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
+      uint64_t q = gfx->xfb_query[i];
+
+      if (q) {
+         params.xfb_prims_generated_counter[i] = q;
+         params.prims_generated_counter[i] = q + sizeof(uint64_t);
+      }
+   }
+
+   /* Calculate input primitive count for direct draws, and allocate the vertex
+    * & count buffers. GPU calculates and allocates for indirect draws.
+    */
+   unsigned count_buffer_stride = count->info.gs.count_words * 4;
+
+   if (indirect) {
+      params.count_buffer_stride = count_buffer_stride;
+      params.vs_grid[2] = params.gs_grid[2] = 1;
+   } else {
+      uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
+
+      params.vs_grid[0] = verts;
+      params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
+
+      params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
+      params.input_primitives = params.gs_grid[0] * instances;
+
+      unsigned size = params.input_primitives * count_buffer_stride;
+      if (size) {
+         params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
+      }
+   }
+
+   desc->root_dirty = true;
+   return hk_pool_upload(cmd, &params, sizeof(params), 8);
+}
+
+/*
+ * Tessellation has a fast path where the tessellator generates a VDM Index List
+ * command per patch, as well as a slow path using prefix sums to generate a
+ * single combined API draw. We need the latter if tessellation is fed into
+ * another software stage (geometry shading), or if we need accurate primitive
+ * IDs in the linked fragment shader (since that would require a prefix sum
+ * anyway).
+ */
+static bool
+hk_tess_needs_prefix_sum(struct hk_cmd_buffer *cmd)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   return gfx->shaders[MESA_SHADER_GEOMETRY] || gfx->generate_primitive_id;
+}
+
+static uint64_t
+hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
+   struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
+
+   struct libagx_tess_args args = {
+      .heap = hk_geometry_state(cmd),
+      .tcs_stride_el = tcs->info.tcs.output_stride / 4,
+      .statistic = hk_pipeline_stat_addr(
+         cmd,
+         VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
+
+      .input_patch_size = dyn->ts.patch_control_points,
+      .output_patch_size = tcs->info.tcs.output_patch_size,
+      .tcs_patch_constants = tcs->info.tcs.nr_patch_outputs,
+      .tcs_per_vertex_outputs = tcs->info.tcs.per_vertex_outputs,
+   };
+
+   bool with_counts = hk_tess_needs_prefix_sum(cmd);
+
+   /* This assumes !with_counts, if we have counts it's only one draw */
+   uint32_t draw_stride_el = tes->info.ts.point_mode ? 4 : 6;
+   size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
+
+   /* heap is allocated by hk_geometry_state */
+   args.patch_coord_buffer = dev->heap->ptr.gpu;
+
+   if (!draw.b.indirect) {
+      unsigned in_patches = draw.b.count[0] / args.input_patch_size;
+      if (in_patches == 0)
+         unreachable("todo: drop the draw?");
+
+      unsigned unrolled_patches = in_patches * draw.b.count[1];
+
+      uint32_t alloc = 0;
+      uint32_t tcs_out_offs = alloc;
+      alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
+
+      uint32_t patch_coord_offs = alloc;
+      alloc += unrolled_patches * 4 * 32;
+
+      uint32_t count_offs = alloc;
+      if (with_counts)
+         alloc += unrolled_patches * sizeof(uint32_t) * 32;
+
+      uint32_t draw_offs = alloc;
+
+      if (with_counts) {
+         /* Single API draw */
+         alloc += 5 * sizeof(uint32_t);
+      } else {
+         /* Padding added because VDM overreads */
+         alloc += (draw_stride_B * unrolled_patches) +
+                  (AGX_VDM_BARRIER_LENGTH + 0x800);
+      }
+
+      struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
+      args.tcs_buffer = blob.gpu + tcs_out_offs;
+      args.patches_per_instance = in_patches;
+      args.coord_allocs = blob.gpu + patch_coord_offs;
+      args.nr_patches = unrolled_patches;
+      args.out_draws = blob.gpu + draw_offs;
+
+      gfx->tess_out_draws = args.out_draws;
+
+      if (with_counts) {
+         args.counts = blob.gpu + count_offs;
+      } else {
+         /* Arrange so we return after all generated draws */
+         uint8_t *ret = (uint8_t *)blob.cpu + draw_offs +
+                        (draw_stride_B * unrolled_patches);
+
+         agx_pack(ret, VDM_BARRIER, cfg) {
+            cfg.returns = true;
+         }
+      }
+   } else {
+      unreachable("todo: indirect with tess");
+#if 0
+      args.tcs_statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
+
+      args.indirect = agx_indirect_buffer_ptr(batch, indirect);
+
+      /* Allocate 3x indirect global+local grids for VS/TCS/tess */
+      uint32_t grid_stride = sizeof(uint32_t) * 6;
+      args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
+
+      vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
+      tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
+      tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
+
+      args.vertex_outputs = ctx->vs->b.info.outputs;
+      args.vertex_output_buffer_ptr =
+         agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
+
+      batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
+
+      if (with_counts) {
+         args.out_draws = agx_pool_alloc_aligned_with_bo(
+                             &batch->pool, draw_stride, 4, &draw_bo)
+                             .gpu;
+      } else {
+         unreachable("need an extra indirection...");
+      }
+#endif
+   }
+
+   return hk_pool_upload(cmd, &args, sizeof(args), 8);
+}
+
+static struct hk_api_shader *
+hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
+                            hk_internal_builder_t builder)
+{
+   /* Try to get the cached shader */
+   struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
+   if (ent)
+      return ent->data;
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                                  &agx_nir_options, NULL);
+   builder(&b, key->key);
+
+   const struct vk_pipeline_robustness_state rs = {
+      .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
+      .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+   };
+
+   struct vk_shader_compile_info info = {
+      .stage = b.shader->info.stage,
+      .nir = b.shader,
+      .robustness = &rs,
+   };
+
+   /* We need to link libagx and assign shared before preprocessing, matching
+    * what the driver would otherwise produce.
+    */
+   agx_link_libagx(b.shader, dev->dev.libagx);
+
+   if (info.stage == MESA_SHADER_COMPUTE) {
+      NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
+               nir_var_mem_shared, glsl_get_cl_type_size_align);
+
+      /* Commit to the layout so we don't clobber later */
+      b.shader->info.shared_memory_explicit_layout = true;
+
+      NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
+               nir_address_format_62bit_generic);
+   }
+
+   hk_preprocess_nir_internal(dev->vk.physical, b.shader);
+
+   struct hk_api_shader *s;
+   if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
+      return NULL;
+
+   /* ..and cache it before we return. The key is on the stack right now, so
+    * clone it before using it as a hash table key. The clone is logically owned
+    * by the hash table.
+    */
+   size_t total_key_size = sizeof(*key) + key->key_size;
+   void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
+
+   _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
+   return s;
+}
+
+struct hk_api_shader *
+hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
+               size_t data_size)
+{
+   size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
+
+   struct hk_internal_key *key = alloca(total_key_size);
+   key->builder = builder;
+   key->key_size = data_size;
+
+   if (data_size)
+      memcpy(key->key, data, data_size);
+
+   simple_mtx_lock(&dev->kernels.lock);
+   struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
+   simple_mtx_unlock(&dev->kernels.lock);
+
+   return s;
+}
+
+static struct hk_draw
+hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                        struct hk_draw draw, uint32_t draw_count)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
+
+   /* The unroll kernel assumes an indirect draw. Synthesize one if needed */
+   if (!draw.b.indirect) {
+      uint32_t desc[5] = {draw.b.count[0], draw.b.count[1], draw.start,
+                          draw.index_bias, draw.start_instance};
+
+      draw =
+         hk_draw_indexed_indirect(hk_pool_upload(cmd, desc, sizeof(desc), 4),
+                                  draw.index, draw.index_size, true);
+   }
+
+   /* Next, we unroll the index buffer used by the indirect draw */
+   struct agx_unroll_restart_key key = {
+      .prim = vk_conv_topology(dyn->ia.primitive_topology),
+      .index_size_B = agx_index_size_to_B(draw.index_size),
+   };
+
+   struct agx_restart_unroll_params ia = {
+      .heap = hk_geometry_state(cmd),
+      .index_buffer = draw.index.addr,
+      .count = hk_pool_upload(cmd, &draw_count, sizeof(uint32_t), 4),
+      .draws = draw.b.ptr,
+      .out_draws = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
+      .max_draws = 1 /* TODO: MDI */,
+      .restart_index = gfx->index.restart,
+      .index_buffer_size_el = draw.index.range / key.index_size_B,
+      .flatshade_first =
+         dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
+      .zero_sink = dev->rodata.zero_sink,
+   };
+
+   struct hk_shader *s =
+      hk_meta_kernel(dev, agx_nir_unroll_restart, &key, sizeof(key));
+
+   uint64_t params = hk_pool_upload(cmd, &ia, sizeof(ia), 8);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1024 * draw_count, 1, 1),
+                        hk_grid(1024, 1, 1));
+
+   struct hk_addr_range out_index = {
+      .addr = dev->heap->ptr.gpu,
+      .range = dev->heap->size,
+   };
+
+   return hk_draw_indexed_indirect(ia.out_draws, out_index, draw.index_size,
+                                   false /* restart */);
+}
+
+static struct hk_draw
+hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                     struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+   struct hk_grid grid_vs, grid_gs;
+
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   bool rast_disc = dyn->rs.rasterizer_discard_enable;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
+
+   struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
+   struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
+   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
+   struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
+
+   unsigned count_words = count->info.gs.count_words;
+
+   if (false /* TODO */)
+      perf_debug(dev, "Transform feedbck");
+   else if (count_words)
+      perf_debug(dev, "Geometry shader with counts");
+   else
+      perf_debug(dev, "Geometry shader without counts");
+
+   enum mesa_prim mode = hk_gs_in_prim(cmd);
+
+   if (draw.indexed && draw.restart) {
+      draw = hk_draw_without_restart(cmd, cs, draw, 1);
+      mode = u_decomposed_prim(mode);
+   }
+
+   /* Setup grids */
+   if (draw.b.indirect) {
+      struct agx_gs_setup_indirect_key key = {.prim = mode};
+
+      struct hk_shader *gsi =
+         hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
+
+      uint64_t push = hk_upload_gsi_params(cmd, draw);
+      uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
+
+      hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
+                           hk_grid(1, 1, 1));
+
+      uint64_t geometry_params = desc->root.draw.geometry_params;
+      grid_vs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, vs_grid));
+
+      grid_gs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, gs_grid));
+   } else {
+      grid_vs = grid_gs = draw.b;
+      grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
+   }
+
+   /* Launch the vertex shader first */
+   hk_reserve_scratch(cmd, cs, vs);
+   hk_dispatch_with_usc(dev, cs, vs,
+                        hk_upload_usc_words(cmd, vs,
+                                            vs->info.stage == MESA_SHADER_VERTEX
+                                               ? gfx->linked[MESA_SHADER_VERTEX]
+                                               : vs->only_linked),
+                        grid_vs, hk_grid(1, 1, 1));
+
+   /* If we need counts, launch the count shader and prefix sum the results. */
+   if (count_words) {
+      hk_dispatch_with_local_size(cmd, cs, count, grid_gs, hk_grid(1, 1, 1));
+
+      struct hk_api_shader *prefix_sum = hk_meta_shader(
+         dev, agx_nir_prefix_sum_gs, &count_words, sizeof(count_words));
+
+      /* XXX: hack */
+      hk_only_variant(prefix_sum)->info.stage = MESA_SHADER_GEOMETRY;
+
+      hk_dispatch_with_local_size(cmd, cs, hk_only_variant(prefix_sum),
+                                  hk_grid(1024 * count_words, 1, 1),
+                                  hk_grid(1024, 1, 1));
+   }
+
+   /* Pre-GS shader */
+   hk_dispatch_with_local_size(cmd, cs, pre_gs, hk_grid(1, 1, 1),
+                               hk_grid(1, 1, 1));
+
+   /* Pre-rast geometry shader */
+   hk_dispatch_with_local_size(cmd, cs, main, grid_gs, hk_grid(1, 1, 1));
+
+   struct hk_addr_range range = (struct hk_addr_range){
+      .addr = dev->heap->ptr.gpu,
+      .range = dev->heap->size,
+   };
+
+   bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS;
+   return hk_draw_indexed_indirect(cmd->geom_indirect, range,
+                                   AGX_INDEX_SIZE_U32, restart);
+}
+
+static struct hk_draw
+hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_grid grid_vs, grid_tcs, grid_tess;
+
+   struct hk_shader *vs = hk_bound_sw_vs(gfx);
+   struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
+   struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
+
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   uint32_t input_patch_size = dyn->ts.patch_control_points;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
+
+   perf_debug(dev, "Tessellation");
+
+   uint64_t tcs_stat = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
+
+   /* Setup grids */
+   if (draw.b.indirect) {
+      unreachable("todo: indirect tess");
+#if 0
+      struct agx_gs_setup_indirect_key key = {.prim = mode};
+
+      struct hk_shader *gsi =
+         hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
+
+      uint64_t push = hk_upload_gsi_params(cmd, draw);
+      uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
+
+      hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
+                           hk_grid(1, 1, 1));
+
+      uint64_t geometry_params = desc->root.draw.geometry_params;
+      grid_vs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, vs_grid));
+
+      grid_gs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, gs_grid));
+#endif
+   } else {
+      uint32_t patches = draw.b.count[0] / input_patch_size;
+      grid_vs = grid_tcs = draw.b;
+
+      grid_tcs.count[0] = patches * tcs->info.tcs.output_patch_size;
+      grid_tess = hk_grid(patches * draw.b.count[1], 1, 1);
+
+      /* TCS invocation counter increments once per-patch */
+      if (tcs_stat) {
+         perf_debug(dev, "Direct TCS statistic");
+
+         struct libagx_increment_params args = {
+            .statistic = tcs_stat,
+            .delta = patches,
+         };
+
+         struct hk_shader *s =
+            hk_meta_kernel(dev, agx_nir_increment_statistic, NULL, 0);
+
+         uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
+         uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+
+         hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1),
+                              hk_grid(1, 1, 1));
+      }
+   }
+
+   /* First launch the VS and TCS */
+   hk_reserve_scratch(cmd, cs, vs);
+   hk_reserve_scratch(cmd, cs, tcs);
+
+   /* XXX perf: grid size */
+   hk_dispatch_with_usc(
+      dev, cs, vs,
+      hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
+      hk_grid(64, 1, 1));
+
+   hk_dispatch_with_usc(
+      dev, cs, tcs, hk_upload_usc_words(cmd, tcs, tcs->only_linked), grid_tcs,
+      hk_grid(tcs->info.tcs.output_patch_size, 1, 1));
+
+   /* TODO indirect */
+
+   bool with_counts = hk_tess_needs_prefix_sum(cmd);
+   uint64_t state = gfx->descriptors.root.draw.tess_params;
+
+   /* If the domain is flipped, we need to flip the winding order */
+   bool ccw = tes->info.ts.ccw;
+   ccw ^= dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
+
+   enum libagx_tess_partitioning partitioning =
+      tes->info.ts.spacing == TESS_SPACING_EQUAL
+         ? LIBAGX_TESS_PARTITIONING_INTEGER
+      : tes->info.ts.spacing == TESS_SPACING_FRACTIONAL_ODD
+         ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
+         : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
+
+   enum libagx_tess_output_primitive prim =
+      tes->info.ts.point_mode ? LIBAGX_TESS_OUTPUT_POINT
+      : ccw                   ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
+                              : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   struct agx_tessellator_key key = {
+      .prim = tes->info.ts.mode,
+      .output_primitive = prim,
+      .partitioning = partitioning,
+   };
+
+   if (with_counts) {
+      perf_debug(dev, "Tessellation with counts");
+
+      /* Generate counts */
+      key.mode = LIBAGX_TESS_MODE_COUNT;
+      {
+         struct hk_shader *tess =
+            hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
+
+         hk_dispatch_with_usc(
+            dev, cs, tess,
+            hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
+            grid_tess, hk_grid(64, 1, 1));
+      }
+
+      /* Prefix sum counts, allocating index buffer space. */
+      {
+         struct hk_shader *sum =
+            hk_meta_kernel(dev, agx_nir_prefix_sum_tess, NULL, 0);
+
+         hk_dispatch_with_usc(
+            dev, cs, sum,
+            hk_upload_usc_words_kernel(cmd, sum, &state, sizeof(state)),
+            hk_grid(1024, 1, 1), hk_grid(1024, 1, 1));
+      }
+
+      key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
+   } else {
+      key.mode = LIBAGX_TESS_MODE_VDM;
+   }
+
+   /* Now we can tessellate */
+   {
+      struct hk_shader *tess =
+         hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
+
+      hk_dispatch_with_usc(
+         dev, cs, tess,
+         hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
+         grid_tess, hk_grid(64, 1, 1));
+   }
+
+   struct hk_addr_range range = (struct hk_addr_range){
+      .addr = dev->heap->ptr.gpu,
+      .range = dev->heap->size,
+   };
+
+   struct hk_draw out = hk_draw_indexed_indirect(gfx->tess_out_draws, range,
+                                                 AGX_INDEX_SIZE_U32, false);
+   out.raw = !with_counts;
+   return out;
+}
+
+void
+hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
+                            const gl_shader_stage stage,
+                            struct hk_api_shader *shader)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
+   if (cmd->state.gfx.shaders[stage] == shader)
+      return;
+
+   cmd->state.gfx.shaders[stage] = shader;
+   cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
+   }
+}
+
+static uint32_t
+hk_pipeline_bind_group(gl_shader_stage stage)
+{
+   return stage;
+}
+
+static void
+hk_flush_shaders(struct hk_cmd_buffer *cmd)
+{
+   if (cmd->state.gfx.shaders_dirty == 0)
+      return;
+
+   /* Map shader types to shaders */
+   struct hk_api_shader *type_shader[6] = {
+      NULL,
+   };
+   uint32_t types_dirty = 0;
+
+   const uint32_t gfx_stages =
+      BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
+      BITFIELD_BIT(MESA_SHADER_TESS_EVAL) | BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
+      BITFIELD_BIT(MESA_SHADER_FRAGMENT);
+
+   /* Geometry shading overrides the restart index, reemit on rebind */
+   if (IS_SHADER_DIRTY(GEOMETRY)) {
+      cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
+   }
+
+   u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
+      /* TODO: compact? */
+      uint32_t type = stage;
+      types_dirty |= BITFIELD_BIT(type);
+
+      /* Only copy non-NULL shaders because mesh/task alias with vertex and
+       * tessellation stages.
+       */
+      if (cmd->state.gfx.shaders[stage] != NULL) {
+         assert(type < ARRAY_SIZE(type_shader));
+         assert(type_shader[type] == NULL);
+         type_shader[type] = cmd->state.gfx.shaders[stage];
+      }
+   }
+
+   u_foreach_bit(type, types_dirty) {
+      struct hk_api_shader *shader = type_shader[type];
+
+      /* We always map index == type */
+      // const uint32_t idx = type;
+
+      if (shader == NULL)
+         continue;
+
+      /* TODO */
+   }
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+   struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
+
+   /* If we have a new VS/FS pair, UVS locations may have changed so need to
+    * relink. We do this here because there's no dependence on the fast linked
+    * shaders.
+    */
+   agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
+                  fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
+                  fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
+
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   desc->root_dirty = true;
+
+   for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
+      desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
+   }
+}
+
+static struct agx_shader_part *
+hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
+                            hk_internal_builder_t builder, bool preprocess_nir,
+                            bool stop, unsigned cf_base)
+{
+   /* Try to get the cached shader */
+   struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
+   if (ent)
+      return ent->data;
+
+   nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
+   builder(&b, key->key);
+
+   if (preprocess_nir)
+      agx_preprocess_nir(b.shader, dev->dev.libagx);
+
+   struct agx_shader_key backend_key = {
+      .needs_g13x_coherency = (dev->dev.params.gpu_generation == 13 &&
+                               dev->dev.params.num_clusters_total > 1) ||
+                              dev->dev.params.num_dies > 1,
+      .libagx = dev->dev.libagx,
+      .secondary = true,
+      .no_stop = !stop,
+   };
+
+   /* We always use dynamic sample shading in the GL driver. Indicate that. */
+   if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
+      backend_key.fs.cf_base = cf_base;
+
+      if (b.shader->info.fs.uses_sample_shading)
+         backend_key.fs.inside_sample_loop = true;
+   }
+
+   struct agx_shader_part *part =
+      rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
+
+   agx_compile_shader_nir(b.shader, &backend_key, NULL, part);
+
+   ralloc_free(b.shader);
+
+   /* ..and cache it before we return. The key is on the stack right now, so
+    * clone it before using it as a hash table key. The clone is logically owned
+    * by the hash table.
+    */
+   size_t total_key_size = sizeof(*key) + key->key_size;
+   void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
+
+   _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
+   return part;
+}
+
+static struct agx_shader_part *
+hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
+                     hk_internal_builder_t builder, bool preprocess_nir,
+                     bool stop, unsigned cf_base)
+{
+   /* Build the meta shader key */
+   size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
+
+   struct hk_internal_key *key = alloca(total_key_size);
+   key->builder = builder;
+   key->key_size = data_size;
+
+   if (data_size)
+      memcpy(key->key, data, data_size);
+
+   simple_mtx_lock(&dev->prolog_epilog.lock);
+
+   struct agx_shader_part *part = hk_get_prolog_epilog_locked(
+      dev, key, builder, preprocess_nir, stop, cf_base);
+
+   simple_mtx_unlock(&dev->prolog_epilog.lock);
+   return part;
+}
+
+static struct hk_linked_shader *
+hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
+                             struct hk_fast_link_key_vs *key)
+{
+   struct agx_shader_part *prolog =
+      hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
+                           agx_nir_vs_prolog, false, false, 0);
+
+   struct hk_linked_shader *linked =
+      hk_fast_link(dev, false, shader, prolog, NULL, 0);
+
+   struct hk_fast_link_key *key_clone =
+      ralloc_memdup(shader->linked.ht, key, sizeof(*key));
+
+   /* XXX: Fix this higher up the stack */
+   linked->b.uses_base_param |= !key->prolog.hw;
+
+   _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
+   return linked;
+}
+
+static void
+build_fs_prolog(nir_builder *b, const void *key)
+{
+   agx_nir_fs_prolog(b, key);
+
+   /* Lower load_stat_query_address_agx, needed for FS statistics */
+   NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
+}
+
+static struct hk_linked_shader *
+hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
+                             struct hk_fast_link_key_fs *key)
+{
+   /* TODO: prolog without fs needs to work too... */
+   bool needs_prolog = key->prolog.statistics ||
+                       key->prolog.cull_distance_size ||
+                       key->prolog.api_sample_mask != 0xff;
+
+   struct agx_shader_part *prolog = NULL;
+   if (needs_prolog) {
+      prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
+                                    build_fs_prolog, false, false,
+                                    key->prolog.cf_base);
+   }
+
+   /* If sample shading is used, don't stop at the epilog, there's a
+    * footer that the fast linker will insert to stop.
+    */
+   bool epilog_stop = (key->nr_samples_shaded == 0);
+
+   struct agx_shader_part *epilog =
+      hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
+                           agx_nir_fs_epilog, true, epilog_stop, 0);
+
+   struct hk_linked_shader *linked =
+      hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
+
+   struct hk_fast_link_key *key_clone =
+      ralloc_memdup(shader->linked.ht, key, sizeof(*key));
+
+   _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
+   return linked;
+}
+
+/*
+ * First, look for a fully linked variant. Else, build the required shader
+ * parts and link.
+ */
+static struct hk_linked_shader *
+hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
+{
+   struct hk_linked_shader *linked;
+   simple_mtx_lock(&shader->linked.lock);
+
+   struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
+
+   if (ent)
+      linked = ent->data;
+   else if (shader->info.stage == MESA_SHADER_VERTEX)
+      linked = hk_get_fast_linked_locked_vs(dev, shader, key);
+   else if (shader->info.stage == MESA_SHADER_FRAGMENT)
+      linked = hk_get_fast_linked_locked_fs(dev, shader, key);
+   else
+      unreachable("invalid stage");
+
+   simple_mtx_unlock(&shader->linked.lock);
+   return linked;
+}
+
+static void
+hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
+                      void *key)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
+   gl_shader_stage stage = shader->info.stage;
+
+   if (cmd->state.gfx.linked[stage] != new) {
+      cmd->state.gfx.linked[stage] = new;
+      cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
+   }
+}
+
+static enum agx_polygon_mode
+translate_polygon_mode(VkPolygonMode vk_mode)
+{
+   static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
+                 AGX_POLYGON_MODE_FILL);
+   static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
+                 AGX_POLYGON_MODE_LINE);
+   static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
+                 AGX_POLYGON_MODE_POINT);
+
+   assert(vk_mode <= VK_POLYGON_MODE_POINT);
+   return (enum agx_polygon_mode)vk_mode;
+}
+
+static enum agx_zs_func
+translate_compare_op(VkCompareOp vk_mode)
+{
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
+                 AGX_ZS_FUNC_LEQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
+                 AGX_ZS_FUNC_GREATER);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
+                 AGX_ZS_FUNC_NOT_EQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
+                 AGX_ZS_FUNC_GEQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
+
+   assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
+   return (enum agx_zs_func)vk_mode;
+}
+
+static enum agx_stencil_op
+translate_stencil_op(VkStencilOp vk_op)
+{
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
+                 AGX_STENCIL_OP_KEEP);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
+                 AGX_STENCIL_OP_ZERO);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
+                 AGX_STENCIL_OP_REPLACE);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
+                 AGX_STENCIL_OP_INCR_SAT);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
+                 AGX_STENCIL_OP_DECR_SAT);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
+                 AGX_STENCIL_OP_INVERT);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
+                 AGX_STENCIL_OP_INCR_WRAP);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
+                 AGX_STENCIL_OP_DECR_WRAP);
+
+   return (enum agx_stencil_op)vk_op;
+}
+
+static void
+hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
+                         struct vk_stencil_test_face_state s, bool enabled)
+{
+   if (enabled) {
+      agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
+         cfg.compare = translate_compare_op(s.op.compare);
+         cfg.write_mask = s.write_mask;
+         cfg.read_mask = s.compare_mask;
+
+         cfg.depth_pass = translate_stencil_op(s.op.pass);
+         cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
+         cfg.stencil_fail = translate_stencil_op(s.op.fail);
+      }
+   } else {
+      agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
+         cfg.compare = AGX_ZS_FUNC_ALWAYS;
+         cfg.write_mask = 0xFF;
+         cfg.read_mask = 0xFF;
+
+         cfg.depth_pass = AGX_STENCIL_OP_KEEP;
+         cfg.depth_fail = AGX_STENCIL_OP_KEEP;
+         cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
+      }
+   }
+}
+
+static bool
+hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
+{
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   return dyn->ds.stencil.test_enable &&
+          render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
+}
+
+static void
+hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd->vk.dynamic_graphics_state;
+
+   /* We always need at least 1 viewport for the hardware. With rasterizer
+    * discard the app may not supply any, but we can just program garbage.
+    */
+   unsigned count = MAX2(dyn->vp.viewport_count, 1);
+
+   unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
+   unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
+
+   /* We implicitly scissor to the viewport. We need to do a min/max dance to
+    * handle inverted viewports.
+    */
+   for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
+      const VkViewport *vp = &dyn->vp.viewports[i];
+
+      minx[i] = MIN2(vp->x, vp->x + vp->width);
+      miny[i] = MIN2(vp->y, vp->y + vp->height);
+      maxx[i] = MAX2(vp->x, vp->x + vp->width);
+      maxy[i] = MAX2(vp->y, vp->y + vp->height);
+   }
+
+   /* Additionally clamp to the framebuffer so we don't rasterize
+    * off-screen pixels. TODO: Is this necessary? the GL driver does this but
+    * it might be cargoculted at this point.
+    *
+    * which is software-visible and can cause faults with
+    * eMRT when the framebuffer is not a multiple of the tile size.
+    */
+   for (unsigned i = 0; i < count; ++i) {
+      minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
+      maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
+      miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
+      maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
+   }
+
+   /* We additionally apply any API scissors */
+   for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
+      const VkRect2D *s = &dyn->vp.scissors[i];
+
+      minx[i] = MAX2(minx[i], s->offset.x);
+      miny[i] = MAX2(miny[i], s->offset.y);
+      maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
+      maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
+   }
+
+   /* Upload a hardware scissor for each viewport, whether there's a
+    * corresponding API scissor or not.
+    */
+   unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
+   struct agx_scissor_packed *scissors =
+      util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
+
+   for (unsigned i = 0; i < count; ++i) {
+      const VkViewport *vp = &dyn->vp.viewports[i];
+
+      agx_pack(scissors + i, SCISSOR, cfg) {
+         cfg.min_x = minx[i];
+         cfg.min_y = miny[i];
+         cfg.max_x = maxx[i];
+         cfg.max_y = maxy[i];
+
+         /* These settings in conjunction with the PPP control depth clip/clamp
+          * settings implement depth clip/clamping. Properly setting them
+          * together is required for conformant depth clip enable.
+          *
+          * TODO: Reverse-engineer the finer interactions here.
+          */
+         if (dyn->rs.depth_clamp_enable) {
+            cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
+            cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
+         } else {
+            cfg.min_z = 0.0;
+            cfg.max_z = 1.0;
+         }
+      }
+   }
+
+   /* Upload state */
+   struct AGX_PPP_HEADER present = {
+      .depth_bias_scissor = true,
+      .region_clip = true,
+      .viewport = true,
+      .viewport_count = count,
+   };
+
+   size_t size = agx_ppp_update_size(&present);
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
+   if (!T.cpu)
+      return;
+
+   struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
+
+   agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
+      cfg.scissor = index;
+
+      /* Use the current depth bias, we allocate linearly */
+      unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
+      cfg.depth_bias = count ? count - 1 : 0;
+   };
+
+   for (unsigned i = 0; i < count; ++i) {
+      agx_ppp_push(&ppp, REGION_CLIP, cfg) {
+         cfg.enable = true;
+         cfg.min_x = minx[i] / 32;
+         cfg.min_y = miny[i] / 32;
+         cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
+         cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
+      }
+   }
+
+   agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
+      ;
+
+   /* Upload viewports */
+   for (unsigned i = 0; i < count; ++i) {
+      const VkViewport *vp = &dyn->vp.viewports[i];
+
+      agx_ppp_push(&ppp, VIEWPORT, cfg) {
+         cfg.translate_x = vp->x + 0.5f * vp->width;
+         cfg.translate_y = vp->y + 0.5f * vp->height;
+         cfg.translate_z = vp->minDepth;
+
+         cfg.scale_x = vp->width * 0.5f;
+         cfg.scale_y = vp->height * 0.5f;
+         cfg.scale_z = vp->maxDepth - vp->minDepth;
+      }
+   }
+
+   agx_ppp_fini(out, &ppp);
+}
+
+static enum agx_object_type
+translate_object_type(enum mesa_prim topology)
+{
+   static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
+   static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
+
+   if (topology == MESA_PRIM_POINTS)
+      return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
+   else if (topology <= MESA_PRIM_LINE_STRIP)
+      return AGX_OBJECT_TYPE_LINE;
+   else
+      return AGX_OBJECT_TYPE_TRIANGLE;
+}
+
+static enum agx_primitive
+translate_hw_primitive_topology(enum mesa_prim prim)
+{
+   switch (prim) {
+   case MESA_PRIM_POINTS:
+      return AGX_PRIMITIVE_POINTS;
+   case MESA_PRIM_LINES:
+      return AGX_PRIMITIVE_LINES;
+   case MESA_PRIM_LINE_STRIP:
+      return AGX_PRIMITIVE_LINE_STRIP;
+   case MESA_PRIM_TRIANGLES:
+      return AGX_PRIMITIVE_TRIANGLES;
+   case MESA_PRIM_TRIANGLE_STRIP:
+      return AGX_PRIMITIVE_TRIANGLE_STRIP;
+   case MESA_PRIM_TRIANGLE_FAN:
+      return AGX_PRIMITIVE_TRIANGLE_FAN;
+   default:
+      unreachable("Invalid hardware primitive topology");
+   }
+}
+
+static inline enum agx_vdm_vertex
+translate_vdm_vertex(unsigned vtx)
+{
+   static_assert(AGX_VDM_VERTEX_0 == 0);
+   static_assert(AGX_VDM_VERTEX_1 == 1);
+   static_assert(AGX_VDM_VERTEX_2 == 2);
+
+   assert(vtx <= 2);
+   return vtx;
+}
+
+static inline enum agx_ppp_vertex
+translate_ppp_vertex(unsigned vtx)
+{
+   static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
+   static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
+   static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
+
+   assert(vtx <= 2);
+   return vtx + 1;
+}
+
+static void
+hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
+{
+   uint8_t *out = cs->current;
+   agx_push(out, VDM_STATE, cfg) {
+      cfg.restart_index_present = true;
+   }
+
+   agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
+      if (cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY])
+         cfg.value = BITFIELD_MASK(32);
+      else
+         cfg.value = cmd->state.gfx.index.restart;
+   }
+
+   cs->current = out;
+}
+
+/*
+ * Return the given sample positions, packed into a 32-bit word with fixed
+ * point nibbles for each x/y component of the (at most 4) samples. This is
+ * suitable for programming the PPP_MULTISAMPLECTL control register.
+ */
+static uint32_t
+hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
+{
+   uint32_t ctrl = 0;
+
+   for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
+      VkSampleLocationEXT loc = sl->locations[i];
+
+      uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
+      uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
+
+      assert(x <= 15);
+      assert(y <= 15);
+
+      /* Push bytes in reverse order so we can use constant shifts. */
+      ctrl = (ctrl << 8) | (y << 4) | x;
+   }
+
+   return ctrl;
+}
+
+/*
+ * Return the standard sample positions, prepacked as above for efficiency.
+ */
+uint32_t
+hk_default_sample_positions(unsigned nr_samples)
+{
+   switch (nr_samples) {
+   case 0:
+   case 1:
+      return 0x88;
+   case 2:
+      return 0x44cc;
+   case 4:
+      return 0xeaa26e26;
+   default:
+      unreachable("Invalid sample count");
+   }
+}
+
+static void
+hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
+{
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
+
+   bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
+                      IS_SHADER_DIRTY(GEOMETRY);
+   bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
+
+   struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
+   bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
+
+   bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
+
+   bool face_dirty =
+      IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
+      IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
+      IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
+
+   bool stencil_face_dirty =
+      IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
+      IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
+
+   struct AGX_PPP_HEADER dirty = {
+      .fragment_control =
+         IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
+         IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
+
+      .fragment_control_2 =
+         IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
+
+      .fragment_front_face = face_dirty,
+      .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
+      .fragment_front_stencil = stencil_face_dirty,
+      .fragment_back_face = face_dirty,
+      .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
+      .fragment_back_stencil = stencil_face_dirty,
+      .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
+      .varying_counts_32 = varyings_dirty,
+      .varying_counts_16 = varyings_dirty,
+      .cull =
+         IS_DIRTY(RS_CULL_MODE) || IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
+         IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
+         IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
+         IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || (gfx->dirty & HK_DIRTY_PROVOKING),
+      .cull_2 = varyings_dirty,
+
+      /* With a null FS, the fragment shader PPP word is ignored and doesn't
+       * need to be present.
+       */
+      .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
+                                gfx->descriptors.root_dirty),
+
+      .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
+      .output_size = hw_vs_dirty,
+      .viewport_count = 1, /* irrelevant */
+   };
+
+   /* Calculate the update size. If it equals the header, there is nothing to
+    * update so early-exit.
+    */
+   size_t size = agx_ppp_update_size(&dirty);
+   if (size == AGX_PPP_HEADER_LENGTH)
+      return;
+
+   /* Otherwise, allocate enough space for the update and push it. */
+   assert(size > AGX_PPP_HEADER_LENGTH);
+
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
+   if (!T.cpu)
+      return;
+
+   struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
+
+   if (dirty.fragment_control) {
+      agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
+         cfg.visibility_mode = gfx->occlusion.mode;
+         cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
+
+         /* TODO: Consider optimizing this? */
+         cfg.two_sided_stencil = cfg.stencil_test_enable;
+
+         cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
+                                 gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
+
+         /* Always enable scissoring so we may scissor to the viewport (TODO:
+          * optimize this out if the viewport is the default and the app does
+          * not use the scissor test)
+          */
+         cfg.scissor_enable = true;
+
+         /* This avoids broken derivatives along primitive edges */
+         cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
+      }
+   }
+
+   if (dirty.fragment_control_2) {
+      if (linked_fs) {
+         /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
+          * main fragment control word and has to be combined into the secondary
+          * word for reliable behaviour.
+          */
+         agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
+                             linked_fs->b.fragment_control) {
+
+            cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
+         }
+      } else {
+         /* If there is no fragment shader, we must disable tag writes to avoid
+          * executing the missing shader. This optimizes depth-only passes.
+          */
+         agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
+            cfg.tag_write_disable = true;
+            cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
+         }
+      }
+   }
+
+   struct agx_fragment_face_packed fragment_face;
+   struct agx_fragment_face_2_packed fragment_face_2;
+
+   if (dirty.fragment_front_face) {
+      bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
+      bool z_test = has_z && dyn->ds.depth.test_enable;
+
+      agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
+         cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
+         cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
+         cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
+
+         if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
+            cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
+         else
+            cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
+      };
+
+      agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
+         cfg.stencil_reference = dyn->ds.stencil.front.reference;
+      }
+   }
+
+   if (dirty.fragment_front_face_2) {
+      agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
+         cfg.object_type = gfx->object_type;
+
+         /* TODO: flip the default? */
+         if (fs)
+            cfg.conservative_depth = 0;
+      }
+
+      if (fs)
+         agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
+
+      agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
+   }
+
+   if (dirty.fragment_front_stencil) {
+      hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
+                               hk_stencil_test_enabled(cmd));
+   }
+
+   if (dirty.fragment_back_face) {
+      assert(dirty.fragment_front_face);
+
+      agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
+         cfg.stencil_reference = dyn->ds.stencil.back.reference;
+      }
+   }
+
+   if (dirty.fragment_back_face_2) {
+      assert(dirty.fragment_front_face_2);
+
+      agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
+   }
+
+   if (dirty.fragment_back_stencil) {
+      hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
+                               hk_stencil_test_enabled(cmd));
+   }
+
+   if (dirty.output_select) {
+      struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
+
+      if (linked_fs) {
+         agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
+                                   &linked_fs->b.osel);
+      } else {
+         agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
+      }
+   }
+
+   assert(dirty.varying_counts_32 == dirty.varying_counts_16);
+
+   if (dirty.varying_counts_32) {
+      agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
+                          VARYING_COUNTS);
+
+      agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
+                          VARYING_COUNTS);
+   }
+
+   if (dirty.cull) {
+      agx_ppp_push(&ppp, CULL, cfg) {
+         cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
+         cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
+         cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
+         cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
+         cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
+
+         /* We do not support unrestricted depth, so clamping is inverted from
+          * clipping. This implementation seems to pass CTS without unrestricted
+          * depth support.
+          *
+          * TODO: Make sure this is right with gl_FragDepth.
+          */
+         cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
+         cfg.depth_clamp = !cfg.depth_clip;
+
+         cfg.primitive_msaa =
+            gfx->object_type == AGX_OBJECT_TYPE_LINE &&
+            dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
+      }
+   }
+
+   if (dirty.cull_2) {
+      agx_ppp_push(&ppp, CULL_2, cfg) {
+         cfg.needs_primitive_id = gfx->generate_primitive_id;
+      }
+   }
+
+   if (dirty.fragment_shader) {
+      /* TODO: Do less often? */
+      hk_reserve_scratch(cmd, cs, fs);
+
+      agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
+
+      agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
+         cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
+      }
+
+      agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
+         cfg.cf_bindings = gfx->varyings;
+      }
+
+      agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
+         ;
+   }
+
+   if (dirty.occlusion_query) {
+      agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
+         cfg.index = gfx->occlusion.index;
+      }
+   }
+
+   if (dirty.output_size) {
+      agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
+         cfg.count = hw_vs->info.uvs.size;
+      }
+   }
+
+   agx_ppp_fini(out, &ppp);
+}
+
+static void
+hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                       uint32_t draw_id, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+   struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
+
+   if (!vk_dynamic_graphics_state_any_dirty(dyn) &&
+       !(gfx->dirty & ~HK_DIRTY_INDEX) && !gfx->descriptors.root_dirty &&
+       !gfx->shaders_dirty && !sw_vs->b.info.uses_draw_id &&
+       !sw_vs->b.info.uses_base_param &&
+       !(gfx->linked[MESA_SHADER_VERTEX] &&
+         gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
+      return;
+
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+
+   assert(cs->current + 0x1000 < cs->end && "already ensured space");
+   uint8_t *out = cs->current;
+
+   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
+
+   bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
+                   IS_SHADER_DIRTY(GEOMETRY);
+   bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
+   bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
+
+   if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
+      static_assert(sizeof(desc->root.draw.blend_constant) ==
+                       sizeof(dyn->cb.blend_constants) &&
+                    "common size");
+
+      memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
+             sizeof(dyn->cb.blend_constants));
+      desc->root_dirty = true;
+   }
+
+   if (IS_DIRTY(MS_SAMPLE_MASK)) {
+      desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
+      desc->root_dirty = true;
+   }
+
+   if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
+       IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
+
+      const struct hk_rendering_state *render = &cmd->state.gfx.render;
+      bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
+      bool z_test = has_z && dyn->ds.depth.test_enable;
+
+      desc->root.draw.force_never_in_shader =
+         z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
+         fs->info.fs.writes_memory;
+
+      desc->root_dirty = true;
+   }
+
+   /* The main shader must not run tests if the epilog will. */
+   bool nontrivial_force_early =
+      fs && (fs->b.info.early_fragment_tests &&
+             (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
+
+   bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
+                          (fs && (fs->info.fs.epilog_key.write_z ||
+                                  fs->info.fs.epilog_key.write_s));
+   epilog_discards &= !nontrivial_force_early;
+
+   if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
+      desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
+      desc->root_dirty = true;
+   }
+
+   if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
+       IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
+
+      struct hk_fast_link_key_vs key = {
+         .prolog.hw = (sw_vs == hw_vs),
+
+         /* FIXME: handle pipeline robustness "properly" */
+         .prolog.robustness.level =
+            (dev->vk.enabled_features.robustBufferAccess2 ||
+             dev->vk.enabled_features.pipelineRobustness)
+               ? AGX_ROBUSTNESS_D3D
+               : AGX_ROBUSTNESS_GL,
+
+         .prolog.robustness.soft_fault = false /*TODO*/,
+      };
+
+      if (!key.prolog.hw) {
+         key.prolog.sw_index_size_B =
+            draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
+      }
+
+      static_assert(sizeof(key.prolog.component_mask) ==
+                    sizeof(sw_vs->info.vs.attrib_components_read));
+      BITSET_COPY(key.prolog.component_mask,
+                  sw_vs->info.vs.attrib_components_read);
+
+      u_foreach_bit(a, dyn->vi->attributes_valid) {
+         struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
+
+         assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
+         struct vk_vertex_binding_state binding =
+            dyn->vi->bindings[attr.binding];
+
+         /* nir_assign_io_var_locations compacts vertex inputs, eliminating
+          * unused inputs. We need to do the same here to match the locations.
+          */
+         unsigned slot =
+            util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
+
+         key.prolog.attribs[slot] = (struct agx_velem_key){
+            .format = vk_format_to_pipe_format(attr.format),
+            .stride = dyn->vi_binding_strides[attr.binding],
+            .divisor = binding.divisor,
+            .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
+         };
+      }
+
+      hk_update_fast_linked(cmd, sw_vs, &key);
+   }
+
+   if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
+       (gfx->dirty & HK_DIRTY_VB)) {
+
+      uint64_t sink = dev->rodata.zero_sink;
+
+      unsigned slot = 0;
+      u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
+         if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
+            struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
+            struct hk_addr_range vb = gfx->vb[attr.binding];
+
+            desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
+               vb.addr, sink, vk_format_to_pipe_format(attr.format), vb.range,
+               dyn->vi_binding_strides[attr.binding], attr.offset,
+               &desc->root.draw.attrib_base[slot]);
+         } else {
+            desc->root.draw.attrib_base[slot] = sink;
+            desc->root.draw.attrib_clamps[slot] = 0;
+         }
+
+         ++slot;
+      }
+
+      desc->root_dirty = true;
+   }
+
+   if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
+       IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
+       IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
+       IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
+       IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
+       IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
+       IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
+       IS_DIRTY(CB_BLEND_CONSTANTS) ||
+       desc->root_dirty /* for pipeline stats */ || true) {
+
+      if (fs) {
+         unsigned samples_shaded = 0;
+         if (fs->info.fs.epilog_key.sample_shading)
+            samples_shaded = dyn->ms.rasterization_samples;
+
+         unsigned tib_sample_mask =
+            BITFIELD_MASK(dyn->ms.rasterization_samples);
+         unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
+         bool has_sample_mask = api_sample_mask != tib_sample_mask;
+
+         struct hk_fast_link_key_fs key = {
+            .prolog.statistics = hk_pipeline_stat_addr(
+               cmd,
+               VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
+
+            .prolog.cull_distance_size =
+               hw_vs->info.vs.cull_distance_array_size,
+            .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
+            .nr_samples_shaded = samples_shaded,
+         };
+
+         bool prolog_discards =
+            has_sample_mask || key.prolog.cull_distance_size;
+
+         bool needs_prolog = key.prolog.statistics || prolog_discards;
+
+         if (needs_prolog) {
+            /* With late main shader tests, the prolog runs tests if neither the
+             * main shader nor epilog will.
+             *
+             * With (nontrivial) early main shader tests, the prolog does not
+             * run tests, the tests will run at the start of the main shader.
+             * This ensures tests are after API sample mask and cull distance
+             * discards.
+             */
+            key.prolog.run_zs_tests = !nontrivial_force_early &&
+                                      !fs->b.info.writes_sample_mask &&
+                                      !epilog_discards && prolog_discards;
+
+            if (key.prolog.cull_distance_size) {
+               key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
+            }
+         }
+
+         key.epilog = (struct agx_fs_epilog_key){
+            .link = fs->info.fs.epilog_key,
+            .nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
+            .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
+            .blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
+            .blend.logicop_func = dyn->cb.logic_op_enable
+                                     ? vk_logic_op_to_pipe(dyn->cb.logic_op)
+                                     : PIPE_LOGICOP_COPY,
+         };
+
+         key.epilog.link.already_ran_zs |= nontrivial_force_early;
+
+         struct hk_rendering_state *render = &cmd->state.gfx.render;
+         for (uint32_t i = 0; i < render->color_att_count; i++) {
+            key.epilog.rt_formats[i] =
+               vk_format_to_pipe_format(render->color_att[i].vk_format);
+
+            const struct vk_color_blend_attachment_state *cb =
+               &dyn->cb.attachments[i];
+
+            bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
+            unsigned write_mask = write_enable ? cb->write_mask : 0;
+
+            /* nir_lower_blend always blends, so use a default blend state when
+             * blending is disabled at an API level.
+             */
+            if (!dyn->cb.attachments[i].blend_enable) {
+               key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
+                  .colormask = write_mask,
+                  .rgb_func = PIPE_BLEND_ADD,
+                  .alpha_func = PIPE_BLEND_ADD,
+                  .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
+                  .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
+                  .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
+                  .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
+               };
+            } else {
+               key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
+                  .colormask = write_mask,
+
+                  .rgb_src_factor =
+                     vk_blend_factor_to_pipe(cb->src_color_blend_factor),
+
+                  .rgb_dst_factor =
+                     vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
+
+                  .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
+
+                  .alpha_src_factor =
+                     vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
+
+                  .alpha_dst_factor =
+                     vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
+
+                  .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
+               };
+            }
+         }
+
+         hk_update_fast_linked(cmd, fs, &key);
+      } else {
+         /* TODO: prolog without fs needs to work too... */
+         if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
+            cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
+            cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
+         }
+      }
+   }
+
+   /* If the vertex shader uses draw parameters, vertex uniforms are dirty every
+    * draw. Fragment uniforms are unaffected.
+    *
+    * For a direct draw, we upload the draw parameters as-if indirect to
+    * avoid keying to indirectness.
+    */
+   if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
+      if (draw.b.indirect) {
+         gfx->draw_params = draw.b.ptr;
+
+         if (draw.indexed) {
+            gfx->draw_params +=
+               offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
+         } else {
+            gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
+         }
+      } else {
+         uint32_t params[] = {
+            draw.indexed ? draw.index_bias : draw.start,
+            draw.start_instance,
+         };
+
+         gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
+      }
+   } else {
+      gfx->draw_params = 0;
+   }
+
+   if (sw_vs->b.info.uses_draw_id) {
+      /* TODO: rodata? */
+      gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
+   } else {
+      gfx->draw_id_ptr = 0;
+   }
+
+   if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
+      enum mesa_prim prim = hk_rast_prim(cmd);
+
+      gfx->topology = translate_hw_primitive_topology(prim);
+      gfx->object_type = translate_object_type(prim);
+   }
+
+   if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
+      unsigned provoking;
+      if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
+         provoking = 2;
+      else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
+         provoking = 1;
+      else
+         provoking = 0;
+
+      if (provoking != gfx->provoking) {
+         gfx->provoking = provoking;
+         gfx->dirty |= HK_DIRTY_PROVOKING;
+
+         gfx->descriptors.root.draw.provoking = provoking;
+         gfx->descriptors.root_dirty = true;
+      }
+   }
+
+   /* With attachmentless rendering, we don't know the sample count until draw
+    * time, so we do a late tilebuffer fix up. But with rasterizer discard,
+    * rasterization_samples might be 0.
+    */
+   if (dyn->ms.rasterization_samples &&
+       gfx->render.tilebuffer.nr_samples != dyn->ms.rasterization_samples) {
+
+      assert(gfx->render.tilebuffer.nr_samples == 0);
+
+      unsigned nr_samples = MAX2(dyn->ms.rasterization_samples, 1);
+      gfx->render.tilebuffer.nr_samples = nr_samples;
+      agx_tilebuffer_pack_usc(&gfx->render.tilebuffer);
+      cs->tib = gfx->render.tilebuffer;
+   }
+
+   if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
+       IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
+
+      uint32_t ctrl;
+      if (dyn->ms.sample_locations_enable) {
+         ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
+      } else {
+         ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
+      }
+
+      bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
+
+      if (!cs->has_sample_locations) {
+         cs->ppp_multisamplectl = ctrl;
+
+         /* If we're in vk_meta, do not commit to the sample locations yet.
+          * vk_meta doesn't care, but the app will!
+          */
+         cs->has_sample_locations |= !dont_commit;
+      } else {
+         assert(dont_commit || cs->ppp_multisamplectl == ctrl);
+      }
+
+      gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
+      gfx->descriptors.root_dirty = true;
+   }
+
+   /* Link varyings before uploading tessellation state, becuase the
+    * gfx->generate_primitive_id boolean needs to be plumbed.
+    */
+   struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
+   struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
+   bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
+   bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
+
+   if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
+      unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
+      if (bindings) {
+         size_t linkage_size =
+            AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
+
+         struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
+         if (!t.cpu)
+            return;
+
+         agx_link_varyings_vs_fs(
+            t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
+            &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
+
+         gfx->varyings = t.gpu;
+      } else {
+         gfx->varyings = 0;
+      }
+
+      gfx->dirty |= HK_DIRTY_VARYINGS;
+   }
+
+   if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
+       gfx->shaders[MESA_SHADER_GEOMETRY]) {
+
+      struct hk_shader *vs = hk_bound_sw_vs(gfx);
+      desc->root.draw.vertex_outputs = vs->b.info.outputs;
+
+      /* XXX: We should deduplicate this logic */
+      bool restart = (draw.indexed && draw.restart);
+      bool indirect = draw.b.indirect || restart;
+
+      desc->root.draw.input_assembly =
+         indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
+                  : hk_upload_ia_params(cmd, draw);
+
+      if (!indirect) {
+         uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
+         unsigned vb_size =
+            libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
+
+         /* Allocate if there are any outputs, or use the null sink to trap
+          * reads if there aren't. Those reads are undefined but should not
+          * fault. Affects:
+          *
+          *    dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
+          */
+         desc->root.draw.vertex_output_buffer =
+            vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
+                    : dev->rodata.null_sink;
+      }
+   }
+
+   if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
+      gfx->descriptors.root.draw.tess_params = hk_upload_tess_params(cmd, draw);
+      gfx->descriptors.root_dirty = true;
+   }
+
+   if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
+      /* TODO: size */
+      cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
+
+      gfx->descriptors.root.draw.geometry_params =
+         hk_upload_geometry_params(cmd, draw);
+
+      gfx->descriptors.root_dirty = true;
+   }
+
+   /* Root must be uploaded after the above, which touch the root */
+   if (gfx->descriptors.root_dirty) {
+      gfx->root =
+         hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   }
+
+   /* Hardware dynamic state must be deferred until after the root and fast
+    * linking, since it will use the root address and the linked shaders.
+    */
+   if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
+       IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
+       gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
+
+      /* TODO: Do less often? */
+      hk_reserve_scratch(cmd, cs, hw_vs);
+
+      agx_push(out, VDM_STATE, cfg) {
+         cfg.vertex_shader_word_0_present = true;
+         cfg.vertex_shader_word_1_present = true;
+         cfg.vertex_outputs_present = true;
+         cfg.vertex_unknown_present = true;
+      }
+
+      agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
+
+      struct hk_linked_shader *linked_hw_vs =
+         (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
+
+      agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
+         cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
+      }
+
+      agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
+
+      agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
+         cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
+         cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
+         cfg.generate_primitive_id = gfx->generate_primitive_id;
+      }
+
+      /* Pad up to a multiple of 8 bytes */
+      memset(out, 0, 4);
+      out += 4;
+   }
+
+   if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
+      void *ptr =
+         util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
+
+      agx_pack(ptr, DEPTH_BIAS, cfg) {
+         cfg.depth_bias = dyn->rs.depth_bias.constant;
+         cfg.slope_scale = dyn->rs.depth_bias.slope;
+         cfg.clamp = dyn->rs.depth_bias.clamp;
+
+         /* Value from the PowerVR driver. */
+         if (render->depth_att.vk_format == VK_FORMAT_D16_UNORM) {
+            cfg.depth_bias /= (1 << 15);
+         }
+      }
+   }
+
+   /* Hardware viewport/scissor state is entangled with depth bias. */
+   if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
+       IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
+       IS_DIRTY(VP_VIEWPORT_COUNT) ||
+       IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
+       IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
+
+      hk_flush_vp_state(cmd, cs, &out);
+   }
+
+   hk_flush_ppp_state(cmd, cs, &out);
+   cs->current = out;
+
+   vk_dynamic_graphics_state_clear_dirty(dyn);
+   gfx->shaders_dirty = 0;
+   gfx->linked_dirty = 0;
+   gfx->dirty = 0;
+   gfx->descriptors.root_dirty = false;
+}
+
+static bool
+hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   if (!draw.indexed)
+      return false;
+
+   /* If tessellation is used, we'll go through the robust path anyway, don't
+    * end up with a tess+geom doom combo.
+    */
+   if (gfx->shaders[MESA_SHADER_TESS_EVAL])
+      return false;
+
+   /* Allowed with maint6 without robustness features enabled */
+   if (draw.index.range == 0)
+      return true;
+
+   if (!(dev->vk.enabled_features.robustBufferAccess ||
+         dev->vk.enabled_features.robustBufferAccess2 ||
+         dev->vk.enabled_features.pipelineRobustness))
+      return false;
+
+   if (draw.b.indirect) {
+      return true;
+   } else {
+      uint32_t range_B =
+         (draw.start + draw.b.count[0]) * agx_index_size_to_B(draw.index_size);
+
+      return range_B > draw.index.range;
+   }
+}
+
+static void
+hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+
+   /* If there's an application geometry shader, there's nothing to un/bind */
+   if (gs && !gs->is_passthrough)
+      return;
+
+   /* Determine if we need a geometry shader to emulate XFB or adjacency */
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
+   uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
+
+   VkPrimitiveTopology topology = dyn->ia.primitive_topology;
+   bool adjacency =
+      (topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY) ||
+      (topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY) ||
+      (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY) ||
+      (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY);
+
+   /* TODO: Don't use a whole GS just for index robustness. */
+   bool index_robustness = hk_needs_index_robustness(cmd, draw);
+
+   bool needs_gs = xfb_outputs || adjacency || index_robustness;
+
+   /* Various pipeline statistics are implemented in the pre-GS shader. TODO:
+    * This could easily be optimized.
+    */
+   VkQueryPipelineStatisticFlagBits ia_statistics[] = {
+      VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
+      VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
+      VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
+   };
+
+   bool ia_stats = false;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) {
+      ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0;
+   }
+
+   needs_gs |= ia_stats;
+
+   /* If we already have a matching GS configuration, we're done */
+   if ((gs != NULL) == needs_gs)
+      return;
+
+   /* If we don't need a GS but we do have a passthrough, unbind it */
+   if (gs) {
+      assert(!needs_gs && gs->is_passthrough);
+      hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
+      return;
+   }
+
+   /* Else, we need to bind a passthrough GS */
+   size_t key_size =
+      sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
+   struct hk_passthrough_gs_key *key = alloca(key_size);
+
+   *key = (struct hk_passthrough_gs_key){
+      .prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
+      .outputs = last_sw->b.info.outputs,
+      .clip_distance_array_size = last_sw->info.clip_distance_array_size,
+      .cull_distance_array_size = last_sw->info.cull_distance_array_size,
+   };
+
+   if (xfb_outputs) {
+      typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
+                   ARRAY_SIZE(key->xfb_stride));
+
+      memcpy(&key->xfb_info, &last_sw->info.xfb_info,
+             nir_xfb_info_size(xfb_outputs));
+   }
+
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   perf_debug(dev, "Binding passthrough GS for%s%s%s%s\n",
+              xfb_outputs ? " XFB" : "", adjacency ? " adjacency" : "",
+              index_robustness ? " robustness" : "",
+              ia_stats ? " statistics" : "");
+
+   gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
+   gs->is_passthrough = true;
+   hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
+}
+
+static struct hk_cs *
+hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
+                   struct hk_draw draw)
+{
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
+   if (!cs)
+      return NULL;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_descriptor_state *desc = &gfx->descriptors;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+#ifndef NDEBUG
+   if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
+      hk_cmd_buffer_dirty_all(cmd);
+   }
+#endif
+
+   /* TODO: Try to reduce draw overhead of this */
+   hk_handle_passthrough_gs(cmd, draw);
+
+   hk_flush_shaders(cmd);
+
+   if (desc->push_dirty)
+      hk_cmd_buffer_flush_push_descriptors(cmd, desc);
+
+   if ((gfx->dirty & HK_DIRTY_INDEX) &&
+       (gfx->index.restart || gfx->shaders[MESA_SHADER_GEOMETRY]))
+      hk_flush_index(cmd, cs);
+
+   hk_flush_dynamic_state(cmd, cs, draw_id, draw);
+   return cs;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                          VkDeviceSize offset, VkDeviceSize size,
+                          VkIndexType indexType)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   cmd->state.gfx.index = (struct hk_index_buffer_state){
+      .buffer = hk_buffer_addr_range(buffer, offset, size),
+      .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
+      .restart = vk_index_to_restart(indexType),
+   };
+
+   /* TODO: check if necessary, blob does this */
+   cmd->state.gfx.index.buffer.range =
+      align(cmd->state.gfx.index.buffer.range, 4);
+
+   cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
+}
+
+void
+hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
+                          struct hk_addr_range addr_range)
+{
+   cmd->state.gfx.vb[vb_idx] = addr_range;
+   cmd->state.gfx.dirty |= HK_DIRTY_VB;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
+                         uint32_t bindingCount, const VkBuffer *pBuffers,
+                         const VkDeviceSize *pOffsets,
+                         const VkDeviceSize *pSizes,
+                         const VkDeviceSize *pStrides)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pStrides) {
+      vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
+                                        pStrides);
+   }
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
+      uint32_t idx = firstBinding + i;
+
+      uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
+      const struct hk_addr_range addr_range =
+         hk_buffer_addr_range(buffer, pOffsets[i], size);
+
+      hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
+   }
+}
+
+static bool
+hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
+{
+   if (cmd->state.gfx.render.view_mask) {
+      cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
+      cmd->state.gfx.descriptors.root_dirty = true;
+   }
+
+   return true;
+}
+
+/*
+ * Iterator macro to duplicate a draw for each enabled view (when multiview is
+ * enabled, else always view 0). Along with hk_lower_multiview, this forms the
+ * world's worst multiview lowering.
+ */
+#define hk_foreach_view(cmd)                                                   \
+   u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1)               \
+      if (hk_set_view_index(cmd, view_idx))
+
+static void
+hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw,
+             uint64_t ia_vertices, uint64_t vs_invocations)
+{
+   /* XXX: stream link needed? */
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   perf_debug(dev, "Input assembly counters");
+
+   struct agx_increment_ia_counters_key key = {
+      .index_size_B = draw.restart ? agx_index_size_to_B(draw.index_size) : 0,
+   };
+
+   uint64_t draw_ptr;
+   if (draw.b.indirect) {
+      draw_ptr = draw.b.ptr;
+   } else {
+      uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
+      draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
+   }
+
+   struct libagx_increment_ia_counters args = {
+      .ia_vertices = ia_vertices,
+      .vs_invocations = vs_invocations,
+      .restart_index = cmd->state.gfx.index.restart,
+      .draw = draw_ptr,
+      .index_buffer = draw.index.addr,
+      .index_buffer_range_el =
+         key.index_size_B ? (draw.index.range / key.index_size_B) : 0,
+   };
+
+   uint64_t wg_size = key.index_size_B ? 1024 : 1;
+
+   struct hk_shader *s =
+      hk_meta_kernel(dev, agx_nir_increment_ia_counters, &key, sizeof(key));
+
+   uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(wg_size, 1, 1),
+                        hk_grid(wg_size, 1, 1));
+}
+
+static void
+hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct hk_draw draw_)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd->vk.dynamic_graphics_state;
+
+   /* Filter trivial draws so we don't need to worry about null index buffers */
+   if (!draw_.b.indirect && (draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
+      return;
+
+   draw_.restart = dyn->ia.primitive_restart_enable;
+   draw_.index_size = cmd->state.gfx.index.size;
+
+   uint64_t stat_ia_verts = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
+
+   uint64_t stat_vs_inv = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
+
+   bool ia_stats = stat_ia_verts || stat_vs_inv;
+
+   hk_foreach_view(cmd) {
+      struct hk_draw draw = draw_;
+      struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
+      /* If we failed to allocate a control stream, we've already lost the
+       * device. Just drop the draw so we don't crash.
+       */
+      if (!cs)
+         return;
+
+      bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
+      bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
+      struct hk_cs *ccs = NULL;
+      uint8_t *out = cs->current;
+      assert(cs->current + 0x1000 < cs->end);
+
+      if (geom || tess || ia_stats) {
+         ccs =
+            hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
+         if (!ccs)
+            return;
+      }
+
+      if (ia_stats) {
+         hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv);
+      }
+
+      if (tess) {
+         draw = hk_launch_tess(cmd, ccs, draw);
+
+         if (draw.raw) {
+            assert(!geom);
+            assert(draw.b.indirect);
+
+            agx_push(out, VDM_STREAM_LINK, cfg) {
+               cfg.target_lo = draw.b.ptr & BITFIELD_MASK(32);
+               cfg.target_hi = draw.b.ptr >> 32;
+               cfg.with_return = true;
+            }
+
+            cs->current = out;
+            continue;
+         }
+      }
+
+      if (geom) {
+         draw = hk_launch_gs_prerast(cmd, ccs, draw);
+
+         /* We must not draw if the app specified rasterizer discard. This is
+          * required for both performance (it is pointless to rasterize and
+          * there are no side effects), but also correctness (no indirect draw
+          * descriptor will be filled out).
+          */
+         if (dyn->rs.rasterizer_discard_enable)
+            continue;
+      }
+
+      uint64_t ib = draw.index.addr;
+      if (draw.indexed && !draw.b.indirect)
+         ib += (draw.start << draw.index_size);
+
+      agx_push(out, INDEX_LIST, cfg) {
+         cfg.primitive = cmd->state.gfx.topology;
+
+         if (draw.b.indirect) {
+            cfg.indirect_buffer_present = true;
+         } else {
+            cfg.instance_count_present = true;
+            cfg.index_count_present = true;
+            cfg.start_present = true;
+         }
+
+         if (draw.indexed) {
+            cfg.restart_enable = draw.restart;
+            cfg.index_buffer_hi = ib >> 32;
+            cfg.index_size = draw.index_size;
+
+            cfg.index_buffer_present = true;
+            cfg.index_buffer_size_present = true;
+         }
+      }
+
+      if (draw.indexed) {
+         agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
+            cfg.buffer_lo = ib;
+         }
+      }
+
+      if (draw.b.indirect) {
+         agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
+            cfg.address_hi = draw.b.ptr >> 32;
+            cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
+         }
+      } else {
+         agx_push(out, INDEX_LIST_COUNT, cfg) {
+            cfg.count = draw.b.count[0];
+         }
+
+         agx_push(out, INDEX_LIST_INSTANCES, cfg) {
+            cfg.count = draw.b.count[1];
+         }
+
+         agx_push(out, INDEX_LIST_START, cfg) {
+            cfg.start = draw.indexed ? draw.index_bias : draw.start;
+         }
+      }
+
+      if (draw.indexed) {
+         agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
+            cfg.size = draw.index.range;
+         }
+      }
+
+      cs->current = out;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
+           uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   struct hk_draw draw = {
+      .b = hk_grid(vertexCount, instanceCount, 1),
+      .start = firstVertex,
+      .start_instance = firstInstance,
+   };
+
+   hk_draw(cmd, 0, draw);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
+                   const VkMultiDrawInfoEXT *pVertexInfo,
+                   uint32_t instanceCount, uint32_t firstInstance,
+                   uint32_t stride)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   for (unsigned i = 0; i < drawCount; ++i) {
+      struct hk_draw draw = {
+         .b = hk_grid(pVertexInfo->vertexCount, instanceCount, 1),
+         .start = pVertexInfo->firstVertex,
+         .start_instance = firstInstance,
+      };
+
+      hk_draw(cmd, i, draw);
+      pVertexInfo = ((void *)pVertexInfo) + stride;
+   }
+}
+
+static void
+hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
+                uint32_t indexCount, uint32_t instanceCount,
+                uint32_t firstIndex, int32_t vertexOffset,
+                uint32_t firstInstance)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   struct hk_draw draw = {
+      .b = hk_grid(indexCount, instanceCount, 1),
+      .indexed = true,
+      .index = cmd->state.gfx.index.buffer,
+      .start = firstIndex,
+      .index_bias = vertexOffset,
+      .start_instance = firstInstance,
+   };
+
+   hk_draw(cmd, draw_id, draw);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
+                  uint32_t instanceCount, uint32_t firstIndex,
+                  int32_t vertexOffset, uint32_t firstInstance)
+{
+   hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
+                   vertexOffset, firstInstance);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
+                          const VkMultiDrawIndexedInfoEXT *pIndexInfo,
+                          uint32_t instanceCount, uint32_t firstInstance,
+                          uint32_t stride, const int32_t *pVertexOffset)
+{
+   for (unsigned i = 0; i < drawCount; ++i) {
+      const uint32_t vertex_offset =
+         pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
+
+      hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
+                      pIndexInfo->firstIndex, vertex_offset, firstInstance);
+
+      pIndexInfo = ((void *)pIndexInfo) + stride;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                   VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   /* From the Vulkan 1.3.238 spec:
+    *
+    *    VUID-vkCmdDrawIndirect-drawCount-00476
+    *
+    *    "If drawCount is greater than 1, stride must be a multiple of 4 and
+    *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
+    *
+    * and
+    *
+    *    "If drawCount is less than or equal to one, stride is ignored."
+    */
+   if (drawCount > 1) {
+      assert(stride % 4 == 0);
+      assert(stride >= sizeof(VkDrawIndirectCommand));
+   }
+
+   for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
+      uint64_t addr = hk_buffer_address(buffer, offset) + stride * draw_id;
+      hk_draw(cmd, draw_id, hk_draw_indirect(addr));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                          VkDeviceSize offset, uint32_t drawCount,
+                          uint32_t stride)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   /* From the Vulkan 1.3.238 spec:
+    *
+    *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
+    *
+    *    "If drawCount is greater than 1, stride must be a multiple of 4 and
+    *    must be greater than or equal to
+    * sizeof(VkDrawIndexedIndirectCommand)"
+    *
+    * and
+    *
+    *    "If drawCount is less than or equal to one, stride is ignored."
+    */
+   if (drawCount > 1) {
+      assert(stride % 4 == 0);
+      assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
+   }
+
+   for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
+      uint64_t addr = hk_buffer_address(buffer, offset) + stride * draw_id;
+
+      hk_draw(
+         cmd, draw_id,
+         hk_draw_indexed_indirect(addr, cmd->state.gfx.index.buffer, 0, 0));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                        VkDeviceSize offset, VkBuffer countBuffer,
+                        VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
+                        uint32_t stride)
+{
+   unreachable("TODO");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                               VkDeviceSize offset, VkBuffer countBuffer,
+                               VkDeviceSize countBufferOffset,
+                               uint32_t maxDrawCount, uint32_t stride)
+{
+   unreachable("TODO");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
+                               uint32_t instanceCount, uint32_t firstInstance,
+                               VkBuffer counterBuffer,
+                               VkDeviceSize counterBufferOffset,
+                               uint32_t counterOffset, uint32_t vertexStride)
+{
+   unreachable("TODO");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
+                                      uint32_t firstBinding,
+                                      uint32_t bindingCount,
+                                      const VkBuffer *pBuffers,
+                                      const VkDeviceSize *pOffsets,
+                                      const VkDeviceSize *pSizes)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
+      uint32_t idx = firstBinding + i;
+      uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
+
+      gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size);
+   }
+}
+
+static void
+hk_libagx_copy_xfb_counters(nir_builder *b, const void *key)
+{
+   b->shader->info.workgroup_size_variable = true;
+
+   libagx_copy_xfb_counters(b, nir_load_preamble(b, 1, 64));
+}
+
+static void
+hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
+                 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
+                 const VkDeviceSize *pCounterBufferOffsets, bool begin)
+
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   gfx->xfb_enabled = begin;
+
+   /* If we haven't reserved XFB offsets yet for the command buffer, do so. */
+   if (!gfx->xfb_offsets) {
+      gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
+   }
+
+   struct hk_cs *cs =
+      hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
+   if (!cs)
+      return;
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   struct libagx_xfb_counter_copy params = {};
+   unsigned copies = 0;
+
+   /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
+   if (begin) {
+      for (copies = 0; copies < 4; ++copies) {
+         params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
+      }
+   }
+
+   for (unsigned i = 0; i < counterBufferCount; ++i) {
+      if (pCounterBuffers[i] == VK_NULL_HANDLE)
+         continue;
+
+      VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
+
+      uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
+      uint64_t cb_addr = hk_buffer_address(buffer, offset);
+      uint32_t cmd_idx = firstCounterBuffer + i;
+
+      if (begin) {
+         params.src[cmd_idx] = cb_addr;
+      } else {
+         params.dest[copies] = cb_addr;
+         params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
+         ++copies;
+      }
+   }
+
+   if (begin)
+      copies = 4;
+
+   if (copies > 0) {
+      perf_debug(dev, "XFB counter copy");
+
+      struct hk_shader *s =
+         hk_meta_kernel(dev, hk_libagx_copy_xfb_counters, NULL, 0);
+
+      uint64_t push = hk_pool_upload(cmd, &params, sizeof(params), 8);
+      uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+
+      hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(copies, 1, 1),
+                           hk_grid(copies, 1, 1));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
+                                uint32_t firstCounterBuffer,
+                                uint32_t counterBufferCount,
+                                const VkBuffer *pCounterBuffers,
+                                const VkDeviceSize *pCounterBufferOffsets)
+{
+   hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
+                    pCounterBuffers, pCounterBufferOffsets, true);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
+                              uint32_t firstCounterBuffer,
+                              uint32_t counterBufferCount,
+                              const VkBuffer *pCounterBuffers,
+                              const VkDeviceSize *pCounterBufferOffsets)
+{
+   hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
+                    pCounterBuffers, pCounterBufferOffsets, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginConditionalRenderingEXT(
+   VkCommandBuffer commandBuffer,
+   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
+{
+   unreachable("stub");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
+{
+   unreachable("stub");
+}
diff --git a/src/asahi/vulkan/hk_cmd_meta.c b/src/asahi/vulkan/hk_cmd_meta.c
new file mode 100644
index 00000000000..ee70d9d0d3c
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_meta.c
@@ -0,0 +1,1692 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "vulkan/vulkan_core.h"
+#include "agx_pack.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "nir_builder.h"
+#include "shader_enums.h"
+#include "vk_format.h"
+#include "vk_meta.h"
+#include "vk_pipeline.h"
+
+static VkResult
+hk_cmd_bind_map_buffer(struct vk_command_buffer *vk_cmd,
+                       struct vk_meta_device *meta, VkBuffer _buffer,
+                       void **map_out)
+{
+   struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   assert(buffer->vk.size < UINT_MAX);
+   struct agx_ptr T = hk_pool_alloc(cmd, buffer->vk.size, 16);
+   if (unlikely(T.cpu == NULL))
+      return VK_ERROR_OUT_OF_POOL_MEMORY;
+
+   buffer->addr = T.gpu;
+   *map_out = T.cpu;
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_device_init_meta(struct hk_device *dev)
+{
+   VkResult result = vk_meta_device_init(&dev->vk, &dev->meta);
+   if (result != VK_SUCCESS)
+      return result;
+
+   dev->meta.use_gs_for_layer = false;
+   dev->meta.use_stencil_export = true;
+   dev->meta.cmd_bind_map_buffer = hk_cmd_bind_map_buffer;
+   dev->meta.max_bind_map_buffer_size_B = 64 * 1024;
+
+   return VK_SUCCESS;
+}
+
+void
+hk_device_finish_meta(struct hk_device *dev)
+{
+   vk_meta_device_finish(&dev->vk, &dev->meta);
+}
+
+struct hk_meta_save {
+   struct vk_vertex_input_state _dynamic_vi;
+   struct vk_sample_locations_state _dynamic_sl;
+   struct vk_dynamic_graphics_state dynamic;
+   struct hk_api_shader *shaders[MESA_SHADER_MESH + 1];
+   struct hk_addr_range vb0;
+   struct hk_descriptor_set *desc0;
+   bool has_push_desc0;
+   enum agx_visibility_mode occlusion;
+   struct hk_push_descriptor_set push_desc0;
+   VkQueryPipelineStatisticFlags pipeline_stats_flags;
+   uint8_t push[128];
+};
+
+static void
+hk_meta_begin(struct hk_cmd_buffer *cmd, struct hk_meta_save *save,
+              VkPipelineBindPoint bind_point)
+{
+   struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
+
+   if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
+      save->dynamic = cmd->vk.dynamic_graphics_state;
+      save->_dynamic_vi = cmd->state.gfx._dynamic_vi;
+      save->_dynamic_sl = cmd->state.gfx._dynamic_sl;
+
+      static_assert(sizeof(cmd->state.gfx.shaders) == sizeof(save->shaders));
+      memcpy(save->shaders, cmd->state.gfx.shaders, sizeof(save->shaders));
+
+      /* Pause queries */
+      save->occlusion = cmd->state.gfx.occlusion.mode;
+      cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
+      cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
+
+      save->pipeline_stats_flags = desc->root.draw.pipeline_stats_flags;
+      desc->root.draw.pipeline_stats_flags = 0;
+      desc->root_dirty = true;
+   } else {
+      save->shaders[MESA_SHADER_COMPUTE] = cmd->state.cs.shader;
+   }
+
+   save->vb0 = cmd->state.gfx.vb[0];
+
+   save->desc0 = desc->sets[0];
+   save->has_push_desc0 = desc->push[0];
+   if (save->has_push_desc0)
+      save->push_desc0 = *desc->push[0];
+
+   static_assert(sizeof(save->push) == sizeof(desc->root.push));
+   memcpy(save->push, desc->root.push, sizeof(save->push));
+
+   cmd->in_meta = true;
+}
+
+static void
+hk_meta_init_render(struct hk_cmd_buffer *cmd,
+                    struct vk_meta_rendering_info *info)
+{
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   *info = (struct vk_meta_rendering_info){
+      .samples = MAX2(render->tilebuffer.nr_samples, 1),
+      .view_mask = render->view_mask,
+      .color_attachment_count = render->color_att_count,
+      .depth_attachment_format = render->depth_att.vk_format,
+      .stencil_attachment_format = render->stencil_att.vk_format,
+   };
+   for (uint32_t a = 0; a < render->color_att_count; a++)
+      info->color_attachment_formats[a] = render->color_att[a].vk_format;
+}
+
+static void
+hk_meta_end(struct hk_cmd_buffer *cmd, struct hk_meta_save *save,
+            VkPipelineBindPoint bind_point)
+{
+   struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
+   desc->root_dirty = true;
+
+   if (save->desc0) {
+      desc->sets[0] = save->desc0;
+      desc->root.sets[0] = hk_descriptor_set_addr(save->desc0);
+      desc->sets_dirty |= BITFIELD_BIT(0);
+      desc->push_dirty &= ~BITFIELD_BIT(0);
+   } else if (save->has_push_desc0) {
+      *desc->push[0] = save->push_desc0;
+      desc->push_dirty |= BITFIELD_BIT(0);
+   }
+
+   if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
+      /* Restore the dynamic state */
+      assert(save->dynamic.vi == &cmd->state.gfx._dynamic_vi);
+      assert(save->dynamic.ms.sample_locations == &cmd->state.gfx._dynamic_sl);
+      cmd->vk.dynamic_graphics_state = save->dynamic;
+      cmd->state.gfx._dynamic_vi = save->_dynamic_vi;
+      cmd->state.gfx._dynamic_sl = save->_dynamic_sl;
+      memcpy(cmd->vk.dynamic_graphics_state.dirty,
+             cmd->vk.dynamic_graphics_state.set,
+             sizeof(cmd->vk.dynamic_graphics_state.set));
+
+      for (uint32_t stage = 0; stage < ARRAY_SIZE(save->shaders); stage++) {
+         hk_cmd_bind_graphics_shader(cmd, stage, save->shaders[stage]);
+      }
+
+      hk_cmd_bind_vertex_buffer(cmd, 0, save->vb0);
+
+      /* Restore queries */
+      cmd->state.gfx.occlusion.mode = save->occlusion;
+      cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
+
+      desc->root.draw.pipeline_stats_flags = save->pipeline_stats_flags;
+      desc->root_dirty = true;
+   } else {
+      hk_cmd_bind_compute_shader(cmd, save->shaders[MESA_SHADER_COMPUTE]);
+   }
+
+   memcpy(desc->root.push, save->push, sizeof(save->push));
+   cmd->in_meta = false;
+}
+
+#define VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE (0xcafe0000)
+#define VK_META_OBJECT_KEY_FILL_PIPELINE                 (0xcafe0001)
+
+#define BINDING_OUTPUT 0
+#define BINDING_INPUT  1
+
+static VkFormat
+aspect_format(VkFormat fmt, VkImageAspectFlags aspect)
+{
+   bool depth = (aspect & VK_IMAGE_ASPECT_DEPTH_BIT);
+   bool stencil = (aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
+
+   enum pipe_format p_format = vk_format_to_pipe_format(fmt);
+
+   if (util_format_is_depth_or_stencil(p_format)) {
+      assert(depth ^ stencil);
+      if (depth) {
+         switch (fmt) {
+         case VK_FORMAT_D32_SFLOAT:
+         case VK_FORMAT_D32_SFLOAT_S8_UINT:
+            return VK_FORMAT_D32_SFLOAT;
+         case VK_FORMAT_D16_UNORM:
+         case VK_FORMAT_D16_UNORM_S8_UINT:
+            return VK_FORMAT_D16_UNORM;
+         default:
+            unreachable("invalid depth");
+         }
+      } else {
+         switch (fmt) {
+         case VK_FORMAT_S8_UINT:
+         case VK_FORMAT_D32_SFLOAT_S8_UINT:
+         case VK_FORMAT_D16_UNORM_S8_UINT:
+            return VK_FORMAT_S8_UINT;
+         default:
+            unreachable("invalid stencil");
+         }
+      }
+   }
+
+   assert(!depth && !stencil);
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(fmt);
+
+   if (ycbcr_info) {
+      switch (aspect) {
+      case VK_IMAGE_ASPECT_PLANE_0_BIT:
+         return ycbcr_info->planes[0].format;
+      case VK_IMAGE_ASPECT_PLANE_1_BIT:
+         return ycbcr_info->planes[1].format;
+      case VK_IMAGE_ASPECT_PLANE_2_BIT:
+         return ycbcr_info->planes[2].format;
+      default:
+         unreachable("invalid ycbcr aspect");
+      }
+   }
+
+   return fmt;
+}
+
+static VkFormat
+canonical_format(VkFormat fmt)
+{
+   enum pipe_format p_format = vk_format_to_pipe_format(fmt);
+
+   if (util_format_is_depth_or_stencil(p_format))
+      return fmt;
+
+   switch (util_format_get_blocksize(p_format)) {
+   case 1:
+      return VK_FORMAT_R8_UINT;
+   case 2:
+      return VK_FORMAT_R16_UINT;
+   case 4:
+      return VK_FORMAT_R32_UINT;
+   case 8:
+      return VK_FORMAT_R32G32_UINT;
+   case 16:
+      return VK_FORMAT_R32G32B32A32_UINT;
+   default:
+      unreachable("invalid bpp");
+   }
+}
+
+enum copy_type {
+   BUF2IMG,
+   IMG2BUF,
+   IMG2IMG,
+};
+
+struct vk_meta_push_data {
+   uint32_t buffer_offset;
+   uint32_t row_extent;
+   uint32_t slice_or_layer_extent;
+
+   int32_t src_offset_el[4];
+   int32_t dst_offset_el[4];
+   uint32_t grid_el[3];
+} PACKED;
+
+#define get_push(b, name)                                                      \
+   nir_load_push_constant(                                                     \
+      b, 1, sizeof(((struct vk_meta_push_data *)0)->name) * 8,                 \
+      nir_imm_int(b, offsetof(struct vk_meta_push_data, name)))
+
+struct vk_meta_image_copy_key {
+   enum vk_meta_object_key_type key_type;
+   enum copy_type type;
+   unsigned block_size;
+   unsigned nr_samples;
+};
+
+static nir_def *
+linearize_coords(nir_builder *b, nir_def *coord,
+                 const struct vk_meta_image_copy_key *key)
+{
+   assert(key->nr_samples == 1 && "buffer<-->image copies not multisampled");
+
+   nir_def *row_extent = get_push(b, row_extent);
+   nir_def *slice_or_layer_extent = get_push(b, slice_or_layer_extent);
+   nir_def *x = nir_channel(b, coord, 0);
+   nir_def *y = nir_channel(b, coord, 1);
+   nir_def *z_or_layer = nir_channel(b, coord, 2);
+
+   nir_def *v = get_push(b, buffer_offset);
+
+   v = nir_iadd(b, v, nir_imul_imm(b, x, key->block_size));
+   v = nir_iadd(b, v, nir_imul(b, y, row_extent));
+   v = nir_iadd(b, v, nir_imul(b, z_or_layer, slice_or_layer_extent));
+
+   return nir_udiv_imm(b, v, key->block_size);
+}
+
+static nir_shader *
+build_image_copy_shader(const struct vk_meta_image_copy_key *key)
+{
+   nir_builder build =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "vk-meta-copy");
+
+   nir_builder *b = &build;
+   b->shader->info.workgroup_size[0] = 32;
+   b->shader->info.workgroup_size[1] = 32;
+
+   bool src_is_buf = key->type == BUF2IMG;
+   bool dst_is_buf = key->type == IMG2BUF;
+
+   bool msaa = key->nr_samples > 1;
+   enum glsl_sampler_dim dim_2d =
+      msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D;
+   enum glsl_sampler_dim dim_src = src_is_buf ? GLSL_SAMPLER_DIM_BUF : dim_2d;
+   enum glsl_sampler_dim dim_dst = dst_is_buf ? GLSL_SAMPLER_DIM_BUF : dim_2d;
+
+   const struct glsl_type *texture_type =
+      glsl_sampler_type(dim_src, false, !src_is_buf, GLSL_TYPE_UINT);
+
+   const struct glsl_type *image_type =
+      glsl_image_type(dim_dst, !dst_is_buf, GLSL_TYPE_UINT);
+
+   nir_variable *texture =
+      nir_variable_create(b->shader, nir_var_uniform, texture_type, "source");
+   nir_variable *image =
+      nir_variable_create(b->shader, nir_var_image, image_type, "dest");
+
+   image->data.descriptor_set = 0;
+   image->data.binding = BINDING_OUTPUT;
+   image->data.access = ACCESS_NON_READABLE;
+
+   texture->data.descriptor_set = 0;
+   texture->data.binding = BINDING_INPUT;
+
+   /* Grab the offset vectors */
+   nir_def *src_offset_el = nir_load_push_constant(
+      b, 3, 32,
+      nir_imm_int(b, offsetof(struct vk_meta_push_data, src_offset_el)));
+
+   nir_def *dst_offset_el = nir_load_push_constant(
+      b, 3, 32,
+      nir_imm_int(b, offsetof(struct vk_meta_push_data, dst_offset_el)));
+
+   nir_def *grid_el = nir_load_push_constant(
+      b, 3, 32, nir_imm_int(b, offsetof(struct vk_meta_push_data, grid_el)));
+
+   /* We're done setting up variables, do the copy */
+   nir_def *coord = nir_load_global_invocation_id(b, 32);
+
+   nir_push_if(b,
+               nir_ball(b, nir_trim_vector(b, nir_ult(b, coord, grid_el), 2)));
+   {
+      nir_def *src_coord = nir_iadd(b, coord, src_offset_el);
+      nir_def *dst_coord = nir_iadd(b, coord, dst_offset_el);
+
+      /* Special case handle buffer indexing */
+      if (dst_is_buf) {
+         dst_coord = linearize_coords(b, coord, key);
+      } else if (src_is_buf) {
+         src_coord = linearize_coords(b, coord, key);
+      }
+
+      /* Copy formatted texel from texture to storage image */
+      for (unsigned s = 0; s < key->nr_samples; ++s) {
+         nir_deref_instr *deref = nir_build_deref_var(b, texture);
+         nir_def *ms_index = nir_imm_int(b, s);
+
+         nir_def *value = msaa ? nir_txf_ms_deref(b, deref, src_coord, ms_index)
+                               : nir_txf_deref(b, deref, src_coord, NULL);
+
+         nir_image_deref_store(b, &nir_build_deref_var(b, image)->def,
+                               nir_pad_vec4(b, dst_coord), ms_index, value,
+                               nir_imm_int(b, 0), .image_dim = dim_dst,
+                               .image_array = !dst_is_buf);
+      }
+   }
+   nir_pop_if(b, NULL);
+   return b->shader;
+}
+
+static VkResult
+get_image_copy_descriptor_set_layout(struct vk_device *device,
+                                     struct vk_meta_device *meta,
+                                     VkDescriptorSetLayout *layout_out,
+                                     enum copy_type type)
+{
+   const char *keys[] = {
+      [IMG2BUF] = "vk-meta-copy-image-to-buffer-descriptor-set-layout",
+      [BUF2IMG] = "vk-meta-copy-buffer-to-image-descriptor-set-layout",
+      [IMG2IMG] = "vk-meta-copy-image-to-image-descriptor-set-layout",
+   };
+
+   VkDescriptorSetLayout from_cache = vk_meta_lookup_descriptor_set_layout(
+      meta, keys[type], strlen(keys[type]));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkDescriptorSetLayoutBinding bindings[] = {
+      {
+         .binding = BINDING_OUTPUT,
+         .descriptorType = type != IMG2BUF
+                              ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE
+                              : VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+      {
+         .binding = BINDING_INPUT,
+         .descriptorType = type == BUF2IMG
+                              ? VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER
+                              : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+   };
+
+   const VkDescriptorSetLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .bindingCount = ARRAY_SIZE(bindings),
+      .pBindings = bindings,
+   };
+
+   return vk_meta_create_descriptor_set_layout(device, meta, &info, keys[type],
+                                               strlen(keys[type]), layout_out);
+}
+
+static VkResult
+get_image_copy_pipeline_layout(struct vk_device *device,
+                               struct vk_meta_device *meta,
+                               struct vk_meta_image_copy_key *key,
+                               VkDescriptorSetLayout set_layout,
+                               VkPipelineLayout *layout_out,
+                               enum copy_type type)
+{
+   const char *keys[] = {
+      [IMG2BUF] = "vk-meta-copy-image-to-buffer-pipeline-layout",
+      [BUF2IMG] = "vk-meta-copy-buffer-to-image-pipeline-layout",
+      [IMG2IMG] = "vk-meta-copy-image-to-image-pipeline-layout",
+   };
+
+   VkPipelineLayout from_cache =
+      vk_meta_lookup_pipeline_layout(meta, keys[type], strlen(keys[type]));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkPipelineLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = &set_layout,
+   };
+
+   const VkPushConstantRange push_range = {
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(struct vk_meta_push_data),
+   };
+
+   info.pushConstantRangeCount = 1;
+   info.pPushConstantRanges = &push_range;
+
+   return vk_meta_create_pipeline_layout(device, meta, &info, keys[type],
+                                         strlen(keys[type]), layout_out);
+}
+
+static VkResult
+get_image_copy_pipeline(struct vk_device *device, struct vk_meta_device *meta,
+                        const struct vk_meta_image_copy_key *key,
+                        VkPipelineLayout layout, VkPipeline *pipeline_out)
+{
+   VkPipeline from_cache = vk_meta_lookup_pipeline(meta, key, sizeof(*key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkPipelineShaderStageNirCreateInfoMESA nir_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA,
+      .nir = build_image_copy_shader(key),
+   };
+   const VkPipelineShaderStageCreateInfo cs_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &nir_info,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+   };
+
+   const VkComputePipelineCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = cs_info,
+      .layout = layout,
+   };
+
+   VkResult result = vk_meta_create_compute_pipeline(
+      device, meta, &info, key, sizeof(*key), pipeline_out);
+   ralloc_free(nir_info.nir);
+
+   return result;
+}
+
+static void
+hk_meta_copy_image_to_buffer2(struct vk_command_buffer *cmd,
+                              struct vk_meta_device *meta,
+                              const VkCopyImageToBufferInfo2 *pCopyBufferInfo)
+{
+   VK_FROM_HANDLE(vk_image, image, pCopyBufferInfo->srcImage);
+   VK_FROM_HANDLE(vk_image, src_image, pCopyBufferInfo->srcImage);
+
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkResult result;
+
+   VkDescriptorSetLayout set_layout;
+   result =
+      get_image_copy_descriptor_set_layout(device, meta, &set_layout, IMG2BUF);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   bool per_layer =
+      util_format_is_compressed(vk_format_to_pipe_format(image->format));
+
+   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
+      const VkBufferImageCopy2 *region = &pCopyBufferInfo->pRegions[i];
+
+      unsigned layers = MAX2(region->imageExtent.depth,
+                             vk_image_subresource_layer_count(
+                                src_image, &region->imageSubresource));
+      unsigned layer_iters = per_layer ? layers : 1;
+
+      for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) {
+
+         VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+         VkFormat aspect_fmt = aspect_format(image->format, aspect);
+         VkFormat canonical = canonical_format(aspect_fmt);
+
+         uint32_t blocksize_B =
+            util_format_get_blocksize(vk_format_to_pipe_format(canonical));
+
+         enum pipe_format p_format = vk_format_to_pipe_format(image->format);
+
+         unsigned row_extent = util_format_get_nblocksx(
+                                  p_format, MAX2(region->bufferRowLength,
+                                                 region->imageExtent.width)) *
+                               blocksize_B;
+         unsigned slice_extent =
+            util_format_get_nblocksy(
+               p_format,
+               MAX2(region->bufferImageHeight, region->imageExtent.height)) *
+            row_extent;
+         unsigned layer_extent =
+            util_format_get_nblocksz(p_format, region->imageExtent.depth) *
+            slice_extent;
+
+         bool is_3d = region->imageExtent.depth > 1;
+
+         struct vk_meta_image_copy_key key = {
+            .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE,
+            .type = IMG2BUF,
+            .block_size = blocksize_B,
+            .nr_samples = image->samples,
+         };
+
+         VkPipelineLayout pipeline_layout;
+         result = get_image_copy_pipeline_layout(device, meta, &key, set_layout,
+                                                 &pipeline_layout, false);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkImageView src_view;
+         const VkImageViewUsageCreateInfo src_view_usage = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+            .usage = VK_IMAGE_USAGE_SAMPLED_BIT,
+         };
+         const VkImageViewCreateInfo src_view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+            .pNext = &src_view_usage,
+            .image = pCopyBufferInfo->srcImage,
+            .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+            .format = canonical,
+            .subresourceRange =
+               {
+                  .aspectMask = region->imageSubresource.aspectMask,
+                  .baseMipLevel = region->imageSubresource.mipLevel,
+                  .baseArrayLayer =
+                     MAX2(region->imageOffset.z,
+                          region->imageSubresource.baseArrayLayer) +
+                     layer_offs,
+                  .layerCount = per_layer ? 1 : layers,
+                  .levelCount = 1,
+               },
+         };
+
+         result =
+            vk_meta_create_image_view(cmd, meta, &src_view_info, &src_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkDescriptorImageInfo src_info = {
+            .imageLayout = pCopyBufferInfo->srcImageLayout,
+            .imageView = src_view,
+         };
+
+         VkWriteDescriptorSet desc_writes[2];
+
+         const VkBufferViewCreateInfo dst_view_info = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = pCopyBufferInfo->dstBuffer,
+            .format = canonical,
+
+            /* Ideally, this would be region->bufferOffset, but that might not
+             * be aligned to minTexelBufferOffsetAlignment. Instead, we use a 0
+             * offset (which is definitely aligned) and add the offset ourselves
+             * in the shader.
+             */
+            .offset = 0,
+            .range = VK_WHOLE_SIZE,
+         };
+
+         VkBufferView dst_view;
+         VkResult result =
+            vk_meta_create_buffer_view(cmd, meta, &dst_view_info, &dst_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         desc_writes[0] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_OUTPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+            .descriptorCount = 1,
+            .pTexelBufferView = &dst_view,
+         };
+
+         desc_writes[1] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_INPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+            .descriptorCount = 1,
+            .pImageInfo = &src_info,
+         };
+
+         disp->CmdPushDescriptorSetKHR(
+            vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE,
+            pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes);
+
+         VkPipeline pipeline;
+         result = get_image_copy_pipeline(device, meta, &key, pipeline_layout,
+                                          &pipeline);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                               VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+         enum pipe_format p_src_fmt =
+            vk_format_to_pipe_format(src_image->format);
+
+         struct vk_meta_push_data push = {
+            .buffer_offset = region->bufferOffset,
+            .row_extent = row_extent,
+            .slice_or_layer_extent = is_3d ? slice_extent : layer_extent,
+
+            .src_offset_el[0] =
+               util_format_get_nblocksx(p_src_fmt, region->imageOffset.x),
+            .src_offset_el[1] =
+               util_format_get_nblocksy(p_src_fmt, region->imageOffset.y),
+
+            .grid_el[0] =
+               util_format_get_nblocksx(p_format, region->imageExtent.width),
+            .grid_el[1] =
+               util_format_get_nblocksy(p_format, region->imageExtent.height),
+            .grid_el[2] = per_layer ? 1 : layers,
+         };
+
+         push.buffer_offset += push.slice_or_layer_extent * layer_offs;
+
+         disp->CmdPushConstants(vk_command_buffer_to_handle(cmd),
+                                pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                                sizeof(push), &push);
+
+         disp->CmdDispatch(vk_command_buffer_to_handle(cmd),
+                           DIV_ROUND_UP(push.grid_el[0], 32),
+                           DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]);
+      }
+   }
+}
+
+static void
+hk_meta_copy_buffer_to_image2(struct vk_command_buffer *cmd,
+                              struct vk_meta_device *meta,
+                              const struct VkCopyBufferToImageInfo2 *info)
+{
+   VK_FROM_HANDLE(vk_image, image, info->dstImage);
+
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkDescriptorSetLayout set_layout;
+   VkResult result =
+      get_image_copy_descriptor_set_layout(device, meta, &set_layout, BUF2IMG);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   bool per_layer =
+      util_format_is_compressed(vk_format_to_pipe_format(image->format));
+
+   for (unsigned r = 0; r < info->regionCount; ++r) {
+      const VkBufferImageCopy2 *region = &info->pRegions[r];
+
+      unsigned layers = MAX2(
+         region->imageExtent.depth,
+         vk_image_subresource_layer_count(image, &region->imageSubresource));
+      unsigned layer_iters = per_layer ? layers : 1;
+
+      for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) {
+         VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+         VkFormat aspect_fmt = aspect_format(image->format, aspect);
+         VkFormat canonical = canonical_format(aspect_fmt);
+         enum pipe_format p_format = vk_format_to_pipe_format(aspect_fmt);
+         uint32_t blocksize_B = util_format_get_blocksize(p_format);
+         bool is_3d = region->imageExtent.depth > 1;
+
+         struct vk_meta_image_copy_key key = {
+            .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE,
+            .type = BUF2IMG,
+            .block_size = blocksize_B,
+            .nr_samples = image->samples,
+         };
+
+         VkPipelineLayout pipeline_layout;
+         result = get_image_copy_pipeline_layout(device, meta, &key, set_layout,
+                                                 &pipeline_layout, true);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkWriteDescriptorSet desc_writes[2];
+
+         unsigned row_extent = util_format_get_nblocksx(
+                                  p_format, MAX2(region->bufferRowLength,
+                                                 region->imageExtent.width)) *
+                               blocksize_B;
+         unsigned slice_extent =
+            util_format_get_nblocksy(
+               p_format,
+               MAX2(region->bufferImageHeight, region->imageExtent.height)) *
+            row_extent;
+         unsigned layer_extent =
+            util_format_get_nblocksz(p_format, region->imageExtent.depth) *
+            slice_extent;
+
+         /* Create a view into the source buffer as a texel buffer */
+         const VkBufferViewCreateInfo src_view_info = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = info->srcBuffer,
+            .format = canonical,
+
+            /* Ideally, this would be region->bufferOffset, but that might not
+             * be aligned to minTexelBufferOffsetAlignment. Instead, we use a 0
+             * offset (which is definitely aligned) and add the offset ourselves
+             * in the shader.
+             */
+            .offset = 0,
+            .range = VK_WHOLE_SIZE,
+         };
+
+         assert((region->bufferOffset % blocksize_B) == 0 && "must be aligned");
+
+         VkBufferView src_view;
+         result =
+            vk_meta_create_buffer_view(cmd, meta, &src_view_info, &src_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkImageView dst_view;
+         const VkImageViewUsageCreateInfo dst_view_usage = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+            .usage = VK_IMAGE_USAGE_STORAGE_BIT,
+         };
+         const VkImageViewCreateInfo dst_view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+            .pNext = &dst_view_usage,
+            .image = info->dstImage,
+            .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+            .format = canonical,
+            .subresourceRange =
+               {
+                  .aspectMask = region->imageSubresource.aspectMask,
+                  .baseMipLevel = region->imageSubresource.mipLevel,
+                  .baseArrayLayer =
+                     MAX2(region->imageOffset.z,
+                          region->imageSubresource.baseArrayLayer) +
+                     layer_offs,
+                  .layerCount = per_layer ? 1 : layers,
+                  .levelCount = 1,
+               },
+         };
+
+         result =
+            vk_meta_create_image_view(cmd, meta, &dst_view_info, &dst_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         const VkDescriptorImageInfo dst_info = {
+            .imageView = dst_view,
+            .imageLayout = info->dstImageLayout,
+         };
+
+         desc_writes[0] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_OUTPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .descriptorCount = 1,
+            .pImageInfo = &dst_info,
+         };
+
+         desc_writes[1] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_INPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+            .descriptorCount = 1,
+            .pTexelBufferView = &src_view,
+         };
+
+         disp->CmdPushDescriptorSetKHR(
+            vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE,
+            pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes);
+
+         VkPipeline pipeline;
+         result = get_image_copy_pipeline(device, meta, &key, pipeline_layout,
+                                          &pipeline);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                               VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+         struct vk_meta_push_data push = {
+            .buffer_offset = region->bufferOffset,
+            .row_extent = row_extent,
+            .slice_or_layer_extent = is_3d ? slice_extent : layer_extent,
+
+            .dst_offset_el[0] =
+               util_format_get_nblocksx(p_format, region->imageOffset.x),
+            .dst_offset_el[1] =
+               util_format_get_nblocksy(p_format, region->imageOffset.y),
+
+            .grid_el[0] =
+               util_format_get_nblocksx(p_format, region->imageExtent.width),
+            .grid_el[1] =
+               util_format_get_nblocksy(p_format, region->imageExtent.height),
+            .grid_el[2] = per_layer ? 1 : layers,
+         };
+
+         push.buffer_offset += push.slice_or_layer_extent * layer_offs;
+
+         disp->CmdPushConstants(vk_command_buffer_to_handle(cmd),
+                                pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                                sizeof(push), &push);
+
+         disp->CmdDispatch(vk_command_buffer_to_handle(cmd),
+                           DIV_ROUND_UP(push.grid_el[0], 32),
+                           DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]);
+      }
+   }
+}
+
+static void
+hk_meta_copy_image2(struct vk_command_buffer *cmd, struct vk_meta_device *meta,
+                    const struct VkCopyImageInfo2 *info)
+{
+   VK_FROM_HANDLE(vk_image, src_image, info->srcImage);
+   VK_FROM_HANDLE(vk_image, dst_image, info->dstImage);
+
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkDescriptorSetLayout set_layout;
+   VkResult result =
+      get_image_copy_descriptor_set_layout(device, meta, &set_layout, BUF2IMG);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   bool per_layer =
+      util_format_is_compressed(vk_format_to_pipe_format(src_image->format)) ||
+      util_format_is_compressed(vk_format_to_pipe_format(dst_image->format));
+
+   for (unsigned r = 0; r < info->regionCount; ++r) {
+      const VkImageCopy2 *region = &info->pRegions[r];
+
+      unsigned layers = MAX2(
+         vk_image_subresource_layer_count(src_image, &region->srcSubresource),
+         region->extent.depth);
+      unsigned layer_iters = per_layer ? layers : 1;
+
+      for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) {
+         u_foreach_bit(aspect, region->srcSubresource.aspectMask) {
+            /* We use the source format throughout for consistent scaling with
+             * compressed<-->uncompressed copies, where the extents are defined
+             * to follow the source.
+             */
+            VkFormat aspect_fmt = aspect_format(src_image->format, 1 << aspect);
+            VkFormat canonical = canonical_format(aspect_fmt);
+            uint32_t blocksize_B =
+               util_format_get_blocksize(vk_format_to_pipe_format(canonical));
+
+            struct vk_meta_image_copy_key key = {
+               .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE,
+               .type = IMG2IMG,
+               .block_size = blocksize_B,
+               .nr_samples = dst_image->samples,
+            };
+
+            assert(key.nr_samples == src_image->samples);
+
+            VkPipelineLayout pipeline_layout;
+            result = get_image_copy_pipeline_layout(
+               device, meta, &key, set_layout, &pipeline_layout, true);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            VkWriteDescriptorSet desc_writes[2];
+
+            VkImageView src_view;
+            const VkImageViewUsageCreateInfo src_view_usage = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+               .usage = VK_IMAGE_USAGE_SAMPLED_BIT,
+            };
+            const VkImageViewCreateInfo src_view_info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+               .pNext = &src_view_usage,
+               .image = info->srcImage,
+               .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+               .format = canonical,
+               .subresourceRange =
+                  {
+                     .aspectMask =
+                        region->srcSubresource.aspectMask & (1 << aspect),
+                     .baseMipLevel = region->srcSubresource.mipLevel,
+                     .baseArrayLayer =
+                        MAX2(region->srcOffset.z,
+                             region->srcSubresource.baseArrayLayer) +
+                        layer_offs,
+                     .layerCount = per_layer ? 1 : layers,
+                     .levelCount = 1,
+                  },
+            };
+
+            result =
+               vk_meta_create_image_view(cmd, meta, &src_view_info, &src_view);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            VkDescriptorImageInfo src_info = {
+               .imageLayout = info->srcImageLayout,
+               .imageView = src_view,
+            };
+
+            VkImageView dst_view;
+            const VkImageViewUsageCreateInfo dst_view_usage = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+               .usage = VK_IMAGE_USAGE_STORAGE_BIT,
+            };
+            const VkImageViewCreateInfo dst_view_info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+               .pNext = &dst_view_usage,
+               .image = info->dstImage,
+               .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+               .format = canonical,
+               .subresourceRange =
+                  {
+                     .aspectMask =
+                        vk_format_get_ycbcr_info(dst_image->format) ||
+                              vk_format_get_ycbcr_info(src_image->format)
+                           ? region->dstSubresource.aspectMask
+                           : (1 << aspect),
+                     .baseMipLevel = region->dstSubresource.mipLevel,
+                     .baseArrayLayer =
+                        MAX2(region->dstOffset.z,
+                             region->dstSubresource.baseArrayLayer) +
+                        layer_offs,
+                     .layerCount = per_layer ? 1 : layers,
+                     .levelCount = 1,
+                  },
+            };
+
+            result =
+               vk_meta_create_image_view(cmd, meta, &dst_view_info, &dst_view);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            const VkDescriptorImageInfo dst_info = {
+               .imageView = dst_view,
+               .imageLayout = info->dstImageLayout,
+            };
+
+            desc_writes[0] = (VkWriteDescriptorSet){
+               .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+               .dstSet = 0,
+               .dstBinding = BINDING_OUTPUT,
+               .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+               .descriptorCount = 1,
+               .pImageInfo = &dst_info,
+            };
+
+            desc_writes[1] = (VkWriteDescriptorSet){
+               .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+               .dstSet = 0,
+               .dstBinding = BINDING_INPUT,
+               .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+               .descriptorCount = 1,
+               .pImageInfo = &src_info,
+            };
+
+            disp->CmdPushDescriptorSetKHR(
+               vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE,
+               pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes);
+
+            VkPipeline pipeline;
+            result = get_image_copy_pipeline(device, meta, &key,
+                                             pipeline_layout, &pipeline);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                                  VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+            enum pipe_format p_src_fmt =
+               vk_format_to_pipe_format(src_image->format);
+            enum pipe_format p_dst_fmt =
+               vk_format_to_pipe_format(dst_image->format);
+            enum pipe_format p_format = vk_format_to_pipe_format(aspect_fmt);
+
+            struct vk_meta_push_data push = {
+               .src_offset_el[0] =
+                  util_format_get_nblocksx(p_src_fmt, region->srcOffset.x),
+               .src_offset_el[1] =
+                  util_format_get_nblocksy(p_src_fmt, region->srcOffset.y),
+
+               .dst_offset_el[0] =
+                  util_format_get_nblocksx(p_dst_fmt, region->dstOffset.x),
+               .dst_offset_el[1] =
+                  util_format_get_nblocksy(p_dst_fmt, region->dstOffset.y),
+
+               .grid_el[0] =
+                  util_format_get_nblocksx(p_format, region->extent.width),
+               .grid_el[1] =
+                  util_format_get_nblocksy(p_format, region->extent.height),
+               .grid_el[2] = per_layer ? 1 : layers,
+            };
+
+            disp->CmdPushConstants(vk_command_buffer_to_handle(cmd),
+                                   pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(push), &push);
+
+            disp->CmdDispatch(vk_command_buffer_to_handle(cmd),
+                              DIV_ROUND_UP(push.grid_el[0], 32),
+                              DIV_ROUND_UP(push.grid_el[1], 32),
+                              push.grid_el[2]);
+         }
+      }
+   }
+}
+
+struct vk_meta_image_to_buffer_push_data {
+   uint32_t dest_offset_el;
+};
+
+#define get_image_push(b, name)                                                \
+   nir_load_push_constant(                                                     \
+      b, 1, sizeof(((struct vk_meta_image_to_buffer_push_data *)0)->name) * 8, \
+      nir_imm_int(b,                                                           \
+                  offsetof(struct vk_meta_image_to_buffer_push_data, name)))
+
+enum copy_source {
+   COPY_SOURCE_PATTERN,
+   COPY_SOURCE_BUFFER,
+};
+
+struct vk_meta_buffer_copy_key {
+   enum vk_meta_object_key_type key_type;
+   enum copy_source source;
+
+   /* Power-of-two block size for the transfer, range [1, 16] */
+   uint8_t blocksize;
+   uint8_t pad[3];
+};
+static_assert(sizeof(struct vk_meta_buffer_copy_key) == 12, "packed");
+
+/* XXX: TODO: move to common */
+/* Copyright © Microsoft Corporation */
+static nir_def *
+dzn_nir_create_bo_desc(nir_builder *b, nir_variable_mode mode,
+                       uint32_t desc_set, uint32_t binding, const char *name,
+                       unsigned access, const struct glsl_type *dummy_type)
+{
+   nir_variable *var = nir_variable_create(b->shader, mode, dummy_type, name);
+   var->data.descriptor_set = desc_set;
+   var->data.binding = binding;
+   var->data.access = access;
+
+   assert(mode == nir_var_mem_ubo || mode == nir_var_mem_ssbo);
+   if (mode == nir_var_mem_ubo)
+      b->shader->info.num_ubos++;
+   else
+      b->shader->info.num_ssbos++;
+
+   VkDescriptorType desc_type = var->data.mode == nir_var_mem_ubo
+                                   ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
+                                   : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+   nir_address_format addr_format =
+      nir_address_format_64bit_global_32bit_offset; /* XXX */
+   nir_def *index = nir_vulkan_resource_index(
+      b, nir_address_format_num_components(addr_format),
+      nir_address_format_bit_size(addr_format), nir_imm_int(b, 0),
+      .desc_set = desc_set, .binding = binding, .desc_type = desc_type);
+
+   nir_def *desc = nir_load_vulkan_descriptor(
+      b, nir_address_format_num_components(addr_format),
+      nir_address_format_bit_size(addr_format), index, .desc_type = desc_type);
+
+   return desc;
+}
+
+static const struct glsl_type *
+type_for_blocksize(uint8_t blocksize)
+{
+   assert(util_is_power_of_two_nonzero(blocksize) && blocksize <= 16);
+
+   if (blocksize > 4)
+      return glsl_vector_type(GLSL_TYPE_UINT, blocksize / 4);
+   else
+      return glsl_uintN_t_type(8 * blocksize);
+}
+
+static nir_shader *
+build_buffer_copy_shader(const struct vk_meta_buffer_copy_key *key)
+{
+   nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
+                                                      "vk-meta-copy-to-buffer");
+   nir_builder *b = &build;
+
+   const struct glsl_type *type =
+      glsl_array_type(type_for_blocksize(key->blocksize), 0, key->blocksize);
+
+   nir_def *index = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+   nir_def *value;
+
+   if (key->source == COPY_SOURCE_BUFFER) {
+      nir_def *ubo =
+         dzn_nir_create_bo_desc(b, nir_var_mem_ubo, 0, BINDING_INPUT, "source",
+                                ACCESS_NON_WRITEABLE, type);
+      nir_deref_instr *ubo_deref =
+         nir_build_deref_cast(b, ubo, nir_var_mem_ubo, type, key->blocksize);
+
+      nir_deref_instr *element_deref = nir_build_deref_array(
+         b, ubo_deref, nir_u2uN(b, index, ubo_deref->def.bit_size));
+
+      value = nir_load_deref(b, element_deref);
+   } else {
+      nir_def *pattern = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0));
+
+      assert(key->blocksize >= 4 && "fills at least 32-bit");
+      value = nir_replicate(b, pattern, key->blocksize / 4);
+   }
+
+   /* Write out raw bytes to SSBO */
+   nir_def *ssbo =
+      dzn_nir_create_bo_desc(b, nir_var_mem_ssbo, 0, BINDING_OUTPUT,
+                             "destination", ACCESS_NON_READABLE, type);
+
+   nir_deref_instr *ssbo_deref =
+      nir_build_deref_cast(b, ssbo, nir_var_mem_ssbo, type, key->blocksize);
+
+   nir_deref_instr *element_deref = nir_build_deref_array(
+      b, ssbo_deref, nir_u2uN(b, index, ssbo_deref->def.bit_size));
+
+   nir_store_deref(b, element_deref, value,
+                   nir_component_mask(value->num_components));
+
+   return b->shader;
+}
+
+static VkResult
+get_buffer_copy_descriptor_set_layout(struct vk_device *device,
+                                      struct vk_meta_device *meta,
+                                      VkDescriptorSetLayout *layout_out,
+                                      enum copy_source source)
+{
+   const char buffer_key[] = "vk-meta-buffer-copy-descriptor-set-layout";
+   const char fill_key[] = "vk-meta-fill__-copy-descriptor-set-layout";
+
+   static_assert(sizeof(buffer_key) == sizeof(fill_key));
+   const char *key = source == COPY_SOURCE_BUFFER ? buffer_key : fill_key;
+
+   VkDescriptorSetLayout from_cache =
+      vk_meta_lookup_descriptor_set_layout(meta, key, sizeof(buffer_key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkDescriptorSetLayoutBinding bindings[] = {
+      {
+         .binding = BINDING_OUTPUT,
+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+      {
+         .binding = BINDING_INPUT,
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+   };
+
+   const VkDescriptorSetLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .bindingCount = ARRAY_SIZE(bindings),
+      .pBindings = bindings,
+   };
+
+   return vk_meta_create_descriptor_set_layout(device, meta, &info, key,
+                                               sizeof(key), layout_out);
+}
+
+static VkResult
+get_buffer_copy_pipeline_layout(struct vk_device *device,
+                                struct vk_meta_device *meta,
+                                struct vk_meta_buffer_copy_key *key,
+                                VkDescriptorSetLayout set_layout,
+                                VkPipelineLayout *layout_out)
+{
+   const char copy_key[] = "vk-meta-buffer-copy-pipeline-layout";
+   const char fill_key[] = "vk-meta-buffer-fill-pipeline-layout";
+   const char cimg_key[] = "vk-meta-buffer-cimg-pipeline-layout";
+
+   STATIC_ASSERT(sizeof(copy_key) == sizeof(fill_key));
+   STATIC_ASSERT(sizeof(copy_key) == sizeof(cimg_key));
+   const char *pipeline_key =
+      key->source == COPY_SOURCE_BUFFER ? copy_key : fill_key;
+
+   VkPipelineLayout from_cache =
+      vk_meta_lookup_pipeline_layout(meta, pipeline_key, sizeof(copy_key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkPipelineLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = &set_layout,
+   };
+
+   size_t push_size = 0;
+   if (key->source == COPY_SOURCE_PATTERN)
+      push_size = sizeof(uint32_t);
+
+   const VkPushConstantRange push_range = {
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = push_size,
+   };
+
+   if (push_size) {
+      info.pushConstantRangeCount = 1;
+      info.pPushConstantRanges = &push_range;
+   }
+
+   return vk_meta_create_pipeline_layout(device, meta, &info, pipeline_key,
+                                         sizeof(copy_key), layout_out);
+}
+
+static VkResult
+get_buffer_copy_pipeline(struct vk_device *device, struct vk_meta_device *meta,
+                         const struct vk_meta_buffer_copy_key *key,
+                         VkPipelineLayout layout, VkPipeline *pipeline_out)
+{
+   VkPipeline from_cache = vk_meta_lookup_pipeline(meta, key, sizeof(*key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkPipelineShaderStageNirCreateInfoMESA nir_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA,
+      .nir = build_buffer_copy_shader(key),
+   };
+   const VkPipelineShaderStageCreateInfo cs_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &nir_info,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+   };
+
+   const VkComputePipelineCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = cs_info,
+      .layout = layout,
+   };
+
+   VkResult result = vk_meta_create_compute_pipeline(
+      device, meta, &info, key, sizeof(*key), pipeline_out);
+   ralloc_free(nir_info.nir);
+
+   return result;
+}
+
+static unsigned
+alignment_of(unsigned x)
+{
+   return 1 << MIN2(__builtin_ctz(x), 31);
+}
+
+struct copy_desc {
+   enum copy_source source;
+
+   union {
+      uint32_t pattern;
+
+      struct {
+         struct vk_buffer *source;
+         VkDeviceSize srcOffset;
+      } buffer;
+
+      struct {
+         struct vk_image *image;
+         VkDescriptorImageInfo *info;
+         VkFormat format;
+         struct vk_meta_image_to_buffer_push_data push;
+      } image;
+   };
+};
+
+static void
+do_copy(struct vk_command_buffer *cmd, struct vk_meta_device *meta, size_t size,
+        struct vk_buffer *dest, VkDeviceSize dstOffset, struct copy_desc *desc)
+{
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   VkResult result;
+
+   /* The "alignment" of the copy is the maximum alignment that all accesses
+    * within the copy will satsify.
+    */
+   unsigned alignment = MIN2(alignment_of(dstOffset), alignment_of(size));
+
+   if (desc->source == COPY_SOURCE_BUFFER)
+      alignment = MIN2(alignment, alignment_of(desc->buffer.srcOffset));
+
+   struct vk_meta_buffer_copy_key key = {
+      .key_type = VK_META_OBJECT_KEY_FILL_PIPELINE,
+      .source = desc->source,
+      .blocksize = MIN2(alignment, 16),
+   };
+
+   VkDescriptorSetLayout set_layout;
+   result = get_buffer_copy_descriptor_set_layout(device, meta, &set_layout,
+                                                  desc->source);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   VkPipelineLayout pipeline_layout;
+   result = get_buffer_copy_pipeline_layout(device, meta, &key, set_layout,
+                                            &pipeline_layout);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   VkDescriptorBufferInfo buffer_infos[2];
+   VkWriteDescriptorSet desc_writes[2];
+
+   for (unsigned i = 0; i < 2; ++i) {
+      bool is_dest = (i == BINDING_OUTPUT);
+
+      if (!is_dest && desc->source != COPY_SOURCE_BUFFER)
+         continue;
+
+      buffer_infos[i] = (VkDescriptorBufferInfo){
+         .buffer = vk_buffer_to_handle(is_dest ? dest : desc->buffer.source),
+         .offset = is_dest ? dstOffset : desc->buffer.srcOffset,
+         .range = size,
+      };
+
+      desc_writes[i] = (VkWriteDescriptorSet){
+         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+         .dstSet = 0,
+         .dstBinding = i,
+         .descriptorType = is_dest ? VK_DESCRIPTOR_TYPE_STORAGE_BUFFER
+                                   : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+         .descriptorCount = 1,
+         .pBufferInfo = &buffer_infos[i],
+      };
+   }
+
+   unsigned desc_count = desc->source == COPY_SOURCE_PATTERN ? 1 : 2;
+   disp->CmdPushDescriptorSetKHR(vk_command_buffer_to_handle(cmd),
+                                 VK_PIPELINE_BIND_POINT_COMPUTE,
+                                 pipeline_layout, 0, desc_count, desc_writes);
+
+   VkPipeline pipeline;
+   result =
+      get_buffer_copy_pipeline(device, meta, &key, pipeline_layout, &pipeline);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                         VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   if (desc->source == COPY_SOURCE_PATTERN) {
+      disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), pipeline_layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(uint32_t),
+                             &desc->pattern);
+   }
+
+   disp->CmdDispatch(vk_command_buffer_to_handle(cmd), size / key.blocksize, 1,
+                     1);
+}
+
+static void
+hk_meta_fill_buffer(struct vk_command_buffer *cmd, struct vk_meta_device *meta,
+                    struct vk_buffer *dest, VkDeviceSize dstOffset,
+                    VkDeviceSize dstRange, uint32_t data)
+{
+   size_t size = ROUND_DOWN_TO(vk_buffer_range(dest, dstOffset, dstRange), 4);
+   dstOffset = ROUND_DOWN_TO(dstOffset, 4);
+
+   do_copy(cmd, meta, size, dest, dstOffset,
+           &(struct copy_desc){
+              .source = COPY_SOURCE_PATTERN,
+              .pattern = data,
+           });
+}
+
+static void
+hk_meta_update_buffer(struct vk_command_buffer *cmd,
+                      struct vk_meta_device *meta, struct vk_buffer *dest,
+                      VkDeviceSize dstOffset, VkDeviceSize dstRange,
+                      const void *data)
+{
+   /* Create a buffer to hold the data */
+   const VkBufferCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = vk_buffer_range(dest, dstOffset, dstRange),
+      .usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+      .queueFamilyIndexCount = 1,
+      .pQueueFamilyIndices = &cmd->pool->queue_family_index,
+   };
+
+   VkBuffer buffer;
+   VkResult result = vk_meta_create_buffer(cmd, meta, &info, &buffer);
+   if (unlikely(result != VK_SUCCESS))
+      return;
+
+   /* Map the buffer for CPU access */
+   void *map;
+   result = meta->cmd_bind_map_buffer(cmd, meta, buffer, &map);
+   if (unlikely(result != VK_SUCCESS))
+      return;
+
+   /* Copy from the CPU input to the staging buffer */
+   memcpy(map, data, info.size);
+
+   /* Copy between the buffers on the GPU */
+   VK_FROM_HANDLE(vk_buffer, buffer_, buffer);
+   size_t size = ROUND_DOWN_TO(vk_buffer_range(dest, dstOffset, dstRange), 4);
+   dstOffset = ROUND_DOWN_TO(dstOffset, 4);
+
+   do_copy(cmd, meta, size, dest, dstOffset,
+           &(struct copy_desc){
+              .source = COPY_SOURCE_BUFFER,
+              .buffer.source = buffer_,
+           });
+}
+
+static void
+hk_meta_copy_buffer2(struct vk_command_buffer *cmd, struct vk_meta_device *meta,
+                     const VkCopyBufferInfo2 *pCopyBufferInfo)
+{
+   VK_FROM_HANDLE(vk_buffer, dst, pCopyBufferInfo->dstBuffer);
+   VK_FROM_HANDLE(vk_buffer, src, pCopyBufferInfo->srcBuffer);
+
+   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
+      const VkBufferCopy2 *copy = &pCopyBufferInfo->pRegions[i];
+
+      do_copy(cmd, meta, copy->size, dst, copy->dstOffset,
+              &(struct copy_desc){
+                 .source = COPY_SOURCE_BUFFER,
+                 .buffer.source = src,
+                 .buffer.srcOffset = copy->srcOffset,
+              });
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBlitImage2(VkCommandBuffer commandBuffer,
+                 const VkBlitImageInfo2 *pBlitImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_blit_image2(&cmd->vk, &dev->meta, pBlitImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdResolveImage2(VkCommandBuffer commandBuffer,
+                    const VkResolveImageInfo2 *pResolveImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_resolve_image2(&cmd->vk, &dev->meta, pResolveImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+void
+hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd,
+                          const VkRenderingInfo *pRenderingInfo)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_resolve_rendering(&cmd->vk, &dev->meta, pRenderingInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
+                  const VkCopyBufferInfo2 *pCopyBufferInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_buffer2(&cmd->vk, &dev->meta, pCopyBufferInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+                         const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_buffer_to_image2(&cmd->vk, &dev->meta, pCopyBufferToImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
+                         const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_image_to_buffer2(&cmd->vk, &dev->meta, pCopyImageToBufferInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyImage2(VkCommandBuffer commandBuffer,
+                 const VkCopyImageInfo2 *pCopyImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_image2(&cmd->vk, &dev->meta, pCopyImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
+                 VkDeviceSize dstOffset, VkDeviceSize dstRange, uint32_t data)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_buffer, buffer, dstBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_fill_buffer(&cmd->vk, &dev->meta, buffer, dstOffset, dstRange, data);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
+                   VkDeviceSize dstOffset, VkDeviceSize dstRange,
+                   const void *pData)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_buffer, buffer, dstBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_update_buffer(&cmd->vk, &dev->meta, buffer, dstOffset, dstRange,
+                         pData);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdClearAttachments(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
+                       const VkClearAttachment *pAttachments,
+                       uint32_t rectCount, const VkClearRect *pRects)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct vk_meta_rendering_info render_info;
+   hk_meta_init_render(cmd, &render_info);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_clear_attachments(&cmd->vk, &dev->meta, &render_info,
+                             attachmentCount, pAttachments, rectCount, pRects);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
diff --git a/src/asahi/vulkan/hk_cmd_pool.c b/src/asahi/vulkan/hk_cmd_pool.c
new file mode 100644
index 00000000000..a3f2a85468a
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_pool.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_cmd_pool.h"
+#include "asahi/lib/agx_bo.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+static VkResult
+hk_cmd_bo_create(struct hk_cmd_pool *pool, bool usc, struct hk_cmd_bo **bo_out)
+{
+   struct hk_device *dev = hk_cmd_pool_device(pool);
+   struct hk_cmd_bo *bo;
+
+   bo = vk_zalloc(&pool->vk.alloc, sizeof(*bo), 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (bo == NULL)
+      return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   bo->bo = agx_bo_create(&dev->dev, HK_CMD_BO_SIZE, usc ? AGX_BO_LOW_VA : 0,
+                          "Command pool");
+   if (bo->bo == NULL) {
+      vk_free(&pool->vk.alloc, bo);
+      return vk_error(pool, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   bo->map = bo->bo->ptr.cpu;
+
+   *bo_out = bo;
+   return VK_SUCCESS;
+}
+
+static void
+hk_cmd_bo_destroy(struct hk_cmd_pool *pool, struct hk_cmd_bo *bo)
+{
+   agx_bo_unreference(bo->bo);
+   vk_free(&pool->vk.alloc, bo);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateCommandPool(VkDevice _device,
+                     const VkCommandPoolCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator,
+                     VkCommandPool *pCmdPool)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   struct hk_cmd_pool *pool;
+
+   pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pool == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result =
+      vk_command_pool_init(&device->vk, &pool->vk, pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pool);
+      return result;
+   }
+
+   list_inithead(&pool->free_bos);
+   list_inithead(&pool->free_usc_bos);
+
+   *pCmdPool = hk_cmd_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_cmd_pool_destroy_bos(struct hk_cmd_pool *pool)
+{
+   list_for_each_entry_safe(struct hk_cmd_bo, bo, &pool->free_bos, link)
+      hk_cmd_bo_destroy(pool, bo);
+
+   list_inithead(&pool->free_bos);
+
+   list_for_each_entry_safe(struct hk_cmd_bo, bo, &pool->free_usc_bos, link)
+      hk_cmd_bo_destroy(pool, bo);
+
+   list_inithead(&pool->free_usc_bos);
+}
+
+VkResult
+hk_cmd_pool_alloc_bo(struct hk_cmd_pool *pool, bool usc,
+                     struct hk_cmd_bo **bo_out)
+{
+   struct hk_cmd_bo *bo = NULL;
+   if (usc) {
+      if (!list_is_empty(&pool->free_usc_bos))
+         bo = list_first_entry(&pool->free_usc_bos, struct hk_cmd_bo, link);
+   } else {
+      if (!list_is_empty(&pool->free_bos))
+         bo = list_first_entry(&pool->free_bos, struct hk_cmd_bo, link);
+   }
+   if (bo) {
+      list_del(&bo->link);
+      *bo_out = bo;
+      return VK_SUCCESS;
+   }
+
+   return hk_cmd_bo_create(pool, usc, bo_out);
+}
+
+void
+hk_cmd_pool_free_bo_list(struct hk_cmd_pool *pool, struct list_head *bos)
+{
+   list_splicetail(bos, &pool->free_bos);
+   list_inithead(bos);
+}
+
+void
+hk_cmd_pool_free_usc_bo_list(struct hk_cmd_pool *pool, struct list_head *bos)
+{
+   list_splicetail(bos, &pool->free_usc_bos);
+   list_inithead(bos);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_cmd_pool, pool, commandPool);
+
+   if (!pool)
+      return;
+
+   vk_command_pool_finish(&pool->vk);
+   hk_cmd_pool_destroy_bos(pool);
+   vk_free2(&device->vk.alloc, pAllocator, pool);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_TrimCommandPool(VkDevice device, VkCommandPool commandPool,
+                   VkCommandPoolTrimFlags flags)
+{
+   VK_FROM_HANDLE(hk_cmd_pool, pool, commandPool);
+
+   vk_command_pool_trim(&pool->vk, flags);
+   hk_cmd_pool_destroy_bos(pool);
+}
diff --git a/src/asahi/vulkan/hk_cmd_pool.h b/src/asahi/vulkan/hk_cmd_pool.h
new file mode 100644
index 00000000000..dbac305f833
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_pool.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_command_pool.h"
+
+/* XXX: FIXME */
+#define HK_CMD_BO_SIZE 1024 * 1024
+
+/* Recyclable command buffer BO, used for both push buffers and upload */
+struct hk_cmd_bo {
+   struct agx_bo *bo;
+
+   void *map;
+
+   /** Link in hk_cmd_pool::free_bos or hk_cmd_buffer::bos */
+   struct list_head link;
+};
+
+struct hk_cmd_pool {
+   struct vk_command_pool vk;
+
+   /** List of hk_cmd_bo */
+   struct list_head free_bos;
+   struct list_head free_usc_bos;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_cmd_pool, vk.base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+
+static inline struct hk_device *
+hk_cmd_pool_device(struct hk_cmd_pool *pool)
+{
+   return (struct hk_device *)pool->vk.base.device;
+}
+
+VkResult hk_cmd_pool_alloc_bo(struct hk_cmd_pool *pool, bool force_usc,
+                              struct hk_cmd_bo **bo_out);
+
+void hk_cmd_pool_free_bo_list(struct hk_cmd_pool *pool, struct list_head *bos);
+void hk_cmd_pool_free_usc_bo_list(struct hk_cmd_pool *pool,
+                                  struct list_head *bos);
diff --git a/src/asahi/vulkan/hk_descriptor_set.c b/src/asahi/vulkan/hk_descriptor_set.c
new file mode 100644
index 00000000000..b59a9ac4b57
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_descriptor_set.h"
+#include "asahi/lib/agx_bo.h"
+#include "vulkan/vulkan_core.h"
+
+#include "hk_buffer.h"
+#include "hk_buffer_view.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+#include "hk_sampler.h"
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline void *
+desc_ubo_data(struct hk_descriptor_set *set, uint32_t binding, uint32_t elem,
+              uint32_t *size_out)
+{
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      &set->layout->binding[binding];
+
+   uint32_t offset = binding_layout->offset + elem * binding_layout->stride;
+   assert(offset < set->size);
+
+   if (size_out != NULL)
+      *size_out = set->size - offset;
+
+   return (char *)set->mapped_ptr + offset;
+}
+
+static void
+write_desc(struct hk_descriptor_set *set, uint32_t binding, uint32_t elem,
+           const void *desc_data, size_t desc_size)
+{
+   ASSERTED uint32_t dst_size;
+   void *dst = desc_ubo_data(set, binding, elem, &dst_size);
+   assert(desc_size <= dst_size);
+   memcpy(dst, desc_data, desc_size);
+}
+
+static void
+write_sampled_image_view_desc(struct hk_descriptor_set *set,
+                              const VkDescriptorImageInfo *const info,
+                              uint32_t binding, uint32_t elem,
+                              VkDescriptorType descriptor_type)
+{
+   struct hk_sampled_image_descriptor desc[3] = {};
+   assert(HK_NULL_TEX_OFFSET == 0 && "zero initialized so null descs implicit");
+
+   uint8_t plane_count = 1;
+   bool ia = (descriptor_type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT);
+
+   if (descriptor_type != VK_DESCRIPTOR_TYPE_SAMPLER && info &&
+       info->imageView != VK_NULL_HANDLE) {
+      VK_FROM_HANDLE(hk_image_view, view, info->imageView);
+
+      plane_count = view->plane_count;
+      for (uint8_t plane = 0; plane < plane_count; plane++) {
+         unsigned index = ia ? view->planes[plane].ia_desc_index
+                             : view->planes[plane].sampled_desc_index;
+
+         assert(index < (1 << 20));
+         desc[plane].image_offset = index * HK_IMAGE_STRIDE;
+      }
+   }
+
+   if (descriptor_type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+       descriptor_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
+      const struct hk_descriptor_set_binding_layout *binding_layout =
+         &set->layout->binding[binding];
+
+      struct hk_sampler *sampler;
+      if (binding_layout->immutable_samplers) {
+         sampler = binding_layout->immutable_samplers[elem];
+      } else {
+         sampler = hk_sampler_from_handle(info->sampler);
+      }
+
+      if (sampler->has_border)
+         assert(plane_count == 1);
+      else
+         plane_count = MAX2(plane_count, sampler->plane_count);
+
+      for (uint8_t plane = 0; plane < plane_count; plane++) {
+         /* We need to replicate the last sampler plane out to all image
+          * planes due to sampler table entry limitations. See
+          * hk_CreateSampler in hk_sampler.c for more details.
+          */
+         uint8_t sampler_plane = MIN2(plane, sampler->plane_count - 1);
+         assert(sampler->planes[sampler_plane].hw->index < (1 << 12));
+
+         /* All bindless samplers are indexed from 28 in hardware, add here so
+          * we don't have to care in the shader.
+          */
+         desc[plane].sampler_index =
+            sampler->planes[sampler_plane].hw->index + 28;
+         desc[plane].lod_bias_fp16 = sampler->lod_bias_fp16;
+         desc[plane].has_border = sampler->has_border;
+      }
+
+      if (sampler->has_border) {
+         assert(sampler->plane_count == 2);
+         desc[0].clamp_0_sampler_index = sampler->planes[1].hw->index + 28;
+
+         static_assert(sizeof(desc[0].border) == sizeof(sampler->custom_border),
+                       "fixed format");
+
+         memcpy(desc[0].border, sampler->custom_border.uint32,
+                sizeof(sampler->custom_border));
+      }
+   }
+   write_desc(set, binding, elem, desc, sizeof(desc[0]) * plane_count);
+}
+
+static void
+write_storage_image_view_desc(struct hk_descriptor_set *set,
+                              const VkDescriptorImageInfo *const info,
+                              uint32_t binding, uint32_t elem)
+{
+   struct hk_storage_image_descriptor desc = {};
+
+   if (info && info->imageView != VK_NULL_HANDLE) {
+      VK_FROM_HANDLE(hk_image_view, view, info->imageView);
+
+      /* Storage images are always single plane */
+      assert(view->plane_count == 1);
+      uint8_t plane = 0;
+
+      desc.tex_offset =
+         view->planes[plane].ro_storage_desc_index * HK_IMAGE_STRIDE;
+
+      desc.pbe_offset =
+         view->planes[plane].storage_desc_index * HK_IMAGE_STRIDE;
+   } else {
+      desc.tex_offset = HK_NULL_TEX_OFFSET;
+      desc.pbe_offset = HK_NULL_PBE_OFFSET;
+   }
+
+   write_desc(set, binding, elem, &desc, sizeof(desc));
+}
+
+static void
+write_buffer_desc(struct hk_descriptor_set *set,
+                  const VkDescriptorBufferInfo *const info, uint32_t binding,
+                  uint32_t elem)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, info->buffer);
+
+   const struct hk_addr_range addr_range =
+      hk_buffer_addr_range(buffer, info->offset, info->range);
+   assert(addr_range.range <= UINT32_MAX);
+
+   const struct hk_buffer_address desc = {
+      .base_addr = addr_range.addr,
+      .size = addr_range.range,
+   };
+   write_desc(set, binding, elem, &desc, sizeof(desc));
+}
+
+static void
+write_dynamic_buffer_desc(struct hk_descriptor_set *set,
+                          const VkDescriptorBufferInfo *const info,
+                          uint32_t binding, uint32_t elem)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, info->buffer);
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      &set->layout->binding[binding];
+
+   const struct hk_addr_range addr_range =
+      hk_buffer_addr_range(buffer, info->offset, info->range);
+   assert(addr_range.range <= UINT32_MAX);
+
+   struct hk_buffer_address *desc =
+      &set->dynamic_buffers[binding_layout->dynamic_buffer_index + elem];
+   *desc = (struct hk_buffer_address){
+      .base_addr = addr_range.addr,
+      .size = addr_range.range,
+   };
+}
+
+static void
+write_buffer_view_desc(struct hk_descriptor_set *set,
+                       const VkBufferView bufferView, uint32_t binding,
+                       uint32_t elem)
+{
+   struct hk_buffer_view_descriptor desc = {};
+   if (bufferView != VK_NULL_HANDLE) {
+      VK_FROM_HANDLE(hk_buffer_view, view, bufferView);
+
+      assert(view->tex_desc_index < (1 << 20));
+      assert(view->pbe_desc_index < (1 << 20));
+
+      desc.tex_offset = view->tex_desc_index * HK_IMAGE_STRIDE;
+      desc.pbe_offset = view->pbe_desc_index * HK_IMAGE_STRIDE;
+   } else {
+      desc.tex_offset = HK_NULL_TEX_OFFSET;
+      desc.pbe_offset = HK_NULL_PBE_OFFSET;
+   }
+
+   write_desc(set, binding, elem, &desc, sizeof(desc));
+}
+
+static void
+write_inline_uniform_data(struct hk_descriptor_set *set,
+                          const VkWriteDescriptorSetInlineUniformBlock *info,
+                          uint32_t binding, uint32_t offset)
+{
+   assert(set->layout->binding[binding].stride == 1);
+   write_desc(set, binding, offset, info->pData, info->dataSize);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_UpdateDescriptorSets(VkDevice device, uint32_t descriptorWriteCount,
+                        const VkWriteDescriptorSet *pDescriptorWrites,
+                        uint32_t descriptorCopyCount,
+                        const VkCopyDescriptorSet *pDescriptorCopies)
+{
+   for (uint32_t w = 0; w < descriptorWriteCount; w++) {
+      const VkWriteDescriptorSet *write = &pDescriptorWrites[w];
+      VK_FROM_HANDLE(hk_descriptor_set, set, write->dstSet);
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_sampled_image_view_desc(
+               set, write->pImageInfo + j, write->dstBinding,
+               write->dstArrayElement + j, write->descriptorType);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_storage_image_view_desc(set, write->pImageInfo + j,
+                                          write->dstBinding,
+                                          write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_view_desc(set, write->pTexelBufferView[j],
+                                   write->dstBinding,
+                                   write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_desc(set, write->pBufferInfo + j, write->dstBinding,
+                              write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_dynamic_buffer_desc(set, write->pBufferInfo + j,
+                                      write->dstBinding,
+                                      write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+         const VkWriteDescriptorSetInlineUniformBlock *write_inline =
+            vk_find_struct_const(write->pNext,
+                                 WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+         assert(write_inline->dataSize == write->descriptorCount);
+         write_inline_uniform_data(set, write_inline, write->dstBinding,
+                                   write->dstArrayElement);
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   for (uint32_t i = 0; i < descriptorCopyCount; i++) {
+      const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
+      VK_FROM_HANDLE(hk_descriptor_set, src, copy->srcSet);
+      VK_FROM_HANDLE(hk_descriptor_set, dst, copy->dstSet);
+
+      const struct hk_descriptor_set_binding_layout *src_binding_layout =
+         &src->layout->binding[copy->srcBinding];
+      const struct hk_descriptor_set_binding_layout *dst_binding_layout =
+         &dst->layout->binding[copy->dstBinding];
+
+      if (dst_binding_layout->stride > 0 && src_binding_layout->stride > 0) {
+         for (uint32_t j = 0; j < copy->descriptorCount; j++) {
+            ASSERTED uint32_t dst_max_size, src_max_size;
+            void *dst_map = desc_ubo_data(
+               dst, copy->dstBinding, copy->dstArrayElement + j, &dst_max_size);
+            const void *src_map = desc_ubo_data(
+               src, copy->srcBinding, copy->srcArrayElement + j, &src_max_size);
+            const uint32_t copy_size =
+               MIN2(dst_binding_layout->stride, src_binding_layout->stride);
+            assert(copy_size <= dst_max_size && copy_size <= src_max_size);
+            memcpy(dst_map, src_map, copy_size);
+         }
+      }
+
+      switch (src_binding_layout->type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+         const uint32_t dst_dyn_start =
+            dst_binding_layout->dynamic_buffer_index + copy->dstArrayElement;
+         const uint32_t src_dyn_start =
+            src_binding_layout->dynamic_buffer_index + copy->srcArrayElement;
+         typed_memcpy(&dst->dynamic_buffers[dst_dyn_start],
+                      &src->dynamic_buffers[src_dyn_start],
+                      copy->descriptorCount);
+         break;
+      }
+      default:
+         break;
+      }
+   }
+}
+
+void
+hk_push_descriptor_set_update(struct hk_push_descriptor_set *push_set,
+                              struct hk_descriptor_set_layout *layout,
+                              uint32_t write_count,
+                              const VkWriteDescriptorSet *writes)
+{
+   assert(layout->non_variable_descriptor_buffer_size < sizeof(push_set->data));
+   struct hk_descriptor_set set = {
+      .layout = layout,
+      .size = sizeof(push_set->data),
+      .mapped_ptr = push_set->data,
+   };
+
+   for (uint32_t w = 0; w < write_count; w++) {
+      const VkWriteDescriptorSet *write = &writes[w];
+      assert(write->dstSet == VK_NULL_HANDLE);
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_sampled_image_view_desc(
+               &set, write->pImageInfo + j, write->dstBinding,
+               write->dstArrayElement + j, write->descriptorType);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_storage_image_view_desc(&set, write->pImageInfo + j,
+                                          write->dstBinding,
+                                          write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_view_desc(&set, write->pTexelBufferView[j],
+                                   write->dstBinding,
+                                   write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_desc(&set, write->pBufferInfo + j, write->dstBinding,
+                              write->dstArrayElement + j);
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+}
+
+static void hk_descriptor_pool_free(struct hk_descriptor_pool *pool,
+                                    uint64_t addr, uint64_t size);
+
+static void
+hk_descriptor_set_destroy(struct hk_device *dev,
+                          struct hk_descriptor_pool *pool,
+                          struct hk_descriptor_set *set)
+{
+   list_del(&set->link);
+   if (set->size > 0)
+      hk_descriptor_pool_free(pool, set->addr, set->size);
+   vk_descriptor_set_layout_unref(&dev->vk, &set->layout->vk);
+
+   vk_object_free(&dev->vk, NULL, set);
+}
+
+static void
+hk_destroy_descriptor_pool(struct hk_device *dev,
+                           const VkAllocationCallbacks *pAllocator,
+                           struct hk_descriptor_pool *pool)
+{
+   list_for_each_entry_safe(struct hk_descriptor_set, set, &pool->sets, link)
+      hk_descriptor_set_destroy(dev, pool, set);
+
+   util_vma_heap_finish(&pool->heap);
+   agx_bo_unreference(pool->bo);
+
+   vk_object_free(&dev->vk, pAllocator, pool);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateDescriptorPool(VkDevice _device,
+                        const VkDescriptorPoolCreateInfo *pCreateInfo,
+                        const VkAllocationCallbacks *pAllocator,
+                        VkDescriptorPool *pDescriptorPool)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_descriptor_pool *pool;
+
+   pool = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pool),
+                           VK_OBJECT_TYPE_DESCRIPTOR_POOL);
+   if (!pool)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   list_inithead(&pool->sets);
+
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+
+   uint32_t max_align = 0;
+   for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
+      const VkMutableDescriptorTypeListEXT *type_list = NULL;
+      if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT &&
+          mutable_info && i < mutable_info->mutableDescriptorTypeListCount)
+         type_list = &mutable_info->pMutableDescriptorTypeLists[i];
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, pCreateInfo->pPoolSizes[i].type,
+                                          type_list, &stride, &alignment);
+      max_align = MAX2(max_align, alignment);
+   }
+
+   uint64_t bo_size = 0;
+   for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
+      const VkMutableDescriptorTypeListEXT *type_list = NULL;
+      if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT &&
+          mutable_info && i < mutable_info->mutableDescriptorTypeListCount)
+         type_list = &mutable_info->pMutableDescriptorTypeLists[i];
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, pCreateInfo->pPoolSizes[i].type,
+                                          type_list, &stride, &alignment);
+      bo_size +=
+         MAX2(stride, max_align) * pCreateInfo->pPoolSizes[i].descriptorCount;
+   }
+
+   /* Individual descriptor sets are aligned to the min UBO alignment to
+    * ensure that we don't end up with unaligned data access in any shaders.
+    * This means that each descriptor buffer allocated may burn up to 16B of
+    * extra space to get the right alignment.  (Technically, it's at most 28B
+    * because we're always going to start at least 4B aligned but we're being
+    * conservative here.)  Allocate enough extra space that we can chop it
+    * into maxSets pieces and align each one of them to 32B.
+    */
+   bo_size += HK_MIN_UBO_ALIGNMENT * pCreateInfo->maxSets;
+
+   if (bo_size) {
+      pool->bo = agx_bo_create(&dev->dev, bo_size, 0, "Descriptor pool");
+      if (!pool->bo) {
+         hk_destroy_descriptor_pool(dev, pAllocator, pool);
+         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+
+      pool->mapped_ptr = pool->bo->ptr.cpu;
+
+      /* The BO may be larger thanks to GPU page alignment.  We may as well
+       * make that extra space available to the client.
+       */
+      assert(pool->bo->size >= bo_size);
+      util_vma_heap_init(&pool->heap, pool->bo->ptr.gpu, pool->bo->size);
+   } else {
+      util_vma_heap_init(&pool->heap, 0, 0);
+   }
+
+   *pDescriptorPool = hk_descriptor_pool_to_handle(pool);
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_descriptor_pool_alloc(struct hk_descriptor_pool *pool, uint64_t size,
+                         uint64_t alignment, uint64_t *addr_out, void **map_out)
+{
+   assert(size > 0);
+   uint64_t addr = util_vma_heap_alloc(&pool->heap, size, alignment);
+   if (addr == 0)
+      return VK_ERROR_OUT_OF_POOL_MEMORY;
+
+   assert(addr >= pool->bo->ptr.gpu);
+   assert(addr + size <= pool->bo->ptr.gpu + pool->bo->size);
+   uint64_t offset = addr - pool->bo->ptr.gpu;
+
+   *addr_out = addr;
+   *map_out = pool->mapped_ptr + offset;
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_descriptor_pool_free(struct hk_descriptor_pool *pool, uint64_t addr,
+                        uint64_t size)
+{
+   assert(size > 0);
+   assert(addr >= pool->bo->ptr.gpu);
+   assert(addr + size <= pool->bo->ptr.gpu + pool->bo->size);
+   util_vma_heap_free(&pool->heap, addr, size);
+}
+
+static VkResult
+hk_descriptor_set_create(struct hk_device *dev, struct hk_descriptor_pool *pool,
+                         struct hk_descriptor_set_layout *layout,
+                         uint32_t variable_count,
+                         struct hk_descriptor_set **out_set)
+{
+   struct hk_descriptor_set *set;
+   VkResult result;
+
+   uint32_t mem_size =
+      sizeof(struct hk_descriptor_set) +
+      layout->dynamic_buffer_count * sizeof(struct hk_buffer_address);
+
+   set =
+      vk_object_zalloc(&dev->vk, NULL, mem_size, VK_OBJECT_TYPE_DESCRIPTOR_SET);
+   if (!set)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   set->size = layout->non_variable_descriptor_buffer_size;
+
+   if (layout->binding_count > 0 &&
+       (layout->binding[layout->binding_count - 1].flags &
+        VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
+      uint32_t stride = layout->binding[layout->binding_count - 1].stride;
+      set->size += stride * variable_count;
+   }
+
+   set->size = align64(set->size, HK_MIN_UBO_ALIGNMENT);
+
+   if (set->size > 0) {
+      result = hk_descriptor_pool_alloc(pool, set->size, HK_MIN_UBO_ALIGNMENT,
+                                        &set->addr, &set->mapped_ptr);
+      if (result != VK_SUCCESS) {
+         vk_object_free(&dev->vk, NULL, set);
+         return result;
+      }
+   }
+
+   vk_descriptor_set_layout_ref(&layout->vk);
+   set->layout = layout;
+
+   for (uint32_t b = 0; b < layout->binding_count; b++) {
+      if (layout->binding[b].type != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          layout->binding[b].type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
+
+      if (layout->binding[b].immutable_samplers == NULL)
+         continue;
+
+      uint32_t array_size = layout->binding[b].array_size;
+      if (layout->binding[b].flags &
+          VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)
+         array_size = variable_count;
+
+      for (uint32_t j = 0; j < array_size; j++) {
+         write_sampled_image_view_desc(set, NULL, b, j,
+                                       layout->binding[b].type);
+      }
+   }
+
+   list_addtail(&set->link, &pool->sets);
+   *out_set = set;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_AllocateDescriptorSets(VkDevice device,
+                          const VkDescriptorSetAllocateInfo *pAllocateInfo,
+                          VkDescriptorSet *pDescriptorSets)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, pAllocateInfo->descriptorPool);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t i;
+
+   struct hk_descriptor_set *set = NULL;
+
+   const VkDescriptorSetVariableDescriptorCountAllocateInfo *var_desc_count =
+      vk_find_struct_const(
+         pAllocateInfo->pNext,
+         DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO);
+
+   /* allocate a set of buffers for each shader to contain descriptors */
+   for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
+      VK_FROM_HANDLE(hk_descriptor_set_layout, layout,
+                     pAllocateInfo->pSetLayouts[i]);
+      /* If descriptorSetCount is zero or this structure is not included in
+       * the pNext chain, then the variable lengths are considered to be zero.
+       */
+      const uint32_t variable_count =
+         var_desc_count && var_desc_count->descriptorSetCount > 0
+            ? var_desc_count->pDescriptorCounts[i]
+            : 0;
+
+      result =
+         hk_descriptor_set_create(dev, pool, layout, variable_count, &set);
+      if (result != VK_SUCCESS)
+         break;
+
+      pDescriptorSets[i] = hk_descriptor_set_to_handle(set);
+   }
+
+   if (result != VK_SUCCESS) {
+      hk_FreeDescriptorSets(device, pAllocateInfo->descriptorPool, i,
+                            pDescriptorSets);
+      for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
+         pDescriptorSets[i] = VK_NULL_HANDLE;
+      }
+   }
+   return result;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_FreeDescriptorSets(VkDevice device, VkDescriptorPool descriptorPool,
+                      uint32_t descriptorSetCount,
+                      const VkDescriptorSet *pDescriptorSets)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, descriptorPool);
+
+   for (uint32_t i = 0; i < descriptorSetCount; i++) {
+      VK_FROM_HANDLE(hk_descriptor_set, set, pDescriptorSets[i]);
+
+      if (set)
+         hk_descriptor_set_destroy(dev, pool, set);
+   }
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyDescriptorPool(VkDevice device, VkDescriptorPool _pool,
+                         const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, _pool);
+
+   if (!_pool)
+      return;
+
+   hk_destroy_descriptor_pool(dev, pAllocator, pool);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_ResetDescriptorPool(VkDevice device, VkDescriptorPool descriptorPool,
+                       VkDescriptorPoolResetFlags flags)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, descriptorPool);
+
+   list_for_each_entry_safe(struct hk_descriptor_set, set, &pool->sets, link)
+      hk_descriptor_set_destroy(dev, pool, set);
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_descriptor_set_write_template(
+   struct hk_descriptor_set *set,
+   const struct vk_descriptor_update_template *template, const void *data)
+{
+   for (uint32_t i = 0; i < template->entry_count; i++) {
+      const struct vk_descriptor_template_entry *entry = &template->entries[i];
+
+      switch (entry->type) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorImageInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_sampled_image_view_desc(set, info, entry->binding,
+                                          entry->array_element + j,
+                                          entry->type);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorImageInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_storage_image_view_desc(set, info, entry->binding,
+                                          entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkBufferView *bview =
+               data + entry->offset + j * entry->stride;
+
+            write_buffer_view_desc(set, *bview, entry->binding,
+                                   entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorBufferInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_buffer_desc(set, info, entry->binding,
+                              entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorBufferInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_dynamic_buffer_desc(set, info, entry->binding,
+                                      entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+         write_desc(set, entry->binding, entry->array_element,
+                    data + entry->offset, entry->array_count);
+         break;
+
+      default:
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_UpdateDescriptorSetWithTemplate(
+   VkDevice device, VkDescriptorSet descriptorSet,
+   VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData)
+{
+   VK_FROM_HANDLE(hk_descriptor_set, set, descriptorSet);
+   VK_FROM_HANDLE(vk_descriptor_update_template, template,
+                  descriptorUpdateTemplate);
+
+   hk_descriptor_set_write_template(set, template, pData);
+}
+
+void
+hk_push_descriptor_set_update_template(
+   struct hk_push_descriptor_set *push_set,
+   struct hk_descriptor_set_layout *layout,
+   const struct vk_descriptor_update_template *template, const void *data)
+{
+   struct hk_descriptor_set tmp_set = {
+      .layout = layout,
+      .size = sizeof(push_set->data),
+      .mapped_ptr = push_set->data,
+   };
+   hk_descriptor_set_write_template(&tmp_set, template, data);
+}
diff --git a/src/asahi/vulkan/hk_descriptor_set.h b/src/asahi/vulkan/hk_descriptor_set.h
new file mode 100644
index 00000000000..88606654df2
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "hk_device.h"
+#include "vk_descriptor_update_template.h"
+#include "vk_object.h"
+
+#include "util/list.h"
+#include "util/vma.h"
+
+/* Stride of the image heap, equal to the size of a texture/PBE descriptor */
+#define HK_IMAGE_STRIDE (24)
+
+struct hk_descriptor_set_layout;
+
+struct hk_sampled_image_descriptor {
+   uint32_t image_offset;
+   uint16_t sampler_index;
+   uint16_t lod_bias_fp16;
+   /* TODO: This should probably be a heap! */
+   uint32_t border[4];
+   /* XXX: Single bit! Tuck it in somewhere else */
+   uint32_t has_border;
+   uint16_t clamp_0_sampler_index;
+   uint16_t pad_0;
+};
+static_assert(sizeof(struct hk_sampled_image_descriptor) == 32,
+              "hk_sampled_image_descriptor has no holes");
+
+struct hk_storage_image_descriptor {
+   uint32_t tex_offset;
+   uint32_t pbe_offset;
+};
+static_assert(sizeof(struct hk_storage_image_descriptor) == 8,
+              "hk_storage_image_descriptor has no holes");
+
+struct hk_buffer_view_descriptor {
+   uint32_t tex_offset;
+   uint32_t pbe_offset;
+};
+static_assert(sizeof(struct hk_buffer_view_descriptor) == 8,
+              "hk_buffer_view_descriptor has no holes");
+
+/* This has to match nir_address_format_64bit_bounded_global */
+struct hk_buffer_address {
+   uint64_t base_addr;
+   uint32_t size;
+   uint32_t zero; /* Must be zero! */
+};
+
+struct hk_descriptor_pool {
+   struct vk_object_base base;
+
+   struct list_head sets;
+
+   struct agx_bo *bo;
+   uint8_t *mapped_ptr;
+   struct util_vma_heap heap;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+
+struct hk_descriptor_set {
+   struct vk_object_base base;
+
+   /* Link in hk_descriptor_pool::sets */
+   struct list_head link;
+
+   struct hk_descriptor_set_layout *layout;
+   void *mapped_ptr;
+   uint64_t addr;
+   uint32_t size;
+
+   struct hk_buffer_address dynamic_buffers[];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+
+static inline uint64_t
+hk_descriptor_set_addr(const struct hk_descriptor_set *set)
+{
+   return set->addr;
+}
+
+struct hk_push_descriptor_set {
+   uint8_t data[HK_PUSH_DESCRIPTOR_SET_SIZE];
+};
+
+void hk_push_descriptor_set_update(struct hk_push_descriptor_set *push_set,
+                                   struct hk_descriptor_set_layout *layout,
+                                   uint32_t write_count,
+                                   const VkWriteDescriptorSet *writes);
+
+void hk_push_descriptor_set_update_template(
+   struct hk_push_descriptor_set *push_set,
+   struct hk_descriptor_set_layout *layout,
+   const struct vk_descriptor_update_template *template, const void *data);
diff --git a/src/asahi/vulkan/hk_descriptor_set_layout.c b/src/asahi/vulkan/hk_descriptor_set_layout.c
new file mode 100644
index 00000000000..7efe2e127a6
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_descriptor_set_layout.h"
+
+#include "hk_descriptor_set.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+#include "hk_sampler.h"
+
+#include "vk_pipeline_layout.h"
+
+static bool
+binding_has_immutable_samplers(const VkDescriptorSetLayoutBinding *binding)
+{
+   switch (binding->descriptorType) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      return binding->pImmutableSamplers != NULL;
+
+   default:
+      return false;
+   }
+}
+
+void
+hk_descriptor_stride_align_for_type(
+   const struct hk_physical_device *pdev, VkDescriptorType type,
+   const VkMutableDescriptorTypeListEXT *type_list, uint32_t *stride,
+   uint32_t *alignment)
+{
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      /* TODO: How do samplers work? */
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+      *stride = *alignment = sizeof(struct hk_sampled_image_descriptor);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      *stride = *alignment = sizeof(struct hk_storage_image_descriptor);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      *stride = *alignment = sizeof(struct hk_buffer_view_descriptor);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      *stride = *alignment = sizeof(struct hk_buffer_address);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      *stride = *alignment = 0; /* These don't take up buffer space */
+      break;
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+      *stride = 1; /* Array size is bytes */
+      *alignment = HK_MIN_UBO_ALIGNMENT;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
+      *stride = *alignment = 0;
+      if (type_list == NULL)
+         *stride = *alignment = HK_MAX_DESCRIPTOR_SIZE;
+      for (unsigned i = 0; type_list && i < type_list->descriptorTypeCount;
+           i++) {
+         /* This shouldn't recurse */
+         assert(type_list->pDescriptorTypes[i] !=
+                VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
+         uint32_t desc_stride, desc_align;
+         hk_descriptor_stride_align_for_type(pdev,
+                                             type_list->pDescriptorTypes[i],
+                                             NULL, &desc_stride, &desc_align);
+         *stride = MAX2(*stride, desc_stride);
+         *alignment = MAX2(*alignment, desc_align);
+      }
+      *stride = ALIGN(*stride, *alignment);
+      break;
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
+
+   assert(*stride <= HK_MAX_DESCRIPTOR_SIZE);
+}
+
+static const VkMutableDescriptorTypeListEXT *
+hk_descriptor_get_type_list(VkDescriptorType type,
+                            const VkMutableDescriptorTypeCreateInfoEXT *info,
+                            const uint32_t info_idx)
+{
+   const VkMutableDescriptorTypeListEXT *type_list = NULL;
+   if (type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
+      assert(info != NULL);
+      assert(info_idx < info->mutableDescriptorTypeListCount);
+      type_list = &info->pMutableDescriptorTypeLists[info_idx];
+   }
+   return type_list;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateDescriptorSetLayout(VkDevice device,
+                             const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *pAllocator,
+                             VkDescriptorSetLayout *pSetLayout)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   uint32_t num_bindings = 0;
+   uint32_t immutable_sampler_count = 0;
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
+      num_bindings = MAX2(num_bindings, binding->binding + 1);
+
+      /* From the Vulkan 1.1.97 spec for VkDescriptorSetLayoutBinding:
+       *
+       *    "If descriptorType specifies a VK_DESCRIPTOR_TYPE_SAMPLER or
+       *    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER type descriptor, then
+       *    pImmutableSamplers can be used to initialize a set of immutable
+       *    samplers. [...]  If descriptorType is not one of these descriptor
+       *    types, then pImmutableSamplers is ignored.
+       *
+       * We need to be careful here and only parse pImmutableSamplers if we
+       * have one of the right descriptor types.
+       */
+      if (binding_has_immutable_samplers(binding))
+         immutable_sampler_count += binding->descriptorCount;
+   }
+
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct hk_descriptor_set_layout, layout, 1);
+   VK_MULTIALLOC_DECL(&ma, struct hk_descriptor_set_binding_layout, bindings,
+                      num_bindings);
+   VK_MULTIALLOC_DECL(&ma, struct hk_sampler *, samplers,
+                      immutable_sampler_count);
+
+   if (!vk_descriptor_set_layout_multizalloc(&dev->vk, &ma))
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   layout->binding_count = num_bindings;
+
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
+      uint32_t b = binding->binding;
+      /* We temporarily store pCreateInfo->pBindings[] index (plus one) in the
+       * immutable_samplers pointer.  This provides us with a quick-and-dirty
+       * way to sort the bindings by binding number.
+       */
+      layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1);
+   }
+
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+
+   uint32_t buffer_size = 0;
+   uint8_t dynamic_buffer_count = 0;
+   for (uint32_t b = 0; b < num_bindings; b++) {
+      /* We stashed the pCreateInfo->pBindings[] index (plus one) in the
+       * immutable_samplers pointer.  Check for NULL (empty binding) and then
+       * reset it and compute the index.
+       */
+      if (layout->binding[b].immutable_samplers == NULL)
+         continue;
+      const uint32_t info_idx =
+         (uintptr_t)(void *)layout->binding[b].immutable_samplers - 1;
+      layout->binding[b].immutable_samplers = NULL;
+
+      const VkDescriptorSetLayoutBinding *binding =
+         &pCreateInfo->pBindings[info_idx];
+
+      if (binding->descriptorCount == 0)
+         continue;
+
+      layout->binding[b].type = binding->descriptorType;
+
+      if (binding_flags_info && binding_flags_info->bindingCount > 0) {
+         assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount);
+         layout->binding[b].flags = binding_flags_info->pBindingFlags[info_idx];
+      }
+
+      layout->binding[b].array_size = binding->descriptorCount;
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         layout->binding[b].dynamic_buffer_index = dynamic_buffer_count;
+         dynamic_buffer_count += binding->descriptorCount;
+         break;
+      default:
+         break;
+      }
+
+      const VkMutableDescriptorTypeListEXT *type_list =
+         hk_descriptor_get_type_list(binding->descriptorType, mutable_info,
+                                     info_idx);
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, binding->descriptorType,
+                                          type_list, &stride, &alignment);
+
+      uint8_t max_plane_count = 1;
+
+      if (binding_has_immutable_samplers(binding)) {
+         layout->binding[b].immutable_samplers = samplers;
+         samplers += binding->descriptorCount;
+         for (uint32_t i = 0; i < binding->descriptorCount; i++) {
+            VK_FROM_HANDLE(hk_sampler, sampler, binding->pImmutableSamplers[i]);
+            layout->binding[b].immutable_samplers[i] = sampler;
+            const uint8_t sampler_plane_count =
+               sampler->vk.ycbcr_conversion
+                  ? vk_format_get_plane_count(
+                       sampler->vk.ycbcr_conversion->state.format)
+                  : 1;
+            if (max_plane_count < sampler_plane_count)
+               max_plane_count = sampler_plane_count;
+         }
+      }
+
+      stride *= max_plane_count;
+
+      if (stride > 0) {
+         assert(stride <= UINT8_MAX);
+         assert(util_is_power_of_two_nonzero(alignment));
+
+         buffer_size = align64(buffer_size, alignment);
+         layout->binding[b].offset = buffer_size;
+         layout->binding[b].stride = stride;
+
+         if (layout->binding[b].flags &
+             VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) {
+            /* From the Vulkan 1.3.256 spec:
+             *
+             *    VUID-VkDescriptorSetLayoutBindingFlagsCreateInfo-pBindingFlags-03004
+             *    "If an element of pBindingFlags includes
+             *    VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT, then
+             *    all other elements of
+             *    VkDescriptorSetLayoutCreateInfo::pBindings must have a
+             *    smaller value of binding"
+             *
+             * In other words, it has to be the last binding.
+             */
+            assert(b == num_bindings - 1);
+         } else {
+            /* the allocation size will be computed at descriptor allocation,
+             * but the buffer size will be already aligned as this binding will
+             * be the last
+             */
+            buffer_size += stride * binding->descriptorCount;
+         }
+      }
+   }
+
+   layout->non_variable_descriptor_buffer_size = buffer_size;
+   layout->dynamic_buffer_count = dynamic_buffer_count;
+
+   struct mesa_blake3 blake3_ctx;
+   _mesa_blake3_init(&blake3_ctx);
+
+#define BLAKE3_UPDATE_VALUE(x)                                                 \
+   _mesa_blake3_update(&blake3_ctx, &(x), sizeof(x));
+   BLAKE3_UPDATE_VALUE(layout->non_variable_descriptor_buffer_size);
+   BLAKE3_UPDATE_VALUE(layout->dynamic_buffer_count);
+   BLAKE3_UPDATE_VALUE(layout->binding_count);
+
+   for (uint32_t b = 0; b < num_bindings; b++) {
+      BLAKE3_UPDATE_VALUE(layout->binding[b].type);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].flags);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].array_size);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].offset);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].stride);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].dynamic_buffer_index);
+
+      if (layout->binding[b].immutable_samplers != NULL) {
+         for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
+            const struct hk_sampler *sampler =
+               layout->binding[b].immutable_samplers[i];
+
+            /* We zalloc the object, so it's safe to hash the whole thing */
+            if (sampler != NULL && sampler->vk.ycbcr_conversion != NULL)
+               BLAKE3_UPDATE_VALUE(sampler->vk.ycbcr_conversion->state);
+         }
+      }
+   }
+#undef BLAKE3_UPDATE_VALUE
+
+   _mesa_blake3_final(&blake3_ctx, layout->vk.blake3);
+
+   *pSetLayout = hk_descriptor_set_layout_to_handle(layout);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDescriptorSetLayoutSupport(
+   VkDevice device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
+   VkDescriptorSetLayoutSupport *pSupport)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+
+   /* Figure out the maximum alignment up-front.  Otherwise, we need to sort
+    * the list of descriptors by binding number in order to get the size
+    * accumulation right.
+    */
+   uint32_t max_align = 0;
+   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[i];
+      const VkMutableDescriptorTypeListEXT *type_list =
+         hk_descriptor_get_type_list(binding->descriptorType, mutable_info, i);
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, binding->descriptorType,
+                                          type_list, &stride, &alignment);
+      max_align = MAX2(max_align, alignment);
+   }
+
+   uint64_t non_variable_size = 0;
+   uint32_t variable_stride = 0;
+   uint32_t variable_count = 0;
+   uint8_t dynamic_buffer_count = 0;
+
+   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[i];
+
+      VkDescriptorBindingFlags flags = 0;
+      if (binding_flags != NULL && binding_flags->bindingCount > 0)
+         flags = binding_flags->pBindingFlags[i];
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         dynamic_buffer_count += binding->descriptorCount;
+         break;
+      default:
+         break;
+      }
+
+      const VkMutableDescriptorTypeListEXT *type_list =
+         hk_descriptor_get_type_list(binding->descriptorType, mutable_info, i);
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, binding->descriptorType,
+                                          type_list, &stride, &alignment);
+
+      if (stride > 0) {
+         assert(stride <= UINT8_MAX);
+         assert(util_is_power_of_two_nonzero(alignment));
+
+         if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) {
+            /* From the Vulkan 1.3.256 spec:
+             *
+             *    "For the purposes of this command, a variable-sized
+             *    descriptor binding with a descriptorCount of zero is treated
+             *    as if the descriptorCount is one"
+             */
+            variable_count = MAX2(1, binding->descriptorCount);
+            variable_stride = stride;
+         } else {
+            /* Since we're aligning to the maximum and since this is just a
+             * check for whether or not the max buffer size is big enough, we
+             * keep non_variable_size aligned to max_align.
+             */
+            non_variable_size += stride * binding->descriptorCount;
+            non_variable_size = align64(non_variable_size, max_align);
+         }
+      }
+   }
+
+   uint64_t buffer_size = non_variable_size;
+   if (variable_stride > 0) {
+      buffer_size += variable_stride * variable_count;
+      buffer_size = align64(buffer_size, max_align);
+   }
+
+   uint32_t max_buffer_size;
+   if (pCreateInfo->flags &
+       VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
+      max_buffer_size = HK_PUSH_DESCRIPTOR_SET_SIZE;
+   else
+      max_buffer_size = HK_MAX_DESCRIPTOR_SET_SIZE;
+
+   pSupport->supported = dynamic_buffer_count <= HK_MAX_DYNAMIC_BUFFERS &&
+                         buffer_size <= max_buffer_size;
+
+   vk_foreach_struct(ext, pSupport->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
+         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         if (variable_stride > 0) {
+            vs->maxVariableDescriptorCount =
+               (max_buffer_size - non_variable_size) / variable_stride;
+         } else {
+            vs->maxVariableDescriptorCount = 0;
+         }
+         break;
+      }
+
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
diff --git a/src/asahi/vulkan/hk_descriptor_set_layout.h b/src/asahi/vulkan/hk_descriptor_set_layout.h
new file mode 100644
index 00000000000..a21a885a918
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_descriptor_set_layout.h"
+#include "vk_object.h"
+
+struct hk_device;
+struct hk_physical_device;
+struct hk_sampler;
+struct vk_pipeline_layout;
+
+struct hk_descriptor_set_binding_layout {
+   /* The type of the descriptors in this binding */
+   VkDescriptorType type;
+
+   /* Flags provided when this binding was created */
+   VkDescriptorBindingFlags flags;
+
+   /* Number of array elements in this binding (or size in bytes for inline
+    * uniform data)
+    */
+   uint32_t array_size;
+
+   /* Offset into the descriptor buffer where this descriptor lives */
+   uint32_t offset;
+
+   /* Stride between array elements in the descriptor buffer */
+   uint8_t stride;
+
+   /* Index into the dynamic buffer binding array */
+   uint8_t dynamic_buffer_index;
+
+   /* Immutable samplers (or NULL if no immutable samplers) */
+   struct hk_sampler **immutable_samplers;
+};
+
+struct hk_descriptor_set_layout {
+   struct vk_descriptor_set_layout vk;
+
+   /* Size of the descriptor buffer for this descriptor set */
+   /* Does not contain the size needed for variable count descriptors */
+   uint32_t non_variable_descriptor_buffer_size;
+
+   /* Number of dynamic UBO bindings in this set */
+   uint8_t dynamic_buffer_count;
+
+   /* Number of bindings in this descriptor set */
+   uint32_t binding_count;
+
+   /* Bindings in this descriptor set */
+   struct hk_descriptor_set_binding_layout binding[0];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_set_layout, vk.base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+
+void hk_descriptor_stride_align_for_type(
+   const struct hk_physical_device *pdev, VkDescriptorType type,
+   const VkMutableDescriptorTypeListEXT *type_list, uint32_t *stride,
+   uint32_t *alignment);
+
+static inline struct hk_descriptor_set_layout *
+vk_to_hk_descriptor_set_layout(struct vk_descriptor_set_layout *layout)
+{
+   return container_of(layout, struct hk_descriptor_set_layout, vk);
+}
diff --git a/src/asahi/vulkan/hk_descriptor_table.c b/src/asahi/vulkan/hk_descriptor_table.c
new file mode 100644
index 00000000000..6d07ac6f384
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_table.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_descriptor_table.h"
+
+#include "hk_device.h"
+#include "hk_physical_device.h"
+
+#include "asahi/lib/agx_bo.h"
+#include <sys/mman.h>
+
+static VkResult
+hk_descriptor_table_grow_locked(struct hk_device *dev,
+                                struct hk_descriptor_table *table,
+                                uint32_t new_alloc)
+{
+   struct agx_bo *new_bo;
+   uint32_t *new_free_table;
+
+   assert(new_alloc > table->alloc && new_alloc <= table->max_alloc);
+
+   const uint32_t new_bo_size = new_alloc * table->desc_size;
+   new_bo = agx_bo_create(&dev->dev, new_bo_size, 0, "Descriptor table");
+
+   if (new_bo == NULL) {
+      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "Failed to allocate the descriptor table");
+   }
+
+   void *new_map = new_bo->ptr.cpu;
+
+   assert(table->bo == NULL && "not yet implemented sparse binding");
+   table->bo = new_bo;
+   table->map = new_map;
+
+   const size_t new_free_table_size = new_alloc * sizeof(uint32_t);
+   new_free_table =
+      vk_realloc(&dev->vk.alloc, table->free_table, new_free_table_size, 4,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_free_table == NULL) {
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "Failed to allocate image descriptor free table");
+   }
+   table->free_table = new_free_table;
+
+   table->alloc = new_alloc;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_descriptor_table_init(struct hk_device *dev,
+                         struct hk_descriptor_table *table,
+                         uint32_t descriptor_size,
+                         uint32_t min_descriptor_count,
+                         uint32_t max_descriptor_count)
+{
+   memset(table, 0, sizeof(*table));
+   VkResult result;
+
+   simple_mtx_init(&table->mutex, mtx_plain);
+
+   assert(util_is_power_of_two_nonzero(min_descriptor_count));
+   assert(util_is_power_of_two_nonzero(max_descriptor_count));
+
+   /* TODO: sparse binding for stable gpu va */
+   min_descriptor_count = max_descriptor_count;
+
+   table->desc_size = descriptor_size;
+   table->alloc = 0;
+   table->max_alloc = max_descriptor_count;
+   table->next_desc = 0;
+   table->free_count = 0;
+
+   result = hk_descriptor_table_grow_locked(dev, table, min_descriptor_count);
+   if (result != VK_SUCCESS) {
+      hk_descriptor_table_finish(dev, table);
+      return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+hk_descriptor_table_finish(struct hk_device *dev,
+                           struct hk_descriptor_table *table)
+{
+   agx_bo_unreference(table->bo);
+   vk_free(&dev->vk.alloc, table->free_table);
+   simple_mtx_destroy(&table->mutex);
+}
+
+#define HK_IMAGE_DESC_INVALID
+
+static VkResult
+hk_descriptor_table_alloc_locked(struct hk_device *dev,
+                                 struct hk_descriptor_table *table,
+                                 uint32_t *index_out)
+{
+   VkResult result;
+
+   if (table->free_count > 0) {
+      *index_out = table->free_table[--table->free_count];
+      return VK_SUCCESS;
+   }
+
+   if (table->next_desc < table->alloc) {
+      *index_out = table->next_desc++;
+      return VK_SUCCESS;
+   }
+
+   if (table->next_desc >= table->max_alloc) {
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "Descriptor table not large enough");
+   }
+
+   result = hk_descriptor_table_grow_locked(dev, table, table->alloc * 2);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(table->next_desc < table->alloc);
+   *index_out = table->next_desc++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_descriptor_table_add_locked(struct hk_device *dev,
+                               struct hk_descriptor_table *table,
+                               const void *desc_data, size_t desc_size,
+                               uint32_t *index_out)
+{
+   VkResult result = hk_descriptor_table_alloc_locked(dev, table, index_out);
+   if (result != VK_SUCCESS)
+      return result;
+
+   void *map = (char *)table->map + (*index_out * table->desc_size);
+
+   assert(desc_size == table->desc_size);
+   memcpy(map, desc_data, table->desc_size);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_descriptor_table_add(struct hk_device *dev,
+                        struct hk_descriptor_table *table,
+                        const void *desc_data, size_t desc_size,
+                        uint32_t *index_out)
+{
+   simple_mtx_lock(&table->mutex);
+   VkResult result = hk_descriptor_table_add_locked(dev, table, desc_data,
+                                                    desc_size, index_out);
+   simple_mtx_unlock(&table->mutex);
+
+   return result;
+}
+
+void
+hk_descriptor_table_remove(struct hk_device *dev,
+                           struct hk_descriptor_table *table, uint32_t index)
+{
+   simple_mtx_lock(&table->mutex);
+
+   void *map = (char *)table->map + (index * table->desc_size);
+   memset(map, 0, table->desc_size);
+
+   /* Sanity check for double-free */
+   assert(table->free_count < table->alloc);
+   for (uint32_t i = 0; i < table->free_count; i++)
+      assert(table->free_table[i] != index);
+
+   table->free_table[table->free_count++] = index;
+
+   simple_mtx_unlock(&table->mutex);
+}
diff --git a/src/asahi/vulkan/hk_descriptor_table.h b/src/asahi/vulkan/hk_descriptor_table.h
new file mode 100644
index 00000000000..759bcf8a4b5
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_table.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "asahi/lib/agx_bo.h"
+#include "util/simple_mtx.h"
+
+struct hk_device;
+
+struct hk_descriptor_table {
+   simple_mtx_t mutex;
+
+   uint32_t desc_size;  /**< Size of a descriptor */
+   uint32_t alloc;      /**< Number of descriptors allocated */
+   uint32_t max_alloc;  /**< Maximum possible number of descriptors */
+   uint32_t next_desc;  /**< Next unallocated descriptor */
+   uint32_t free_count; /**< Size of free_table */
+
+   struct agx_bo *bo;
+   void *map;
+
+   /* Stack for free descriptor elements */
+   uint32_t *free_table;
+};
+
+VkResult hk_descriptor_table_init(struct hk_device *dev,
+                                  struct hk_descriptor_table *table,
+                                  uint32_t descriptor_size,
+                                  uint32_t min_descriptor_count,
+                                  uint32_t max_descriptor_count);
+
+void hk_descriptor_table_finish(struct hk_device *dev,
+                                struct hk_descriptor_table *table);
+
+VkResult hk_descriptor_table_add(struct hk_device *dev,
+                                 struct hk_descriptor_table *table,
+                                 const void *desc_data, size_t desc_size,
+                                 uint32_t *index_out);
+
+void hk_descriptor_table_remove(struct hk_device *dev,
+                                struct hk_descriptor_table *table,
+                                uint32_t index);
diff --git a/src/asahi/vulkan/hk_device.c b/src/asahi/vulkan/hk_device.c
new file mode 100644
index 00000000000..f5c4535aca2
--- /dev/null
+++ b/src/asahi/vulkan/hk_device.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_device.h"
+
+#include "agx_bg_eot.h"
+#include "agx_opcodes.h"
+#include "agx_scratch.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_table.h"
+#include "hk_entrypoints.h"
+#include "hk_instance.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+
+#include "asahi/genxml/agx_pack.h"
+#include "asahi/lib/agx_bo.h"
+#include "asahi/lib/agx_device.h"
+#include "asahi/lib/shaders/geometry.h"
+#include "util/hash_table.h"
+#include "util/os_file.h"
+#include "util/ralloc.h"
+#include "util/simple_mtx.h"
+#include "vulkan/vulkan_core.h"
+#include "vulkan/wsi/wsi_common.h"
+#include "vk_cmd_enqueue_entrypoints.h"
+#include "vk_common_entrypoints.h"
+#include "vk_pipeline_cache.h"
+
+#include <fcntl.h>
+#include <xf86drm.h>
+
+/*
+ * We preupload some constants so we can cheaply reference later without extra
+ * allocation and copying.
+ *
+ * TODO: This is small, don't waste a whole BO.
+ */
+static VkResult
+hk_upload_rodata(struct hk_device *dev)
+{
+   dev->rodata.bo =
+      agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, "Read only data");
+
+   if (!dev->rodata.bo)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   uint8_t *map = dev->rodata.bo->ptr.cpu;
+   uint32_t offs = 0;
+
+   offs = align(offs, 8);
+   agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) {
+      cfg.start = 0;
+      cfg.count = 1;
+      cfg.buffer = dev->rodata.bo->ptr.gpu + offs;
+   }
+
+   agx_pack(map + offs, SAMPLER, cfg) {
+      /* Allow mipmapping. This is respected by txf, weirdly. */
+      cfg.mip_filter = AGX_MIP_FILTER_NEAREST;
+
+      /* Out-of-bounds reads must return 0 */
+      cfg.wrap_s = AGX_WRAP_CLAMP_TO_BORDER;
+      cfg.wrap_t = AGX_WRAP_CLAMP_TO_BORDER;
+      cfg.wrap_r = AGX_WRAP_CLAMP_TO_BORDER;
+      cfg.border_colour = AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
+   }
+   offs += AGX_SAMPLER_LENGTH;
+
+   /* The image heap is allocated on the device prior to the rodata. The heap
+    * lives as long as the device does and has a stable address (requiring
+    * sparse binding to grow dynamically). That means its address is effectively
+    * rodata and can be uploaded now. agx_usc_uniform requires an indirection to
+    * push the heap address, so this takes care of that indirection up front to
+    * cut an alloc/upload at draw time.
+    */
+   offs = align(offs, sizeof(uint64_t));
+   agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
+      cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
+      cfg.size_halfs = 4;
+      cfg.buffer = dev->rodata.bo->ptr.gpu + offs;
+   }
+
+   uint64_t *image_heap_ptr = dev->rodata.bo->ptr.cpu + offs;
+   *image_heap_ptr = dev->images.bo->ptr.gpu;
+   offs += sizeof(uint64_t);
+
+   /* The geometry state buffer isn't strictly readonly data, but we only have a
+    * single instance of it device-wide and -- after initializing at heap
+    * allocate time -- it is read-only from the CPU perspective. The GPU uses it
+    * for scratch, but is required to reset it after use to ensure resubmitting
+    * the same command buffer works.
+    *
+    * So, we allocate it here for convenience.
+    */
+   offs = align(offs, sizeof(uint64_t));
+   dev->rodata.geometry_state = dev->rodata.bo->ptr.gpu + offs;
+   offs += sizeof(struct agx_geometry_state);
+
+   /* For null readonly buffers, we need to allocate 16 bytes of zeroes for
+    * robustness2 semantics on read.
+    */
+   offs = align(offs, 16);
+   dev->rodata.zero_sink = dev->rodata.bo->ptr.gpu + offs;
+   memset(dev->rodata.bo->ptr.cpu + offs, 0, 16);
+   offs += 16;
+
+   /* For null storage descriptors, we need to reserve 16 bytes to catch writes.
+    * No particular content is required; we cannot get robustness2 semantics
+    * without more work.
+    */
+   offs = align(offs, 16);
+   dev->rodata.null_sink = dev->rodata.bo->ptr.gpu + offs;
+   offs += 16;
+
+   return VK_SUCCESS;
+}
+
+static uint32_t
+internal_key_hash(const void *key_)
+{
+   const struct hk_internal_key *key = key_;
+
+   return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
+}
+
+static bool
+internal_key_equal(const void *a_, const void *b_)
+{
+   const struct hk_internal_key *a = a_;
+   const struct hk_internal_key *b = b_;
+
+   return a->builder == b->builder && a->key_size == b->key_size &&
+          memcmp(a->key, b->key, a->key_size) == 0;
+}
+
+static VkResult
+hk_init_internal_shaders(struct hk_internal_shaders *s)
+{
+   s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
+   if (!s->ht)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   simple_mtx_init(&s->lock, mtx_plain);
+   return VK_SUCCESS;
+}
+
+static void
+hk_destroy_internal_shaders(struct hk_device *dev,
+                            struct hk_internal_shaders *s, bool part)
+{
+   hash_table_foreach(s->ht, ent) {
+      if (part) {
+         struct agx_shader_part *part = ent->data;
+         free(part->binary);
+
+         /* The agx_shader_part itself is ralloc'd against the hash table so
+          * will be freed.
+          */
+      } else {
+         struct hk_api_shader *obj = ent->data;
+         hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
+      }
+   }
+
+   _mesa_hash_table_destroy(s->ht, NULL);
+   simple_mtx_destroy(&s->lock);
+}
+
+DERIVE_HASH_TABLE(agx_sampler_packed);
+
+static VkResult
+hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
+{
+   h->ht = agx_sampler_packed_table_create(NULL);
+   if (!h->ht)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   VkResult result =
+      hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
+
+   if (result != VK_SUCCESS) {
+      ralloc_free(h->ht);
+      return result;
+   }
+
+   simple_mtx_init(&h->lock, mtx_plain);
+   return VK_SUCCESS;
+}
+
+static void
+hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
+{
+   hk_descriptor_table_finish(dev, &h->table);
+   ralloc_free(h->ht);
+   simple_mtx_destroy(&h->lock);
+}
+
+static VkResult
+hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
+                           struct agx_sampler_packed desc,
+                           struct hk_rc_sampler **out)
+{
+   struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
+   if (ent != NULL) {
+      *out = ent->data;
+
+      assert((*out)->refcount != 0);
+      (*out)->refcount++;
+
+      return VK_SUCCESS;
+   }
+
+   struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
+   if (!rc)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   uint32_t index;
+   VkResult result =
+      hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
+   if (result != VK_SUCCESS) {
+      ralloc_free(rc);
+      return result;
+   }
+
+   *rc = (struct hk_rc_sampler){
+      .key = desc,
+      .refcount = 1,
+      .index = index,
+   };
+
+   _mesa_hash_table_insert(h->ht, &rc->key, rc);
+   *out = rc;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
+                    struct hk_rc_sampler **out)
+{
+   struct hk_sampler_heap *h = &dev->samplers;
+
+   simple_mtx_lock(&h->lock);
+   VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
+   simple_mtx_unlock(&h->lock);
+
+   return result;
+}
+
+static void
+hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
+                              struct hk_rc_sampler *rc)
+{
+   assert(rc->refcount != 0);
+   rc->refcount--;
+
+   if (rc->refcount == 0) {
+      hk_descriptor_table_remove(dev, &h->table, rc->index);
+      _mesa_hash_table_remove_key(h->ht, &rc->key);
+      ralloc_free(rc);
+   }
+}
+
+void
+hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
+{
+   struct hk_sampler_heap *h = &dev->samplers;
+
+   simple_mtx_lock(&h->lock);
+   hk_sampler_heap_remove_locked(dev, h, rc);
+   simple_mtx_unlock(&h->lock);
+}
+
+/*
+ * To implement nullDescriptor, the descriptor set code will reference
+ * preuploaded null descriptors at fixed offsets in the image heap. Here we
+ * upload those descriptors, initializing the image heap.
+ */
+static void
+hk_upload_null_descriptors(struct hk_device *dev)
+{
+   struct agx_texture_packed null_tex;
+   struct agx_pbe_packed null_pbe;
+   uint32_t offset_tex, offset_pbe;
+
+   agx_set_null_texture(&null_tex, dev->rodata.null_sink);
+   agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
+
+   hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
+                           &offset_tex);
+
+   hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
+                           &offset_pbe);
+
+   assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
+   assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateDevice(VkPhysicalDevice physicalDevice,
+                const VkDeviceCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+   VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
+   struct hk_device *dev;
+
+   dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!dev)
+      return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_device_dispatch_table dispatch_table;
+
+   /* For secondary command buffer support, overwrite any command entrypoints
+    * in the main device-level dispatch table with
+    * vk_cmd_enqueue_unless_primary_Cmd*.
+    */
+   vk_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
+
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &hk_device_entrypoints, false);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &wsi_device_entrypoints, false);
+
+   /* Populate primary cmd_dispatch table */
+   vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
+                                             &hk_device_entrypoints, true);
+   vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
+                                             &wsi_device_entrypoints, false);
+   vk_device_dispatch_table_from_entrypoints(
+      &dev->cmd_dispatch, &vk_common_device_entrypoints, false);
+
+   result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
+                           pAllocator);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   dev->vk.shader_ops = &hk_device_shader_ops;
+   dev->vk.command_dispatch_table = &dev->cmd_dispatch;
+
+   drmDevicePtr drm_device = NULL;
+   int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
+   if (ret != 0) {
+      result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to get DRM device: %m");
+      goto fail_init;
+   }
+
+   const char *path = drm_device->nodes[DRM_NODE_RENDER];
+   dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
+   if (dev->dev.fd < 0) {
+      drmFreeDevice(&drm_device);
+      result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
+                         "failed to open device %s", path);
+      goto fail_init;
+   }
+
+   bool succ = agx_open_device(NULL, &dev->dev);
+   drmFreeDevice(&drm_device);
+   if (!succ) {
+      result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to get DRM device: %m");
+      goto fail_fd;
+   }
+
+   vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
+   dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
+
+   result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
+                                     1024, 1024 * 1024);
+   if (result != VK_SUCCESS)
+      goto fail_dev;
+
+   result = hk_init_sampler_heap(dev, &dev->samplers);
+   if (result != VK_SUCCESS)
+      goto fail_images;
+
+   result = hk_descriptor_table_init(
+      dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
+      AGX_MAX_OCCLUSION_QUERIES);
+   if (result != VK_SUCCESS)
+      goto fail_samplers;
+
+   result = hk_upload_rodata(dev);
+   if (result != VK_SUCCESS)
+      goto fail_queries;
+
+   /* Depends on rodata */
+   hk_upload_null_descriptors(dev);
+
+   /* XXX: error handling, and should this even go on the device? */
+   agx_bg_eot_init(&dev->bg_eot, &dev->dev);
+   if (!dev->bg_eot.ht) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail_rodata;
+   }
+
+   result = hk_init_internal_shaders(&dev->prolog_epilog);
+   if (result != VK_SUCCESS)
+      goto fail_bg_eot;
+
+   result = hk_init_internal_shaders(&dev->kernels);
+   if (result != VK_SUCCESS)
+      goto fail_internal_shaders;
+
+   result =
+      hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
+   if (result != VK_SUCCESS)
+      goto fail_internal_shaders_2;
+
+   struct vk_pipeline_cache_create_info cache_info = {
+      .weak_ref = true,
+   };
+   dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
+   if (dev->mem_cache == NULL) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail_queue;
+   }
+
+   result = hk_device_init_meta(dev);
+   if (result != VK_SUCCESS)
+      goto fail_mem_cache;
+
+   *pDevice = hk_device_to_handle(dev);
+
+   agx_scratch_init(&dev->dev, &dev->scratch.vs);
+   agx_scratch_init(&dev->dev, &dev->scratch.fs);
+   agx_scratch_init(&dev->dev, &dev->scratch.cs);
+
+   return VK_SUCCESS;
+
+fail_mem_cache:
+   vk_pipeline_cache_destroy(dev->mem_cache, NULL);
+fail_queue:
+   hk_queue_finish(dev, &dev->queue);
+fail_rodata:
+   agx_bo_unreference(dev->rodata.bo);
+fail_bg_eot:
+   agx_bg_eot_cleanup(&dev->bg_eot);
+fail_internal_shaders_2:
+   hk_destroy_internal_shaders(dev, &dev->kernels, false);
+fail_internal_shaders:
+   hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
+fail_queries:
+   hk_descriptor_table_finish(dev, &dev->occlusion_queries);
+fail_samplers:
+   hk_destroy_sampler_heap(dev, &dev->samplers);
+fail_images:
+   hk_descriptor_table_finish(dev, &dev->images);
+fail_dev:
+   agx_close_device(&dev->dev);
+fail_fd:
+   close(dev->dev.fd);
+fail_init:
+   vk_device_finish(&dev->vk);
+fail_alloc:
+   vk_free(&dev->vk.alloc, dev);
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+
+   if (!dev)
+      return;
+
+   hk_device_finish_meta(dev);
+   hk_destroy_internal_shaders(dev, &dev->kernels, false);
+   hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
+
+   vk_pipeline_cache_destroy(dev->mem_cache, NULL);
+   hk_queue_finish(dev, &dev->queue);
+   vk_device_finish(&dev->vk);
+
+   agx_scratch_fini(&dev->scratch.vs);
+   agx_scratch_fini(&dev->scratch.fs);
+   agx_scratch_fini(&dev->scratch.cs);
+
+   hk_destroy_sampler_heap(dev, &dev->samplers);
+   hk_descriptor_table_finish(dev, &dev->images);
+   hk_descriptor_table_finish(dev, &dev->occlusion_queries);
+   agx_bo_unreference(dev->rodata.bo);
+   agx_bo_unreference(dev->heap);
+   agx_bg_eot_cleanup(&dev->bg_eot);
+   agx_close_device(&dev->dev);
+   vk_free(&dev->vk.alloc, dev);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetCalibratedTimestampsKHR(
+   VkDevice _device, uint32_t timestampCount,
+   const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
+   uint64_t *pMaxDeviation)
+{
+   // VK_FROM_HANDLE(hk_device, dev, _device);
+   // struct hk_physical_device *pdev = hk_device_physical(dev);
+   uint64_t max_clock_period = 0;
+   uint64_t begin, end;
+   int d;
+
+#ifdef CLOCK_MONOTONIC_RAW
+   begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
+#else
+   begin = vk_clock_gettime(CLOCK_MONOTONIC);
+#endif
+
+   for (d = 0; d < timestampCount; d++) {
+      switch (pTimestampInfos[d].timeDomain) {
+      case VK_TIME_DOMAIN_DEVICE_KHR:
+         unreachable("todo");
+         // pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev);
+         max_clock_period = MAX2(
+            max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */
+         break;
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
+         pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
+         max_clock_period = MAX2(max_clock_period, 1);
+         break;
+
+#ifdef CLOCK_MONOTONIC_RAW
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
+         pTimestamps[d] = begin;
+         break;
+#endif
+      default:
+         pTimestamps[d] = 0;
+         break;
+      }
+   }
+
+#ifdef CLOCK_MONOTONIC_RAW
+   end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
+#else
+   end = vk_clock_gettime(CLOCK_MONOTONIC);
+#endif
+
+   *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
+
+   return VK_SUCCESS;
+}
diff --git a/src/asahi/vulkan/hk_device.h b/src/asahi/vulkan/hk_device.h
new file mode 100644
index 00000000000..b6c57315390
--- /dev/null
+++ b/src/asahi/vulkan/hk_device.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/lib/agx_device.h"
+#include "agx_bg_eot.h"
+#include "agx_pack.h"
+#include "agx_scratch.h"
+#include "decode.h"
+#include "vk_cmd_queue.h"
+#include "vk_dispatch_table.h"
+
+#include "hk_private.h"
+
+#include "hk_descriptor_table.h"
+#include "hk_queue.h"
+#include "vk_device.h"
+#include "vk_meta.h"
+#include "vk_queue.h"
+
+struct hk_physical_device;
+struct vk_pipeline_cache;
+
+/* Fixed offsets for reserved null image descriptors */
+#define HK_NULL_TEX_OFFSET (0)
+#define HK_NULL_PBE_OFFSET (24)
+
+typedef void (*hk_internal_builder_t)(struct nir_builder *b, const void *key);
+
+struct hk_internal_key {
+   hk_internal_builder_t builder;
+   size_t key_size;
+   uint8_t key[];
+};
+
+struct hk_internal_shaders {
+   simple_mtx_t lock;
+   struct hash_table *ht;
+};
+
+struct hk_rc_sampler {
+   struct agx_sampler_packed key;
+
+   /* Reference count for this hardware sampler, protected by the heap mutex */
+   uint16_t refcount;
+
+   /* Index of this hardware sampler in the hardware sampler heap */
+   uint16_t index;
+};
+
+struct hk_sampler_heap {
+   simple_mtx_t lock;
+
+   struct hk_descriptor_table table;
+
+   /* Map of agx_sampler_packed to hk_rc_sampler */
+   struct hash_table *ht;
+};
+
+struct hk_device {
+   struct vk_device vk;
+   struct agx_device dev;
+   struct agxdecode_ctx *decode_ctx;
+
+   struct hk_descriptor_table images;
+   struct hk_descriptor_table occlusion_queries;
+   struct hk_sampler_heap samplers;
+
+   struct hk_queue queue;
+
+   struct vk_pipeline_cache *mem_cache;
+
+   struct vk_meta_device meta;
+   struct agx_bg_eot_cache bg_eot;
+
+   struct {
+      struct agx_bo *bo;
+      struct agx_usc_sampler_packed txf_sampler;
+      struct agx_usc_uniform_packed image_heap;
+      uint64_t null_sink, zero_sink;
+      uint64_t geometry_state;
+   } rodata;
+
+   struct hk_internal_shaders prolog_epilog;
+   struct hk_internal_shaders kernels;
+   struct hk_api_shader *write_shader;
+
+   /* Indirected for common secondary emulation */
+   struct vk_device_dispatch_table cmd_dispatch;
+
+   /* Heap used for GPU-side memory allocation for geometry/tessellation.
+    *
+    * Control streams accessing the heap must be serialized. This is not
+    * expected to be a legitimate problem. If it is, we can rework later.
+    */
+   struct agx_bo *heap;
+
+   struct {
+      struct agx_scratch vs, fs, cs;
+   } scratch;
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+
+static inline struct hk_physical_device *
+hk_device_physical(struct hk_device *dev)
+{
+   return (struct hk_physical_device *)dev->vk.physical;
+}
+
+VkResult hk_device_init_meta(struct hk_device *dev);
+void hk_device_finish_meta(struct hk_device *dev);
+
+VkResult hk_sampler_heap_add(struct hk_device *dev,
+                             struct agx_sampler_packed desc,
+                             struct hk_rc_sampler **out);
+
+void hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc);
diff --git a/src/asahi/vulkan/hk_device_memory.c b/src/asahi/vulkan/hk_device_memory.c
new file mode 100644
index 00000000000..0d10a55f5df
--- /dev/null
+++ b/src/asahi/vulkan/hk_device_memory.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_device_memory.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "asahi/lib/agx_bo.h"
+#include "util/u_atomic.h"
+
+#include <inttypes.h>
+#include <sys/mman.h>
+
+/* Supports opaque fd only */
+const VkExternalMemoryProperties hk_opaque_fd_mem_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+   .compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+};
+
+/* Supports opaque fd and dma_buf. */
+const VkExternalMemoryProperties hk_dma_buf_mem_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+   .compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+                            VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+};
+
+static enum agx_bo_flags
+hk_memory_type_flags(const VkMemoryType *type,
+                     VkExternalMemoryHandleTypeFlagBits handle_types)
+{
+   unsigned flags = 0;
+
+   if (handle_types)
+      flags |= AGX_BO_SHARED | AGX_BO_SHAREABLE;
+
+   return flags;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetMemoryFdPropertiesKHR(VkDevice device,
+                            VkExternalMemoryHandleTypeFlagBits handleType,
+                            int fd,
+                            VkMemoryFdPropertiesKHR *pMemoryFdProperties)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct agx_bo *bo;
+
+   switch (handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      bo = agx_bo_import(&dev->dev, fd);
+      if (bo == NULL)
+         return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      break;
+   default:
+      return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   }
+
+   uint32_t type_bits = 0;
+   for (unsigned t = 0; t < ARRAY_SIZE(pdev->mem_types); t++) {
+      const unsigned flags =
+         hk_memory_type_flags(&pdev->mem_types[t], handleType);
+      if (!(flags & ~bo->flags))
+         type_bits |= (1 << t);
+   }
+
+   pMemoryFdProperties->memoryTypeBits = type_bits;
+
+   agx_bo_unreference(bo);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_AllocateMemory(VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo,
+                  const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMem)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_device_memory *mem;
+   VkResult result = VK_SUCCESS;
+
+   const VkImportMemoryFdInfoKHR *fd_info =
+      vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
+   const VkExportMemoryAllocateInfo *export_info =
+      vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO);
+   const VkMemoryType *type = &pdev->mem_types[pAllocateInfo->memoryTypeIndex];
+
+   VkExternalMemoryHandleTypeFlagBits handle_types = 0;
+   if (export_info != NULL)
+      handle_types |= export_info->handleTypes;
+   if (fd_info != NULL)
+      handle_types |= fd_info->handleType;
+
+   const unsigned flags = hk_memory_type_flags(type, handle_types);
+
+   uint32_t alignment = 16384; /* Apple page size */
+
+   struct hk_memory_heap *heap = &pdev->mem_heaps[type->heapIndex];
+   if (p_atomic_read(&heap->used) > heap->size)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   const uint64_t aligned_size =
+      align64(pAllocateInfo->allocationSize, alignment);
+
+   mem = vk_device_memory_create(&dev->vk, pAllocateInfo, pAllocator,
+                                 sizeof(*mem));
+   if (!mem)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   mem->map = NULL;
+   if (fd_info && fd_info->handleType) {
+      assert(
+         fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+         fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+
+      mem->bo = agx_bo_import(&dev->dev, fd_info->fd);
+      if (mem->bo == NULL) {
+         result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         goto fail_alloc;
+      }
+      assert(!(flags & ~mem->bo->flags));
+   } else {
+      enum agx_bo_flags flags = 0;
+      if (handle_types)
+         flags |= AGX_BO_SHAREABLE;
+
+      mem->bo = agx_bo_create(&dev->dev, aligned_size, flags, "App memory");
+      if (!mem->bo) {
+         result = vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail_alloc;
+      }
+   }
+
+   if (fd_info && fd_info->handleType) {
+      /* From the Vulkan spec:
+       *
+       *    "Importing memory from a file descriptor transfers ownership of
+       *    the file descriptor from the application to the Vulkan
+       *    implementation. The application must not perform any operations on
+       *    the file descriptor after a successful import."
+       *
+       * If the import fails, we leave the file descriptor open.
+       */
+      close(fd_info->fd);
+   }
+
+   uint64_t heap_used = p_atomic_add_return(&heap->used, mem->bo->size);
+   if (heap_used > heap->size) {
+      hk_FreeMemory(device, hk_device_memory_to_handle(mem), pAllocator);
+      return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "Out of heap memory");
+   }
+
+   *pMem = hk_device_memory_to_handle(mem);
+
+   return VK_SUCCESS;
+
+fail_alloc:
+   vk_device_memory_destroy(&dev->vk, pAllocator, &mem->vk);
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_FreeMemory(VkDevice device, VkDeviceMemory _mem,
+              const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_device_memory, mem, _mem);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   if (!mem)
+      return;
+
+   const VkMemoryType *type = &pdev->mem_types[mem->vk.memory_type_index];
+   struct hk_memory_heap *heap = &pdev->mem_heaps[type->heapIndex];
+   p_atomic_add(&heap->used, -((int64_t)mem->bo->size));
+
+   agx_bo_unreference(mem->bo);
+
+   vk_device_memory_destroy(&dev->vk, pAllocator, &mem->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_MapMemory2KHR(VkDevice device, const VkMemoryMapInfoKHR *pMemoryMapInfo,
+                 void **ppData)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_device_memory, mem, pMemoryMapInfo->memory);
+
+   if (mem == NULL) {
+      *ppData = NULL;
+      return VK_SUCCESS;
+   }
+
+   const VkDeviceSize offset = pMemoryMapInfo->offset;
+   const VkDeviceSize size = vk_device_memory_range(
+      &mem->vk, pMemoryMapInfo->offset, pMemoryMapInfo->size);
+
+   UNUSED void *fixed_addr = NULL;
+   if (pMemoryMapInfo->flags & VK_MEMORY_MAP_PLACED_BIT_EXT) {
+      const VkMemoryMapPlacedInfoEXT *placed_info = vk_find_struct_const(
+         pMemoryMapInfo->pNext, MEMORY_MAP_PLACED_INFO_EXT);
+      fixed_addr = placed_info->pPlacedAddress;
+   }
+
+   /* From the Vulkan spec version 1.0.32 docs for MapMemory:
+    *
+    *  * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0
+    *    assert(size != 0);
+    *  * If size is not equal to VK_WHOLE_SIZE, size must be less than or
+    *    equal to the size of the memory minus offset
+    */
+   assert(size > 0);
+   assert(offset + size <= mem->bo->size);
+
+   if (size != (size_t)size) {
+      return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED,
+                       "requested size 0x%" PRIx64 " does not fit in %u bits",
+                       size, (unsigned)(sizeof(size_t) * 8));
+   }
+
+   /* From the Vulkan 1.2.194 spec:
+    *
+    *    "memory must not be currently host mapped"
+    */
+   if (mem->map != NULL) {
+      return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED,
+                       "Memory object already mapped.");
+   }
+
+   mem->map = mem->bo->ptr.cpu;
+   *ppData = mem->map + offset;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_UnmapMemory2KHR(VkDevice device,
+                   const VkMemoryUnmapInfoKHR *pMemoryUnmapInfo)
+{
+   VK_FROM_HANDLE(hk_device_memory, mem, pMemoryUnmapInfo->memory);
+
+   if (mem == NULL)
+      return VK_SUCCESS;
+
+   if (pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT) {
+      unreachable("todo");
+#if 0
+      VK_FROM_HANDLE(hk_device, dev, device);
+
+      int err = agx_bo_overmap(mem->bo, mem->map);
+      if (err) {
+         return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED,
+                          "Failed to map over original mapping");
+      }
+#endif
+   } else {
+      /* TODO */
+      //// agx_bo_unmap(mem->bo, mem->map);
+   }
+
+   mem->map = NULL;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_FlushMappedMemoryRanges(VkDevice device, uint32_t memoryRangeCount,
+                           const VkMappedMemoryRange *pMemoryRanges)
+{
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_InvalidateMappedMemoryRanges(VkDevice device, uint32_t memoryRangeCount,
+                                const VkMappedMemoryRange *pMemoryRanges)
+{
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceMemoryCommitment(VkDevice device, VkDeviceMemory _mem,
+                             VkDeviceSize *pCommittedMemoryInBytes)
+{
+   VK_FROM_HANDLE(hk_device_memory, mem, _mem);
+
+   *pCommittedMemoryInBytes = mem->bo->size;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetMemoryFdKHR(VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo,
+                  int *pFD)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_device_memory, memory, pGetFdInfo->memory);
+
+   switch (pGetFdInfo->handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      *pFD = agx_bo_export(memory->bo);
+      return VK_SUCCESS;
+   default:
+      assert(!"unsupported handle type");
+      return vk_error(dev, VK_ERROR_FEATURE_NOT_PRESENT);
+   }
+}
+
+VKAPI_ATTR uint64_t VKAPI_CALL
+hk_GetDeviceMemoryOpaqueCaptureAddress(
+   UNUSED VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfo *pInfo)
+{
+   VK_FROM_HANDLE(hk_device_memory, mem, pInfo->memory);
+
+   return mem->bo->ptr.gpu;
+}
diff --git a/src/asahi/vulkan/hk_device_memory.h b/src/asahi/vulkan/hk_device_memory.h
new file mode 100644
index 00000000000..29d3651972a
--- /dev/null
+++ b/src/asahi/vulkan/hk_device_memory.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_device_memory.h"
+
+#include "util/list.h"
+
+struct hk_device;
+struct hk_image_plane;
+
+struct hk_device_memory {
+   struct vk_device_memory vk;
+
+   struct agx_bo *bo;
+
+   void *map;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_device_memory, vk.base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+
+extern const VkExternalMemoryProperties hk_opaque_fd_mem_props;
+extern const VkExternalMemoryProperties hk_dma_buf_mem_props;
diff --git a/src/asahi/vulkan/hk_event.c b/src/asahi/vulkan/hk_event.c
new file mode 100644
index 00000000000..aadbb272e76
--- /dev/null
+++ b/src/asahi/vulkan/hk_event.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_event.h"
+#include "vulkan/vulkan_core.h"
+
+#include "agx_bo.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+
+#define HK_EVENT_MEM_SIZE sizeof(VkResult)
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateEvent(VkDevice device, const VkEventCreateInfo *pCreateInfo,
+               const VkAllocationCallbacks *pAllocator, VkEvent *pEvent)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_event *event;
+
+   event = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*event),
+                            VK_OBJECT_TYPE_EVENT);
+   if (!event)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* TODO: this is really wasteful, bring back the NVK heap!
+    *
+    * XXX
+    */
+   event->bo =
+      agx_bo_create(&dev->dev, HK_EVENT_MEM_SIZE, AGX_BO_WRITEBACK, "Event");
+   event->status = event->bo->ptr.cpu;
+   event->addr = event->bo->ptr.gpu;
+
+   *event->status = VK_EVENT_RESET;
+
+   *pEvent = hk_event_to_handle(event);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyEvent(VkDevice device, VkEvent _event,
+                const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   if (!event)
+      return;
+
+   agx_bo_unreference(event->bo);
+   vk_object_free(&dev->vk, pAllocator, event);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetEventStatus(VkDevice device, VkEvent _event)
+{
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   return *event->status;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_SetEvent(VkDevice device, VkEvent _event)
+{
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   *event->status = VK_EVENT_SET;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_ResetEvent(VkDevice device, VkEvent _event)
+{
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   *event->status = VK_EVENT_RESET;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
+                const VkDependencyInfo *pDependencyInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   hk_queue_write(cmd, event->bo->ptr.gpu, VK_EVENT_SET, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
+                  VkPipelineStageFlags2 stageMask)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   hk_queue_write(cmd, event->bo->ptr.gpu, VK_EVENT_RESET, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount,
+                  const VkEvent *pEvents,
+                  const VkDependencyInfo *pDependencyInfos)
+{
+   /* Currently we barrier everything, so this is a no-op. */
+}
diff --git a/src/asahi/vulkan/hk_event.h b/src/asahi/vulkan/hk_event.h
new file mode 100644
index 00000000000..c675ceada8a
--- /dev/null
+++ b/src/asahi/vulkan/hk_event.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_object.h"
+
+struct hk_event {
+   struct vk_object_base base;
+   struct agx_bo *bo;
+
+   uint64_t addr;
+   VkResult *status;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
diff --git a/src/asahi/vulkan/hk_format.c b/src/asahi/vulkan/hk_format.c
new file mode 100644
index 00000000000..b0fa8ae5c99
--- /dev/null
+++ b/src/asahi/vulkan/hk_format.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "drm-uapi/drm_fourcc.h"
+
+#include "hk_buffer_view.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "vk_enum_defines.h"
+#include "vk_format.h"
+
+uint64_t agx_best_modifiers[] = {
+   // DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED,
+   DRM_FORMAT_MOD_APPLE_TWIDDLED,
+   DRM_FORMAT_MOD_LINEAR,
+};
+
+static VkFormatFeatureFlags2
+hk_modifier_features(uint64_t mod, VkFormat vk_format,
+                     const VkFormatProperties *props)
+{
+   if (mod == DRM_FORMAT_MOD_LINEAR)
+      return props->linearTilingFeatures;
+
+   if (mod == DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED /* TODO */)
+      return 0;
+
+   return props->optimalTilingFeatures;
+}
+
+static void
+get_drm_format_modifier_properties_list(
+   const struct hk_physical_device *physical_device, VkFormat vk_format,
+   VkDrmFormatModifierPropertiesListEXT *list, const VkFormatProperties *props)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(agx_best_modifiers); ++i) {
+      uint64_t mod = agx_best_modifiers[i];
+      VkFormatFeatureFlags2 flags = hk_modifier_features(mod, vk_format, props);
+
+      if (!flags)
+         continue;
+
+      vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out,
+                               out_props)
+      {
+         *out_props = (VkDrmFormatModifierPropertiesEXT){
+            .drmFormatModifier = mod,
+            .drmFormatModifierPlaneCount = 1 /* no planar mods */,
+            .drmFormatModifierTilingFeatures = flags,
+         };
+      };
+   }
+}
+
+static void
+get_drm_format_modifier_properties_list_2(
+   const struct hk_physical_device *physical_device, VkFormat vk_format,
+   VkDrmFormatModifierPropertiesList2EXT *list, const VkFormatProperties *props)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(agx_best_modifiers); ++i) {
+      uint64_t mod = agx_best_modifiers[i];
+      VkFormatFeatureFlags2 flags = hk_modifier_features(mod, vk_format, props);
+
+      if (!flags)
+         continue;
+
+      vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out,
+                               out_props)
+      {
+         *out_props = (VkDrmFormatModifierProperties2EXT){
+            .drmFormatModifier = mod,
+            .drmFormatModifierPlaneCount = 1, /* no planar mods */
+            .drmFormatModifierTilingFeatures = flags,
+         };
+      };
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
+                                      VkFormat format,
+                                      VkFormatProperties2 *pFormatProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdevice, physicalDevice);
+
+   VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+   linear2 =
+      hk_get_image_format_features(pdevice, format, VK_IMAGE_TILING_LINEAR);
+   optimal2 =
+      hk_get_image_format_features(pdevice, format, VK_IMAGE_TILING_OPTIMAL);
+   buffer2 = hk_get_buffer_format_features(pdevice, format);
+
+   pFormatProperties->formatProperties = (VkFormatProperties){
+      .linearTilingFeatures = vk_format_features2_to_features(linear2),
+      .optimalTilingFeatures = vk_format_features2_to_features(optimal2),
+      .bufferFeatures = vk_format_features2_to_features(buffer2),
+   };
+
+   vk_foreach_struct(ext, pFormatProperties->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+         VkFormatProperties3 *p = (void *)ext;
+         p->linearTilingFeatures = linear2;
+         p->optimalTilingFeatures = optimal2;
+         p->bufferFeatures = buffer2;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT:
+         get_drm_format_modifier_properties_list(
+            pdevice, format, (void *)ext, &pFormatProperties->formatProperties);
+         break;
+
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT:
+         get_drm_format_modifier_properties_list_2(
+            pdevice, format, (void *)ext, &pFormatProperties->formatProperties);
+         break;
+
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
diff --git a/src/asahi/vulkan/hk_image.c b/src/asahi/vulkan/hk_image.c
new file mode 100644
index 00000000000..6187eff40a8
--- /dev/null
+++ b/src/asahi/vulkan/hk_image.c
@@ -0,0 +1,1536 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_image.h"
+#include "asahi/layout/layout.h"
+#include "asahi/lib/agx_formats.h"
+#include "drm-uapi/drm_fourcc.h"
+#include "util/bitscan.h"
+#include "util/format/u_format.h"
+#include "util/format/u_formats.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+#include "vulkan/vulkan_core.h"
+
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vk_format.h"
+
+/* Minimum alignment encodable for our descriptors. The hardware texture/PBE
+ * descriptors require 16-byte alignment. Our software PBE atomic descriptor
+ * requires 128-byte alignment, but we could relax that one if we wanted.
+ */
+#define HK_PLANE_ALIGN_B 128
+
+static VkFormatFeatureFlags2
+hk_get_image_plane_format_features(struct hk_physical_device *pdev,
+                                   VkFormat vk_format, VkImageTiling tiling)
+{
+   VkFormatFeatureFlags2 features = 0;
+
+   /* Conformance fails with these optional formats. Just drop them for now.
+    * TODO: Investigate later if we have a use case.
+    */
+   switch (vk_format) {
+   case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR:
+   case VK_FORMAT_A8_UNORM_KHR:
+      return 0;
+   default:
+      break;
+   }
+
+   enum pipe_format p_format = vk_format_to_pipe_format(vk_format);
+   if (p_format == PIPE_FORMAT_NONE)
+      return 0;
+
+   /* NPOT formats only supported for texel buffers */
+   if (!util_is_power_of_two_nonzero(util_format_get_blocksize(p_format)))
+      return 0;
+
+   if (util_format_is_compressed(p_format)) {
+      /* Linear block-compressed images are all sorts of problematic, not sure
+       * if AGX even supports them. Don't try.
+       */
+      if (tiling != VK_IMAGE_TILING_OPTIMAL)
+         return 0;
+
+      /* XXX: Conformance fails, e.g.:
+       * dEQP-VK.pipeline.monolithic.sampler.view_type.2d.format.etc2_r8g8b8a1_unorm_block.mipmap.linear.lod.select_bias_3_7
+       *
+       * I suspect ail bug with mipmapping of compressed :-/
+       */
+      switch (util_format_description(p_format)->layout) {
+      case UTIL_FORMAT_LAYOUT_ETC:
+      case UTIL_FORMAT_LAYOUT_ASTC:
+         return 0;
+      default:
+         break;
+      }
+   }
+
+   if (agx_pixel_format[p_format].texturable) {
+      features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT;
+      features |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
+
+      /* We can sample integer formats but it doesn't make sense to linearly
+       * filter them.
+       */
+      if (!util_format_is_pure_integer(p_format)) {
+         features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+      }
+
+      if (vk_format_has_depth(vk_format)) {
+         features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
+      }
+   }
+
+   if (agx_pixel_format[p_format].renderable) {
+      /* For now, disable snorm rendering due to nir_lower_blend bugs.
+       *
+       * TODO: revisit.
+       */
+      if (!util_format_is_snorm(p_format)) {
+         features |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+         features |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
+      }
+
+      features |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
+      features |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+   }
+
+   if (vk_format_is_depth_or_stencil(vk_format)) {
+      if (!(p_format == PIPE_FORMAT_Z32_FLOAT ||
+            p_format == PIPE_FORMAT_S8_UINT ||
+            p_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ||
+            p_format == PIPE_FORMAT_Z16_UNORM) ||
+          tiling == VK_IMAGE_TILING_LINEAR)
+         return 0;
+
+      features |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT;
+   }
+
+   /* Our image atomic lowering doesn't bother to handle linear */
+   if ((p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT) &&
+       tiling == VK_IMAGE_TILING_OPTIMAL) {
+
+      features |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
+   }
+
+   if (features != 0) {
+      features |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT;
+      features |= VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+      features |= VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
+   }
+
+   return features;
+}
+
+VkFormatFeatureFlags2
+hk_get_image_format_features(struct hk_physical_device *pdev,
+                             VkFormat vk_format, VkImageTiling tiling)
+{
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(vk_format);
+   if (ycbcr_info == NULL)
+      return hk_get_image_plane_format_features(pdev, vk_format, tiling);
+
+   /* For multi-plane, we get the feature flags of each plane separately,
+    * then take their intersection as the overall format feature flags
+    */
+   VkFormatFeatureFlags2 features = ~0ull;
+   bool cosited_chroma = false;
+   for (uint8_t plane = 0; plane < ycbcr_info->n_planes; plane++) {
+      const struct vk_format_ycbcr_plane *plane_info =
+         &ycbcr_info->planes[plane];
+      features &=
+         hk_get_image_plane_format_features(pdev, plane_info->format, tiling);
+      if (plane_info->denominator_scales[0] > 1 ||
+          plane_info->denominator_scales[1] > 1)
+         cosited_chroma = true;
+   }
+   if (features == 0)
+      return 0;
+
+   /* Uh... We really should be able to sample from YCbCr */
+   assert(features & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT);
+   assert(features & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT);
+
+   /* These aren't allowed for YCbCr formats */
+   features &=
+      ~(VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+        VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+        VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+        VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT);
+
+   /* This is supported on all YCbCr formats */
+   features |=
+      VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT;
+
+   if (ycbcr_info->n_planes > 1) {
+      /* DISJOINT_BIT implies that each plane has its own separate binding,
+       * while SEPARATE_RECONSTRUCTION_FILTER_BIT implies that luma and chroma
+       * each have their own, separate filters, so these two bits make sense
+       * for multi-planar formats only.
+       *
+       * For MIDPOINT_CHROMA_SAMPLES_BIT, NVIDIA HW on single-plane interleaved
+       * YCbCr defaults to COSITED_EVEN, which is inaccurate and fails tests.
+       * This can be fixed with a NIR tweak but for now, we only enable this bit
+       * for multi-plane formats. See Issue #9525 on the mesa/main tracker.
+       */
+      features |=
+         VK_FORMAT_FEATURE_DISJOINT_BIT |
+         VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT |
+         VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+   }
+
+   if (cosited_chroma)
+      features |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT;
+
+   return features;
+}
+
+static VkFormatFeatureFlags2
+vk_image_usage_to_format_features(VkImageUsageFlagBits usage_flag)
+{
+   assert(util_bitcount(usage_flag) == 1);
+   switch (usage_flag) {
+   case VK_IMAGE_USAGE_TRANSFER_SRC_BIT:
+      return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+             VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+   case VK_IMAGE_USAGE_TRANSFER_DST_BIT:
+      return VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+             VK_FORMAT_FEATURE_BLIT_DST_BIT;
+   case VK_IMAGE_USAGE_SAMPLED_BIT:
+      return VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT;
+   case VK_IMAGE_USAGE_STORAGE_BIT:
+      return VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+   case VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT:
+      return VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+   case VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT:
+      return VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT;
+   default:
+      return 0;
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetPhysicalDeviceImageFormatProperties2(
+   VkPhysicalDevice physicalDevice,
+   const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo,
+   VkImageFormatProperties2 *pImageFormatProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+
+   const VkPhysicalDeviceExternalImageFormatInfo *external_info =
+      vk_find_struct_const(pImageFormatInfo->pNext,
+                           PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO);
+
+   /* Initialize to zero in case we return VK_ERROR_FORMAT_NOT_SUPPORTED */
+   memset(&pImageFormatProperties->imageFormatProperties, 0,
+          sizeof(pImageFormatProperties->imageFormatProperties));
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(pImageFormatInfo->format);
+
+   /* For the purposes of these checks, we don't care about all the extra
+    * YCbCr features and we just want the accumulation of features available
+    * to all planes of the given format.
+    */
+   VkFormatFeatureFlags2 features;
+   if (ycbcr_info == NULL) {
+      features = hk_get_image_plane_format_features(
+         pdev, pImageFormatInfo->format, pImageFormatInfo->tiling);
+   } else {
+      features = ~0ull;
+      assert(ycbcr_info->n_planes > 0);
+      for (uint8_t plane = 0; plane < ycbcr_info->n_planes; plane++) {
+         const VkFormat plane_format = ycbcr_info->planes[plane].format;
+         features &= hk_get_image_plane_format_features(
+            pdev, plane_format, pImageFormatInfo->tiling);
+      }
+   }
+   if (features == 0)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   if (pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR &&
+       pImageFormatInfo->type != VK_IMAGE_TYPE_2D)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   if (ycbcr_info && pImageFormatInfo->type != VK_IMAGE_TYPE_2D)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* From the Vulkan 1.3.279 spec:
+    *
+    *    VUID-VkImageCreateInfo-tiling-04121
+    *
+    *    "If tiling is VK_IMAGE_TILING_LINEAR, flags must not contain
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    *
+    *    VUID-VkImageCreateInfo-imageType-00970
+    *
+    *    "If imageType is VK_IMAGE_TYPE_1D, flags must not contain
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    */
+   if (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT &&
+       (pImageFormatInfo->type == VK_IMAGE_TYPE_1D ||
+        pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* From the Vulkan 1.3.279 spec:
+    *
+    *    VUID-VkImageCreateInfo-flags-09403
+    *
+    *    "If flags contains VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT, flags
+    *    must not include VK_IMAGE_CREATE_SPARSE_ALIASED_BIT,
+    *    VK_IMAGE_CREATE_SPARSE_BINDING_BIT, or
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    */
+   if ((pImageFormatInfo->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) &&
+       (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT |
+                                   VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                                   VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* We don't yet support sparse, but it shouldn't be too hard */
+   if (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT |
+                                  VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                                  VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   const uint32_t max_dim = 16384;
+   VkExtent3D maxExtent;
+   uint32_t maxArraySize;
+   switch (pImageFormatInfo->type) {
+   case VK_IMAGE_TYPE_1D:
+      maxExtent = (VkExtent3D){max_dim, 1, 1};
+      maxArraySize = 2048;
+      break;
+   case VK_IMAGE_TYPE_2D:
+      maxExtent = (VkExtent3D){max_dim, max_dim, 1};
+      maxArraySize = 2048;
+      break;
+   case VK_IMAGE_TYPE_3D:
+      maxExtent = (VkExtent3D){max_dim, max_dim, max_dim};
+      maxArraySize = 1;
+      break;
+   default:
+      unreachable("Invalid image type");
+   }
+   if (pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR)
+      maxArraySize = 1;
+
+   assert(util_is_power_of_two_nonzero(max_dim));
+   uint32_t maxMipLevels = util_logbase2(max_dim) + 1;
+   if (ycbcr_info != NULL || pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR)
+      maxMipLevels = 1;
+
+   VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+   if (pImageFormatInfo->tiling == VK_IMAGE_TILING_OPTIMAL &&
+       pImageFormatInfo->type == VK_IMAGE_TYPE_2D && ycbcr_info == NULL &&
+       (features & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+                    VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       !(pImageFormatInfo->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)) {
+
+      sampleCounts =
+         VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
+   }
+
+   /* From the Vulkan 1.2.199 spec:
+    *
+    *    "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be
+    *    created with usage flags that are not supported for the format the
+    *    image is created with but are supported for at least one format a
+    *    VkImageView created from the image can have."
+    *
+    * If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set, views can be created with
+    * different usage than the image so we can't always filter on usage.
+    * There is one exception to this below for storage.
+    */
+   const VkImageUsageFlags image_usage = pImageFormatInfo->usage;
+   VkImageUsageFlags view_usage = image_usage;
+   if (pImageFormatInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)
+      view_usage = 0;
+
+   if (view_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
+      if (!(features & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+                        VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT))) {
+         return VK_ERROR_FORMAT_NOT_SUPPORTED;
+      }
+   }
+
+   u_foreach_bit(b, view_usage) {
+      VkFormatFeatureFlags2 usage_features =
+         vk_image_usage_to_format_features(1 << b);
+      if (usage_features && !(features & usage_features))
+         return VK_ERROR_FORMAT_NOT_SUPPORTED;
+   }
+
+   const VkExternalMemoryProperties *ext_mem_props = NULL;
+   if (external_info != NULL && external_info->handleType != 0) {
+      bool tiling_has_explicit_layout;
+      switch (pImageFormatInfo->tiling) {
+      case VK_IMAGE_TILING_LINEAR:
+      case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT:
+         tiling_has_explicit_layout = true;
+         break;
+      case VK_IMAGE_TILING_OPTIMAL:
+         tiling_has_explicit_layout = false;
+         break;
+      default:
+         unreachable("Unsupported VkImageTiling");
+      }
+
+      switch (external_info->handleType) {
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+         /* No special restrictions */
+         if (tiling_has_explicit_layout) {
+            /* With an explicit memory layout, we don't care which type of
+             * fd the image belongs too. Both OPAQUE_FD and DMA_BUF are
+             * interchangeable here.
+             */
+            ext_mem_props = &hk_dma_buf_mem_props;
+         } else {
+            ext_mem_props = &hk_opaque_fd_mem_props;
+         }
+         break;
+
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+         if (!tiling_has_explicit_layout) {
+            return vk_errorf(pdev, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                             "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
+                             "requires VK_IMAGE_TILING_LINEAR or "
+                             "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+         }
+         ext_mem_props = &hk_dma_buf_mem_props;
+         break;
+
+      default:
+         /* From the Vulkan 1.3.256 spec:
+          *
+          *    "If handleType is not compatible with the [parameters] in
+          *    VkPhysicalDeviceImageFormatInfo2, then
+          *    vkGetPhysicalDeviceImageFormatProperties2 returns
+          *    VK_ERROR_FORMAT_NOT_SUPPORTED."
+          */
+         return vk_errorf(pdev, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                          "unsupported VkExternalMemoryTypeFlagBits 0x%x",
+                          external_info->handleType);
+      }
+   }
+
+   const unsigned plane_count =
+      vk_format_get_plane_count(pImageFormatInfo->format);
+
+   /* From the Vulkan 1.3.259 spec, VkImageCreateInfo:
+    *
+    *    VUID-VkImageCreateInfo-imageCreateFormatFeatures-02260
+    *
+    *    "If format is a multi-planar format, and if imageCreateFormatFeatures
+    *    (as defined in Image Creation Limits) does not contain
+    *    VK_FORMAT_FEATURE_DISJOINT_BIT, then flags must not contain
+    *    VK_IMAGE_CREATE_DISJOINT_BIT"
+    *
+    * This is satisfied trivially because we support DISJOINT on all
+    * multi-plane formats.  Also,
+    *
+    *    VUID-VkImageCreateInfo-format-01577
+    *
+    *    "If format is not a multi-planar format, and flags does not include
+    *    VK_IMAGE_CREATE_ALIAS_BIT, flags must not contain
+    *    VK_IMAGE_CREATE_DISJOINT_BIT"
+    */
+   if (plane_count == 1 &&
+       !(pImageFormatInfo->flags & VK_IMAGE_CREATE_ALIAS_BIT) &&
+       (pImageFormatInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   if (ycbcr_info &&
+       ((pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) ||
+        (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   pImageFormatProperties->imageFormatProperties = (VkImageFormatProperties){
+      .maxExtent = maxExtent,
+      .maxMipLevels = maxMipLevels,
+      .maxArrayLayers = maxArraySize,
+      .sampleCounts = sampleCounts,
+      .maxResourceSize = UINT32_MAX, /* TODO */
+   };
+
+   vk_foreach_struct(s, pImageFormatProperties->pNext) {
+      switch (s->sType) {
+      case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: {
+         VkExternalImageFormatProperties *p = (void *)s;
+         /* From the Vulkan 1.3.256 spec:
+          *
+          *    "If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2
+          *    will behave as if VkPhysicalDeviceExternalImageFormatInfo was
+          *    not present, and VkExternalImageFormatProperties will be
+          *    ignored."
+          *
+          * This is true if and only if ext_mem_props == NULL
+          */
+         if (ext_mem_props != NULL)
+            p->externalMemoryProperties = *ext_mem_props;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: {
+         VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = (void *)s;
+         ycbcr_props->combinedImageSamplerDescriptorCount = plane_count;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_HOST_IMAGE_COPY_DEVICE_PERFORMANCE_QUERY_EXT: {
+         VkHostImageCopyDevicePerformanceQueryEXT *hic_props = (void *)s;
+
+         /* TODO: Check compressability */
+         hic_props->optimalDeviceAccess = hic_props->identicalMemoryLayout =
+            true;
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(s->sType);
+         break;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkSparseImageFormatProperties
+hk_fill_sparse_image_fmt_props(VkImageAspectFlags aspects)
+{
+   /* TODO */
+   return (VkSparseImageFormatProperties){
+      .aspectMask = aspects,
+      .flags = VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT,
+      .imageGranularity =
+         {
+            .width = 1,
+            .height = 1,
+            .depth = 1,
+         },
+   };
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceSparseImageFormatProperties2(
+   VkPhysicalDevice physicalDevice,
+   const VkPhysicalDeviceSparseImageFormatInfo2 *pFormatInfo,
+   uint32_t *pPropertyCount, VkSparseImageFormatProperties2 *pProperties)
+{
+   VkResult result;
+
+   /* Check if the given format info is valid first before returning sparse
+    * props.  The easiest way to do this is to just call
+    * hk_GetPhysicalDeviceImageFormatProperties2()
+    */
+   const VkPhysicalDeviceImageFormatInfo2 img_fmt_info = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+      .format = pFormatInfo->format,
+      .type = pFormatInfo->type,
+      .tiling = pFormatInfo->tiling,
+      .usage = pFormatInfo->usage,
+      .flags = VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+               VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT,
+   };
+
+   VkImageFormatProperties2 img_fmt_props2 = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
+      .pNext = NULL,
+   };
+
+   result = hk_GetPhysicalDeviceImageFormatProperties2(
+      physicalDevice, &img_fmt_info, &img_fmt_props2);
+   if (result != VK_SUCCESS) {
+      *pPropertyCount = 0;
+      return;
+   }
+
+   const VkImageFormatProperties *props = &img_fmt_props2.imageFormatProperties;
+   if (!(pFormatInfo->samples & props->sampleCounts)) {
+      *pPropertyCount = 0;
+      return;
+   }
+
+   VK_OUTARRAY_MAKE_TYPED(VkSparseImageFormatProperties2, out, pProperties,
+                          pPropertyCount);
+
+   VkImageAspectFlags aspects = vk_format_aspects(pFormatInfo->format);
+
+   vk_outarray_append_typed(VkSparseImageFormatProperties2, &out, props)
+   {
+      props->properties = hk_fill_sparse_image_fmt_props(aspects);
+   }
+}
+
+static enum ail_tiling
+hk_map_tiling(const VkImageCreateInfo *info, unsigned plane)
+{
+   switch (info->tiling) {
+   case VK_IMAGE_TILING_LINEAR:
+      return AIL_TILING_LINEAR;
+
+   case VK_IMAGE_TILING_OPTIMAL: {
+      const struct vk_format_ycbcr_info *ycbcr_info =
+         vk_format_get_ycbcr_info(info->format);
+      VkFormat format =
+         ycbcr_info ? ycbcr_info->planes[plane].format : info->format;
+
+      if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         format = (plane == 0) ? VK_FORMAT_D32_SFLOAT : VK_FORMAT_S8_UINT;
+      }
+
+      const uint8_t width_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[0] : 1;
+      const uint8_t height_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[1] : 1;
+
+      if ((info->extent.width / width_scale) < 16 ||
+          (info->extent.height / height_scale) < 16)
+         return AIL_TILING_TWIDDLED;
+
+      // TODO: lots of bugs to fix first
+      // return AIL_TILING_TWIDDLED_COMPRESSED;
+      return AIL_TILING_TWIDDLED;
+   }
+
+   case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT:
+      /* TODO */
+      return AIL_TILING_TWIDDLED;
+   default:
+      unreachable("invalid tiling");
+   }
+}
+
+static uint32_t
+modifier_get_score(uint64_t mod)
+{
+   switch (mod) {
+   case DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED:
+      return 10;
+
+   case DRM_FORMAT_MOD_APPLE_TWIDDLED:
+      return 5;
+
+   case DRM_FORMAT_MOD_LINEAR:
+      return 1;
+
+   default:
+      return 0;
+   }
+}
+
+static uint64_t
+choose_drm_format_mod(uint32_t modifier_count, const uint64_t *modifiers)
+{
+   uint64_t best_mod = UINT64_MAX;
+   uint32_t best_score = 0;
+
+   for (uint32_t i = 0; i < modifier_count; ++i) {
+      uint32_t score = modifier_get_score(modifiers[i]);
+      if (score > best_score) {
+         best_mod = modifiers[i];
+         best_score = score;
+      }
+   }
+
+   if (best_score > 0)
+      return best_mod;
+   else
+      return DRM_FORMAT_MOD_INVALID;
+}
+
+static VkResult
+hk_image_init(struct hk_device *dev, struct hk_image *image,
+              const VkImageCreateInfo *pCreateInfo)
+{
+   vk_image_init(&dev->vk, &image->vk, pCreateInfo);
+
+   if ((image->vk.usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                           VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       image->vk.samples > 1) {
+      image->vk.usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+      image->vk.stencil_usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+   }
+
+   if (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
+      image->vk.usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+   if (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT)
+      image->vk.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+   image->plane_count = vk_format_get_plane_count(pCreateInfo->format);
+   image->disjoint = image->plane_count > 1 &&
+                     (pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT);
+
+   /* We do not support interleaved depth/stencil. Instead, we decompose to
+    * a depth plane and a stencil plane.
+    */
+   if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      image->plane_count = 2;
+   }
+
+   if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) {
+      /* Sparse multiplane is not supported. Sparse depth/stencil not supported
+       * on G13 so we're fine there too.
+       */
+      assert(image->plane_count == 1);
+   }
+
+   const struct VkImageDrmFormatModifierExplicitCreateInfoEXT
+      *mod_explicit_info = NULL;
+
+   if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      assert(!image->vk.wsi_legacy_scanout);
+      mod_explicit_info = vk_find_struct_const(
+         pCreateInfo->pNext,
+         IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+
+      uint64_t modifier = DRM_FORMAT_MOD_INVALID;
+
+      if (mod_explicit_info) {
+         modifier = mod_explicit_info->drmFormatModifier;
+      } else {
+         const struct VkImageDrmFormatModifierListCreateInfoEXT *mod_list_info =
+            vk_find_struct_const(
+               pCreateInfo->pNext,
+               IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
+
+         modifier = choose_drm_format_mod(mod_list_info->drmFormatModifierCount,
+                                          mod_list_info->pDrmFormatModifiers);
+      }
+
+      assert(modifier != DRM_FORMAT_MOD_INVALID);
+      assert(image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID);
+      image->vk.drm_format_mod = modifier;
+   }
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(pCreateInfo->format);
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      VkFormat format =
+         ycbcr_info ? ycbcr_info->planes[plane].format : pCreateInfo->format;
+
+      if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         format = (plane == 0) ? VK_FORMAT_D32_SFLOAT : VK_FORMAT_S8_UINT;
+      }
+
+      const uint8_t width_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[0] : 1;
+      const uint8_t height_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[1] : 1;
+
+      enum ail_tiling tiling = hk_map_tiling(pCreateInfo, plane);
+
+      image->planes[plane].layout = (struct ail_layout){
+         .tiling = tiling,
+         .mipmapped_z = pCreateInfo->imageType == VK_IMAGE_TYPE_3D,
+         .format = vk_format_to_pipe_format(format),
+
+         .width_px = pCreateInfo->extent.width / width_scale,
+         .height_px = pCreateInfo->extent.height / height_scale,
+         .depth_px = MAX2(pCreateInfo->extent.depth, pCreateInfo->arrayLayers),
+
+         .levels = pCreateInfo->mipLevels,
+         .sample_count_sa = pCreateInfo->samples,
+         .writeable_image = tiling != AIL_TILING_TWIDDLED_COMPRESSED,
+
+         /* TODO: Maybe optimize this, our GL driver doesn't bother though */
+         .renderable = true,
+      };
+
+      ail_make_miptree(&image->planes[plane].layout);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_image_plane_alloc_vma(struct hk_device *dev, struct hk_image_plane *plane,
+                         VkImageCreateFlags create_flags)
+{
+   const bool sparse_bound = create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT;
+   const bool sparse_resident =
+      create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
+   assert(sparse_bound || !sparse_resident);
+
+   if (sparse_bound) {
+      plane->vma_size_B = plane->layout.size_B;
+#if 0
+      plane->addr = nouveau_ws_alloc_vma(dev->ws_dev, 0, plane->vma_size_B,
+                                         plane->layout.align_B,
+                                         false, sparse_resident);
+#endif
+      if (plane->addr == 0) {
+         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "Sparse VMA allocation failed");
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_image_plane_finish(struct hk_device *dev, struct hk_image_plane *plane,
+                      VkImageCreateFlags create_flags,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   if (plane->vma_size_B) {
+#if 0
+      const bool sparse_resident =
+         create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
+
+      agx_bo_unbind_vma(dev->ws_dev, plane->addr, plane->vma_size_B);
+      nouveau_ws_free_vma(dev->ws_dev, plane->addr, plane->vma_size_B,
+                          false, sparse_resident);
+#endif
+   }
+}
+
+static void
+hk_image_finish(struct hk_device *dev, struct hk_image *image,
+                const VkAllocationCallbacks *pAllocator)
+{
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      hk_image_plane_finish(dev, &image->planes[plane], image->vk.create_flags,
+                            pAllocator);
+   }
+
+   vk_image_finish(&image->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateImage(VkDevice _device, const VkImageCreateInfo *pCreateInfo,
+               const VkAllocationCallbacks *pAllocator, VkImage *pImage)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_image *image;
+   VkResult result;
+
+#ifdef HK_USE_WSI_PLATFORM
+   /* Ignore swapchain creation info on Android. Since we don't have an
+    * implementation in Mesa, we're guaranteed to access an Android object
+    * incorrectly.
+    */
+   const VkImageSwapchainCreateInfoKHR *swapchain_info =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+      return wsi_common_create_swapchain_image(
+         &pdev->wsi_device, pCreateInfo, swapchain_info->swapchain, pImage);
+   }
+#endif
+
+   image = vk_zalloc2(&dev->vk.alloc, pAllocator, sizeof(*image), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!image)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = hk_image_init(dev, image, pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_free2(&dev->vk.alloc, pAllocator, image);
+      return result;
+   }
+
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      result = hk_image_plane_alloc_vma(dev, &image->planes[plane],
+                                        image->vk.create_flags);
+      if (result != VK_SUCCESS) {
+         hk_image_finish(dev, image, pAllocator);
+         vk_free2(&dev->vk.alloc, pAllocator, image);
+         return result;
+      }
+   }
+
+   *pImage = hk_image_to_handle(image);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyImage(VkDevice device, VkImage _image,
+                const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   if (!image)
+      return;
+
+   hk_image_finish(dev, image, pAllocator);
+   vk_free2(&dev->vk.alloc, pAllocator, image);
+}
+
+static void
+hk_image_plane_add_req(struct hk_image_plane *plane, uint64_t *size_B,
+                       uint32_t *align_B)
+{
+   assert(util_is_power_of_two_or_zero64(*align_B));
+   assert(util_is_power_of_two_or_zero64(HK_PLANE_ALIGN_B));
+
+   *align_B = MAX2(*align_B, HK_PLANE_ALIGN_B);
+   *size_B = align64(*size_B, HK_PLANE_ALIGN_B);
+   *size_B += plane->layout.size_B;
+}
+
+static void
+hk_get_image_memory_requirements(struct hk_device *dev, struct hk_image *image,
+                                 VkImageAspectFlags aspects,
+                                 VkMemoryRequirements2 *pMemoryRequirements)
+{
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   uint32_t memory_types = (1 << pdev->mem_type_count) - 1;
+
+   // TODO hope for the best?
+
+   uint64_t size_B = 0;
+   uint32_t align_B = 0;
+   if (image->disjoint) {
+      uint8_t plane = hk_image_aspects_to_plane(image, aspects);
+      hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B);
+   } else {
+      for (unsigned plane = 0; plane < image->plane_count; plane++)
+         hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B);
+   }
+
+   pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
+   pMemoryRequirements->memoryRequirements.alignment = align_B;
+   pMemoryRequirements->memoryRequirements.size = size_B;
+
+   vk_foreach_struct_const(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *dedicated = (void *)ext;
+         dedicated->prefersDedicatedAllocation = false;
+         dedicated->requiresDedicatedAllocation = false;
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetImageMemoryRequirements2(VkDevice device,
+                               const VkImageMemoryRequirementsInfo2 *pInfo,
+                               VkMemoryRequirements2 *pMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, pInfo->image);
+
+   const VkImagePlaneMemoryRequirementsInfo *plane_info =
+      vk_find_struct_const(pInfo->pNext, IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO);
+   const VkImageAspectFlags aspects =
+      image->disjoint ? plane_info->planeAspect : image->vk.aspects;
+
+   hk_get_image_memory_requirements(dev, image, aspects, pMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceImageMemoryRequirements(VkDevice device,
+                                    const VkDeviceImageMemoryRequirements *pInfo,
+                                    VkMemoryRequirements2 *pMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   ASSERTED VkResult result;
+   struct hk_image image = {0};
+
+   result = hk_image_init(dev, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   const VkImageAspectFlags aspects =
+      image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+   hk_get_image_memory_requirements(dev, &image, aspects, pMemoryRequirements);
+
+   hk_image_finish(dev, &image, NULL);
+}
+
+static VkSparseImageMemoryRequirements
+hk_fill_sparse_image_memory_reqs(const struct ail_layout *layout,
+                                 VkImageAspectFlags aspects)
+{
+   VkSparseImageFormatProperties sparse_format_props =
+      hk_fill_sparse_image_fmt_props(aspects);
+
+   // assert(layout->mip_tail_first_lod <= layout->num_levels);
+   VkSparseImageMemoryRequirements sparse_memory_reqs = {
+      .formatProperties = sparse_format_props,
+      .imageMipTailFirstLod = 0, // layout->mip_tail_first_lod,
+      .imageMipTailStride = 0,
+   };
+
+   sparse_memory_reqs.imageMipTailSize = layout->size_B;
+   sparse_memory_reqs.imageMipTailOffset = 0;
+   return sparse_memory_reqs;
+}
+
+static void
+hk_get_image_sparse_memory_requirements(
+   struct hk_device *dev, struct hk_image *image, VkImageAspectFlags aspects,
+   uint32_t *pSparseMemoryRequirementCount,
+   VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkSparseImageMemoryRequirements2, out,
+                          pSparseMemoryRequirements,
+                          pSparseMemoryRequirementCount);
+
+   /* From the Vulkan 1.3.279 spec:
+    *
+    *    "The sparse image must have been created using the
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT flag to retrieve valid sparse
+    *    image memory requirements."
+    */
+   if (!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+      return;
+
+   /* We don't support multiplane sparse for now */
+   if (image->plane_count > 1)
+      return;
+
+   vk_outarray_append_typed(VkSparseImageMemoryRequirements2, &out, reqs)
+   {
+      reqs->memoryRequirements =
+         hk_fill_sparse_image_memory_reqs(&image->planes[0].layout, aspects);
+   };
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetImageSparseMemoryRequirements2(
+   VkDevice device, const VkImageSparseMemoryRequirementsInfo2 *pInfo,
+   uint32_t *pSparseMemoryRequirementCount,
+   VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, pInfo->image);
+
+   const VkImageAspectFlags aspects = image->vk.aspects;
+
+   hk_get_image_sparse_memory_requirements(dev, image, aspects,
+                                           pSparseMemoryRequirementCount,
+                                           pSparseMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceImageSparseMemoryRequirements(
+   VkDevice device, const VkDeviceImageMemoryRequirements *pInfo,
+   uint32_t *pSparseMemoryRequirementCount,
+   VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   ASSERTED VkResult result;
+   struct hk_image image = {0};
+
+   result = hk_image_init(dev, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   const VkImageAspectFlags aspects =
+      image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+   hk_get_image_sparse_memory_requirements(dev, &image, aspects,
+                                           pSparseMemoryRequirementCount,
+                                           pSparseMemoryRequirements);
+
+   hk_image_finish(dev, &image, NULL);
+}
+
+static void
+hk_get_image_subresource_layout(UNUSED struct hk_device *dev,
+                                struct hk_image *image,
+                                const VkImageSubresource2KHR *pSubresource,
+                                VkSubresourceLayout2KHR *pLayout)
+{
+   const VkImageSubresource *isr = &pSubresource->imageSubresource;
+
+   const uint8_t p = hk_image_aspects_to_plane(image, isr->aspectMask);
+   const struct hk_image_plane *plane = &image->planes[p];
+
+   uint64_t offset_B = 0;
+   if (!image->disjoint) {
+      uint32_t align_B = 0;
+      for (unsigned plane = 0; plane < p; plane++)
+         hk_image_plane_add_req(&image->planes[plane], &offset_B, &align_B);
+   }
+   offset_B +=
+      ail_get_layer_level_B(&plane->layout, isr->arrayLayer, isr->mipLevel);
+
+   bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
+
+   pLayout->subresourceLayout = (VkSubresourceLayout){
+      .offset = offset_B,
+      .size = ail_get_level_size_B(&plane->layout, isr->mipLevel),
+
+      /* From the spec:
+       *
+       *     It is legal to call vkGetImageSubresourceLayout2KHR with a image
+       *     created with tiling equal to VK_IMAGE_TILING_OPTIMAL, but the
+       * members of VkSubresourceLayout2KHR::subresourceLayout will have
+       * undefined values in this case.
+       *
+       * So don't collapse with mips.
+       */
+      .rowPitch = isr->mipLevel
+                     ? 0
+                     : ail_get_wsi_stride_B(&plane->layout, isr->mipLevel),
+      .arrayPitch = is_3d ? 0 : plane->layout.layer_stride_B,
+      .depthPitch = is_3d ? plane->layout.layer_stride_B : 0,
+   };
+
+   VkSubresourceHostMemcpySizeEXT *memcpy_size =
+      vk_find_struct(pLayout, SUBRESOURCE_HOST_MEMCPY_SIZE_EXT);
+   if (memcpy_size) {
+      memcpy_size->size = pLayout->subresourceLayout.size;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetImageSubresourceLayout2KHR(VkDevice device, VkImage _image,
+                                 const VkImageSubresource2KHR *pSubresource,
+                                 VkSubresourceLayout2KHR *pLayout)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   hk_get_image_subresource_layout(dev, image, pSubresource, pLayout);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceImageSubresourceLayoutKHR(
+   VkDevice device, const VkDeviceImageSubresourceInfoKHR *pInfo,
+   VkSubresourceLayout2KHR *pLayout)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   ASSERTED VkResult result;
+   struct hk_image image = {0};
+
+   result = hk_image_init(dev, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   hk_get_image_subresource_layout(dev, &image, pInfo->pSubresource, pLayout);
+
+   hk_image_finish(dev, &image, NULL);
+}
+
+static void
+hk_image_plane_bind(struct hk_device *dev, struct hk_image_plane *plane,
+                    struct hk_device_memory *mem, uint64_t *offset_B)
+{
+   *offset_B = align64(*offset_B, HK_PLANE_ALIGN_B);
+
+   if (plane->vma_size_B) {
+#if 0
+      agx_bo_bind_vma(dev->ws_dev,
+                             mem->bo,
+                             plane->addr,
+                             plane->vma_size_B,
+                             *offset_B,
+                             plane->nil.pte_kind);
+#endif
+      unreachable("todo");
+   } else {
+      plane->addr = mem->bo->ptr.gpu + *offset_B;
+      plane->map = mem->bo->ptr.cpu + *offset_B;
+      plane->rem = mem->bo->size - (*offset_B);
+   }
+
+   *offset_B += plane->layout.size_B;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_BindImageMemory2(VkDevice device, uint32_t bindInfoCount,
+                    const VkBindImageMemoryInfo *pBindInfos)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   for (uint32_t i = 0; i < bindInfoCount; ++i) {
+      VK_FROM_HANDLE(hk_device_memory, mem, pBindInfos[i].memory);
+      VK_FROM_HANDLE(hk_image, image, pBindInfos[i].image);
+
+      /* Ignore this struct on Android, we cannot access swapchain structures
+       * there. */
+#ifdef HK_USE_WSI_PLATFORM
+      const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
+         vk_find_struct_const(pBindInfos[i].pNext,
+                              BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR);
+
+      if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+         VkImage _wsi_image = wsi_common_get_image(swapchain_info->swapchain,
+                                                   swapchain_info->imageIndex);
+         VK_FROM_HANDLE(hk_image, wsi_img, _wsi_image);
+
+         assert(image->plane_count == 1);
+         assert(wsi_img->plane_count == 1);
+
+         struct hk_image_plane *plane = &image->planes[0];
+         struct hk_image_plane *swapchain_plane = &wsi_img->planes[0];
+
+         /* Copy memory binding information from swapchain image to the current
+          * image's plane. */
+         plane->addr = swapchain_plane->addr;
+         continue;
+      }
+#endif
+
+      uint64_t offset_B = pBindInfos[i].memoryOffset;
+      if (image->disjoint) {
+         const VkBindImagePlaneMemoryInfo *plane_info = vk_find_struct_const(
+            pBindInfos[i].pNext, BIND_IMAGE_PLANE_MEMORY_INFO);
+         uint8_t plane =
+            hk_image_aspects_to_plane(image, plane_info->planeAspect);
+         hk_image_plane_bind(dev, &image->planes[plane], mem, &offset_B);
+      } else {
+         for (unsigned plane = 0; plane < image->plane_count; plane++) {
+            hk_image_plane_bind(dev, &image->planes[plane], mem, &offset_B);
+         }
+      }
+
+      const VkBindMemoryStatusKHR *status =
+         vk_find_struct_const(pBindInfos[i].pNext, BIND_MEMORY_STATUS_KHR);
+      if (status != NULL && status->pResult != NULL)
+         *status->pResult = VK_SUCCESS;
+   }
+
+   return VK_SUCCESS;
+}
+
+static uint32_t
+hk_plane_index(VkFormat format, VkImageAspectFlags aspect_mask)
+{
+   switch (aspect_mask) {
+   default:
+      assert(aspect_mask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+      return 0;
+   case VK_IMAGE_ASPECT_PLANE_1_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+      return 1;
+   case VK_IMAGE_ASPECT_PLANE_2_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+      return 2;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      return format == VK_FORMAT_D32_SFLOAT_S8_UINT;
+   }
+}
+
+static void
+hk_copy_memory_to_image(struct hk_device *device, struct hk_image *dst_image,
+                        const VkMemoryToImageCopyEXT *info, bool copy_memcpy)
+{
+   unsigned plane =
+      hk_plane_index(dst_image->vk.format, info->imageSubresource.aspectMask);
+   const struct ail_layout *layout = &dst_image->planes[plane].layout;
+
+   VkOffset3D offset = info->imageOffset;
+   VkExtent3D extent = info->imageExtent;
+   uint32_t src_width = info->memoryRowLength ?: extent.width;
+   uint32_t src_height = info->memoryImageHeight ?: extent.height;
+
+   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+   uint32_t src_pitch = src_width * blocksize_B;
+
+   unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                             ? offset.z
+                             : info->imageSubresource.baseArrayLayer;
+   uint32_t layers =
+      MAX2(extent.depth, vk_image_subresource_layer_count(
+                            &dst_image->vk, &info->imageSubresource));
+
+   unsigned level = info->imageSubresource.mipLevel;
+   uint32_t image_offset = ail_get_layer_level_B(layout, start_layer, level);
+   uint32_t dst_layer_stride = layout->layer_stride_B;
+   uint32_t src_layer_stride = copy_memcpy
+                                  ? ail_get_level_size_B(layout, level)
+                                  : (src_width * src_height * blocksize_B);
+   bool tiled = ail_is_level_twiddled_uncompressed(
+      layout, info->imageSubresource.mipLevel);
+
+   const char *src =
+      (const char *)info->pHostPointer + start_layer * dst_layer_stride;
+   char *dst = (char *)dst_image->planes[plane].map + image_offset;
+   for (unsigned layer = 0; layer < layers;
+        layer++, src += src_layer_stride, dst += dst_layer_stride) {
+      if (copy_memcpy) {
+         memcpy(dst, src, ail_get_level_size_B(layout, level));
+      } else if (!tiled) {
+         uint32_t dst_pitch = ail_get_linear_stride_B(layout, level);
+         /*TODO:comp*/
+         for (unsigned y = 0; y < extent.height; y++) {
+            memcpy(dst + dst_pitch * (y + offset.y) + offset.x * blocksize_B,
+                   src + src_pitch * y, extent.width * blocksize_B);
+         }
+      } else {
+         ail_tile(dst, (void *)src, layout, level, src_pitch, offset.x,
+                  offset.y, extent.width, extent.height);
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CopyMemoryToImageEXT(VkDevice _device,
+                        const VkCopyMemoryToImageInfoEXT *info)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_image, dst_image, info->dstImage);
+
+   for (unsigned i = 0; i < info->regionCount; i++) {
+      hk_copy_memory_to_image(device, dst_image, &info->pRegions[i],
+                              info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_copy_image_to_memory(struct hk_device *device, struct hk_image *src_image,
+                        const VkImageToMemoryCopyEXT *info, bool copy_memcpy)
+{
+   unsigned plane =
+      hk_plane_index(src_image->vk.format, info->imageSubresource.aspectMask);
+   const struct ail_layout *layout = &src_image->planes[plane].layout;
+
+   VkOffset3D offset = info->imageOffset;
+   VkExtent3D extent = info->imageExtent;
+   uint32_t dst_width = info->memoryRowLength ?: extent.width;
+   uint32_t dst_height = info->memoryImageHeight ?: extent.height;
+
+#if 0
+   copy_compressed(src_image->vk.format, &offset, &extent, &dst_width,
+                   &dst_height);
+#endif
+
+   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+   uint32_t dst_pitch = dst_width * blocksize_B;
+
+   unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                             ? offset.z
+                             : info->imageSubresource.baseArrayLayer;
+   uint32_t layers =
+      MAX2(extent.depth, vk_image_subresource_layer_count(
+                            &src_image->vk, &info->imageSubresource));
+   unsigned level = info->imageSubresource.mipLevel;
+
+   uint32_t image_offset = ail_get_layer_level_B(layout, start_layer, level);
+   uint32_t src_layer_stride = layout->layer_stride_B;
+   uint32_t dst_layer_stride = copy_memcpy
+                                  ? ail_get_level_size_B(layout, level)
+                                  : (dst_width * dst_height * blocksize_B);
+
+   bool tiled = ail_is_level_twiddled_uncompressed(
+      layout, info->imageSubresource.mipLevel);
+
+   const char *src = (const char *)src_image->planes[plane].map + image_offset;
+   char *dst = (char *)info->pHostPointer + start_layer * dst_layer_stride;
+   for (unsigned layer = 0; layer < layers;
+        layer++, src += src_layer_stride, dst += dst_layer_stride) {
+
+      if (copy_memcpy) {
+         memcpy(dst, src, dst_layer_stride);
+      } else if (!tiled) {
+         /* TODO: comp */
+         uint32_t src_pitch = ail_get_linear_stride_B(layout, level);
+         for (unsigned y = 0; y < extent.height; y++) {
+            memcpy(dst + dst_pitch * y,
+                   src + src_pitch * (y + offset.y) + offset.x * blocksize_B,
+                   extent.width * blocksize_B);
+         }
+      } else {
+         ail_detile((void *)src, dst, layout, info->imageSubresource.mipLevel,
+                    dst_pitch, offset.x, offset.y, extent.width, extent.height);
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CopyImageToMemoryEXT(VkDevice _device,
+                        const VkCopyImageToMemoryInfoEXT *info)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_image, image, info->srcImage);
+
+   for (unsigned i = 0; i < info->regionCount; i++) {
+      hk_copy_image_to_memory(device, image, &info->pRegions[i],
+                              info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
+                           struct hk_image *dst_image, const VkImageCopy2 *info,
+                           bool copy_memcpy)
+{
+   unsigned src_plane =
+      hk_plane_index(src_image->vk.format, info->srcSubresource.aspectMask);
+   unsigned dst_plane =
+      hk_plane_index(dst_image->vk.format, info->dstSubresource.aspectMask);
+
+   const struct ail_layout *src_layout = &src_image->planes[src_plane].layout;
+   const struct ail_layout *dst_layout = &dst_image->planes[dst_plane].layout;
+
+   VkOffset3D src_offset = info->srcOffset;
+   VkOffset3D dst_offset = info->dstOffset;
+   VkExtent3D extent = info->extent;
+   uint32_t layers_to_copy = MAX2(
+      info->extent.depth,
+      vk_image_subresource_layer_count(&src_image->vk, &info->srcSubresource));
+
+   /* See comment above. */
+#if 0
+   copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
+   copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
+#endif
+
+   unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                                 ? src_offset.z
+                                 : info->srcSubresource.baseArrayLayer;
+   unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                                 ? dst_offset.z
+                                 : info->dstSubresource.baseArrayLayer;
+
+   uint32_t src_layer_stride = src_layout->layer_stride_B;
+   uint32_t dst_layer_stride = dst_layout->layer_stride_B;
+
+   uint32_t dst_block_B = util_format_get_blocksize(dst_layout->format);
+   uint32_t src_block_B = util_format_get_blocksize(src_layout->format);
+
+   uint32_t src_image_offset = ail_get_layer_level_B(
+      src_layout, src_start_layer, info->srcSubresource.mipLevel);
+   uint32_t dst_image_offset = ail_get_layer_level_B(
+      dst_layout, dst_start_layer, info->dstSubresource.mipLevel);
+
+   bool src_tiled = ail_is_level_twiddled_uncompressed(
+      src_layout, info->srcSubresource.mipLevel);
+   bool dst_tiled = ail_is_level_twiddled_uncompressed(
+      dst_layout, info->dstSubresource.mipLevel);
+
+   const char *src =
+      (const char *)src_image->planes[src_plane].map + src_image_offset;
+   char *dst = (char *)dst_image->planes[dst_plane].map + dst_image_offset;
+   for (unsigned layer = 0; layer < layers_to_copy;
+        layer++, src += src_layer_stride, dst += dst_layer_stride) {
+
+      if (copy_memcpy) {
+         uint32_t src_size =
+            ail_get_level_size_B(src_layout, info->srcSubresource.mipLevel);
+         uint32_t dst_size =
+            ail_get_level_size_B(dst_layout, info->dstSubresource.mipLevel);
+
+         assert(src_size == dst_size);
+         memcpy(dst, src, src_size);
+      } else if (!src_tiled && !dst_tiled) {
+         /* TODO comp */
+         uint32_t src_pitch =
+            ail_get_linear_stride_B(src_layout, info->srcSubresource.mipLevel);
+
+         uint32_t dst_pitch =
+            ail_get_linear_stride_B(dst_layout, info->dstSubresource.mipLevel);
+
+         for (unsigned y = 0; y < extent.height; y++) {
+            memcpy(dst + dst_pitch * (y + dst_offset.y) +
+                      dst_offset.x * dst_block_B,
+                   src + src_pitch * (y + src_offset.y) +
+                      src_offset.x * src_block_B,
+                   extent.width * src_block_B);
+         }
+      } else if (!src_tiled) {
+         unreachable("todo");
+#if 0
+         fdl6_memcpy_linear_to_tiled(
+            dst_offset.x, dst_offset.y, extent.width, extent.height, dst,
+            src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp,
+            dst_layout, info->dstSubresource.mipLevel, src_pitch,
+            &device->physical_device->ubwc_config);
+#endif
+      } else if (!dst_tiled) {
+         unreachable("todo");
+#if 0
+         fdl6_memcpy_tiled_to_linear(
+            src_offset.x, src_offset.y, extent.width, extent.height,
+            dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp,
+            src, src_layout, info->dstSubresource.mipLevel, dst_pitch,
+            &device->physical_device->ubwc_config);
+#endif
+      } else {
+         /* Work tile-by-tile, holding the unswizzled tile in a temporary
+          * buffer.
+          */
+         char temp_tile[16384];
+
+         unsigned src_level = info->srcSubresource.mipLevel;
+         unsigned dst_level = info->dstSubresource.mipLevel;
+         uint32_t block_width = src_layout->tilesize_el[src_level].width_el;
+         uint32_t block_height = src_layout->tilesize_el[src_level].height_el;
+         uint32_t temp_pitch = block_width * src_block_B;
+         ;
+
+         for (unsigned by = src_offset.y / block_height;
+              by * block_height < src_offset.y + extent.height; by++) {
+            uint32_t src_y_start = MAX2(src_offset.y, by * block_height);
+            uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y;
+            uint32_t height =
+               MIN2((by + 1) * block_height, src_offset.y + extent.height) -
+               src_y_start;
+            for (unsigned bx = src_offset.x / block_width;
+                 bx * block_width < src_offset.x + extent.width; bx++) {
+               uint32_t src_x_start = MAX2(src_offset.x, bx * block_width);
+               uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x;
+               uint32_t width =
+                  MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
+                  src_x_start;
+
+               ail_detile((void *)src, temp_tile, src_layout, src_level,
+                          temp_pitch, src_x_start, src_y_start, width, height);
+               ail_tile(dst, temp_tile, dst_layout, dst_level, temp_pitch,
+                        dst_x_start, dst_y_start, width, height);
+            }
+         }
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CopyImageToImageEXT(VkDevice _device,
+                       const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_image, src_image, pCopyImageToImageInfo->srcImage);
+   VK_FROM_HANDLE(hk_image, dst_image, pCopyImageToImageInfo->dstImage);
+   bool copy_memcpy =
+      pCopyImageToImageInfo->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT;
+
+   for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) {
+      if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i];
+         u_foreach_bit(b, info.dstSubresource.aspectMask) {
+            info.srcSubresource.aspectMask = BITFIELD_BIT(b);
+            info.dstSubresource.aspectMask = BITFIELD_BIT(b);
+            hk_copy_image_to_image_cpu(device, src_image, dst_image, &info,
+                                       copy_memcpy);
+         }
+         continue;
+      }
+
+      hk_copy_image_to_image_cpu(device, src_image, dst_image,
+                                 pCopyImageToImageInfo->pRegions + i,
+                                 copy_memcpy);
+   }
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_TransitionImageLayoutEXT(
+   VkDevice device, uint32_t transitionCount,
+   const VkHostImageLayoutTransitionInfoEXT *transitions)
+{
+   /* We don't do anything with layouts so this should be a no-op */
+   return VK_SUCCESS;
+}
diff --git a/src/asahi/vulkan/hk_image.h b/src/asahi/vulkan/hk_image.h
new file mode 100644
index 00000000000..a15129032aa
--- /dev/null
+++ b/src/asahi/vulkan/hk_image.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/layout/layout.h"
+#include "vulkan/vulkan_core.h"
+
+#include "hk_private.h"
+
+#include "vk_image.h"
+
+/* Because small images can end up with an array_stride_B that is less than
+ * the sparse block size (in bytes), we have to set SINGLE_MIPTAIL_BIT when
+ * advertising sparse properties to the client.  This means that we get one
+ * single memory range for the miptail of the image.  For large images with
+ * mipTailStartLod > 0, we have to deal with the array stride ourselves.
+ *
+ * We do this by returning HK_MIP_TAIL_START_OFFSET as the image's
+ * imageMipTailOffset.  We can then detect anything with that address as
+ * being part of the miptail and re-map it accordingly.  The Vulkan spec
+ * explicitly allows for this.
+ *
+ * From the Vulkan 1.3.279 spec:
+ *
+ *    "When VK_SPARSE_MEMORY_BIND_METADATA_BIT is present, the resourceOffset
+ *    must have been derived explicitly from the imageMipTailOffset in the
+ *    sparse resource properties returned for the metadata aspect. By
+ *    manipulating the value returned for imageMipTailOffset, the
+ *    resourceOffset does not have to correlate directly to a device virtual
+ *    address offset, and may instead be whatever value makes it easiest for
+ *    the implementation to derive the correct device virtual address."
+ */
+#define HK_MIP_TAIL_START_OFFSET 0x6d74000000000000UL
+
+struct hk_device_memory;
+struct hk_physical_device;
+
+static VkFormatFeatureFlags2
+hk_get_image_plane_format_features(struct hk_physical_device *pdev,
+                                   VkFormat vk_format, VkImageTiling tiling);
+
+VkFormatFeatureFlags2
+hk_get_image_format_features(struct hk_physical_device *pdevice,
+                             VkFormat format, VkImageTiling tiling);
+
+struct hk_image_plane {
+   struct ail_layout layout;
+   uint64_t addr;
+
+   /** Size of the reserved VMA range for sparse images, zero otherwise. */
+   uint64_t vma_size_B;
+
+   /* For host image copy */
+   void *map;
+   uint32_t rem;
+};
+
+struct hk_image {
+   struct vk_image vk;
+
+   /** True if the planes are bound separately
+    *
+    * This is set based on VK_IMAGE_CREATE_DISJOINT_BIT
+    */
+   bool disjoint;
+
+   uint8_t plane_count;
+   struct hk_image_plane planes[3];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
+
+static inline uint64_t
+hk_image_plane_base_address(const struct hk_image_plane *plane)
+{
+   return plane->addr;
+}
+
+static inline uint64_t
+hk_image_base_address(const struct hk_image *image, uint8_t plane)
+{
+   return hk_image_plane_base_address(&image->planes[plane]);
+}
+
+static inline uint8_t
+hk_image_aspects_to_plane(const struct hk_image *image,
+                          VkImageAspectFlags aspectMask)
+{
+   /* Must only be one aspect unless it's depth/stencil */
+   assert(aspectMask ==
+             (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) ||
+          util_bitcount(aspectMask) == 1);
+
+   switch (aspectMask) {
+   default:
+      assert(aspectMask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+      return 0;
+
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      return image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
+
+   case VK_IMAGE_ASPECT_PLANE_1_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+      return 1;
+
+   case VK_IMAGE_ASPECT_PLANE_2_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+      return 2;
+   }
+}
diff --git a/src/asahi/vulkan/hk_image_view.c b/src/asahi/vulkan/hk_image_view.c
new file mode 100644
index 00000000000..5a78224a4fd
--- /dev/null
+++ b/src/asahi/vulkan/hk_image_view.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_image_view.h"
+#include "util/format/u_format.h"
+#include "vulkan/vulkan_core.h"
+
+#include "agx_helpers.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "layout.h"
+#include "vk_format.h"
+
+enum hk_desc_usage {
+   HK_DESC_USAGE_SAMPLED,
+   HK_DESC_USAGE_STORAGE,
+   HK_DESC_USAGE_INPUT,
+   HK_DESC_USAGE_BG_EOT,
+   HK_DESC_USAGE_LAYERED_BG_EOT,
+   HK_DESC_USAGE_EMRT,
+};
+
+static bool
+hk_image_view_type_is_array(VkImageViewType view_type)
+{
+   switch (view_type) {
+   case VK_IMAGE_VIEW_TYPE_1D:
+   case VK_IMAGE_VIEW_TYPE_2D:
+   case VK_IMAGE_VIEW_TYPE_3D:
+   case VK_IMAGE_VIEW_TYPE_CUBE:
+      return false;
+
+   case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
+   case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
+   case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
+      return true;
+
+   default:
+      unreachable("Invalid image view type");
+   }
+}
+
+static enum agx_texture_dimension
+translate_image_view_type(VkImageViewType view_type, bool msaa, bool layered,
+                          enum hk_desc_usage usage)
+{
+   if (usage == HK_DESC_USAGE_EMRT || usage == HK_DESC_USAGE_INPUT ||
+       (usage == HK_DESC_USAGE_LAYERED_BG_EOT && layered)) {
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D_ARRAY;
+   }
+
+   /* For background/EOT, we ignore the application-provided view type */
+   if (usage == HK_DESC_USAGE_BG_EOT || usage == HK_DESC_USAGE_LAYERED_BG_EOT) {
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D;
+   }
+
+   bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED;
+
+   switch (view_type) {
+   case VK_IMAGE_VIEW_TYPE_1D:
+   case VK_IMAGE_VIEW_TYPE_2D:
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D;
+
+   case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
+   case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D_ARRAY;
+
+   case VK_IMAGE_VIEW_TYPE_3D:
+      assert(!msaa);
+      return AGX_TEXTURE_DIMENSION_3D;
+
+   case VK_IMAGE_VIEW_TYPE_CUBE:
+      assert(!msaa);
+      return cubes_to_2d ? AGX_TEXTURE_DIMENSION_2D_ARRAY
+                         : AGX_TEXTURE_DIMENSION_CUBE;
+
+   case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
+      assert(!msaa);
+      return cubes_to_2d ? AGX_TEXTURE_DIMENSION_2D_ARRAY
+                         : AGX_TEXTURE_DIMENSION_CUBE_ARRAY;
+
+   default:
+      unreachable("Invalid image view type");
+   }
+}
+
+static enum pipe_swizzle
+vk_swizzle_to_pipe(VkComponentSwizzle swizzle)
+{
+   switch (swizzle) {
+   case VK_COMPONENT_SWIZZLE_R:
+      return PIPE_SWIZZLE_X;
+   case VK_COMPONENT_SWIZZLE_G:
+      return PIPE_SWIZZLE_Y;
+   case VK_COMPONENT_SWIZZLE_B:
+      return PIPE_SWIZZLE_Z;
+   case VK_COMPONENT_SWIZZLE_A:
+      return PIPE_SWIZZLE_W;
+   case VK_COMPONENT_SWIZZLE_ONE:
+      return PIPE_SWIZZLE_1;
+   case VK_COMPONENT_SWIZZLE_ZERO:
+      return PIPE_SWIZZLE_0;
+   default:
+      unreachable("Invalid component swizzle");
+   }
+}
+
+static enum pipe_format
+get_stencil_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_S8_UINT:
+      return PIPE_FORMAT_S8_UINT;
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      return PIPE_FORMAT_X24S8_UINT;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      return PIPE_FORMAT_S8X24_UINT;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return PIPE_FORMAT_X32_S8X24_UINT;
+   default:
+      unreachable("Unsupported depth/stencil format");
+   }
+}
+
+struct hk_3d {
+   unsigned x, y, z;
+};
+
+static struct hk_3d
+view_denominator(struct hk_image_view *view)
+{
+   enum pipe_format view_format = vk_format_to_pipe_format(view->vk.format);
+   enum pipe_format img_format =
+      vk_format_to_pipe_format(view->vk.image->format);
+
+   if (util_format_is_compressed(view_format)) {
+      /*
+       * We can do an uncompressed view of a compressed image but not the other
+       * way around.
+       */
+      assert(util_format_is_compressed(img_format));
+      assert(util_format_get_blockwidth(img_format) ==
+             util_format_get_blockwidth(view_format));
+      assert(util_format_get_blockheight(img_format) ==
+             util_format_get_blockheight(view_format));
+      assert(util_format_get_blockdepth(img_format) ==
+             util_format_get_blockdepth(view_format));
+
+      return (struct hk_3d){1, 1, 1};
+   }
+
+   if (!util_format_is_compressed(img_format)) {
+      /* Both formats uncompressed */
+      return (struct hk_3d){1, 1, 1};
+   }
+
+   /* Else, img is compressed but view is not */
+   return (struct hk_3d){
+      util_format_get_blockwidth(img_format),
+      util_format_get_blockheight(img_format),
+      util_format_get_blockdepth(img_format),
+   };
+}
+
+static enum pipe_format
+format_for_plane(struct hk_image_view *view, unsigned view_plane)
+{
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(view->vk.format);
+
+   assert(ycbcr_info || view_plane == 0);
+   VkFormat plane_format =
+      ycbcr_info ? ycbcr_info->planes[view_plane].format : view->vk.format;
+
+   enum pipe_format p_format = vk_format_to_pipe_format(plane_format);
+   if (view->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT)
+      p_format = get_stencil_format(p_format);
+
+   return p_format;
+}
+
+static void
+pack_texture(struct hk_image_view *view, unsigned view_plane,
+             enum hk_desc_usage usage, struct agx_texture_packed *out)
+{
+   struct hk_image *image = container_of(view->vk.image, struct hk_image, vk);
+   const uint8_t image_plane = view->planes[view_plane].image_plane;
+   struct ail_layout *layout = &image->planes[image_plane].layout;
+   uint64_t base_addr = hk_image_base_address(image, image_plane);
+
+   bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED;
+
+   unsigned level = view->vk.base_mip_level;
+   unsigned layer = view->vk.base_array_layer;
+
+   enum pipe_format p_format = format_for_plane(view, view_plane);
+   const struct util_format_description *desc =
+      util_format_description(p_format);
+
+   struct hk_3d denom = view_denominator(view);
+
+   uint8_t format_swizzle[4] = {
+      desc->swizzle[0],
+      desc->swizzle[1],
+      desc->swizzle[2],
+      desc->swizzle[3],
+   };
+
+   /* Different APIs have different depth/stencil swizzle rules. Vulkan expects
+    * R001 behaviour, override here because Mesa's format table is not that.
+    */
+   if (util_format_is_depth_or_stencil(p_format)) {
+      format_swizzle[0] = PIPE_SWIZZLE_X;
+      format_swizzle[1] = PIPE_SWIZZLE_0;
+      format_swizzle[2] = PIPE_SWIZZLE_0;
+      format_swizzle[3] = PIPE_SWIZZLE_1;
+   }
+
+   /* We only have a single swizzle for the user swizzle and the format
+    * fixup, so compose them now.
+    */
+   uint8_t out_swizzle[4];
+   uint8_t view_swizzle[4] = {
+      vk_swizzle_to_pipe(view->vk.swizzle.r),
+      vk_swizzle_to_pipe(view->vk.swizzle.g),
+      vk_swizzle_to_pipe(view->vk.swizzle.b),
+      vk_swizzle_to_pipe(view->vk.swizzle.a),
+   };
+
+   unsigned layers = view->vk.layer_count;
+   if (view->vk.view_type == VK_IMAGE_VIEW_TYPE_3D) {
+      layers = DIV_ROUND_UP(layout->depth_px, denom.z);
+   } else if (!cubes_to_2d &&
+              (view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
+               view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)) {
+
+      layers /= 6;
+   }
+
+   util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle);
+
+   agx_pack(out, TEXTURE, cfg) {
+      cfg.dimension = translate_image_view_type(
+         view->vk.view_type, view->vk.image->samples > 1, layers > 1, usage);
+      cfg.layout = agx_translate_layout(layout->tiling);
+      cfg.channels = agx_pixel_format[p_format].channels;
+      cfg.type = agx_pixel_format[p_format].type;
+      cfg.srgb = util_format_is_srgb(p_format);
+
+      cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]);
+      cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]);
+      cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]);
+      cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]);
+
+      if (denom.x > 1) {
+         assert(view->vk.level_count == 1);
+         assert(view->vk.layer_count == 1);
+
+         cfg.address = base_addr + ail_get_layer_level_B(layout, layer, level);
+         cfg.width = DIV_ROUND_UP(u_minify(layout->width_px, level), denom.x);
+         cfg.height = DIV_ROUND_UP(u_minify(layout->height_px, level), denom.y);
+         cfg.first_level = 0;
+         cfg.last_level = 1;
+      } else {
+         cfg.address = base_addr + ail_get_layer_offset_B(layout, layer);
+         cfg.width = layout->width_px;
+         cfg.height = layout->height_px;
+         cfg.first_level = level;
+         cfg.last_level = level + view->vk.level_count - 1;
+      }
+
+      cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+      cfg.unk_mipmapped = layout->levels > 1;
+      cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
+
+      if (ail_is_compressed(layout)) {
+         cfg.compressed_1 = true;
+         cfg.extended = true;
+      }
+
+      if (ail_is_compressed(layout)) {
+         cfg.acceleration_buffer = base_addr + layout->metadata_offset_B +
+                                   (layer * layout->compression_layer_stride_B);
+      }
+
+      if (layout->tiling == AIL_TILING_LINEAR &&
+          (hk_image_view_type_is_array(view->vk.view_type))) {
+
+         cfg.depth_linear = layers;
+         cfg.layer_stride_linear = layout->layer_stride_B - 0x80;
+         cfg.extended = true;
+      } else {
+         assert((layout->tiling != AIL_TILING_LINEAR) || (layers == 1));
+         cfg.depth = layers;
+      }
+
+      if (view->vk.image->samples > 1) {
+         cfg.samples = agx_translate_sample_count(view->vk.image->samples);
+      }
+
+      if (layout->tiling == AIL_TILING_LINEAR) {
+         cfg.stride = ail_get_linear_stride_B(layout, 0) - 16;
+      } else {
+         assert(layout->tiling == AIL_TILING_TWIDDLED ||
+                layout->tiling == AIL_TILING_TWIDDLED_COMPRESSED);
+
+         cfg.page_aligned_layers = layout->page_aligned_layers;
+      }
+   }
+}
+
+static void
+pack_pbe(struct hk_device *dev, struct hk_image_view *view, unsigned view_plane,
+         enum hk_desc_usage usage, struct agx_pbe_packed *out)
+{
+   struct hk_image *image = container_of(view->vk.image, struct hk_image, vk);
+   const uint8_t image_plane = view->planes[view_plane].image_plane;
+   struct ail_layout *layout = &image->planes[image_plane].layout;
+   uint64_t base_addr = hk_image_base_address(image, image_plane);
+
+   unsigned level = view->vk.base_mip_level;
+   unsigned layer = view->vk.base_array_layer;
+
+   enum pipe_format p_format = format_for_plane(view, view_plane);
+   const struct util_format_description *desc =
+      util_format_description(p_format);
+
+   bool eot =
+      usage == HK_DESC_USAGE_BG_EOT || usage == HK_DESC_USAGE_LAYERED_BG_EOT;
+
+   /* The tilebuffer is already in sRGB space if needed. Do not convert for
+    * end-of-tile descriptors.
+    */
+   if (eot)
+      p_format = util_format_linear(p_format);
+
+   bool msaa = view->vk.image->samples > 1;
+   struct hk_3d denom = view_denominator(view);
+
+   unsigned layers = view->vk.view_type == VK_IMAGE_VIEW_TYPE_3D
+                        ? image->vk.extent.depth
+                        : view->vk.layer_count;
+
+   agx_pack(out, PBE, cfg) {
+      cfg.dimension =
+         translate_image_view_type(view->vk.view_type, msaa, layers > 1, usage);
+      cfg.layout = agx_translate_layout(layout->tiling);
+      cfg.channels = agx_pixel_format[p_format].channels;
+      cfg.type = agx_pixel_format[p_format].type;
+      cfg.srgb = util_format_is_srgb(p_format);
+
+      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
+
+      for (unsigned i = 0; i < desc->nr_channels; ++i) {
+         if (desc->swizzle[i] == 0)
+            cfg.swizzle_r = i;
+         else if (desc->swizzle[i] == 1)
+            cfg.swizzle_g = i;
+         else if (desc->swizzle[i] == 2)
+            cfg.swizzle_b = i;
+         else if (desc->swizzle[i] == 3)
+            cfg.swizzle_a = i;
+      }
+
+      cfg.buffer = base_addr + ail_get_layer_offset_B(layout, layer);
+      cfg.unk_mipmapped = layout->levels > 1;
+
+      if (msaa & !eot) {
+         /* Multisampled images are bound like buffer textures, with
+          * addressing arithmetic to determine the texel to write.
+          *
+          * Note that the end-of-tile program uses real multisample images
+          * with image_write_block instructions.
+          */
+         unsigned blocksize_B = util_format_get_blocksize(p_format);
+         unsigned size_px =
+            (layout->size_B - layout->layer_stride_B * layer) / blocksize_B;
+
+         cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
+         cfg.layout = AGX_LAYOUT_LINEAR;
+         cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
+         cfg.height = DIV_ROUND_UP(size_px, cfg.width);
+         cfg.stride = (cfg.width * blocksize_B) - 4;
+         cfg.layers = 1;
+         cfg.levels = 1;
+
+         cfg.buffer += layout->level_offsets_B[level];
+         cfg.level = 0;
+      } else {
+         if (denom.x > 1) {
+            assert(denom.z == 1 && "todo how to handle?");
+            assert(view->vk.level_count == 1);
+            assert(view->vk.layer_count == 1);
+
+            cfg.buffer =
+               base_addr + ail_get_layer_level_B(layout, layer, level);
+            cfg.width =
+               DIV_ROUND_UP(u_minify(layout->width_px, level), denom.x);
+            cfg.height =
+               DIV_ROUND_UP(u_minify(layout->height_px, level), denom.y);
+            cfg.level = 0;
+         } else {
+            cfg.buffer = base_addr + ail_get_layer_offset_B(layout, layer);
+            cfg.width = layout->width_px;
+            cfg.height = layout->height_px;
+            cfg.level = level;
+         }
+
+         if (layout->tiling == AIL_TILING_LINEAR &&
+             (hk_image_view_type_is_array(view->vk.view_type))) {
+
+            cfg.depth_linear = layers;
+            cfg.layer_stride_linear = (layout->layer_stride_B - 0x80);
+            cfg.extended = true;
+         } else {
+            assert((layout->tiling != AIL_TILING_LINEAR) || (layers == 1));
+            cfg.layers = layers;
+         }
+
+         cfg.levels = image->vk.mip_levels;
+
+         if (layout->tiling == AIL_TILING_LINEAR) {
+            cfg.stride = ail_get_linear_stride_B(layout, level) - 4;
+            assert(cfg.levels == 1);
+         } else {
+            cfg.page_aligned_layers = layout->page_aligned_layers;
+         }
+
+         if (image->vk.samples > 1)
+            cfg.samples = agx_translate_sample_count(image->vk.samples);
+      }
+
+      if (ail_is_compressed(layout)) {
+         cfg.compressed_1 = true;
+         cfg.extended = true;
+
+         cfg.acceleration_buffer = base_addr + layout->metadata_offset_B +
+                                   (layer * layout->compression_layer_stride_B);
+      }
+
+      /* When the descriptor isn't extended architecturally, we use
+       * the last 8 bytes as a sideband to accelerate image atomics.
+       */
+      if (!cfg.extended && layout->writeable_image) {
+         if (msaa) {
+            assert(denom.x == 1 && "no MSAA of block-compressed");
+
+            cfg.aligned_width_msaa_sw =
+               align(u_minify(layout->width_px, level),
+                     layout->tilesize_el[level].width_el);
+         } else {
+            cfg.level_offset_sw = ail_get_level_offset_B(layout, cfg.level);
+         }
+
+         cfg.sample_count_log2_sw = util_logbase2(image->vk.samples);
+
+         if (layout->tiling == AIL_TILING_TWIDDLED) {
+            struct ail_tile tile_size = layout->tilesize_el[level];
+            cfg.tile_width_sw = tile_size.width_el;
+            cfg.tile_height_sw = tile_size.height_el;
+
+            cfg.layer_stride_sw = layout->layer_stride_B;
+         }
+      }
+   };
+}
+
+static VkResult
+add_descriptor(struct hk_device *dev, struct hk_image_view *view,
+               struct agx_texture_packed *desc,
+               struct agx_texture_packed *cached, uint32_t *index)
+{
+   /* First, look for a descriptor we already uploaded */
+   for (unsigned i = 0; i < view->descriptor_count; ++i) {
+      if (memcmp(&cached[i], desc, sizeof *desc) == 0) {
+         *index = view->descriptor_index[i];
+         return VK_SUCCESS;
+      }
+   }
+
+   /* Else, add a new descriptor */
+   VkResult result =
+      hk_descriptor_table_add(dev, &dev->images, desc, sizeof *desc, index);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint32_t local_index = view->descriptor_count++;
+   assert(local_index < HK_MAX_IMAGE_DESCS);
+
+   cached[local_index] = *desc;
+   view->descriptor_index[local_index] = *index;
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_image_view_init(struct hk_device *dev, struct hk_image_view *view,
+                   bool driver_internal,
+                   const VkImageViewCreateInfo *pCreateInfo)
+{
+   VK_FROM_HANDLE(hk_image, image, pCreateInfo->image);
+   VkResult result;
+
+   memset(view, 0, sizeof(*view));
+
+   vk_image_view_init(&dev->vk, &view->vk, driver_internal, pCreateInfo);
+
+   /* First, figure out which image planes we need. For depth/stencil, we only
+    * have one aspect viewed at a time.
+    */
+   if (image->vk.aspects &
+       (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+
+      view->plane_count = 1;
+      view->planes[0].image_plane =
+         hk_image_aspects_to_plane(image, view->vk.aspects);
+   } else {
+      /* For other formats, retrieve the plane count from the aspect mask
+       * and then walk through the aspect mask to map each image plane
+       * to its corresponding view plane
+       */
+      assert(util_bitcount(view->vk.aspects) ==
+             vk_format_get_plane_count(view->vk.format));
+      view->plane_count = 0;
+      u_foreach_bit(aspect_bit, view->vk.aspects) {
+         uint8_t image_plane =
+            hk_image_aspects_to_plane(image, 1u << aspect_bit);
+         view->planes[view->plane_count++].image_plane = image_plane;
+      }
+   }
+
+   struct agx_texture_packed cached[HK_MAX_IMAGE_DESCS];
+
+   /* Finally, fill in each view plane separately */
+   for (unsigned view_plane = 0; view_plane < view->plane_count; view_plane++) {
+      const struct {
+         VkImageUsageFlagBits flag;
+         enum hk_desc_usage usage;
+         uint32_t *tex;
+         uint32_t *pbe;
+      } descriptors[] = {
+         {VK_IMAGE_USAGE_SAMPLED_BIT, HK_DESC_USAGE_SAMPLED,
+          &view->planes[view_plane].sampled_desc_index},
+
+         {VK_IMAGE_USAGE_STORAGE_BIT, HK_DESC_USAGE_STORAGE,
+          &view->planes[view_plane].ro_storage_desc_index,
+          &view->planes[view_plane].storage_desc_index},
+
+         {VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, HK_DESC_USAGE_INPUT,
+          &view->planes[view_plane].ia_desc_index},
+
+         {VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, HK_DESC_USAGE_BG_EOT,
+          &view->planes[view_plane].background_desc_index,
+          &view->planes[view_plane].eot_pbe_desc_index},
+
+         {VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, HK_DESC_USAGE_LAYERED_BG_EOT,
+          &view->planes[view_plane].layered_background_desc_index,
+          &view->planes[view_plane].layered_eot_pbe_desc_index},
+      };
+
+      for (unsigned i = 0; i < ARRAY_SIZE(descriptors); ++i) {
+         if (!(view->vk.usage & descriptors[i].flag))
+            continue;
+
+         for (unsigned is_pbe = 0; is_pbe < 2; ++is_pbe) {
+            struct agx_texture_packed desc;
+            uint32_t *out = is_pbe ? descriptors[i].pbe : descriptors[i].tex;
+
+            if (!out)
+               continue;
+
+            if (is_pbe) {
+               static_assert(sizeof(struct agx_pbe_packed) ==
+                             sizeof(struct agx_texture_packed));
+
+               pack_pbe(dev, view, view_plane, descriptors[i].usage,
+                        (struct agx_pbe_packed *)&desc);
+            } else {
+               pack_texture(view, view_plane, descriptors[i].usage, &desc);
+            }
+
+            result = add_descriptor(dev, view, &desc, cached, out);
+            if (result != VK_SUCCESS)
+               return result;
+         }
+      }
+
+      if (view->vk.usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+         pack_texture(view, view_plane, HK_DESC_USAGE_EMRT,
+                      &view->planes[view_plane].emrt_texture);
+
+         pack_pbe(dev, view, view_plane, HK_DESC_USAGE_EMRT,
+                  &view->planes[view_plane].emrt_pbe);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyImageView(VkDevice _device, VkImageView imageView,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   VK_FROM_HANDLE(hk_image_view, view, imageView);
+
+   if (!view)
+      return;
+
+   for (uint8_t d = 0; d < view->descriptor_count; ++d) {
+      hk_descriptor_table_remove(dev, &dev->images, view->descriptor_index[d]);
+   }
+
+   vk_image_view_finish(&view->vk);
+   vk_free2(&dev->vk.alloc, pAllocator, view);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo,
+                   const VkAllocationCallbacks *pAllocator, VkImageView *pView)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   struct hk_image_view *view;
+   VkResult result;
+
+   view = vk_alloc2(&dev->vk.alloc, pAllocator, sizeof(*view), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!view)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = hk_image_view_init(
+      dev, view, pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+      pCreateInfo);
+   if (result != VK_SUCCESS) {
+      hk_DestroyImageView(_device, hk_image_view_to_handle(view), pAllocator);
+      return result;
+   }
+
+   *pView = hk_image_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
diff --git a/src/asahi/vulkan/hk_image_view.h b/src/asahi/vulkan/hk_image_view.h
new file mode 100644
index 00000000000..4a5c7c79fb7
--- /dev/null
+++ b/src/asahi/vulkan/hk_image_view.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "agx_pack.h"
+#include "hk_private.h"
+#include "vk_image.h"
+
+struct hk_device;
+
+#define HK_MAX_PLANES      3
+#define HK_MAX_IMAGE_DESCS (10 * HK_MAX_PLANES)
+
+struct hk_image_view {
+   struct vk_image_view vk;
+
+   uint32_t descriptor_index[HK_MAX_IMAGE_DESCS];
+   uint8_t descriptor_count;
+
+   uint8_t plane_count;
+   struct {
+      uint8_t image_plane;
+
+      /** Descriptors used for eMRT. We delay upload since we want them
+       * contiguous in memory, although this could be reworked if we wanted.
+       */
+      struct agx_texture_packed emrt_texture;
+      struct agx_pbe_packed emrt_pbe;
+
+      /** Index in the image descriptor table for the sampled image descriptor */
+      uint32_t sampled_desc_index;
+
+      /** Index in the image descriptor table for the storage image descriptor */
+      uint32_t storage_desc_index;
+
+      /** Index in the image descriptor table for the readonly storage image
+       * descriptor.
+       */
+      uint32_t ro_storage_desc_index;
+
+      /** Index in the image descriptor table for the texture descriptor used
+       * for background programs.
+       */
+      uint32_t background_desc_index;
+      uint32_t layered_background_desc_index;
+
+      /** Index in the image descriptor table for the texture descriptor used
+       * for input attachments.
+       */
+      uint32_t ia_desc_index;
+
+      /** Index in the image descriptor table for the PBE descriptor used for
+       * end-of-tile programs.
+       */
+      uint32_t eot_pbe_desc_index;
+      uint32_t layered_eot_pbe_desc_index;
+   } planes[3];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_image_view, vk.base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW)
diff --git a/src/asahi/vulkan/hk_instance.c b/src/asahi/vulkan/hk_instance.c
new file mode 100644
index 00000000000..fdf113f0edf
--- /dev/null
+++ b/src/asahi/vulkan/hk_instance.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_instance.h"
+
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vulkan/wsi/wsi_common.h"
+
+#include "util/build_id.h"
+#include "util/driconf.h"
+#include "util/mesa-sha1.h"
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_EnumerateInstanceVersion(uint32_t *pApiVersion)
+{
+   uint32_t version_override = vk_get_version_override();
+   *pApiVersion = version_override ? version_override
+                                   : VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION);
+
+   return VK_SUCCESS;
+}
+
+static const struct vk_instance_extension_table instance_extensions = {
+#ifdef HK_USE_WSI_PLATFORM
+   .KHR_get_surface_capabilities2 = true,
+   .KHR_surface = true,
+   .KHR_surface_protected_capabilities = true,
+   .EXT_surface_maintenance1 = true,
+   .EXT_swapchain_colorspace = true,
+#endif
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+   .KHR_wayland_surface = true,
+#endif
+#ifdef VK_USE_PLATFORM_XCB_KHR
+   .KHR_xcb_surface = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+   .KHR_xlib_surface = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+   .EXT_acquire_xlib_display = true,
+#endif
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+   .KHR_display = true,
+   .KHR_get_display_properties2 = true,
+   .EXT_direct_mode_display = true,
+   .EXT_display_surface_counter = true,
+   .EXT_acquire_drm_display = true,
+#endif
+#ifndef VK_USE_PLATFORM_WIN32_KHR
+   .EXT_headless_surface = true,
+#endif
+   .KHR_device_group_creation = true,
+   .KHR_external_fence_capabilities = true,
+   .KHR_external_memory_capabilities = true,
+   .KHR_external_semaphore_capabilities = true,
+   .KHR_get_physical_device_properties2 = true,
+   .EXT_debug_report = true,
+   .EXT_debug_utils = true,
+};
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_EnumerateInstanceExtensionProperties(const char *pLayerName,
+                                        uint32_t *pPropertyCount,
+                                        VkExtensionProperties *pProperties)
+{
+   if (pLayerName)
+      return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
+
+   return vk_enumerate_instance_extension_properties(
+      &instance_extensions, pPropertyCount, pProperties);
+}
+
+static const driOptionDescription hk_dri_options[] = {
+   DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC(true)
+      DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
+         DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
+            DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false)
+               DRI_CONF_VK_KHR_PRESENT_WAIT(false)
+                  DRI_CONF_VK_XWAYLAND_WAIT_READY(false) DRI_CONF_SECTION_END
+
+                     DRI_CONF_SECTION_DEBUG DRI_CONF_FORCE_VK_VENDOR()
+                        DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false)
+                           DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false)
+                              DRI_CONF_SECTION_END};
+
+static void
+hk_init_dri_options(struct hk_instance *instance)
+{
+   driParseOptionInfo(&instance->available_dri_options, hk_dri_options,
+                      ARRAY_SIZE(hk_dri_options));
+   driParseConfigFiles(
+      &instance->dri_options, &instance->available_dri_options, 0, "hk", NULL,
+      NULL, instance->vk.app_info.app_name, instance->vk.app_info.app_version,
+      instance->vk.app_info.engine_name, instance->vk.app_info.engine_version);
+
+   instance->force_vk_vendor =
+      driQueryOptioni(&instance->dri_options, "force_vk_vendor");
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
+                  const VkAllocationCallbacks *pAllocator,
+                  VkInstance *pInstance)
+{
+   struct hk_instance *instance;
+   VkResult result;
+
+   if (pAllocator == NULL)
+      pAllocator = vk_default_allocator();
+
+   instance = vk_alloc(pAllocator, sizeof(*instance), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!instance)
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_instance_dispatch_table dispatch_table;
+   vk_instance_dispatch_table_from_entrypoints(&dispatch_table,
+                                               &hk_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
+
+   result = vk_instance_init(&instance->vk, &instance_extensions,
+                             &dispatch_table, pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   hk_init_dri_options(instance);
+
+   instance->vk.physical_devices.try_create_for_drm =
+      hk_create_drm_physical_device;
+   instance->vk.physical_devices.destroy = hk_physical_device_destroy;
+
+   const struct build_id_note *note =
+      build_id_find_nhdr_for_addr(hk_CreateInstance);
+   if (!note) {
+      result = vk_errorf(NULL, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to find build-id");
+      goto fail_init;
+   }
+
+   unsigned build_id_len = build_id_length(note);
+   if (build_id_len < SHA1_DIGEST_LENGTH) {
+      result = vk_errorf(NULL, VK_ERROR_INITIALIZATION_FAILED,
+                         "build-id too short.  It needs to be a SHA");
+      goto fail_init;
+   }
+
+   static_assert(sizeof(instance->driver_build_sha) == SHA1_DIGEST_LENGTH);
+   memcpy(instance->driver_build_sha, build_id_data(note), SHA1_DIGEST_LENGTH);
+
+   *pInstance = hk_instance_to_handle(instance);
+   return VK_SUCCESS;
+
+fail_init:
+   vk_instance_finish(&instance->vk);
+fail_alloc:
+   vk_free(pAllocator, instance);
+
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyInstance(VkInstance _instance,
+                   const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_instance, instance, _instance);
+
+   if (!instance)
+      return;
+
+   driDestroyOptionCache(&instance->dri_options);
+   driDestroyOptionInfo(&instance->available_dri_options);
+
+   vk_instance_finish(&instance->vk);
+   vk_free(&instance->vk.alloc, instance);
+}
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+hk_GetInstanceProcAddr(VkInstance _instance, const char *pName)
+{
+   VK_FROM_HANDLE(hk_instance, instance, _instance);
+   return vk_instance_get_proc_addr(&instance->vk, &hk_instance_entrypoints,
+                                    pName);
+}
+
+PUBLIC VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName)
+{
+   return hk_GetInstanceProcAddr(instance, pName);
+}
diff --git a/src/asahi/vulkan/hk_instance.h b/src/asahi/vulkan/hk_instance.h
new file mode 100644
index 00000000000..d0c0397b02a
--- /dev/null
+++ b/src/asahi/vulkan/hk_instance.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "util/xmlconfig.h"
+#include "hk_private.h"
+#include "vk_instance.h"
+
+struct hk_instance {
+   struct vk_instance vk;
+
+   struct driOptionCache dri_options;
+   struct driOptionCache available_dri_options;
+
+   uint8_t driver_build_sha[20];
+   uint32_t force_vk_vendor;
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_instance, vk.base, VkInstance,
+                       VK_OBJECT_TYPE_INSTANCE)
diff --git a/src/asahi/vulkan/hk_nir_lower_descriptors.c b/src/asahi/vulkan/hk_nir_lower_descriptors.c
new file mode 100644
index 00000000000..802e184ae5e
--- /dev/null
+++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c
@@ -0,0 +1,867 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "pipe/p_defines.h"
+#include "vulkan/vulkan_core.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_set.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_shader.h"
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_deref.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "shader_enums.h"
+#include "vk_pipeline.h"
+
+struct lower_descriptors_ctx {
+   const struct hk_descriptor_set_layout *set_layouts[HK_MAX_SETS];
+
+   bool clamp_desc_array_bounds;
+   nir_address_format ubo_addr_format;
+   nir_address_format ssbo_addr_format;
+};
+
+static const struct hk_descriptor_set_binding_layout *
+get_binding_layout(uint32_t set, uint32_t binding,
+                   const struct lower_descriptors_ctx *ctx)
+{
+   assert(set < HK_MAX_SETS);
+   assert(ctx->set_layouts[set] != NULL);
+
+   const struct hk_descriptor_set_layout *set_layout = ctx->set_layouts[set];
+
+   assert(binding < set_layout->binding_count);
+   return &set_layout->binding[binding];
+}
+
+static nir_def *
+load_speculatable(nir_builder *b, unsigned num_components, unsigned bit_size,
+                  nir_def *addr, unsigned align)
+{
+   return nir_build_load_global_constant(b, num_components, bit_size, addr,
+                                         .align_mul = align,
+                                         .access = ACCESS_CAN_SPECULATE);
+}
+
+static nir_def *
+load_root(nir_builder *b, unsigned num_components, unsigned bit_size,
+          nir_def *offset, unsigned align)
+{
+   nir_def *root = nir_load_preamble(b, 1, 64, .base = HK_ROOT_UNIFORM);
+
+   /* We've bound the address of the root descriptor, index in. */
+   nir_def *addr = nir_iadd(b, root, nir_u2u64(b, offset));
+
+   return load_speculatable(b, num_components, bit_size, addr, align);
+}
+
+static bool
+lower_load_constant(nir_builder *b, nir_intrinsic_instr *load,
+                    const struct lower_descriptors_ctx *ctx)
+{
+   assert(load->intrinsic == nir_intrinsic_load_constant);
+   unreachable("todo: stick an address in the root descriptor or something");
+
+   uint32_t base = nir_intrinsic_base(load);
+   uint32_t range = nir_intrinsic_range(load);
+
+   b->cursor = nir_before_instr(&load->instr);
+
+   nir_def *offset = nir_iadd_imm(b, load->src[0].ssa, base);
+   nir_def *data = nir_load_ubo(
+      b, load->def.num_components, load->def.bit_size, nir_imm_int(b, 0),
+      offset, .align_mul = nir_intrinsic_align_mul(load),
+      .align_offset = nir_intrinsic_align_offset(load), .range_base = base,
+      .range = range);
+
+   nir_def_rewrite_uses(&load->def, data);
+
+   return true;
+}
+
+static nir_def *
+load_descriptor_set_addr(nir_builder *b, uint32_t set,
+                         UNUSED const struct lower_descriptors_ctx *ctx)
+{
+   uint32_t set_addr_offset =
+      hk_root_descriptor_offset(sets) + set * sizeof(uint64_t);
+
+   return load_root(b, 1, 64, nir_imm_int(b, set_addr_offset), 8);
+}
+
+static nir_def *
+load_dynamic_buffer_start(nir_builder *b, uint32_t set,
+                          const struct lower_descriptors_ctx *ctx)
+{
+   int dynamic_buffer_start_imm = 0;
+   for (uint32_t s = 0; s < set; s++) {
+      if (ctx->set_layouts[s] == NULL) {
+         dynamic_buffer_start_imm = -1;
+         break;
+      }
+
+      dynamic_buffer_start_imm += ctx->set_layouts[s]->dynamic_buffer_count;
+   }
+
+   if (dynamic_buffer_start_imm >= 0) {
+      return nir_imm_int(b, dynamic_buffer_start_imm);
+   } else {
+      uint32_t root_offset =
+         hk_root_descriptor_offset(set_dynamic_buffer_start) + set;
+
+      return nir_u2u32(b, load_root(b, 1, 8, nir_imm_int(b, root_offset), 1));
+   }
+}
+
+static nir_def *
+load_descriptor(nir_builder *b, unsigned num_components, unsigned bit_size,
+                uint32_t set, uint32_t binding, nir_def *index,
+                unsigned offset_B, const struct lower_descriptors_ctx *ctx)
+{
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      get_binding_layout(set, binding, ctx);
+
+   if (ctx->clamp_desc_array_bounds)
+      index =
+         nir_umin(b, index, nir_imm_int(b, binding_layout->array_size - 1));
+
+   switch (binding_layout->type) {
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+      /* Get the index in the root descriptor table dynamic_buffers array. */
+      nir_def *dynamic_buffer_start = load_dynamic_buffer_start(b, set, ctx);
+
+      index = nir_iadd(b, index,
+                       nir_iadd_imm(b, dynamic_buffer_start,
+                                    binding_layout->dynamic_buffer_index));
+
+      nir_def *root_desc_offset = nir_iadd_imm(
+         b, nir_imul_imm(b, index, sizeof(struct hk_buffer_address)),
+         hk_root_descriptor_offset(dynamic_buffers));
+
+      assert(num_components == 4 && bit_size == 32);
+      nir_def *desc = load_root(b, 4, 32, root_desc_offset, 16);
+
+      /* We know a priori that the the .w compnent (offset) is zero */
+      return nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3);
+   }
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+      nir_def *base_addr = nir_iadd_imm(
+         b, load_descriptor_set_addr(b, set, ctx), binding_layout->offset);
+
+      assert(binding_layout->stride == 1);
+      const uint32_t binding_size = binding_layout->array_size;
+
+      /* Convert it to nir_address_format_64bit_bounded_global */
+      assert(num_components == 4 && bit_size == 32);
+      return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
+                      nir_unpack_64_2x32_split_y(b, base_addr),
+                      nir_imm_int(b, binding_size), nir_imm_int(b, 0));
+   }
+
+   default: {
+      assert(binding_layout->stride > 0);
+      nir_def *desc_ubo_offset =
+         nir_iadd_imm(b, nir_imul_imm(b, index, binding_layout->stride),
+                      binding_layout->offset + offset_B);
+
+      unsigned desc_align_mul = (1 << (ffs(binding_layout->stride) - 1));
+      desc_align_mul = MIN2(desc_align_mul, 16);
+      unsigned desc_align_offset = binding_layout->offset + offset_B;
+      desc_align_offset %= desc_align_mul;
+
+      nir_def *desc;
+      nir_def *set_addr = load_descriptor_set_addr(b, set, ctx);
+      desc = nir_load_global_constant_offset(
+         b, num_components, bit_size, set_addr, desc_ubo_offset,
+         .align_mul = desc_align_mul, .align_offset = desc_align_offset,
+         .access = ACCESS_CAN_SPECULATE);
+
+      if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER) {
+         /* We know a priori that the the .w compnent (offset) is zero */
+         assert(num_components == 4 && bit_size == 32);
+         desc = nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3);
+      }
+      return desc;
+   }
+   }
+}
+
+static bool
+is_idx_intrin(nir_intrinsic_instr *intrin)
+{
+   while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) {
+      intrin = nir_src_as_intrinsic(intrin->src[0]);
+      if (intrin == NULL)
+         return false;
+   }
+
+   return intrin->intrinsic == nir_intrinsic_vulkan_resource_index;
+}
+
+static nir_def *
+load_descriptor_for_idx_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
+                               const struct lower_descriptors_ctx *ctx)
+{
+   nir_def *index = nir_imm_int(b, 0);
+
+   while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) {
+      index = nir_iadd(b, index, intrin->src[1].ssa);
+      intrin = nir_src_as_intrinsic(intrin->src[0]);
+   }
+
+   assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+   uint32_t set = nir_intrinsic_desc_set(intrin);
+   uint32_t binding = nir_intrinsic_binding(intrin);
+   index = nir_iadd(b, index, intrin->src[0].ssa);
+
+   return load_descriptor(b, 4, 32, set, binding, index, 0, ctx);
+}
+
+static bool
+try_lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
+                                 const struct lower_descriptors_ctx *ctx)
+{
+   ASSERTED const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(intrin->src[0]);
+   if (idx_intrin == NULL || !is_idx_intrin(idx_intrin)) {
+      assert(desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+             desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
+      return false;
+   }
+
+   nir_def *desc = load_descriptor_for_idx_intrin(b, idx_intrin, ctx);
+
+   nir_def_rewrite_uses(&intrin->def, desc);
+
+   return true;
+}
+
+static bool
+_lower_sysval_to_root_table(nir_builder *b, nir_intrinsic_instr *intrin,
+                            uint32_t root_table_offset)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+   assert((root_table_offset & 3) == 0 && "aligned");
+
+   nir_def *val = load_root(b, intrin->def.num_components, intrin->def.bit_size,
+                            nir_imm_int(b, root_table_offset), 4);
+
+   nir_def_rewrite_uses(&intrin->def, val);
+
+   return true;
+}
+
+#define lower_sysval_to_root_table(b, intrin, member)                          \
+   _lower_sysval_to_root_table(b, intrin, hk_root_descriptor_offset(member))
+
+static bool
+lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *load,
+                         const struct lower_descriptors_ctx *ctx)
+{
+   const uint32_t push_region_offset = hk_root_descriptor_offset(push);
+   const uint32_t base = nir_intrinsic_base(load);
+
+   b->cursor = nir_before_instr(&load->instr);
+
+   nir_def *offset =
+      nir_iadd_imm(b, load->src[0].ssa, push_region_offset + base);
+
+   nir_def *val = load_root(b, load->def.num_components, load->def.bit_size,
+                            offset, load->def.bit_size / 8);
+
+   nir_def_rewrite_uses(&load->def, val);
+
+   return true;
+}
+
+static void
+get_resource_deref_binding(nir_builder *b, nir_deref_instr *deref,
+                           uint32_t *set, uint32_t *binding, nir_def **index)
+{
+   if (deref->deref_type == nir_deref_type_array) {
+      *index = deref->arr.index.ssa;
+      deref = nir_deref_instr_parent(deref);
+   } else {
+      *index = nir_imm_int(b, 0);
+   }
+
+   assert(deref->deref_type == nir_deref_type_var);
+   nir_variable *var = deref->var;
+
+   *set = var->data.descriptor_set;
+   *binding = var->data.binding;
+}
+
+static nir_def *
+load_resource_deref_desc(nir_builder *b, unsigned num_components,
+                         unsigned bit_size, nir_deref_instr *deref,
+                         unsigned offset_B,
+                         const struct lower_descriptors_ctx *ctx)
+{
+   uint32_t set, binding;
+   nir_def *index;
+   get_resource_deref_binding(b, deref, &set, &binding, &index);
+   return load_descriptor(b, num_components, bit_size, set, binding, index,
+                          offset_B, ctx);
+}
+
+/*
+ * Returns an AGX bindless handle to access an indexed image within the global
+ * image heap.
+ */
+static nir_def *
+image_heap_handle(nir_builder *b, nir_def *offset)
+{
+   return nir_vec2(b, nir_imm_int(b, HK_IMAGE_HEAP_UNIFORM), offset);
+}
+
+static bool
+lower_image_intrin(nir_builder *b, nir_intrinsic_instr *intr,
+                   const struct lower_descriptors_ctx *ctx)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+
+   /* Reads and queries use the texture descriptor; writes and atomics PBE. */
+   unsigned offs;
+   if (intr->intrinsic != nir_intrinsic_image_deref_load &&
+       intr->intrinsic != nir_intrinsic_image_deref_size &&
+       intr->intrinsic != nir_intrinsic_image_deref_samples) {
+
+      offs = offsetof(struct hk_storage_image_descriptor, pbe_offset);
+   } else {
+      offs = offsetof(struct hk_storage_image_descriptor, tex_offset);
+   }
+
+   nir_def *offset = load_resource_deref_desc(b, 1, 32, deref, offs, ctx);
+   nir_rewrite_image_intrinsic(intr, image_heap_handle(b, offset), true);
+
+   return true;
+}
+
+static VkQueryPipelineStatisticFlagBits
+translate_pipeline_stat_bit(enum pipe_statistics_query_index pipe)
+{
+   switch (pipe) {
+   case PIPE_STAT_QUERY_IA_VERTICES:
+      return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT;
+   case PIPE_STAT_QUERY_IA_PRIMITIVES:
+      return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT;
+   case PIPE_STAT_QUERY_VS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_GS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_GS_PRIMITIVES:
+      return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT;
+   case PIPE_STAT_QUERY_C_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_C_PRIMITIVES:
+      return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT;
+   case PIPE_STAT_QUERY_PS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_HS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT;
+   case PIPE_STAT_QUERY_DS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_CS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_TS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT;
+   case PIPE_STAT_QUERY_MS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT;
+   }
+
+   unreachable("invalid statistic");
+}
+
+static bool
+lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
+{
+   unsigned *vs_uniform_base = data;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_uvs_index_agx: {
+      gl_varying_slot slot = nir_intrinsic_io_semantics(intrin).location;
+      unsigned offset = hk_root_descriptor_offset(draw.uvs_index[slot]);
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_def *val = load_root(b, 1, 8, nir_imm_int(b, offset), 1);
+      nir_def_rewrite_uses(&intrin->def, nir_u2u16(b, val));
+      return true;
+   }
+
+   case nir_intrinsic_load_shader_part_tests_zs_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.no_epilog_discard);
+
+   case nir_intrinsic_load_api_sample_mask_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.api_sample_mask);
+
+   case nir_intrinsic_load_sample_positions_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.ppp_multisamplectl);
+
+   case nir_intrinsic_load_depth_never_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.force_never_in_shader);
+
+   case nir_intrinsic_load_geometry_param_buffer_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.geometry_params);
+
+   case nir_intrinsic_load_vs_output_buffer_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.vertex_output_buffer);
+
+   case nir_intrinsic_load_vs_outputs_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.vertex_outputs);
+
+   case nir_intrinsic_load_tess_param_buffer_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.tess_params);
+
+   case nir_intrinsic_load_is_first_fan_agx: {
+      unsigned offset = hk_root_descriptor_offset(draw.provoking);
+      b->cursor = nir_instr_remove(&intrin->instr);
+      nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2);
+      nir_def_rewrite_uses(&intrin->def, nir_ieq_imm(b, val, 1));
+      return true;
+   }
+
+   case nir_intrinsic_load_provoking_last: {
+      unsigned offset = hk_root_descriptor_offset(draw.provoking);
+      b->cursor = nir_instr_remove(&intrin->instr);
+      nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2);
+      nir_def_rewrite_uses(&intrin->def, nir_b2b32(b, nir_ieq_imm(b, val, 2)));
+      return true;
+   }
+
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_first_vertex:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_input_assembly_buffer_agx: {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      unsigned base = *vs_uniform_base;
+      unsigned size = 32;
+
+      if (intrin->intrinsic == nir_intrinsic_load_base_instance) {
+         base += 2;
+      } else if (intrin->intrinsic == nir_intrinsic_load_draw_id) {
+         base += 4;
+         size = 16;
+      } else if (intrin->intrinsic ==
+                 nir_intrinsic_load_input_assembly_buffer_agx) {
+         base += 8;
+         size = 64;
+      }
+
+      nir_def *val = nir_load_preamble(b, 1, size, .base = base);
+      nir_def_rewrite_uses(&intrin->def,
+                           nir_u2uN(b, val, intrin->def.bit_size));
+      return true;
+   }
+
+   case nir_intrinsic_load_stat_query_address_agx: {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      unsigned off1 = hk_root_descriptor_offset(draw.pipeline_stats);
+      unsigned off2 = hk_root_descriptor_offset(draw.pipeline_stats_flags);
+
+      nir_def *base = load_root(b, 1, 64, nir_imm_int(b, off1), 8);
+      nir_def *flags = load_root(b, 1, 16, nir_imm_int(b, off2), 2);
+
+      unsigned query = nir_intrinsic_base(intrin);
+      VkQueryPipelineStatisticFlagBits bit = translate_pipeline_stat_bit(query);
+
+      /* Prefix sum to find the compacted offset */
+      nir_def *idx = nir_bit_count(b, nir_iand_imm(b, flags, bit - 1));
+      nir_def *addr = nir_iadd(
+         b, base, nir_imul_imm(b, nir_u2u64(b, idx), sizeof(uint64_t)));
+
+      /* The above returns garbage if the query isn't actually enabled, handle
+       * that case.
+       *
+       * TODO: Optimize case where we *know* the query is present?
+       */
+      nir_def *present = nir_ine_imm(b, nir_iand_imm(b, flags, bit), 0);
+      addr = nir_bcsel(b, present, addr, nir_imm_int64(b, 0));
+
+      nir_def_rewrite_uses(&intrin->def, addr);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+bool
+hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base)
+{
+   return nir_shader_intrinsics_pass(
+      s, lower_uvs_index, nir_metadata_control_flow, &vs_uniform_base);
+}
+
+static bool
+try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
+                 const struct lower_descriptors_ctx *ctx)
+{
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_constant:
+      return lower_load_constant(b, intrin, ctx);
+
+   case nir_intrinsic_load_vulkan_descriptor:
+      return try_lower_load_vulkan_descriptor(b, intrin, ctx);
+
+   case nir_intrinsic_load_workgroup_size:
+      unreachable("Should have been lowered by nir_lower_cs_intrinsics()");
+
+   case nir_intrinsic_load_base_workgroup_id:
+      return lower_sysval_to_root_table(b, intrin, cs.base_group);
+
+   case nir_intrinsic_load_push_constant:
+      return lower_load_push_constant(b, intrin, ctx);
+
+   case nir_intrinsic_load_view_index:
+      return lower_sysval_to_root_table(b, intrin, draw.view_index);
+
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_deref_sparse_load:
+   case nir_intrinsic_image_deref_store:
+   case nir_intrinsic_image_deref_atomic:
+   case nir_intrinsic_image_deref_atomic_swap:
+   case nir_intrinsic_image_deref_size:
+   case nir_intrinsic_image_deref_samples:
+      return lower_image_intrin(b, intrin, ctx);
+
+   case nir_intrinsic_load_num_workgroups: {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      unsigned offset = hk_root_descriptor_offset(cs.group_count_addr);
+      nir_def *ptr = load_root(b, 1, 64, nir_imm_int(b, offset), 4);
+      nir_def *val = load_speculatable(b, 3, 32, ptr, 4);
+
+      nir_def_rewrite_uses(&intrin->def, val);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+static bool
+lower_tex(nir_builder *b, nir_tex_instr *tex,
+          const struct lower_descriptors_ctx *ctx)
+{
+   b->cursor = nir_before_instr(&tex->instr);
+
+   nir_def *texture = nir_steal_tex_src(tex, nir_tex_src_texture_deref);
+   nir_def *sampler = nir_steal_tex_src(tex, nir_tex_src_sampler_deref);
+   if (!texture) {
+      assert(!sampler);
+      return false;
+   }
+
+   nir_def *plane_ssa = nir_steal_tex_src(tex, nir_tex_src_plane);
+   const uint32_t plane =
+      plane_ssa ? nir_src_as_uint(nir_src_for_ssa(plane_ssa)) : 0;
+   const uint64_t plane_offset_B =
+      plane * sizeof(struct hk_sampled_image_descriptor);
+
+   /* LOD bias is passed in the descriptor set, rather than embedded into
+    * the sampler descriptor. There's no spot in the hardware descriptor,
+    * plus this saves on precious sampler heap spots.
+    */
+   if (tex->op == nir_texop_lod_bias_agx) {
+      unsigned offs =
+         offsetof(struct hk_sampled_image_descriptor, lod_bias_fp16);
+
+      nir_def *bias = load_resource_deref_desc(
+         b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_def_replace(&tex->def, bias);
+      return true;
+   }
+
+   if (tex->op == nir_texop_has_custom_border_color_agx) {
+      unsigned offs = offsetof(struct hk_sampled_image_descriptor, has_border);
+
+      nir_def *res = load_resource_deref_desc(
+         b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_def_replace(&tex->def, nir_ine_imm(b, res, 0));
+      return true;
+   }
+
+   if (tex->op == nir_texop_custom_border_color_agx) {
+      unsigned offs = offsetof(struct hk_sampled_image_descriptor, border);
+
+      nir_def *border = load_resource_deref_desc(
+         b, 4, 32, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_alu_type T = nir_alu_type_get_base_type(tex->dest_type);
+      border = nir_convert_to_bit_size(b, border, T, tex->def.bit_size);
+
+      nir_def_replace(&tex->def, border);
+      return true;
+   }
+
+   {
+      unsigned offs =
+         offsetof(struct hk_sampled_image_descriptor, image_offset);
+
+      nir_def *offset = load_resource_deref_desc(
+         b, 1, 32, nir_src_as_deref(nir_src_for_ssa(texture)),
+         plane_offset_B + offs, ctx);
+
+      nir_def *handle = image_heap_handle(b, offset);
+      nir_tex_instr_add_src(tex, nir_tex_src_texture_handle, handle);
+   }
+
+   if (sampler != NULL) {
+      unsigned offs =
+         offsetof(struct hk_sampled_image_descriptor, sampler_index);
+
+      if (tex->backend_flags & AGX_TEXTURE_FLAG_CLAMP_TO_0) {
+         offs =
+            offsetof(struct hk_sampled_image_descriptor, clamp_0_sampler_index);
+      }
+
+      nir_def *index = load_resource_deref_desc(
+         b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_tex_instr_add_src(tex, nir_tex_src_sampler_handle, index);
+   }
+
+   return true;
+}
+
+static bool
+try_lower_descriptors_instr(nir_builder *b, nir_instr *instr, void *_data)
+{
+   const struct lower_descriptors_ctx *ctx = _data;
+
+   switch (instr->type) {
+   case nir_instr_type_tex:
+      return lower_tex(b, nir_instr_as_tex(instr), ctx);
+   case nir_instr_type_intrinsic:
+      return try_lower_intrin(b, nir_instr_as_intrinsic(instr), ctx);
+   default:
+      return false;
+   }
+}
+
+static bool
+lower_ssbo_resource_index(nir_builder *b, nir_intrinsic_instr *intrin,
+                          const struct lower_descriptors_ctx *ctx)
+{
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
+       desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   uint32_t set = nir_intrinsic_desc_set(intrin);
+   uint32_t binding = nir_intrinsic_binding(intrin);
+   nir_def *index = intrin->src[0].ssa;
+
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      get_binding_layout(set, binding, ctx);
+
+   nir_def *binding_addr;
+   uint8_t binding_stride;
+   switch (binding_layout->type) {
+   case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+      nir_def *set_addr = load_descriptor_set_addr(b, set, ctx);
+      binding_addr = nir_iadd_imm(b, set_addr, binding_layout->offset);
+      binding_stride = binding_layout->stride;
+      break;
+   }
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+      const uint32_t root_desc_addr_offset =
+         hk_root_descriptor_offset(root_desc_addr);
+
+      nir_def *root_desc_addr =
+         load_root(b, 1, 64, nir_imm_int(b, root_desc_addr_offset), 8);
+
+      nir_def *dynamic_buffer_start =
+         nir_iadd_imm(b, load_dynamic_buffer_start(b, set, ctx),
+                      binding_layout->dynamic_buffer_index);
+
+      nir_def *dynamic_binding_offset =
+         nir_iadd_imm(b,
+                      nir_imul_imm(b, dynamic_buffer_start,
+                                   sizeof(struct hk_buffer_address)),
+                      hk_root_descriptor_offset(dynamic_buffers));
+
+      binding_addr =
+         nir_iadd(b, root_desc_addr, nir_u2u64(b, dynamic_binding_offset));
+      binding_stride = sizeof(struct hk_buffer_address);
+      break;
+   }
+
+   default:
+      unreachable("Not an SSBO descriptor");
+   }
+
+   /* Tuck the stride in the top 8 bits of the binding address */
+   binding_addr = nir_ior_imm(b, binding_addr, (uint64_t)binding_stride << 56);
+
+   const uint32_t binding_size = binding_layout->array_size * binding_stride;
+   nir_def *offset_in_binding = nir_imul_imm(b, index, binding_stride);
+
+   nir_def *addr = nir_vec4(b, nir_unpack_64_2x32_split_x(b, binding_addr),
+                            nir_unpack_64_2x32_split_y(b, binding_addr),
+                            nir_imm_int(b, binding_size), offset_in_binding);
+
+   nir_def_rewrite_uses(&intrin->def, addr);
+
+   return true;
+}
+
+static bool
+lower_ssbo_resource_reindex(nir_builder *b, nir_intrinsic_instr *intrin,
+                            const struct lower_descriptors_ctx *ctx)
+{
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
+       desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *addr = intrin->src[0].ssa;
+   nir_def *index = intrin->src[1].ssa;
+
+   nir_def *addr_high32 = nir_channel(b, addr, 1);
+   nir_def *stride = nir_ushr_imm(b, addr_high32, 24);
+   nir_def *offset = nir_imul(b, index, stride);
+
+   addr = nir_build_addr_iadd(b, addr, ctx->ssbo_addr_format, nir_var_mem_ssbo,
+                              offset);
+   nir_def_rewrite_uses(&intrin->def, addr);
+
+   return true;
+}
+
+static bool
+lower_load_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
+                           const struct lower_descriptors_ctx *ctx)
+{
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
+       desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *addr = intrin->src[0].ssa;
+
+   nir_def *desc;
+   switch (ctx->ssbo_addr_format) {
+   case nir_address_format_64bit_global_32bit_offset: {
+      nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
+      nir_def *offset = nir_channel(b, addr, 3);
+      /* Mask off the binding stride */
+      base = nir_iand_imm(b, base, BITFIELD64_MASK(56));
+      desc = nir_load_global_constant_offset(b, 4, 32, base, offset,
+                                             .align_mul = 16, .align_offset = 0,
+                                             .access = ACCESS_CAN_SPECULATE);
+      break;
+   }
+
+   case nir_address_format_64bit_bounded_global: {
+      nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
+      nir_def *size = nir_channel(b, addr, 2);
+      nir_def *offset = nir_channel(b, addr, 3);
+      /* Mask off the binding stride */
+      base = nir_iand_imm(b, base, BITFIELD64_MASK(56));
+      desc = nir_load_global_constant_bounded(
+         b, 4, 32, base, offset, size, .align_mul = 16, .align_offset = 0,
+         .access = ACCESS_CAN_SPECULATE);
+      break;
+   }
+
+   default:
+      unreachable("Unknown address mode");
+   }
+
+   nir_def_rewrite_uses(&intrin->def, desc);
+
+   return true;
+}
+
+static bool
+lower_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intr, void *_data)
+{
+   const struct lower_descriptors_ctx *ctx = _data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_vulkan_resource_index:
+      return lower_ssbo_resource_index(b, intr, ctx);
+   case nir_intrinsic_vulkan_resource_reindex:
+      return lower_ssbo_resource_reindex(b, intr, ctx);
+   case nir_intrinsic_load_vulkan_descriptor:
+      return lower_load_ssbo_descriptor(b, intr, ctx);
+   default:
+      return false;
+   }
+}
+
+bool
+hk_nir_lower_descriptors(nir_shader *nir,
+                         const struct vk_pipeline_robustness_state *rs,
+                         uint32_t set_layout_count,
+                         struct vk_descriptor_set_layout *const *set_layouts)
+{
+   struct lower_descriptors_ctx ctx = {
+      .clamp_desc_array_bounds =
+         rs->storage_buffers !=
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
+
+         rs->uniform_buffers !=
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
+
+         rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
+
+      .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
+      .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
+   };
+
+   assert(set_layout_count <= HK_MAX_SETS);
+   for (uint32_t s = 0; s < set_layout_count; s++) {
+      if (set_layouts[s] != NULL)
+         ctx.set_layouts[s] = vk_to_hk_descriptor_set_layout(set_layouts[s]);
+   }
+
+   /* First lower everything but complex SSBOs, then lower complex SSBOs.
+    *
+    * TODO: See if we can unify this, not sure if the fast path matters on
+    * Apple. This is inherited from NVK.
+    */
+   bool pass_lower_descriptors = nir_shader_instructions_pass(
+      nir, try_lower_descriptors_instr, nir_metadata_control_flow, &ctx);
+
+   bool pass_lower_ssbo = nir_shader_intrinsics_pass(
+      nir, lower_ssbo_descriptor, nir_metadata_control_flow, &ctx);
+
+   return pass_lower_descriptors || pass_lower_ssbo;
+}
diff --git a/src/asahi/vulkan/hk_nir_passthrough_gs.c b/src/asahi/vulkan/hk_nir_passthrough_gs.c
new file mode 100644
index 00000000000..536b10c6b96
--- /dev/null
+++ b/src/asahi/vulkan/hk_nir_passthrough_gs.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/bitscan.h"
+#include "hk_shader.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_xfb_info.h"
+#include "shader_enums.h"
+
+void
+hk_nir_passthrough_gs(nir_builder *b, const void *key_)
+{
+   nir_shader *s = b->shader;
+   const struct hk_passthrough_gs_key *key = key_;
+   assert(key->prim == u_decomposed_prim(key->prim));
+   assert(key->prim != MESA_PRIM_PATCHES && "tessellation consumes patches");
+
+   enum mesa_prim out;
+   if (key->prim == MESA_PRIM_POINTS)
+      out = MESA_PRIM_POINTS;
+   else if (u_reduced_prim(key->prim) == MESA_PRIM_LINES)
+      out = MESA_PRIM_LINE_STRIP;
+   else
+      out = MESA_PRIM_TRIANGLE_STRIP;
+
+#if 0
+   assert((key->outputs &
+           (VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1)) == 0 &&
+          "cull distance lowering not run yet");
+#endif
+   /* XXX: need rework of preprocess_nir */
+   uint64_t outputs =
+      key->outputs & ~(VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1);
+
+   s->info.outputs_written = s->info.inputs_read = outputs;
+   s->info.clip_distance_array_size = key->clip_distance_array_size;
+   s->info.cull_distance_array_size = key->cull_distance_array_size;
+   s->info.stage = MESA_SHADER_GEOMETRY;
+   s->info.gs.input_primitive = key->prim;
+   s->info.gs.output_primitive = out;
+   s->info.gs.vertices_in = mesa_vertices_per_prim(key->prim);
+   s->info.gs.vertices_out = mesa_vertices_per_prim(out);
+   s->info.gs.invocations = 1;
+   s->info.gs.active_stream_mask = 1;
+
+   if (key->xfb_info.output_count) {
+      size_t size = nir_xfb_info_size(key->xfb_info.output_count);
+      s->xfb_info = ralloc_memdup(s, &key->xfb_info, size);
+      s->info.has_transform_feedback_varyings = true;
+      memcpy(s->info.xfb_stride, key->xfb_stride, sizeof(key->xfb_stride));
+   }
+
+   unsigned int start_vert = key->prim == MESA_PRIM_LINES_ADJACENCY ? 1 : 0;
+   unsigned int step = key->prim == MESA_PRIM_TRIANGLES_ADJACENCY ? 2 : 1;
+
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_def *one = nir_imm_int(b, 1);
+
+   for (unsigned i = 0; i < s->info.gs.vertices_out; ++i) {
+      nir_def *vertex = nir_imm_int(b, start_vert + (i * step));
+
+      /* Copy inputs to outputs. */
+      u_foreach_bit64(loc, outputs) {
+         unsigned adjusted_loc = loc;
+         nir_def *offset = zero;
+         unsigned num_slots = 1;
+
+         bool scalar = loc == VARYING_SLOT_LAYER ||
+                       loc == VARYING_SLOT_VIEW_INDEX ||
+                       loc == VARYING_SLOT_VIEWPORT || loc == VARYING_SLOT_PSIZ;
+         unsigned comps = scalar ? 1 : 4;
+
+         /* We use combined, compact clip/cull */
+         if (loc == VARYING_SLOT_CLIP_DIST1 || loc == VARYING_SLOT_CULL_DIST1) {
+            adjusted_loc--;
+            offset = one;
+         }
+
+         if (adjusted_loc == VARYING_SLOT_CLIP_DIST0 ||
+             adjusted_loc == VARYING_SLOT_CULL_DIST0) {
+            num_slots =
+               key->cull_distance_array_size + key->clip_distance_array_size;
+
+            if (loc > adjusted_loc)
+               comps = num_slots - 4;
+            else
+               comps = MIN2(num_slots, 4);
+         }
+
+         nir_io_semantics sem = {
+            .location = adjusted_loc,
+            .num_slots = num_slots,
+         };
+
+         nir_def *val = nir_load_per_vertex_input(b, comps, 32, vertex, offset,
+                                                  .io_semantics = sem);
+
+         for (unsigned c = 0; c < comps; ++c) {
+            nir_store_output(b, nir_channel(b, val, c), offset,
+                             .io_semantics = sem, .src_type = nir_type_uint32,
+                             .component = c);
+         }
+      }
+
+      nir_emit_vertex(b, 0);
+   }
+}
diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c
new file mode 100644
index 00000000000..304cc7c938d
--- /dev/null
+++ b/src/asahi/vulkan/hk_physical_device.c
@@ -0,0 +1,1417 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_physical_device.h"
+
+#include "asahi/lib/agx_device.h"
+#include "asahi/lib/agx_nir_lower_vbo.h"
+#include "asahi/lib/agx_nir_passes.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+#include "git_sha1.h"
+#include "hk_buffer.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_instance.h"
+#include "hk_private.h"
+#include "hk_shader.h"
+#include "hk_wsi.h"
+
+#include "util/u_debug.h"
+#include "vulkan/vulkan_core.h"
+#include "vulkan/wsi/wsi_common.h"
+#include "vk_device.h"
+#include "vk_drm_syncobj.h"
+#include "vk_shader_module.h"
+
+#include <fcntl.h>
+#include <string.h>
+#include <xf86drm.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+static uint32_t
+hk_get_vk_version()
+{
+   /* Version override takes priority */
+   const uint32_t version_override = vk_get_version_override();
+   if (version_override)
+      return version_override;
+
+   return VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION);
+}
+
+static void
+hk_get_device_extensions(const struct hk_instance *instance,
+                         struct vk_device_extension_table *ext)
+{
+   *ext = (struct vk_device_extension_table){
+      .KHR_8bit_storage = true,
+      .KHR_16bit_storage = true,
+      .KHR_bind_memory2 = true,
+      .KHR_buffer_device_address = true,
+      .KHR_calibrated_timestamps = false,
+      .KHR_copy_commands2 = true,
+      .KHR_create_renderpass2 = true,
+      .KHR_dedicated_allocation = true,
+      .KHR_depth_stencil_resolve = true,
+      .KHR_descriptor_update_template = true,
+      .KHR_device_group = true,
+      .KHR_draw_indirect_count = false,
+      .KHR_driver_properties = true,
+      .KHR_dynamic_rendering = true,
+      // TODO
+      .KHR_dynamic_rendering_local_read = false,
+      .KHR_external_fence = true,
+      .KHR_external_fence_fd = true,
+      .KHR_external_memory = true,
+      .KHR_external_memory_fd = true,
+      /* XXX: External timeline semaphores maybe broken in kernel, see
+       * dEQP-VK.synchronization.signal_order.shared_timeline_semaphore.write_copy_buffer_to_image_read_image_compute.image_128_r32_uint_opaque_fd
+       */
+      .KHR_external_semaphore = false,
+      .KHR_external_semaphore_fd = false,
+      .KHR_format_feature_flags2 = true,
+      .KHR_fragment_shader_barycentric = false,
+      .KHR_get_memory_requirements2 = true,
+      .KHR_global_priority = true,
+      .KHR_image_format_list = true,
+      .KHR_imageless_framebuffer = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .KHR_incremental_present = true,
+#endif
+      .KHR_index_type_uint8 = true,
+      .KHR_line_rasterization = true,
+      .KHR_load_store_op_none = true,
+      .KHR_maintenance1 = true,
+      .KHR_maintenance2 = true,
+      .KHR_maintenance3 = true,
+      .KHR_maintenance4 = true,
+      .KHR_maintenance5 = true,
+      .KHR_maintenance6 = true,
+      .KHR_map_memory2 = true,
+      .KHR_multiview = true,
+      .KHR_pipeline_executable_properties = true,
+      .KHR_pipeline_library = true,
+      .KHR_push_descriptor = true,
+      .KHR_relaxed_block_layout = true,
+      .KHR_sampler_mirror_clamp_to_edge = true,
+      .KHR_sampler_ycbcr_conversion = false,
+      .KHR_separate_depth_stencil_layouts = true,
+      .KHR_shader_atomic_int64 = false,
+      .KHR_shader_clock = false,
+      .KHR_shader_draw_parameters = true,
+      .KHR_shader_expect_assume = true,
+      .KHR_shader_float_controls = true,
+      // TODO: wait for nvk
+      .KHR_shader_float_controls2 = true,
+      .KHR_shader_float16_int8 = true,
+      .KHR_shader_integer_dot_product = true,
+      .KHR_shader_maximal_reconvergence = true,
+      .KHR_shader_non_semantic_info = true,
+      .KHR_shader_subgroup_extended_types = true,
+      .KHR_shader_subgroup_rotate = true,
+      .KHR_shader_subgroup_uniform_control_flow = true,
+      .KHR_shader_terminate_invocation = true,
+      .KHR_spirv_1_4 = true,
+      .KHR_storage_buffer_storage_class = true,
+      .KHR_timeline_semaphore = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .KHR_swapchain = true,
+      .KHR_swapchain_mutable_format = true,
+#endif
+      .KHR_synchronization2 = true,
+      .KHR_uniform_buffer_standard_layout = true,
+      .KHR_variable_pointers = true,
+      .KHR_vertex_attribute_divisor = true,
+      .KHR_vulkan_memory_model = true,
+      .KHR_workgroup_memory_explicit_layout = true,
+      .KHR_zero_initialize_workgroup_memory = true,
+      .EXT_4444_formats = true,
+      .EXT_attachment_feedback_loop_layout = true,
+      .EXT_border_color_swizzle = true,
+      .EXT_buffer_device_address = true,
+      .EXT_calibrated_timestamps = false,
+      .EXT_conditional_rendering = false,
+      .EXT_color_write_enable = true,
+      .EXT_custom_border_color = true,
+      .EXT_depth_bias_control = false,
+      .EXT_depth_clip_control = false,
+      .EXT_depth_clip_enable = true,
+      .EXT_descriptor_indexing = true,
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+      .EXT_display_control = false,
+#endif
+      .EXT_dynamic_rendering_unused_attachments = true,
+      .EXT_extended_dynamic_state = true,
+      .EXT_extended_dynamic_state2 = true,
+      .EXT_extended_dynamic_state3 = true,
+      .EXT_external_memory_dma_buf = true,
+      // TODO
+      .EXT_global_priority = false,
+      // TODO
+      .EXT_global_priority_query = false,
+      .EXT_graphics_pipeline_library = true,
+      .EXT_host_query_reset = true,
+      .EXT_host_image_copy = true,
+      .EXT_image_2d_view_of_3d = true,
+      .EXT_image_robustness = true,
+      .EXT_image_sliced_view_of_3d = false,
+      .EXT_image_view_min_lod = false,
+      .EXT_index_type_uint8 = true,
+      .EXT_inline_uniform_block = true,
+      .EXT_line_rasterization = true,
+      .EXT_load_store_op_none = true,
+      .EXT_map_memory_placed = false,
+      .EXT_memory_budget = false,
+      .EXT_multi_draw = true,
+      .EXT_mutable_descriptor_type = true,
+      .EXT_non_seamless_cube_map = true,
+      .EXT_pipeline_creation_cache_control = true,
+      .EXT_pipeline_creation_feedback = true,
+      .EXT_pipeline_protected_access = true,
+      .EXT_pipeline_robustness = true,
+      .EXT_physical_device_drm = true,
+      .EXT_primitive_topology_list_restart = true,
+      .EXT_private_data = true,
+      .EXT_primitives_generated_query = false,
+      .EXT_provoking_vertex = true,
+      .EXT_robustness2 = true,
+      .EXT_sample_locations = true,
+      .EXT_sampler_filter_minmax = false,
+      .EXT_scalar_block_layout = true,
+      .EXT_separate_stencil_usage = true,
+      .EXT_shader_image_atomic_int64 = false,
+      .EXT_shader_demote_to_helper_invocation = true,
+      .EXT_shader_module_identifier = true,
+      .EXT_shader_object = true,
+      .EXT_shader_replicated_composites = true,
+      .EXT_shader_stencil_export = true,
+      .EXT_shader_subgroup_ballot = true,
+      .EXT_shader_subgroup_vote = true,
+      .EXT_shader_viewport_index_layer = true,
+      .EXT_subgroup_size_control = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .EXT_swapchain_maintenance1 = true,
+#endif
+      .EXT_texel_buffer_alignment = true,
+      .EXT_tooling_info = true,
+      .EXT_transform_feedback = true,
+      .EXT_vertex_attribute_divisor = true,
+      .EXT_vertex_input_dynamic_state = true,
+      .EXT_ycbcr_2plane_444_formats = false,
+      .EXT_ycbcr_image_arrays = false,
+      .GOOGLE_decorate_string = true,
+      .GOOGLE_hlsl_functionality1 = true,
+      .GOOGLE_user_type = true,
+      .VALVE_mutable_descriptor_type = true,
+   };
+}
+
+static void
+hk_get_device_features(
+   const struct vk_device_extension_table *supported_extensions,
+   struct vk_features *features)
+{
+   *features = (struct vk_features){
+      /* Vulkan 1.0 */
+      .robustBufferAccess = true,
+      .fullDrawIndexUint32 = true,
+      .imageCubeArray = true,
+      .independentBlend = true,
+      .geometryShader = true,
+      .tessellationShader = true,
+      .sampleRateShading = true,
+      .dualSrcBlend = true,
+      .logicOp = true,
+      .multiDrawIndirect = true,
+      .drawIndirectFirstInstance = true,
+      .depthClamp = true,
+      .depthBiasClamp = true,
+      .fillModeNonSolid = true,
+      .depthBounds = false,
+      .wideLines = true,
+      .largePoints = true,
+      .alphaToOne = true,
+      .multiViewport = true,
+      .samplerAnisotropy = true,
+      .textureCompressionETC2 = false,
+      .textureCompressionBC = true,
+      .textureCompressionASTC_LDR = false,
+      .occlusionQueryPrecise = true,
+      .pipelineStatisticsQuery = true,
+      .vertexPipelineStoresAndAtomics = true,
+      .fragmentStoresAndAtomics = true,
+      .shaderTessellationAndGeometryPointSize = true,
+      .shaderImageGatherExtended = true,
+      .shaderStorageImageExtendedFormats = true,
+      /* TODO: hitting the vertex shader timeout in CTS, but should work */
+      .shaderStorageImageMultisample = false,
+      .shaderStorageImageReadWithoutFormat = true,
+      .shaderStorageImageWriteWithoutFormat = true,
+      .shaderUniformBufferArrayDynamicIndexing = true,
+      .shaderSampledImageArrayDynamicIndexing = true,
+      .shaderStorageBufferArrayDynamicIndexing = true,
+      .shaderStorageImageArrayDynamicIndexing = true,
+      .shaderClipDistance = true,
+      .shaderCullDistance = true,
+      .shaderFloat64 = false,
+      .shaderInt64 = true,
+      .shaderInt16 = true,
+      .shaderResourceResidency = false,
+      .shaderResourceMinLod = false,
+      .sparseBinding = false,
+      .sparseResidency2Samples = false,
+      .sparseResidency4Samples = false,
+      .sparseResidency8Samples = false,
+      .sparseResidencyAliased = false,
+      .sparseResidencyBuffer = false,
+      .sparseResidencyImage2D = false,
+      .sparseResidencyImage3D = false,
+      .variableMultisampleRate = false,
+      .inheritedQueries = true,
+
+      /* Vulkan 1.1 */
+      .storageBuffer16BitAccess = true,
+      .uniformAndStorageBuffer16BitAccess = true,
+      .storagePushConstant16 = true,
+      .storageInputOutput16 = false,
+      .multiview = true,
+      .multiviewGeometryShader = false,
+      .multiviewTessellationShader = false,
+      .variablePointersStorageBuffer = true,
+      .variablePointers = true,
+      .shaderDrawParameters = true,
+      .samplerYcbcrConversion = true,
+
+      /* Vulkan 1.2 */
+      .samplerMirrorClampToEdge = true,
+      .drawIndirectCount = false,
+      .storageBuffer8BitAccess = true,
+      .uniformAndStorageBuffer8BitAccess = true,
+      .storagePushConstant8 = true,
+      .shaderBufferInt64Atomics = false,
+      .shaderSharedInt64Atomics = false,
+      .shaderFloat16 = true,
+      .shaderInt8 = true,
+      .descriptorIndexing = true,
+      .shaderInputAttachmentArrayDynamicIndexing = true,
+      .shaderUniformTexelBufferArrayDynamicIndexing = true,
+      .shaderStorageTexelBufferArrayDynamicIndexing = true,
+      .shaderUniformBufferArrayNonUniformIndexing = true,
+      .shaderSampledImageArrayNonUniformIndexing = true,
+      .shaderStorageBufferArrayNonUniformIndexing = true,
+      .shaderStorageImageArrayNonUniformIndexing = true,
+      .shaderInputAttachmentArrayNonUniformIndexing = true,
+      .shaderUniformTexelBufferArrayNonUniformIndexing = true,
+      .shaderStorageTexelBufferArrayNonUniformIndexing = true,
+      .descriptorBindingUniformBufferUpdateAfterBind = true,
+      .descriptorBindingSampledImageUpdateAfterBind = true,
+      .descriptorBindingStorageImageUpdateAfterBind = true,
+      .descriptorBindingStorageBufferUpdateAfterBind = true,
+      .descriptorBindingUniformTexelBufferUpdateAfterBind = true,
+      .descriptorBindingStorageTexelBufferUpdateAfterBind = true,
+      .descriptorBindingUpdateUnusedWhilePending = true,
+      .descriptorBindingPartiallyBound = true,
+      .descriptorBindingVariableDescriptorCount = true,
+      .runtimeDescriptorArray = true,
+      .samplerFilterMinmax = false,
+      .scalarBlockLayout = true,
+      .imagelessFramebuffer = true,
+      .uniformBufferStandardLayout = true,
+      .shaderSubgroupExtendedTypes = true,
+      .separateDepthStencilLayouts = true,
+      .hostQueryReset = true,
+      .timelineSemaphore = true,
+      .bufferDeviceAddress = true,
+      .bufferDeviceAddressCaptureReplay = false,
+      .bufferDeviceAddressMultiDevice = false,
+      .vulkanMemoryModel = true,
+      .vulkanMemoryModelDeviceScope = true,
+      .vulkanMemoryModelAvailabilityVisibilityChains = false,
+      .shaderOutputViewportIndex = true,
+      .shaderOutputLayer = true,
+      .subgroupBroadcastDynamicId = true,
+
+      /* Vulkan 1.3 */
+      .robustImageAccess = true,
+      .inlineUniformBlock = true,
+      .descriptorBindingInlineUniformBlockUpdateAfterBind = true,
+      .pipelineCreationCacheControl = true,
+      .privateData = true,
+      .shaderDemoteToHelperInvocation = true,
+      .shaderTerminateInvocation = true,
+      .subgroupSizeControl = true,
+      .computeFullSubgroups = true,
+      .synchronization2 = true,
+      .shaderZeroInitializeWorkgroupMemory = true,
+      .dynamicRendering = true,
+      .shaderIntegerDotProduct = true,
+      .maintenance4 = true,
+
+      /* VK_KHR_dynamic_rendering_local_read */
+      .dynamicRenderingLocalRead = true,
+
+      /* VK_KHR_fragment_shader_barycentric */
+      .fragmentShaderBarycentric = false,
+
+      /* VK_KHR_global_priority */
+      .globalPriorityQuery = true,
+
+      /* VK_KHR_index_type_uint8 */
+      .indexTypeUint8 = true,
+
+      /* VK_KHR_line_rasterization */
+      .rectangularLines = false,
+      .bresenhamLines = true,
+      .smoothLines = false,
+      .stippledRectangularLines = false,
+      .stippledBresenhamLines = false,
+      .stippledSmoothLines = false,
+
+      /* VK_KHR_maintenance5 */
+      .maintenance5 = true,
+
+      /* VK_KHR_maintenance6 */
+      .maintenance6 = true,
+
+      /* VK_KHR_pipeline_executable_properties */
+      .pipelineExecutableInfo = true,
+
+      /* VK_KHR_present_id */
+      .presentId = false,
+
+      /* VK_KHR_present_wait */
+      .presentWait = false,
+
+      /* VK_KHR_shader_clock */
+      .shaderSubgroupClock = false,
+      .shaderDeviceClock = false,
+
+      /* VK_KHR_shader_expect_assume */
+      .shaderExpectAssume = true,
+
+      /* VK_KHR_shader_float_controls2 */
+      .shaderFloatControls2 = true,
+
+      /* VK_KHR_shader_maximal_reconvergence */
+      .shaderMaximalReconvergence = true,
+
+      /* VK_KHR_shader_subgroup_rotate */
+      .shaderSubgroupRotate = true,
+      .shaderSubgroupRotateClustered = true,
+
+      /* VK_KHR_vertex_attribute_divisor */
+      .vertexAttributeInstanceRateDivisor = true,
+      .vertexAttributeInstanceRateZeroDivisor = true,
+
+      /* VK_KHR_workgroup_memory_explicit_layout */
+      .workgroupMemoryExplicitLayout = true,
+      .workgroupMemoryExplicitLayoutScalarBlockLayout = true,
+      .workgroupMemoryExplicitLayout8BitAccess = true,
+      .workgroupMemoryExplicitLayout16BitAccess = true,
+
+      /* VK_EXT_4444_formats */
+      .formatA4R4G4B4 = true,
+      .formatA4B4G4R4 = true,
+
+      /* VK_EXT_attachment_feedback_loop_layout */
+      .attachmentFeedbackLoopLayout = true,
+
+      /* VK_EXT_border_color_swizzle */
+      .borderColorSwizzle = true,
+      .borderColorSwizzleFromImage = false,
+
+      /* VK_EXT_buffer_device_address */
+      .bufferDeviceAddressCaptureReplayEXT = false,
+
+      /* VK_EXT_color_write_enable */
+      .colorWriteEnable = true,
+
+      /* VK_EXT_conditional_rendering */
+      .conditionalRendering = false,
+      .inheritedConditionalRendering = false,
+
+      /* VK_EXT_custom_border_color */
+      .customBorderColors = true,
+      .customBorderColorWithoutFormat = true,
+
+      /* VK_EXT_depth_bias_control */
+      .depthBiasControl = false,
+      .leastRepresentableValueForceUnormRepresentation = false,
+      .floatRepresentation = false,
+      .depthBiasExact = false,
+
+      /* VK_EXT_depth_clip_control */
+      .depthClipControl = false,
+
+      /* VK_EXT_depth_clip_enable */
+      .depthClipEnable = true,
+
+      /* VK_EXT_dynamic_rendering_unused_attachments */
+      .dynamicRenderingUnusedAttachments = true,
+
+      /* VK_EXT_extended_dynamic_state */
+      .extendedDynamicState = true,
+
+      /* VK_EXT_extended_dynamic_state2 */
+      .extendedDynamicState2 = true,
+      .extendedDynamicState2LogicOp = true,
+      .extendedDynamicState2PatchControlPoints = false,
+
+      /* VK_EXT_extended_dynamic_state3 */
+      .extendedDynamicState3TessellationDomainOrigin = false,
+      .extendedDynamicState3DepthClampEnable = true,
+      .extendedDynamicState3PolygonMode = true,
+      .extendedDynamicState3RasterizationSamples = true,
+      .extendedDynamicState3SampleMask = true,
+      .extendedDynamicState3AlphaToCoverageEnable = true,
+      .extendedDynamicState3AlphaToOneEnable = true,
+      .extendedDynamicState3LogicOpEnable = true,
+      .extendedDynamicState3ColorBlendEnable = true,
+      .extendedDynamicState3ColorBlendEquation = true,
+      .extendedDynamicState3ColorWriteMask = true,
+      .extendedDynamicState3RasterizationStream = false,
+      .extendedDynamicState3ConservativeRasterizationMode = false,
+      .extendedDynamicState3ExtraPrimitiveOverestimationSize = false,
+      .extendedDynamicState3DepthClipEnable = true,
+      .extendedDynamicState3SampleLocationsEnable = false,
+      .extendedDynamicState3ColorBlendAdvanced = false,
+      .extendedDynamicState3ProvokingVertexMode = true,
+      .extendedDynamicState3LineRasterizationMode = true,
+      .extendedDynamicState3LineStippleEnable = false,
+      .extendedDynamicState3DepthClipNegativeOneToOne = false,
+      .extendedDynamicState3ViewportWScalingEnable = false,
+      .extendedDynamicState3ViewportSwizzle = false,
+      .extendedDynamicState3CoverageToColorEnable = false,
+      .extendedDynamicState3CoverageToColorLocation = false,
+      .extendedDynamicState3CoverageModulationMode = false,
+      .extendedDynamicState3CoverageModulationTableEnable = false,
+      .extendedDynamicState3CoverageModulationTable = false,
+      .extendedDynamicState3CoverageReductionMode = false,
+      .extendedDynamicState3RepresentativeFragmentTestEnable = false,
+      .extendedDynamicState3ShadingRateImageEnable = false,
+
+      /* VK_EXT_graphics_pipeline_library */
+      .graphicsPipelineLibrary = true,
+
+      /* VK_EXT_host_image_copy */
+      .hostImageCopy = true,
+
+      /* VK_EXT_image_2d_view_of_3d */
+      .image2DViewOf3D = true,
+      .sampler2DViewOf3D = true,
+
+      /* VK_EXT_image_sliced_view_of_3d */
+      .imageSlicedViewOf3D = false,
+
+#ifdef HK_USE_WSI_PLATFORM
+      /* VK_EXT_swapchain_maintenance1 */
+      .swapchainMaintenance1 = false,
+#endif
+
+      /* VK_EXT_image_view_min_lod */
+      .minLod = false,
+
+      /* VK_EXT_map_memory_placed */
+      .memoryMapPlaced = false,
+      .memoryMapRangePlaced = false,
+      .memoryUnmapReserve = false,
+
+      /* VK_EXT_multi_draw */
+      .multiDraw = true,
+
+      /* VK_EXT_mutable_descriptor_type */
+      .mutableDescriptorType = true,
+
+      /* VK_EXT_non_seamless_cube_map */
+      .nonSeamlessCubeMap = true,
+
+      /* VK_EXT_pipeline_protected_access */
+      .pipelineProtectedAccess = true,
+
+      /* VK_EXT_pipeline_robustness */
+      .pipelineRobustness = true,
+
+      /* VK_EXT_primitive_topology_list_restart */
+      .primitiveTopologyListRestart = true,
+      .primitiveTopologyPatchListRestart = false,
+
+      /* VK_EXT_primitives_generated_query */
+      .primitivesGeneratedQuery = false,
+      .primitivesGeneratedQueryWithNonZeroStreams = false,
+      .primitivesGeneratedQueryWithRasterizerDiscard = false,
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexLast = true,
+      .transformFeedbackPreservesProvokingVertex = true,
+
+      /* VK_EXT_robustness2 */
+      .robustBufferAccess2 = true,
+      .robustImageAccess2 = true,
+      .nullDescriptor = true,
+
+      /* VK_EXT_shader_image_atomic_int64 */
+      .shaderImageInt64Atomics = false,
+      .sparseImageInt64Atomics = false,
+
+      /* VK_EXT_shader_module_identifier */
+      .shaderModuleIdentifier = true,
+
+      /* VK_EXT_shader_object */
+      .shaderObject = true,
+
+      /* VK_EXT_shader_replicated_composites */
+      .shaderReplicatedComposites = true,
+
+      /* VK_KHR_shader_subgroup_uniform_control_flow */
+      .shaderSubgroupUniformControlFlow = true,
+
+      /* VK_EXT_texel_buffer_alignment */
+      .texelBufferAlignment = true,
+
+      /* VK_EXT_transform_feedback */
+      .transformFeedback = true,
+      .geometryStreams = true,
+
+      /* VK_EXT_vertex_input_dynamic_state */
+      .vertexInputDynamicState = true,
+
+      /* VK_EXT_ycbcr_2plane_444_formats */
+      .ycbcr2plane444Formats = false,
+
+      /* VK_EXT_ycbcr_image_arrays */
+      .ycbcrImageArrays = false,
+   };
+}
+
+static void
+hk_get_device_properties(const struct agx_device *dev,
+                         const struct hk_instance *instance,
+                         struct vk_properties *properties)
+{
+   const VkSampleCountFlagBits sample_counts =
+      VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
+
+   uint64_t os_page_size = 16384;
+   os_get_page_size(&os_page_size);
+
+   *properties = (struct vk_properties){
+      .apiVersion = hk_get_vk_version(),
+      .driverVersion = vk_get_driver_version(),
+      .vendorID = instance->force_vk_vendor ?: VK_VENDOR_ID_MESA,
+      .deviceID = 0,
+      .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
+
+      /* Vulkan 1.0 limits */
+      .maxImageDimension1D = 16384,
+      .maxImageDimension2D = 16384,
+      .maxImageDimension3D = 16384,
+      .maxImageDimensionCube = 16384,
+      .maxImageArrayLayers = 2048,
+      .maxTexelBufferElements = AGX_TEXTURE_BUFFER_MAX_SIZE,
+      .maxUniformBufferRange = 65536,
+      .maxStorageBufferRange = UINT32_MAX,
+      .maxPushConstantsSize = HK_MAX_PUSH_SIZE,
+      .maxMemoryAllocationCount = 4096,
+      .maxSamplerAllocationCount = 4000,
+      .bufferImageGranularity = 0x400,
+      .sparseAddressSpaceSize = HK_SPARSE_ADDR_SPACE_SIZE,
+      .maxBoundDescriptorSets = HK_MAX_SETS,
+      .maxPerStageDescriptorSamplers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorSampledImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorStorageImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorInputAttachments = HK_MAX_DESCRIPTORS,
+      .maxPerStageResources = UINT32_MAX,
+      .maxDescriptorSetSamplers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUniformBuffersDynamic = HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetStorageBuffersDynamic = HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetSampledImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetStorageImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetInputAttachments = HK_MAX_DESCRIPTORS,
+      .maxVertexInputAttributes = AGX_MAX_VBUFS,
+      .maxVertexInputBindings = AGX_MAX_ATTRIBS,
+      .maxVertexInputAttributeOffset = 65535,
+      .maxVertexInputBindingStride = 2048,
+      .maxVertexOutputComponents = 64,
+      .maxGeometryShaderInvocations = 32,
+      .maxGeometryInputComponents = 128,
+      .maxGeometryOutputComponents = 128,
+      .maxGeometryOutputVertices = 1024,
+      .maxGeometryTotalOutputComponents = 1024,
+      .maxTessellationGenerationLevel = 64,
+      .maxTessellationPatchSize = 32,
+      .maxTessellationControlPerVertexInputComponents = 128,
+      .maxTessellationControlPerVertexOutputComponents = 128,
+      .maxTessellationControlPerPatchOutputComponents = 120,
+      .maxTessellationControlTotalOutputComponents = 4216,
+      .maxTessellationEvaluationInputComponents = 128,
+      .maxTessellationEvaluationOutputComponents = 128,
+      .maxFragmentInputComponents = 64,
+      .maxFragmentOutputAttachments = HK_MAX_RTS,
+      .maxFragmentDualSrcAttachments = 1,
+      .maxFragmentCombinedOutputResources = 16,
+      .maxComputeSharedMemorySize = HK_MAX_SHARED_SIZE,
+      .maxComputeWorkGroupCount = {0x7fffffff, 65535, 65535},
+      .maxComputeWorkGroupInvocations = 1024,
+      .maxComputeWorkGroupSize = {1024, 1024, 64},
+      .subPixelPrecisionBits = 8,
+      .subTexelPrecisionBits = 8,
+      .mipmapPrecisionBits = 8,
+      .maxDrawIndexedIndexValue = UINT32_MAX,
+      .maxDrawIndirectCount = UINT32_MAX,
+      .maxSamplerLodBias = 15,
+      .maxSamplerAnisotropy = 16,
+      .maxViewports = HK_MAX_VIEWPORTS,
+      .maxViewportDimensions = {32768, 32768},
+      .viewportBoundsRange = {-65536, 65536},
+      .viewportSubPixelBits = 8,
+      .minMemoryMapAlignment = os_page_size,
+      .minTexelBufferOffsetAlignment = HK_MIN_TEXEL_BUFFER_ALIGNMENT,
+      .minUniformBufferOffsetAlignment = HK_MIN_UBO_ALIGNMENT,
+      .minStorageBufferOffsetAlignment = HK_MIN_SSBO_ALIGNMENT,
+      .minTexelOffset = -8,
+      .maxTexelOffset = 7,
+      .minTexelGatherOffset = -8,
+      .maxTexelGatherOffset = 7,
+      .minInterpolationOffset = -0.5,
+      .maxInterpolationOffset = 0.4375,
+      .subPixelInterpolationOffsetBits = 4,
+      .maxFramebufferHeight = 16384,
+      .maxFramebufferWidth = 16384,
+      .maxFramebufferLayers = 2048,
+      .framebufferColorSampleCounts = sample_counts,
+      .framebufferDepthSampleCounts = sample_counts,
+      .framebufferNoAttachmentsSampleCounts = sample_counts,
+      .framebufferStencilSampleCounts = sample_counts,
+      .maxColorAttachments = HK_MAX_RTS,
+      .sampledImageColorSampleCounts = sample_counts,
+      .sampledImageIntegerSampleCounts = sample_counts,
+      .sampledImageDepthSampleCounts = sample_counts,
+      .sampledImageStencilSampleCounts = sample_counts,
+      .storageImageSampleCounts = sample_counts,
+      .maxSampleMaskWords = 1,
+      .timestampComputeAndGraphics = false,
+      .timestampPeriod = 1,
+      .maxClipDistances = 8,
+      .maxCullDistances = 8,
+      .maxCombinedClipAndCullDistances = 8,
+      .discreteQueuePriorities = 2,
+      .pointSizeRange = {1.0, 512.f - 0.0625f},
+      .lineWidthRange = {1.0, 16.0f},
+      .pointSizeGranularity = 0.0625,
+      .lineWidthGranularity = 1.0f / 16.0f,
+      .strictLines = false,
+      .standardSampleLocations = true,
+      .optimalBufferCopyOffsetAlignment = 1,
+      .optimalBufferCopyRowPitchAlignment = 1,
+      .nonCoherentAtomSize = 64,
+
+      /* Vulkan 1.0 sparse properties */
+      .sparseResidencyNonResidentStrict = false,
+      .sparseResidencyAlignedMipSize = false,
+      .sparseResidencyStandard2DBlockShape = false,
+      .sparseResidencyStandard2DMultisampleBlockShape = false,
+      .sparseResidencyStandard3DBlockShape = false,
+
+      /* Vulkan 1.1 properties */
+      .subgroupSize = 32,
+      .subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT |
+                                 VK_SHADER_STAGE_FRAGMENT_BIT |
+                                 VK_SHADER_STAGE_VERTEX_BIT,
+      .subgroupSupportedOperations =
+         VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT |
+         VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
+         VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+         VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+         VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
+         VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+         VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
+         VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
+      .subgroupQuadOperationsInAllStages = true,
+      .pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY,
+      .maxMultiviewViewCount = HK_MAX_MULTIVIEW_VIEW_COUNT,
+      .maxMultiviewInstanceIndex = UINT32_MAX,
+      .maxPerSetDescriptors = UINT32_MAX,
+      .maxMemoryAllocationSize = (1u << 31),
+
+      /* Vulkan 1.2 properties */
+      .supportedDepthResolveModes =
+         VK_RESOLVE_MODE_SAMPLE_ZERO_BIT | VK_RESOLVE_MODE_AVERAGE_BIT |
+         VK_RESOLVE_MODE_MIN_BIT | VK_RESOLVE_MODE_MAX_BIT,
+      .supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+                                      VK_RESOLVE_MODE_MIN_BIT |
+                                      VK_RESOLVE_MODE_MAX_BIT,
+      .independentResolveNone = true,
+      .independentResolve = true,
+      .driverID = VK_DRIVER_ID_MESA_HONEYKRISP,
+      .conformanceVersion = (VkConformanceVersion){1, 3, 8, 3},
+      .denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+      .roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+      .shaderSignedZeroInfNanPreserveFloat16 = true,
+      .shaderSignedZeroInfNanPreserveFloat32 = true,
+      .shaderSignedZeroInfNanPreserveFloat64 = false,
+      .shaderDenormPreserveFloat16 = true,
+      .shaderDenormPreserveFloat32 = false,
+      .shaderDenormPreserveFloat64 = false,
+      .shaderDenormFlushToZeroFloat16 = false,
+      .shaderDenormFlushToZeroFloat32 = true,
+      .shaderDenormFlushToZeroFloat64 = false,
+      .shaderRoundingModeRTEFloat16 = true,
+      .shaderRoundingModeRTEFloat32 = true,
+      .shaderRoundingModeRTEFloat64 = false,
+      .shaderRoundingModeRTZFloat16 = false,
+      .shaderRoundingModeRTZFloat32 = false,
+      .shaderRoundingModeRTZFloat64 = false,
+      .maxUpdateAfterBindDescriptorsInAllPools = UINT32_MAX,
+      .shaderUniformBufferArrayNonUniformIndexingNative = true,
+      .shaderSampledImageArrayNonUniformIndexingNative = true,
+      .shaderStorageBufferArrayNonUniformIndexingNative = true,
+      .shaderStorageImageArrayNonUniformIndexingNative = true,
+      .shaderInputAttachmentArrayNonUniformIndexingNative = true,
+      .robustBufferAccessUpdateAfterBind = true,
+      .quadDivergentImplicitLod = false,
+      .maxPerStageDescriptorUpdateAfterBindSamplers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindSampledImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindStorageImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindInputAttachments =
+         HK_MAX_DESCRIPTORS,
+      .maxPerStageUpdateAfterBindResources = UINT32_MAX,
+      .maxDescriptorSetUpdateAfterBindSamplers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindUniformBuffersDynamic =
+         HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetUpdateAfterBindStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindStorageBuffersDynamic =
+         HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetUpdateAfterBindSampledImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindStorageImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindInputAttachments = HK_MAX_DESCRIPTORS,
+      .filterMinmaxSingleComponentFormats = false,
+      .filterMinmaxImageComponentMapping = false,
+      .maxTimelineSemaphoreValueDifference = UINT64_MAX,
+      .framebufferIntegerColorSampleCounts = sample_counts,
+
+      /* Vulkan 1.3 properties */
+      .minSubgroupSize = 32,
+      .maxSubgroupSize = 32,
+      .maxComputeWorkgroupSubgroups = 1024 / 32,
+      .requiredSubgroupSizeStages = 0,
+      .maxInlineUniformBlockSize = 1 << 16,
+      .maxPerStageDescriptorInlineUniformBlocks = 32,
+      .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 32,
+      .maxDescriptorSetInlineUniformBlocks = 6 * 32,
+      .maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 6 * 32,
+      .maxInlineUniformTotalSize = 1 << 16,
+      .integerDotProduct4x8BitPackedUnsignedAccelerated = false,
+      .integerDotProduct4x8BitPackedSignedAccelerated = false,
+      .integerDotProduct4x8BitPackedMixedSignednessAccelerated = false,
+      .storageTexelBufferOffsetAlignmentBytes = HK_MIN_TEXEL_BUFFER_ALIGNMENT,
+      .storageTexelBufferOffsetSingleTexelAlignment = true,
+      .uniformTexelBufferOffsetAlignmentBytes = HK_MIN_TEXEL_BUFFER_ALIGNMENT,
+      .uniformTexelBufferOffsetSingleTexelAlignment = true,
+      .maxBufferSize = HK_MAX_BUFFER_SIZE,
+
+      /* VK_KHR_push_descriptor */
+      .maxPushDescriptors = HK_MAX_PUSH_DESCRIPTORS,
+
+      /* VK_EXT_custom_border_color */
+      .maxCustomBorderColorSamplers = 4000,
+
+      /* VK_EXT_extended_dynamic_state3 */
+      .dynamicPrimitiveTopologyUnrestricted = true,
+
+      /* VK_EXT_graphics_pipeline_library */
+      .graphicsPipelineLibraryFastLinking = true,
+      .graphicsPipelineLibraryIndependentInterpolationDecoration = true,
+
+      /* VK_EXT_host_image_copy */
+
+      /* VK_KHR_line_rasterization */
+      .lineSubPixelPrecisionBits = 8,
+
+      /* VK_KHR_maintenance5 */
+      .earlyFragmentMultisampleCoverageAfterSampleCounting = false,
+      .earlyFragmentSampleMaskTestBeforeSampleCounting = true,
+      .depthStencilSwizzleOneSupport = true,
+      .polygonModePointSize = false,
+      .nonStrictSinglePixelWideLinesUseParallelogram = false,
+      .nonStrictWideLinesUseParallelogram = false,
+
+      /* VK_KHR_maintenance6 */
+      .blockTexelViewCompatibleMultipleLayers = false,
+      .maxCombinedImageSamplerDescriptorCount = 3,
+      .fragmentShadingRateClampCombinerInputs = false, /* TODO */
+
+      /* VK_EXT_map_memory_placed */
+      .minPlacedMemoryMapAlignment = os_page_size,
+
+      /* VK_EXT_multi_draw */
+      .maxMultiDrawCount = UINT16_MAX,
+
+      /* VK_EXT_pipeline_robustness */
+      .defaultRobustnessStorageBuffers =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .defaultRobustnessUniformBuffers =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .defaultRobustnessVertexInputs =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .defaultRobustnessImages =
+         VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
+
+      /* VK_EXT_physical_device_drm gets populated later */
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexModePerPipeline = true,
+      .transformFeedbackPreservesTriangleFanProvokingVertex = true,
+
+      /* VK_EXT_robustness2 */
+      .robustStorageBufferAccessSizeAlignment = HK_SSBO_BOUNDS_CHECK_ALIGNMENT,
+      .robustUniformBufferAccessSizeAlignment = HK_MIN_UBO_ALIGNMENT,
+
+      /* VK_EXT_sample_locations */
+      .sampleLocationSampleCounts = sample_counts,
+      .maxSampleLocationGridSize = (VkExtent2D){1, 1},
+      .sampleLocationCoordinateRange[0] = 0.0f,
+      .sampleLocationCoordinateRange[1] = 0.9375f,
+      .sampleLocationSubPixelBits = 4,
+      .variableSampleLocations = false,
+
+      /* VK_EXT_shader_object */
+      .shaderBinaryVersion = 0,
+
+      /* VK_EXT_transform_feedback */
+      .maxTransformFeedbackStreams = 4,
+      .maxTransformFeedbackBuffers = 4,
+      .maxTransformFeedbackBufferSize = UINT32_MAX,
+      .maxTransformFeedbackStreamDataSize = 2048,
+      .maxTransformFeedbackBufferDataSize = 512,
+      .maxTransformFeedbackBufferDataStride = 2048,
+      .transformFeedbackQueries = true,
+      .transformFeedbackStreamsLinesTriangles = false,
+      .transformFeedbackRasterizationStreamSelect = false,
+      .transformFeedbackDraw = false,
+
+      /* VK_KHR_vertex_attribute_divisor */
+      .maxVertexAttribDivisor = UINT32_MAX,
+      .supportsNonZeroFirstInstance = true,
+
+      /* VK_KHR_fragment_shader_barycentric */
+      .triStripVertexOrderIndependentOfProvokingVertex = false,
+   };
+
+   strncpy(properties->deviceName, dev->name, sizeof(properties->deviceName));
+
+   /* VK_EXT_shader_module_identifier */
+   static_assert(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+                 sizeof(properties->shaderModuleIdentifierAlgorithmUUID));
+   memcpy(properties->shaderModuleIdentifierAlgorithmUUID,
+          vk_shaderModuleIdentifierAlgorithmUUID,
+          sizeof(properties->shaderModuleIdentifierAlgorithmUUID));
+
+   const struct {
+      uint16_t vendor_id;
+      uint16_t device_id;
+      uint8_t pad[12];
+   } dev_uuid = {
+      .vendor_id = 0,
+      .device_id = 0,
+   };
+   static_assert(sizeof(dev_uuid) == VK_UUID_SIZE);
+   memcpy(properties->deviceUUID, &dev_uuid, VK_UUID_SIZE);
+   static_assert(sizeof(instance->driver_build_sha) >= VK_UUID_SIZE);
+   memcpy(properties->driverUUID, instance->driver_build_sha, VK_UUID_SIZE);
+
+   strncpy(properties->driverName, "Honeykrisp", VK_MAX_DRIVER_NAME_SIZE);
+   snprintf(properties->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+   /* We don't use the layouts ATM so just report all layouts from
+    * extensions that we support as compatible.
+    */
+   static const VkImageLayout supported_layouts[] = {
+      VK_IMAGE_LAYOUT_GENERAL, /* required by spec */
+      VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+      VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+      VK_IMAGE_LAYOUT_PREINITIALIZED,
+      VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL,
+      // VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT,
+      VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT,
+   };
+
+   properties->pCopySrcLayouts = (VkImageLayout *)supported_layouts;
+   properties->copySrcLayoutCount = ARRAY_SIZE(supported_layouts);
+   properties->pCopyDstLayouts = (VkImageLayout *)supported_layouts;
+   properties->copyDstLayoutCount = ARRAY_SIZE(supported_layouts);
+
+   /* We're a UMR so we can always map every kind of memory */
+   properties->identicalMemoryTypeRequirements = true;
+
+   {
+      struct mesa_sha1 sha1_ctx;
+      uint8_t sha1[20];
+
+      _mesa_sha1_init(&sha1_ctx);
+      /* Make sure we don't match with other vendors */
+      const char *driver = "honeykrisp-v1";
+      _mesa_sha1_update(&sha1_ctx, driver, strlen(driver));
+      _mesa_sha1_final(&sha1_ctx, sha1);
+
+      memcpy(properties->optimalTilingLayoutUUID, sha1, VK_UUID_SIZE);
+   }
+}
+
+static void
+hk_physical_device_init_pipeline_cache(struct hk_physical_device *pdev)
+{
+   struct hk_instance *instance = hk_physical_device_instance(pdev);
+
+   struct mesa_sha1 sha_ctx;
+   _mesa_sha1_init(&sha_ctx);
+
+   _mesa_sha1_update(&sha_ctx, instance->driver_build_sha,
+                     sizeof(instance->driver_build_sha));
+
+   const uint64_t compiler_flags = hk_physical_device_compiler_flags(pdev);
+   _mesa_sha1_update(&sha_ctx, &compiler_flags, sizeof(compiler_flags));
+
+   unsigned char sha[SHA1_DIGEST_LENGTH];
+   _mesa_sha1_final(&sha_ctx, sha);
+
+   static_assert(SHA1_DIGEST_LENGTH >= VK_UUID_SIZE);
+   memcpy(pdev->vk.properties.pipelineCacheUUID, sha, VK_UUID_SIZE);
+   memcpy(pdev->vk.properties.shaderBinaryUUID, sha, VK_UUID_SIZE);
+
+#ifdef ENABLE_SHADER_CACHE
+   char renderer[10];
+   ASSERTED int len = snprintf(renderer, sizeof(renderer), "hk_g13g_");
+   assert(len == sizeof(renderer) - 2);
+
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, instance->driver_build_sha);
+
+   const uint64_t driver_flags = hk_physical_device_compiler_flags(pdev);
+   pdev->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+}
+
+static void
+hk_physical_device_free_disk_cache(struct hk_physical_device *pdev)
+{
+#ifdef ENABLE_SHADER_CACHE
+   if (pdev->vk.disk_cache) {
+      disk_cache_destroy(pdev->vk.disk_cache);
+      pdev->vk.disk_cache = NULL;
+   }
+#else
+   assert(pdev->vk.disk_cache == NULL);
+#endif
+}
+
+static uint64_t
+hk_get_sysmem_heap_size(void)
+{
+   uint64_t sysmem_size_B = 0;
+   if (!os_get_total_physical_memory(&sysmem_size_B))
+      return 0;
+
+   /* Use 3/4 of total size to avoid swapping */
+   return ROUND_DOWN_TO(sysmem_size_B * 3 / 4, 1 << 20);
+}
+
+static uint64_t
+hk_get_sysmem_heap_available(struct hk_physical_device *pdev)
+{
+   uint64_t sysmem_size_B = 0;
+   if (!os_get_available_system_memory(&sysmem_size_B)) {
+      vk_loge(VK_LOG_OBJS(pdev), "Failed to query available system memory");
+      return 0;
+   }
+
+   /* Use 3/4 of available to avoid swapping */
+   return ROUND_DOWN_TO(sysmem_size_B * 3 / 4, 1 << 20);
+}
+
+VkResult
+hk_create_drm_physical_device(struct vk_instance *_instance,
+                              drmDevicePtr drm_device,
+                              struct vk_physical_device **pdev_out)
+{
+   struct hk_instance *instance = (struct hk_instance *)_instance;
+   VkResult result;
+
+   /* Blanket refusal to probe due to unstable UAPI. */
+   return VK_ERROR_INCOMPATIBLE_DRIVER;
+
+   if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
+       drm_device->bustype != DRM_BUS_PLATFORM)
+      return VK_ERROR_INCOMPATIBLE_DRIVER;
+
+   const char *path = drm_device->nodes[DRM_NODE_RENDER];
+   int fd = open(path, O_RDWR | O_CLOEXEC);
+   if (fd < 0) {
+      return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                       "failed to open device %s", path);
+   }
+
+   drmVersionPtr version = drmGetVersion(fd);
+   if (!version) {
+      result =
+         vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                   "failed to query kernel driver version for device %s", path);
+      goto fail_fd;
+   }
+
+   bool is_asahi = (strcmp(version->name, "asahi") == 0);
+   is_asahi |= strcmp(version->name, "virtio_gpu") == 0;
+   drmFreeVersion(version);
+
+   if (!is_asahi) {
+      result =
+         vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                   "device %s does not use the asahi kernel driver", path);
+      goto fail_fd;
+   }
+
+   struct stat st;
+   if (stat(drm_device->nodes[DRM_NODE_RENDER], &st)) {
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "fstat() failed on %s: %m",
+                         drm_device->nodes[DRM_NODE_RENDER]);
+      goto fail_fd;
+   }
+   const dev_t render_dev = st.st_rdev;
+
+   struct hk_physical_device *pdev =
+      vk_zalloc(&instance->vk.alloc, sizeof(*pdev), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+
+   if (pdev == NULL) {
+      result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_fd;
+   }
+
+   /* TODO: we're render-only, should we be reporting displays anyway in
+    * KHR_display?
+    */
+   pdev->master_fd = -1;
+
+#if 0
+   if (instance->vk.enabled_extensions.KHR_display) {
+      int master_fd =
+         open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC);
+
+      if (master_fd >= 0) {
+         struct stat st;
+         if (!stat(drm_device->nodes[DRM_NODE_PRIMARY], &st)) {
+            pdev->master_fd = master_fd;
+            properties.drmHasPrimary = true;
+            properties.drmPrimaryMajor = major(st.st_rdev);
+            properties.drmPrimaryMinor = minor(st.st_rdev);
+         }
+      }
+   }
+#endif
+
+   pdev->render_dev = render_dev;
+   pdev->dev.fd = fd;
+
+   if (!agx_open_device(NULL, &pdev->dev)) {
+      result = vk_error(instance, VK_ERROR_UNKNOWN);
+      goto fail_pdev_alloc;
+   }
+
+   struct vk_physical_device_dispatch_table dispatch_table;
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &hk_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
+
+   struct vk_device_extension_table supported_extensions;
+   hk_get_device_extensions(instance, &supported_extensions);
+
+   struct vk_features supported_features;
+   hk_get_device_features(&supported_extensions, &supported_features);
+
+   struct vk_properties properties;
+   hk_get_device_properties(&pdev->dev, instance, &properties);
+
+   properties.drmHasRender = true;
+   properties.drmRenderMajor = major(render_dev);
+   properties.drmRenderMinor = minor(render_dev);
+
+   result = vk_physical_device_init(&pdev->vk, &instance->vk,
+                                    &supported_extensions, &supported_features,
+                                    &properties, &dispatch_table);
+   if (result != VK_SUCCESS)
+      goto fail_agx_device;
+
+   hk_physical_device_init_pipeline_cache(pdev);
+
+   uint64_t sysmem_size_B = hk_get_sysmem_heap_size();
+   if (sysmem_size_B == 0) {
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to query total system memory");
+      goto fail_disk_cache;
+   }
+
+   uint32_t sysmem_heap_idx = pdev->mem_heap_count++;
+   pdev->mem_heaps[sysmem_heap_idx] = (struct hk_memory_heap){
+      .size = sysmem_size_B,
+      .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+      .available = hk_get_sysmem_heap_available,
+   };
+
+   pdev->mem_types[pdev->mem_type_count++] = (VkMemoryType){
+      .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                       VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                       VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+                       VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+      .heapIndex = sysmem_heap_idx,
+   };
+
+   assert(pdev->mem_heap_count <= ARRAY_SIZE(pdev->mem_heaps));
+   assert(pdev->mem_type_count <= ARRAY_SIZE(pdev->mem_types));
+
+   /* TODO: VK_QUEUE_SPARSE_BINDING_BIT*/
+   pdev->queue_families[pdev->queue_family_count++] = (struct hk_queue_family){
+      .queue_flags =
+         VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT,
+
+      .queue_count = 1,
+   };
+   assert(pdev->queue_family_count <= ARRAY_SIZE(pdev->queue_families));
+
+   unsigned st_idx = 0;
+   pdev->syncobj_sync_type = vk_drm_syncobj_get_type(fd);
+   pdev->sync_types[st_idx++] = &pdev->syncobj_sync_type;
+   pdev->sync_types[st_idx++] = NULL;
+   assert(st_idx <= ARRAY_SIZE(pdev->sync_types));
+   pdev->vk.supported_sync_types = pdev->sync_types;
+
+   result = hk_init_wsi(pdev);
+   if (result != VK_SUCCESS)
+      goto fail_disk_cache;
+
+   *pdev_out = &pdev->vk;
+
+   return VK_SUCCESS;
+
+fail_disk_cache:
+   hk_physical_device_free_disk_cache(pdev);
+   vk_physical_device_finish(&pdev->vk);
+fail_agx_device:
+   agx_close_device(&pdev->dev);
+fail_pdev_alloc:
+   if (pdev->master_fd)
+      close(pdev->master_fd);
+
+   vk_free(&pdev->vk.instance->alloc, pdev);
+fail_fd:
+   close(fd);
+   return result;
+}
+
+void
+hk_physical_device_destroy(struct vk_physical_device *vk_pdev)
+{
+   struct hk_physical_device *pdev =
+      container_of(vk_pdev, struct hk_physical_device, vk);
+
+   hk_finish_wsi(pdev);
+
+   if (pdev->master_fd >= 0)
+      close(pdev->master_fd);
+
+   hk_physical_device_free_disk_cache(pdev);
+   agx_close_device(&pdev->dev);
+   vk_physical_device_finish(&pdev->vk);
+   vk_free(&pdev->vk.instance->alloc, pdev);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceMemoryProperties2(
+   VkPhysicalDevice physicalDevice,
+   VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+
+   pMemoryProperties->memoryProperties.memoryHeapCount = pdev->mem_heap_count;
+   for (int i = 0; i < pdev->mem_heap_count; i++) {
+      pMemoryProperties->memoryProperties.memoryHeaps[i] = (VkMemoryHeap){
+         .size = pdev->mem_heaps[i].size,
+         .flags = pdev->mem_heaps[i].flags,
+      };
+   }
+
+   pMemoryProperties->memoryProperties.memoryTypeCount = pdev->mem_type_count;
+   for (int i = 0; i < pdev->mem_type_count; i++) {
+      pMemoryProperties->memoryProperties.memoryTypes[i] = pdev->mem_types[i];
+   }
+
+   vk_foreach_struct(ext, pMemoryProperties->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: {
+         VkPhysicalDeviceMemoryBudgetPropertiesEXT *p = (void *)ext;
+
+         for (unsigned i = 0; i < pdev->mem_heap_count; i++) {
+            const struct hk_memory_heap *heap = &pdev->mem_heaps[i];
+            uint64_t used = p_atomic_read(&heap->used);
+
+            /* From the Vulkan 1.3.278 spec:
+             *
+             *    "heapUsage is an array of VK_MAX_MEMORY_HEAPS VkDeviceSize
+             *    values in which memory usages are returned, with one element
+             *    for each memory heap. A heap’s usage is an estimate of how
+             *    much memory the process is currently using in that heap."
+             *
+             * TODO: Include internal allocations?
+             */
+            p->heapUsage[i] = used;
+
+            uint64_t available = heap->size;
+            if (heap->available)
+               available = heap->available(pdev);
+
+            /* From the Vulkan 1.3.278 spec:
+             *
+             *    "heapBudget is an array of VK_MAX_MEMORY_HEAPS VkDeviceSize
+             *    values in which memory budgets are returned, with one
+             *    element for each memory heap. A heap’s budget is a rough
+             *    estimate of how much memory the process can allocate from
+             *    that heap before allocations may fail or cause performance
+             *    degradation. The budget includes any currently allocated
+             *    device memory."
+             *
+             * and
+             *
+             *    "The heapBudget value must be less than or equal to
+             *    VkMemoryHeap::size for each heap."
+             *
+             * available (queried above) is the total amount free memory
+             * system-wide and does not include our allocations so we need
+             * to add that in.
+             */
+            uint64_t budget = MIN2(available + used, heap->size);
+
+            /* Set the budget at 90% of available to avoid thrashing */
+            p->heapBudget[i] = ROUND_DOWN_TO(budget * 9 / 10, 1 << 20);
+         }
+
+         /* From the Vulkan 1.3.278 spec:
+          *
+          *    "The heapBudget and heapUsage values must be zero for array
+          *    elements greater than or equal to
+          *    VkPhysicalDeviceMemoryProperties::memoryHeapCount. The
+          *    heapBudget value must be non-zero for array elements less than
+          *    VkPhysicalDeviceMemoryProperties::memoryHeapCount."
+          */
+         for (unsigned i = pdev->mem_heap_count; i < VK_MAX_MEMORY_HEAPS; i++) {
+            p->heapBudget[i] = 0u;
+            p->heapUsage[i] = 0u;
+         }
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceQueueFamilyProperties2(
+   VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount,
+   VkQueueFamilyProperties2 *pQueueFamilyProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+   VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out, pQueueFamilyProperties,
+                          pQueueFamilyPropertyCount);
+
+   for (uint8_t i = 0; i < pdev->queue_family_count; i++) {
+      const struct hk_queue_family *queue_family = &pdev->queue_families[i];
+
+      vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p)
+      {
+         p->queueFamilyProperties.queueFlags = queue_family->queue_flags;
+         p->queueFamilyProperties.queueCount = queue_family->queue_count;
+         p->queueFamilyProperties.timestampValidBits = 0; // TODO 64;
+         p->queueFamilyProperties.minImageTransferGranularity =
+            (VkExtent3D){1, 1, 1};
+
+         vk_foreach_struct(ext, p->pNext) {
+            switch (ext->sType) {
+            case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
+               VkQueueFamilyGlobalPriorityPropertiesKHR *props = (void *)ext;
+
+               /* TODO: support multiple priorities */
+               props->priorityCount = 1;
+               props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT;
+               break;
+            }
+            default:
+               break;
+            }
+         }
+      }
+   }
+}
+
+static const VkTimeDomainKHR hk_time_domains[] = {
+   VK_TIME_DOMAIN_DEVICE_KHR,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
+#ifdef CLOCK_MONOTONIC_RAW
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR,
+#endif
+};
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetPhysicalDeviceCalibrateableTimeDomainsKHR(VkPhysicalDevice physicalDevice,
+                                                uint32_t *pTimeDomainCount,
+                                                VkTimeDomainKHR *pTimeDomains)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkTimeDomainKHR, out, pTimeDomains, pTimeDomainCount);
+
+   for (int d = 0; d < ARRAY_SIZE(hk_time_domains); d++) {
+      vk_outarray_append_typed(VkTimeDomainKHR, &out, i)
+      {
+         *i = hk_time_domains[d];
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceMultisamplePropertiesEXT(
+   VkPhysicalDevice physicalDevice, VkSampleCountFlagBits samples,
+   VkMultisamplePropertiesEXT *pMultisampleProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+
+   if (samples & pdev->vk.properties.sampleLocationSampleCounts) {
+      pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){1, 1};
+   } else {
+      pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){0, 0};
+   }
+}
diff --git a/src/asahi/vulkan/hk_physical_device.h b/src/asahi/vulkan/hk_physical_device.h
new file mode 100644
index 00000000000..8b8b318d8be
--- /dev/null
+++ b/src/asahi/vulkan/hk_physical_device.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/lib/agx_device.h"
+#include <sys/types.h>
+#include "hk_private.h"
+#include "vk_physical_device.h"
+#include "vk_sync.h"
+#include "wsi_common.h"
+
+struct hk_instance;
+struct hk_physical_device;
+
+struct hk_queue_family {
+   VkQueueFlags queue_flags;
+   uint32_t queue_count;
+};
+
+struct hk_memory_heap {
+   uint64_t size;
+   uint64_t used;
+   VkMemoryHeapFlags flags;
+   uint64_t (*available)(struct hk_physical_device *pdev);
+};
+
+struct hk_physical_device {
+   struct vk_physical_device vk;
+   dev_t render_dev;
+   int master_fd;
+
+   /* Only used for VK_EXT_memory_budget */
+   struct agx_device dev;
+
+   struct wsi_device wsi_device;
+
+   uint8_t device_uuid[VK_UUID_SIZE];
+
+   // TODO: add mapable VRAM heap if possible
+   struct hk_memory_heap mem_heaps[3];
+   VkMemoryType mem_types[3];
+   uint8_t mem_heap_count;
+   uint8_t mem_type_count;
+
+   struct hk_queue_family queue_families[3];
+   uint8_t queue_family_count;
+
+   struct vk_sync_type syncobj_sync_type;
+   const struct vk_sync_type *sync_types[2];
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+
+static inline struct hk_instance *
+hk_physical_device_instance(struct hk_physical_device *pdev)
+{
+   return (struct hk_instance *)pdev->vk.instance;
+}
+
+VkResult hk_create_drm_physical_device(struct vk_instance *vk_instance,
+                                       struct _drmDevice *drm_device,
+                                       struct vk_physical_device **pdev_out);
+
+void hk_physical_device_destroy(struct vk_physical_device *vk_device);
+
+#if defined(VK_USE_PLATFORM_WAYLAND_KHR) ||                                    \
+   defined(VK_USE_PLATFORM_XCB_KHR) || defined(VK_USE_PLATFORM_XLIB_KHR) ||    \
+   defined(VK_USE_PLATFORM_DISPLAY_KHR)
+#define HK_USE_WSI_PLATFORM
+#endif
diff --git a/src/asahi/vulkan/hk_private.h b/src/asahi/vulkan/hk_private.h
new file mode 100644
index 00000000000..bd2b8d68f97
--- /dev/null
+++ b/src/asahi/vulkan/hk_private.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <assert.h>
+
+#include "vk_log.h"
+#include "vk_util.h"
+
+#define HK_MAX_SETS                   8
+#define HK_MAX_PUSH_SIZE              128
+#define HK_MAX_DYNAMIC_BUFFERS        64
+#define HK_MAX_RTS                    8
+#define HK_MIN_SSBO_ALIGNMENT         16
+#define HK_MIN_TEXEL_BUFFER_ALIGNMENT 16
+#define HK_MIN_UBO_ALIGNMENT          64
+#define HK_MAX_VIEWPORTS              16
+#define HK_MAX_DESCRIPTOR_SIZE        32
+#define HK_MAX_PUSH_DESCRIPTORS       32
+#define HK_MAX_DESCRIPTOR_SET_SIZE    (1u << 30)
+#define HK_MAX_DESCRIPTORS            (1 << 20)
+#define HK_PUSH_DESCRIPTOR_SET_SIZE                                            \
+   (HK_MAX_PUSH_DESCRIPTORS * HK_MAX_DESCRIPTOR_SIZE)
+#define HK_SSBO_BOUNDS_CHECK_ALIGNMENT 4
+#define HK_MAX_MULTIVIEW_VIEW_COUNT    32
+
+#define HK_SPARSE_ADDR_SPACE_SIZE (1ull << 39)
+#define HK_MAX_BUFFER_SIZE        (1ull << 31)
+#define HK_MAX_SHARED_SIZE        (32 * 1024)
+
+struct hk_addr_range {
+   uint64_t addr;
+   uint64_t range;
+};
+
+#define perf_debug(dev, fmt, ...)                                              \
+   do {                                                                        \
+      if (dev->dev.debug & AGX_DBG_PERF)                                       \
+         mesa_log(MESA_LOG_WARN, (MESA_LOG_TAG), (fmt), ##__VA_ARGS__);        \
+   } while (0)
+
+/* Fake values, pending UAPI upstreaming */
+#ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED
+#define DRM_FORMAT_MOD_APPLE_TWIDDLED (2)
+#endif
+#ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED
+#define DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED (3)
+#endif
diff --git a/src/asahi/vulkan/hk_query_pool.c b/src/asahi/vulkan/hk_query_pool.c
new file mode 100644
index 00000000000..5762c69419c
--- /dev/null
+++ b/src/asahi/vulkan/hk_query_pool.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_query_pool.h"
+
+#include "agx_compile.h"
+#include "agx_pack.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_event.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+
+#include "shader_enums.h"
+#include "vk_common_entrypoints.h"
+#include "vk_meta.h"
+#include "vk_pipeline.h"
+
+#include "asahi/lib/agx_bo.h"
+#include "asahi/lib/libagx_shaders.h"
+#include "asahi/lib/shaders/query.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+
+#include "util/os_time.h"
+#include "vulkan/vulkan_core.h"
+
+struct hk_query_report {
+   /* TODO: do we want this to be legit u64? */
+   uint32_t value;
+   uint32_t padding;
+};
+
+static uint16_t *
+hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
+{
+   return (uint16_t *)(pool->bo->ptr.cpu + pool->query_start);
+}
+
+static uint32_t
+hk_reports_per_query(struct hk_query_pool *pool)
+{
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      return 1;
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+      return util_bitcount(pool->vk.pipeline_statistics);
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      // Primitives succeeded and primitives needed
+      return 2;
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
+                   const VkAllocationCallbacks *pAllocator,
+                   VkQueryPool *pQueryPool)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_query_pool *pool;
+
+   bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
+   unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;
+
+   pool =
+      vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
+   if (!pool)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* We place the availability first and then data */
+   pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
+                             sizeof(struct hk_query_report));
+
+   uint32_t reports_per_query = hk_reports_per_query(pool);
+   pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
+
+   if (pool->vk.query_count > 0) {
+      uint32_t bo_size = pool->query_start;
+
+      /* For occlusion queries, we stick the query index remapping here */
+      if (occlusion_queries)
+         bo_size += sizeof(uint16_t) * pool->vk.query_count;
+      else
+         bo_size += pool->query_stride * pool->vk.query_count;
+
+      pool->bo =
+         agx_bo_create(&dev->dev, bo_size, AGX_BO_WRITEBACK, "Query pool");
+      if (!pool->bo) {
+         hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
+         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+   }
+
+   uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+
+   for (unsigned i = 0; i < occlusion_queries; ++i) {
+      uint64_t zero = 0;
+      unsigned index;
+
+      VkResult result = hk_descriptor_table_add(
+         dev, &dev->occlusion_queries, &zero, sizeof(uint64_t), &index);
+
+      if (result != VK_SUCCESS) {
+         hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
+         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+
+      /* We increment as we go so we can clean up properly if we run out */
+      assert(pool->oq_queries < occlusion_queries);
+      oq_index[pool->oq_queries++] = index;
+   }
+
+   *pQueryPool = hk_query_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   if (!pool)
+      return;
+
+   uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+
+   for (unsigned i = 0; i < pool->oq_queries; ++i) {
+      hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
+   }
+
+   agx_bo_unreference(pool->bo);
+   vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
+}
+
+static uint64_t
+hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return pool->bo->ptr.gpu + query * sizeof(uint32_t);
+}
+
+static uint32_t *
+hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return (uint32_t *)pool->bo->ptr.cpu + query;
+}
+
+static uint64_t
+hk_query_offset(struct hk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return pool->query_start + query * pool->query_stride;
+}
+
+static uint64_t
+hk_query_report_addr(struct hk_device *dev, struct hk_query_pool *pool,
+                     uint32_t query)
+{
+   if (pool->oq_queries) {
+      uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+      return dev->occlusion_queries.bo->ptr.gpu +
+             (oq_index[query] * sizeof(uint64_t));
+   } else {
+      return pool->bo->ptr.gpu + hk_query_offset(pool, query);
+   }
+}
+
+static struct hk_query_report *
+hk_query_report_map(struct hk_device *dev, struct hk_query_pool *pool,
+                    uint32_t query)
+{
+   if (pool->oq_queries) {
+      uint64_t *queries = (uint64_t *)dev->occlusion_queries.bo->ptr.cpu;
+      uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+
+      return (struct hk_query_report *)&queries[oq_index[query]];
+   } else {
+      return (void *)((char *)pool->bo->ptr.cpu + hk_query_offset(pool, query));
+   }
+}
+
+struct hk_write_params {
+   uint64_t address;
+   uint32_t value;
+};
+
+static void
+hk_nir_write_u32(nir_builder *b, UNUSED const void *key)
+{
+   nir_def *addr = nir_load_preamble(
+      b, 1, 64, .base = offsetof(struct hk_write_params, address) / 2);
+
+   nir_def *value = nir_load_preamble(
+      b, 1, 32, .base = offsetof(struct hk_write_params, value) / 2);
+
+   nir_store_global(b, addr, 4, value, nir_component_mask(1));
+}
+
+void
+hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
+               bool after_gfx)
+{
+   struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
+      cmd, after_gfx ? &cmd->current_cs.post_gfx : &cmd->current_cs.cs, true);
+   if (!cs)
+      return;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   /* As soon as we mark a query available, it needs to be available system
+    * wide, otherwise a CPU-side get result can query. As such, we cache flush
+    * before and then let coherency works its magic. Without this barrier, we
+    * get flakes in
+    *
+    * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
+    */
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   hk_cdm_cache_flush(dev, cs);
+
+   struct hk_shader *s = hk_meta_kernel(dev, hk_nir_write_u32, NULL, 0);
+   struct hk_write_params params = {.address = address, .value = value};
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1));
+}
+
+/**
+ * Goes through a series of consecutive query indices in the given pool,
+ * setting all element values to 0 and emitting them as available.
+ */
+static void
+emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
+                  uint32_t first_index, uint32_t num_queries,
+                  bool set_available)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   for (uint32_t i = 0; i < num_queries; i++) {
+      uint64_t available = hk_query_available_addr(pool, first_index + i);
+      uint64_t report = hk_query_report_addr(dev, pool, first_index + i);
+      hk_queue_write(cmd, available, set_available, false);
+
+      /* XXX: is this supposed to happen on the begin? */
+      for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
+         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0,
+                        false);
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
+                  uint32_t queryCount)
+{
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+   VK_FROM_HANDLE(hk_device, dev, device);
+
+   uint32_t *available = hk_query_available_map(pool, firstQuery);
+   struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery);
+
+   memset(available, 0, queryCount * sizeof(*available));
+   memset(reports, 0, queryCount * pool->query_stride);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                     uint32_t firstQuery, uint32_t queryCount)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
+                      VkPipelineStageFlags2 stage, VkQueryPool queryPool,
+                      uint32_t query)
+{
+   unreachable("todo");
+#if 0
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   struct nv_push *p = hk_cmd_buffer_push(cmd, 10);
+
+   uint64_t report_addr = hk_query_report_addr(pool, query);
+   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+   P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
+   P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
+   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
+   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+      .operation = OPERATION_REPORT_ONLY,
+      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
+      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
+   });
+
+   uint64_t available_addr = hk_query_available_addr(pool, query);
+   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
+   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
+   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
+   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+      .operation = OPERATION_RELEASE,
+      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
+      .pipeline_location = PIPELINE_LOCATION_ALL,
+      .structure_size = STRUCTURE_SIZE_ONE_WORD,
+   });
+
+   /* From the Vulkan spec:
+    *
+    *   "If vkCmdWriteTimestamp2 is called while executing a render pass
+    *    instance that has multiview enabled, the timestamp uses N consecutive
+    *    query indices in the query pool (starting at query) where N is the
+    *    number of bits set in the view mask of the subpass the command is
+    *    executed in. The resulting query values are determined by an
+    *    implementation-dependent choice of one of the following behaviors:"
+    *
+    * In our case, only the first query is used, so we emit zeros for the
+    * remaining queries, as described in the first behavior listed in the
+    * Vulkan spec:
+    *
+    *   "The first query is a timestamp value and (if more than one bit is set
+    *   in the view mask) zero is written to the remaining queries."
+    */
+   if (cmd->state.gfx.render.view_mask != 0) {
+      const uint32_t num_queries =
+         util_bitcount(cmd->state.gfx.render.view_mask);
+      if (num_queries > 1)
+         emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
+   }
+#endif
+}
+
+static void
+hk_cmd_begin_end_query(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
+                       uint32_t query, uint32_t index,
+                       VkQueryControlFlags flags, bool end)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   bool graphics = false;
+
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION: {
+      assert(query < pool->oq_queries);
+
+      if (end) {
+         cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
+      } else {
+         cmd->state.gfx.occlusion.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
+                                            ? AGX_VISIBILITY_MODE_COUNTING
+                                            : AGX_VISIBILITY_MODE_BOOLEAN;
+      }
+
+      uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+      cmd->state.gfx.occlusion.index = oq_index[query];
+      cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
+      break;
+   }
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+      uint64_t addr = hk_query_report_addr(dev, pool, query);
+      cmd->state.gfx.xfb_query[index] = end ? 0 : addr;
+      break;
+   }
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+      struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
+      cmd->state.gfx.descriptors.root_dirty = true;
+
+      root->draw.pipeline_stats = hk_query_report_addr(dev, pool, query);
+      root->draw.pipeline_stats_flags = pool->vk.pipeline_statistics;
+
+      /* XXX: I don't think is correct... when does the query become available
+       * exactly?
+       */
+      graphics = pool->vk.pipeline_statistics &
+                 ~VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
+      break;
+   }
+
+   default:
+      unreachable("Unsupported query type");
+   }
+
+   /* We need to set available=1 after the graphics work finishes. */
+   if (end) {
+      hk_queue_write(cmd, hk_query_available_addr(pool, query), 1, graphics);
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                           uint32_t query, VkQueryControlFlags flags,
+                           uint32_t index)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   hk_cmd_begin_end_query(cmd, pool, query, index, flags, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                         uint32_t query, uint32_t index)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   hk_cmd_begin_end_query(cmd, pool, query, index, 0, true);
+
+   /* From the Vulkan spec:
+    *
+    *   "If queries are used while executing a render pass instance that has
+    *    multiview enabled, the query uses N consecutive query indices in
+    *    the query pool (starting at query) where N is the number of bits set
+    *    in the view mask in the subpass the query is used in. How the
+    *    numerical results of the query are distributed among the queries is
+    *    implementation-dependent."
+    *
+    * In our case, only the first query is used, so we emit zeros for the
+    * remaining queries.
+    */
+   if (cmd->state.gfx.render.view_mask != 0) {
+      const uint32_t num_queries =
+         util_bitcount(cmd->state.gfx.render.view_mask);
+      if (num_queries > 1)
+         emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
+   }
+}
+
+static bool
+hk_query_is_available(struct hk_query_pool *pool, uint32_t query)
+{
+   uint32_t *available = hk_query_available_map(pool, query);
+   return p_atomic_read(available) != 0;
+}
+
+#define HK_QUERY_TIMEOUT 2000000000ull
+
+static VkResult
+hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
+                            uint32_t query)
+{
+   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);
+
+   while (os_time_get_nano() < abs_timeout_ns) {
+      if (hk_query_is_available(pool, query))
+         return VK_SUCCESS;
+
+      VkResult status = vk_device_check_status(&dev->vk);
+      if (status != VK_SUCCESS)
+         return status;
+   }
+
+   return vk_device_set_lost(&dev->vk, "query timeout");
+}
+
+static void
+cpu_write_query_result(void *dst, uint32_t idx, VkQueryResultFlags flags,
+                       uint64_t result)
+{
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+      uint64_t *dst64 = dst;
+      dst64[idx] = result;
+   } else {
+      uint32_t *dst32 = dst;
+      dst32[idx] = result;
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
+                       uint32_t firstQuery, uint32_t queryCount,
+                       size_t dataSize, void *pData, VkDeviceSize stride,
+                       VkQueryResultFlags flags)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   if (vk_device_is_lost(&dev->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   VkResult status = VK_SUCCESS;
+   for (uint32_t i = 0; i < queryCount; i++) {
+      const uint32_t query = firstQuery + i;
+
+      bool available = hk_query_is_available(pool, query);
+
+      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
+         status = hk_query_wait_for_available(dev, pool, query);
+         if (status != VK_SUCCESS)
+            return status;
+
+         available = true;
+      }
+
+      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
+
+      const struct hk_query_report *src = hk_query_report_map(dev, pool, query);
+      assert(i * stride < dataSize);
+      void *dst = (char *)pData + i * stride;
+
+      uint32_t reports = hk_reports_per_query(pool);
+      if (write_results) {
+         for (uint32_t j = 0; j < reports; j++) {
+            cpu_write_query_result(dst, j, flags, src[j].value);
+         }
+      }
+
+      if (!write_results)
+         status = VK_NOT_READY;
+
+      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+         cpu_write_query_result(dst, reports, flags, available);
+   }
+
+   return status;
+}
+
+static void
+hk_nir_copy_query(nir_builder *b, UNUSED const void *key)
+{
+   nir_def *id = nir_channel(b, nir_load_workgroup_id(b), 0);
+   libagx_copy_query(b, nir_load_preamble(b, 1, 64), id);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                           uint32_t firstQuery, uint32_t queryCount,
+                           VkBuffer dstBuffer, VkDeviceSize dstOffset,
+                           VkDeviceSize stride, VkQueryResultFlags flags)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+   VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);
+
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true);
+   if (!cs)
+      return;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   const struct libagx_copy_query_push info = {
+      .availability = pool->bo->ptr.gpu,
+      .results = pool->oq_queries ? dev->occlusion_queries.bo->ptr.gpu
+                                  : pool->bo->ptr.gpu + pool->query_start,
+      .oq_index = pool->oq_queries ? pool->bo->ptr.gpu + pool->query_start : 0,
+
+      .first_query = firstQuery,
+      .dst_addr = hk_buffer_address(dst_buffer, dstOffset),
+      .dst_stride = stride,
+      .reports_per_query = hk_reports_per_query(pool),
+
+      .partial = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
+      ._64 = flags & VK_QUERY_RESULT_64_BIT,
+      .with_availability = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
+   };
+
+   uint64_t push = hk_pool_upload(cmd, &info, sizeof(info), 8);
+
+   struct hk_shader *s = hk_meta_kernel(dev, hk_nir_copy_query, NULL, 0);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(queryCount, 1, 1),
+                        hk_grid(1, 1, 1));
+}
diff --git a/src/asahi/vulkan/hk_query_pool.h b/src/asahi/vulkan/hk_query_pool.h
new file mode 100644
index 00000000000..9e235dfed08
--- /dev/null
+++ b/src/asahi/vulkan/hk_query_pool.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+#include "vk_query_pool.h"
+
+struct agx_bo;
+
+struct hk_query_pool {
+   struct vk_query_pool vk;
+
+   uint32_t query_start;
+   uint32_t query_stride;
+
+   struct agx_bo *bo;
+   void *bo_map;
+
+   unsigned oq_queries;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_query_pool, vk.base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c
new file mode 100644
index 00000000000..7cc1c8be139
--- /dev/null
+++ b/src/asahi/vulkan/hk_queue.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_queue.h"
+
+#include "agx_bo.h"
+#include "agx_device.h"
+#include "agx_pack.h"
+#include "decode.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_physical_device.h"
+
+#include <xf86drm.h>
+#include "asahi/lib/unstable_asahi_drm.h"
+#include "util/list.h"
+#include "vulkan/vulkan_core.h"
+
+#include "vk_drm_syncobj.h"
+#include "vk_sync.h"
+
+/*
+ * We need to specially handle submits with no control streams. The kernel
+ * can't accept empty submits, but we can end up here in Vulkan for
+ * synchronization purposes only. Rather than submit a no-op job (slow),
+ * we simply tie the fences together.
+ */
+static VkResult
+queue_submit_empty(struct hk_device *dev, struct hk_queue *queue,
+                   struct vk_queue_submit *submit)
+{
+   int fd = dev->dev.fd;
+
+   /* Transfer the waits into the queue timeline. */
+   for (unsigned i = 0; i < submit->wait_count; ++i) {
+      struct vk_sync_wait *wait = &submit->waits[i];
+
+      assert(vk_sync_type_is_drm_syncobj(wait->sync->type));
+      const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync);
+
+      drmSyncobjTransfer(fd, queue->drm.syncobj, ++queue->drm.timeline_value,
+                         syncobj->syncobj, wait->wait_value, 0);
+   }
+
+   /* Transfer the queue timeline into each out fence. They will all be
+    * signalled when we reach this point.
+    */
+   for (unsigned i = 0; i < submit->signal_count; ++i) {
+      struct vk_sync_signal *signal = &submit->signals[i];
+
+      assert(vk_sync_type_is_drm_syncobj(signal->sync->type));
+      const struct vk_drm_syncobj *syncobj =
+         vk_sync_as_drm_syncobj(signal->sync);
+
+      drmSyncobjTransfer(fd, syncobj->syncobj, signal->signal_value,
+                         queue->drm.syncobj, queue->drm.timeline_value, 0);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
+                       struct drm_asahi_cmd_compute *cmd)
+{
+   size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start);
+
+   *cmd = (struct drm_asahi_cmd_compute){
+      .encoder_ptr = cs->addr,
+      .encoder_end = cs->addr + len,
+
+      .sampler_array = dev->samplers.table.bo->ptr.gpu,
+      .sampler_count = dev->samplers.table.alloc,
+      .sampler_max = dev->samplers.table.alloc + 1,
+
+      .encoder_id = agx_get_global_id(&dev->dev),
+      .cmd_id = agx_get_global_id(&dev->dev),
+      .unk_mask = 0xffffffff,
+   };
+
+   if (cs->scratch.cs.main || cs->scratch.cs.preamble) {
+      cmd->helper_arg = dev->scratch.cs.buf->ptr.gpu;
+      cmd->helper_cfg = cs->scratch.cs.preamble << 16;
+      cmd->helper_program = dev->dev.helper->ptr.gpu | 1;
+   }
+}
+
+static void
+asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
+                       struct drm_asahi_cmd_render *c)
+{
+#if 0
+   bool clear_pipeline_textures =
+      agx_tilebuffer_spills(&batch->tilebuffer_layout);
+
+   for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
+      struct pipe_surface *surf = batch->key.cbufs[i];
+
+      clear_pipeline_textures |=
+         surf && surf->texture && !(batch->clear & (PIPE_CLEAR_COLOR0 << i));
+   }
+
+#endif
+   unsigned cmd_ta_id = agx_get_global_id(&dev->dev);
+   unsigned cmd_3d_id = agx_get_global_id(&dev->dev);
+   unsigned encoder_id = agx_get_global_id(&dev->dev);
+
+   memset(c, 0, sizeof(*c));
+
+   c->encoder_ptr = cs->addr;
+   c->encoder_id = encoder_id;
+   c->cmd_3d_id = cmd_3d_id;
+   c->cmd_ta_id = cmd_ta_id;
+   c->ppp_ctrl = 0x202;
+
+   c->fb_width = cs->cr.width;
+   c->fb_height = cs->cr.height;
+
+   c->isp_bgobjdepth = cs->cr.isp_bgobjdepth;
+   c->isp_bgobjvals = cs->cr.isp_bgobjvals;
+
+   static_assert(sizeof(c->zls_ctrl) == sizeof(cs->cr.zls_control));
+   memcpy(&c->zls_ctrl, &cs->cr.zls_control, sizeof(cs->cr.zls_control));
+
+   c->depth_dimensions = (cs->cr.width - 1) | ((cs->cr.height - 1) << 15);
+
+   c->depth_buffer_load = cs->cr.depth.buffer;
+   c->depth_buffer_store = cs->cr.depth.buffer;
+   c->depth_buffer_partial = cs->cr.depth.buffer;
+
+   c->depth_buffer_load_stride = cs->cr.depth.stride;
+   c->depth_buffer_store_stride = cs->cr.depth.stride;
+   c->depth_buffer_partial_stride = cs->cr.depth.stride;
+
+   c->depth_meta_buffer_load = cs->cr.depth.meta;
+   c->depth_meta_buffer_store = cs->cr.depth.meta;
+   c->depth_meta_buffer_partial = cs->cr.depth.meta;
+
+   c->depth_meta_buffer_load_stride = cs->cr.depth.stride;
+   c->depth_meta_buffer_store_stride = cs->cr.depth.meta_stride;
+   c->depth_meta_buffer_partial_stride = cs->cr.depth.meta_stride;
+
+   c->stencil_buffer_load = cs->cr.stencil.buffer;
+   c->stencil_buffer_store = cs->cr.stencil.buffer;
+   c->stencil_buffer_partial = cs->cr.stencil.buffer;
+
+   c->stencil_buffer_load_stride = cs->cr.stencil.stride;
+   c->stencil_buffer_store_stride = cs->cr.stencil.stride;
+   c->stencil_buffer_partial_stride = cs->cr.stencil.stride;
+
+   c->stencil_meta_buffer_load = cs->cr.stencil.meta;
+   c->stencil_meta_buffer_store = cs->cr.stencil.meta;
+   c->stencil_meta_buffer_partial = cs->cr.stencil.meta;
+
+   c->stencil_meta_buffer_load_stride = cs->cr.stencil.stride;
+   c->stencil_meta_buffer_store_stride = cs->cr.stencil.meta_stride;
+   c->stencil_meta_buffer_partial_stride = cs->cr.stencil.meta_stride;
+
+   c->iogpu_unk_214 = cs->cr.iogpu_unk_214;
+
+#if 0
+   if (clear_pipeline_textures)
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+   else
+      c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES;
+
+   if (zres && !(batch->clear & PIPE_CLEAR_DEPTH))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+
+   if (sres && !(batch->clear & PIPE_CLEAR_STENCIL))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+#endif
+
+   if (dev->dev.debug & AGX_DBG_NOCLUSTER)
+      c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING;
+
+#if 0
+   /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */
+   if (tib->nr_samples > 1 && framebuffer->zsbuf)
+      c->flags |= ASAHI_RENDER_MSAA_ZS;
+#endif
+
+   c->utile_width = cs->tib.tile_size.width;
+   c->utile_height = cs->tib.tile_size.height;
+
+   /* Can be 0 for attachmentless rendering with no draws */
+   c->samples = MAX2(cs->tib.nr_samples, 1);
+   c->layers = cs->cr.layers;
+
+   c->ppp_multisamplectl = cs->ppp_multisamplectl;
+   c->sample_size = cs->tib.sample_size_B;
+
+   /* XXX OR 0x80 with eMRT? */
+   c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(&cs->tib), 2048) / 2048;
+
+   float tan_60 = 1.732051f;
+   c->merge_upper_x = fui(tan_60 / cs->cr.width);
+   c->merge_upper_y = fui(tan_60 / cs->cr.height);
+
+   c->load_pipeline = cs->cr.bg.main.usc | 4;
+   c->store_pipeline = cs->cr.eot.main.usc | 4;
+   c->partial_reload_pipeline = cs->cr.bg.partial.usc | 4;
+   c->partial_store_pipeline = cs->cr.eot.partial.usc | 4;
+
+   memcpy(&c->load_pipeline_bind, &cs->cr.bg.main.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->store_pipeline_bind, &cs->cr.eot.main.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_reload_pipeline_bind, &cs->cr.bg.partial.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_store_pipeline_bind, &cs->cr.eot.partial.counts,
+          sizeof(struct agx_counts_packed));
+
+   c->scissor_array = cs->uploaded_scissor;
+   c->depth_bias_array = cs->uploaded_zbias;
+
+   c->vertex_sampler_array = dev->samplers.table.bo->ptr.gpu;
+   c->vertex_sampler_count = dev->samplers.table.alloc;
+   c->vertex_sampler_max = dev->samplers.table.alloc + 1;
+
+   c->fragment_sampler_array = c->vertex_sampler_array;
+   c->fragment_sampler_count = c->vertex_sampler_count;
+   c->fragment_sampler_max = c->vertex_sampler_max;
+
+   c->visibility_result_buffer = dev->occlusion_queries.bo->ptr.gpu;
+
+   /* If a tile is empty, we do not want to process it, as the redundant
+    * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
+    * memory bandwidth. Any draw marks a tile as non-empty, so we only need to
+    * process empty tiles if the background+EOT programs have a side effect.
+    * This is the case exactly when there is an attachment we are clearing (some
+    * attachment A in clear and in resolve <==> non-empty intersection).
+    *
+    * This case matters a LOT for performance in workloads that split batches.
+    */
+   if (true /* TODO */)
+      c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES;
+
+   if (cs->scratch.vs.main || cs->scratch.vs.preamble) {
+      c->flags |= ASAHI_RENDER_VERTEX_SPILLS;
+      c->vertex_helper_arg = dev->scratch.vs.buf->ptr.gpu;
+      c->vertex_helper_cfg = cs->scratch.vs.preamble << 16;
+      c->vertex_helper_program = dev->dev.helper->ptr.gpu | 1;
+   }
+
+   if (cs->scratch.fs.main || cs->scratch.fs.preamble) {
+      c->fragment_helper_arg = dev->scratch.fs.buf->ptr.gpu;
+      c->fragment_helper_cfg = cs->scratch.fs.preamble << 16;
+      c->fragment_helper_program = dev->dev.helper->ptr.gpu | 1;
+   }
+}
+
+static void
+asahi_fill_sync(struct drm_asahi_sync *sync, struct vk_sync *vk_sync,
+                uint64_t value)
+{
+   if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) {
+      unreachable("Unsupported sync type");
+      return;
+   }
+
+   const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
+   *sync = (struct drm_asahi_sync){.handle = syncobj->syncobj};
+
+   if (vk_sync->flags & VK_SYNC_IS_TIMELINE) {
+      sync->sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ;
+      sync->timeline_value = value;
+   } else {
+      sync->sync_type = DRM_ASAHI_SYNC_SYNCOBJ;
+   }
+}
+
+union drm_asahi_cmd {
+   struct drm_asahi_cmd_compute compute;
+   struct drm_asahi_cmd_render render;
+};
+
+/* TODO: I think it's 64. Can we query from the kernel? */
+#define MAX_COMMANDS_PER_SUBMIT (16)
+
+static VkResult
+queue_submit_single(struct agx_device *dev, struct drm_asahi_submit *submit)
+{
+   int ret = dev->ops.submit(dev, submit, 0);
+
+   /* XXX: don't trap */
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT failed: %m\n");
+      assert(0);
+   }
+
+   return VK_SUCCESS;
+}
+
+/*
+ * The kernel/firmware jointly impose a limit on commands per submit ioctl, but
+ * we can build up arbitrarily large command buffers. We handle this here by
+ * looping the ioctl, submitting slices of the command buffers that are within
+ * bounds.
+ */
+static VkResult
+queue_submit_looped(struct agx_device *dev, struct drm_asahi_submit *submit)
+{
+   struct drm_asahi_command *cmds = (void *)submit->commands;
+   unsigned commands_remaining = submit->command_count;
+   unsigned submitted_vdm = 0, submitted_cdm = 0;
+
+   while (commands_remaining) {
+      bool first = commands_remaining == submit->command_count;
+      bool last = commands_remaining <= MAX_COMMANDS_PER_SUBMIT;
+
+      unsigned count = MIN2(commands_remaining, MAX_COMMANDS_PER_SUBMIT);
+      commands_remaining -= count;
+
+      assert(!last || commands_remaining == 0);
+      assert(count > 0);
+
+      /* We need to fix up the barriers since barriers are ioctl-relative */
+      for (unsigned i = 0; i < count; ++i) {
+         assert(cmds[i].barriers[0] >= submitted_vdm);
+         assert(cmds[i].barriers[1] >= submitted_cdm);
+
+         cmds[i].barriers[0] -= submitted_vdm;
+         cmds[i].barriers[1] -= submitted_cdm;
+      }
+
+      /* We can't signal the out-syncobjs until all prior work finishes. Since
+       * only the last ioctl will signal, make sure it waits on prior ioctls.
+       *
+       * TODO: there might be a more performant way to do this.
+       */
+      if (last && !first) {
+         if (cmds[0].barriers[0] == DRM_ASAHI_BARRIER_NONE)
+            cmds[0].barriers[0] = 0;
+
+         if (cmds[0].barriers[1] == DRM_ASAHI_BARRIER_NONE)
+            cmds[0].barriers[1] = 0;
+      }
+
+      struct drm_asahi_submit submit_ioctl = {
+         .flags = submit->flags,
+         .queue_id = submit->queue_id,
+         .result_handle = submit->result_handle,
+         .commands = (uint64_t)(uintptr_t)(cmds),
+         .command_count = count,
+         .in_syncs = first ? submit->in_syncs : 0,
+         .in_sync_count = first ? submit->in_sync_count : 0,
+         .out_syncs = last ? submit->out_syncs : 0,
+         .out_sync_count = last ? submit->out_sync_count : 0,
+      };
+
+      VkResult result = queue_submit_single(dev, &submit_ioctl);
+      if (result != VK_SUCCESS)
+         return result;
+
+      for (unsigned i = 0; i < count; ++i) {
+         if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE)
+            submitted_cdm++;
+         else if (cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER)
+            submitted_vdm++;
+         else
+            unreachable("unknown subqueue");
+      }
+
+      cmds += count;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+queue_submit(struct hk_device *dev, struct hk_queue *queue,
+             struct vk_queue_submit *submit)
+{
+   unsigned command_count = 0;
+
+   /* Gather the number of individual commands to submit up front */
+   for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
+      struct hk_cmd_buffer *cmdbuf =
+         (struct hk_cmd_buffer *)submit->command_buffers[i];
+
+      command_count += list_length(&cmdbuf->control_streams);
+   }
+
+   if (command_count == 0)
+      return queue_submit_empty(dev, queue, submit);
+
+   unsigned wait_count = 0;
+   struct drm_asahi_sync *waits =
+      alloca(submit->wait_count * sizeof(struct drm_asahi_sync));
+
+   struct drm_asahi_sync *signals =
+      alloca((submit->signal_count + 1) * sizeof(struct drm_asahi_sync));
+
+   for (unsigned i = 0; i < submit->wait_count; ++i) {
+      /* The kernel rejects the submission if we try to wait on the same
+       * timeline semaphore at multiple points.
+       *
+       * TODO: Can we relax the UAPI?
+       *
+       * XXX: This is quadratic time.
+       */
+      bool skip = false;
+      if (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) {
+         uint32_t v1 = submit->waits[i].wait_value;
+         for (unsigned j = 0; j < submit->wait_count; ++j) {
+            uint32_t v2 = submit->waits[j].wait_value;
+            if (i != j && submit->waits[i].sync == submit->waits[j].sync &&
+                (v1 < v2 || (v1 == v2 && i < j))) {
+               skip = true;
+               break;
+            }
+         }
+
+         if (skip)
+            continue;
+      }
+
+      asahi_fill_sync(&waits[wait_count++], submit->waits[i].sync,
+                      submit->waits[i].wait_value);
+   }
+
+   for (unsigned i = 0; i < submit->signal_count; ++i) {
+      asahi_fill_sync(&signals[i], submit->signals[i].sync,
+                      submit->signals[i].signal_value);
+   }
+
+   /* Signal progress on the queue itself */
+   signals[submit->signal_count] = (struct drm_asahi_sync){
+      .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
+      .handle = queue->drm.syncobj,
+      .timeline_value = ++queue->drm.timeline_value,
+   };
+
+   /* Now setup the command structs */
+   struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count);
+   union drm_asahi_cmd *cmds_inner =
+      alloca(sizeof(*cmds_inner) * command_count);
+
+   unsigned cmd_it = 0;
+   unsigned nr_vdm = 0, nr_cdm = 0;
+
+   for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
+      struct hk_cmd_buffer *cmdbuf =
+         (struct hk_cmd_buffer *)submit->command_buffers[i];
+
+      list_for_each_entry(struct hk_cs, cs, &cmdbuf->control_streams, node) {
+         assert(cmd_it < command_count);
+
+         struct drm_asahi_command cmd = {
+            .cmd_buffer = (uint64_t)(uintptr_t)&cmds_inner[cmd_it],
+            .result_offset = 0 /* TODO */,
+            .result_size = 0 /* TODO */,
+            /* Barrier on previous command */
+            .barriers = {nr_vdm, nr_cdm},
+         };
+
+         if (cs->type == HK_CS_CDM) {
+            cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE;
+            cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute);
+            nr_cdm++;
+
+            asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute);
+         } else {
+            assert(cs->type == HK_CS_VDM);
+            cmd.cmd_type = DRM_ASAHI_CMD_RENDER;
+            cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render);
+            nr_vdm++;
+
+            asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render);
+         }
+
+         cmds[cmd_it++] = cmd;
+      }
+   }
+
+   assert(cmd_it == command_count);
+
+   if (dev->dev.debug & AGX_DBG_TRACE) {
+      for (unsigned i = 0; i < command_count; ++i) {
+         if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE) {
+            agxdecode_drm_cmd_compute(dev->dev.agxdecode, &dev->dev.params,
+                                      &cmds_inner[i].compute, true);
+         } else {
+            assert(cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER);
+            agxdecode_drm_cmd_render(dev->dev.agxdecode, &dev->dev.params,
+                                     &cmds_inner[i].render, true);
+         }
+      }
+
+      agxdecode_image_heap(dev->dev.agxdecode, dev->images.bo->ptr.gpu,
+                           dev->images.alloc);
+
+      agxdecode_next_frame();
+   }
+
+   struct drm_asahi_submit submit_ioctl = {
+      .flags = 0,
+      .queue_id = queue->drm.id,
+      .result_handle = 0 /* TODO */,
+      .in_sync_count = wait_count,
+      .out_sync_count = submit->signal_count + 1,
+      .command_count = command_count,
+      .in_syncs = (uint64_t)(uintptr_t)(waits),
+      .out_syncs = (uint64_t)(uintptr_t)(signals),
+      .commands = (uint64_t)(uintptr_t)(cmds),
+   };
+
+   if (command_count <= MAX_COMMANDS_PER_SUBMIT)
+      return queue_submit_single(&dev->dev, &submit_ioctl);
+   else
+      return queue_submit_looped(&dev->dev, &submit_ioctl);
+}
+
+static VkResult
+hk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
+{
+   struct hk_queue *queue = container_of(vk_queue, struct hk_queue, vk);
+   struct hk_device *dev = hk_queue_device(queue);
+
+   if (vk_queue_is_lost(&queue->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   VkResult result = queue_submit(dev, queue, submit);
+   if (result != VK_SUCCESS)
+      return vk_queue_set_lost(&queue->vk, "Submit failed");
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_queue_init(struct hk_device *dev, struct hk_queue *queue,
+              const VkDeviceQueueCreateInfo *pCreateInfo,
+              uint32_t index_in_family)
+{
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   VkResult result;
+
+   assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
+
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+   const enum VkQueueGlobalPriorityKHR global_priority =
+      priority_info ? priority_info->globalPriority
+                    : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   if (global_priority != VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
+   if (result != VK_SUCCESS)
+      return result;
+
+   queue->vk.driver_submit = hk_queue_submit;
+
+   queue->drm.id = agx_create_command_queue(&dev->dev,
+                                            DRM_ASAHI_QUEUE_CAP_RENDER |
+                                               DRM_ASAHI_QUEUE_CAP_BLIT |
+                                               DRM_ASAHI_QUEUE_CAP_COMPUTE,
+                                            2);
+
+   if (drmSyncobjCreate(dev->dev.fd, 0, &queue->drm.syncobj)) {
+      mesa_loge("drmSyncobjCreate() failed %d\n", errno);
+      agx_destroy_command_queue(&dev->dev, queue->drm.id);
+      vk_queue_finish(&queue->vk);
+
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "DRM_IOCTL_SYNCOBJ_CREATE failed: %m");
+   }
+
+   uint64_t initial_value = 1;
+   if (drmSyncobjTimelineSignal(dev->dev.fd, &queue->drm.syncobj,
+                                &initial_value, 1)) {
+      hk_queue_finish(dev, queue);
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "DRM_IOCTL_TIMELINE_SYNCOBJ_SIGNAL failed: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+hk_queue_finish(struct hk_device *dev, struct hk_queue *queue)
+{
+   drmSyncobjDestroy(dev->dev.fd, queue->drm.syncobj);
+   agx_destroy_command_queue(&dev->dev, queue->drm.id);
+   vk_queue_finish(&queue->vk);
+}
diff --git a/src/asahi/vulkan/hk_queue.h b/src/asahi/vulkan/hk_queue.h
new file mode 100644
index 00000000000..42e446ba430
--- /dev/null
+++ b/src/asahi/vulkan/hk_queue.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+#include "vk_queue.h"
+
+struct hk_device;
+
+struct hk_queue {
+   struct vk_queue vk;
+
+   struct {
+      /* Asahi kernel queue ID */
+      uint32_t id;
+
+      /* Timeline syncobj backing the queue */
+      uint32_t syncobj;
+
+      /* Current maximum timeline value for the queue's syncobj. If the
+       * syncobj's value equals timeline_value, then all work is complete.
+       */
+      uint32_t timeline_value;
+   } drm;
+};
+
+static inline struct hk_device *
+hk_queue_device(struct hk_queue *queue)
+{
+   return (struct hk_device *)queue->vk.base.device;
+}
+
+VkResult hk_queue_init(struct hk_device *dev, struct hk_queue *queue,
+                       const VkDeviceQueueCreateInfo *pCreateInfo,
+                       uint32_t index_in_family);
+
+void hk_queue_finish(struct hk_device *dev, struct hk_queue *queue);
diff --git a/src/asahi/vulkan/hk_sampler.c b/src/asahi/vulkan/hk_sampler.c
new file mode 100644
index 00000000000..7e936b0cb04
--- /dev/null
+++ b/src/asahi/vulkan/hk_sampler.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_sampler.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vk_enum_to_str.h"
+#include "vk_format.h"
+#include "vk_sampler.h"
+
+#include "asahi/genxml/agx_pack.h"
+
+static inline uint32_t
+translate_address_mode(VkSamplerAddressMode addr_mode)
+{
+#define MODE(VK, AGX_) [VK_SAMPLER_ADDRESS_MODE_##VK] = AGX_WRAP_##AGX_
+   static const uint8_t translate[] = {
+      MODE(REPEAT, REPEAT),
+      MODE(MIRRORED_REPEAT, MIRRORED_REPEAT),
+      MODE(CLAMP_TO_EDGE, CLAMP_TO_EDGE),
+      MODE(CLAMP_TO_BORDER, CLAMP_TO_BORDER),
+      MODE(MIRROR_CLAMP_TO_EDGE, MIRRORED_CLAMP_TO_EDGE),
+   };
+#undef MODE
+
+   assert(addr_mode < ARRAY_SIZE(translate));
+   return translate[addr_mode];
+}
+
+static uint32_t
+translate_texsamp_compare_op(VkCompareOp op)
+{
+#define OP(VK, AGX_) [VK_COMPARE_OP_##VK] = AGX_COMPARE_FUNC_##AGX_
+   static const uint8_t translate[] = {
+      OP(NEVER, NEVER),
+      OP(LESS, LESS),
+      OP(EQUAL, EQUAL),
+      OP(LESS_OR_EQUAL, LEQUAL),
+      OP(GREATER, GREATER),
+      OP(NOT_EQUAL, NOT_EQUAL),
+      OP(GREATER_OR_EQUAL, GEQUAL),
+      OP(ALWAYS, ALWAYS),
+   };
+#undef OP
+
+   assert(op < ARRAY_SIZE(translate));
+   return translate[op];
+}
+
+static enum agx_filter
+translate_filter(VkFilter filter)
+{
+   static_assert((enum agx_filter)VK_FILTER_NEAREST == AGX_FILTER_NEAREST);
+   static_assert((enum agx_filter)VK_FILTER_LINEAR == AGX_FILTER_LINEAR);
+
+   return (enum agx_filter)filter;
+}
+
+static enum agx_mip_filter
+translate_mipfilter(VkSamplerMipmapMode mode)
+{
+   switch (mode) {
+   case VK_SAMPLER_MIPMAP_MODE_NEAREST:
+      return AGX_MIP_FILTER_NEAREST;
+
+   case VK_SAMPLER_MIPMAP_MODE_LINEAR:
+      return AGX_MIP_FILTER_LINEAR;
+
+   default:
+      unreachable("Invalid filter");
+   }
+}
+
+static bool
+uses_border(const VkSamplerCreateInfo *info)
+{
+   return info->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
+          info->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
+          info->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
+}
+
+static enum agx_border_colour
+is_border_color_custom(VkBorderColor color)
+{
+   /* TODO: for now, opaque black is treated as custom due to rgba4 swizzling
+    * issues, could be optimized though.
+    */
+   switch (color) {
+   case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
+   case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
+   case VK_BORDER_COLOR_INT_CUSTOM_EXT:
+   case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/* Translate an American VkBorderColor into a Canadian agx_border_colour */
+static enum agx_border_colour
+translate_border_color(VkBorderColor color, bool custom_to_1)
+{
+   switch (color) {
+   case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
+   case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
+      return AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
+
+   case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
+   case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
+      return AGX_BORDER_COLOUR_OPAQUE_WHITE;
+
+   default:
+      assert(is_border_color_custom(color));
+      return custom_to_1 ? AGX_BORDER_COLOUR_OPAQUE_WHITE
+                         : AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
+   }
+}
+
+static void
+pack_sampler(const struct hk_physical_device *pdev,
+             const struct VkSamplerCreateInfo *info, bool custom_to_1,
+             struct agx_sampler_packed *out)
+{
+   agx_pack(out, SAMPLER, cfg) {
+      cfg.minimum_lod = info->minLod;
+      cfg.maximum_lod = info->maxLod;
+      cfg.magnify = translate_filter(info->magFilter);
+      cfg.minify = translate_filter(info->minFilter);
+      cfg.mip_filter = translate_mipfilter(info->mipmapMode);
+      cfg.wrap_s = translate_address_mode(info->addressModeU);
+      cfg.wrap_t = translate_address_mode(info->addressModeV);
+      cfg.wrap_r = translate_address_mode(info->addressModeW);
+      cfg.pixel_coordinates = info->unnormalizedCoordinates;
+
+      cfg.seamful_cube_maps =
+         info->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT;
+
+      if (info->compareEnable) {
+         cfg.compare_func = translate_texsamp_compare_op(info->compareOp);
+         cfg.compare_enable = true;
+      }
+
+      if (info->anisotropyEnable) {
+         cfg.maximum_anisotropy =
+            util_next_power_of_two(MAX2(info->maxAnisotropy, 1));
+      } else {
+         cfg.maximum_anisotropy = 1;
+      }
+
+      if (uses_border(info)) {
+         cfg.border_colour =
+            translate_border_color(info->borderColor, custom_to_1);
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateSampler(VkDevice device,
+                 const VkSamplerCreateInfo *info /* pCreateInfo */,
+                 const VkAllocationCallbacks *pAllocator, VkSampler *pSampler)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_sampler *sampler;
+   VkResult result;
+
+   sampler = vk_sampler_create(&dev->vk, info, pAllocator, sizeof(*sampler));
+   if (!sampler)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct agx_sampler_packed samp;
+   pack_sampler(pdev, info, true, &samp);
+
+   /* LOD bias passed in the descriptor set */
+   sampler->lod_bias_fp16 = _mesa_float_to_half(info->mipLodBias);
+
+   result =
+      hk_sampler_heap_add(dev, samp, &sampler->planes[sampler->plane_count].hw);
+   if (result != VK_SUCCESS) {
+      hk_DestroySampler(device, hk_sampler_to_handle(sampler), pAllocator);
+      return result;
+   }
+
+   sampler->plane_count++;
+
+   /* In order to support CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT, we
+    * need multiple sampler planes: at minimum we will need one for luminance
+    * (the default), and one for chroma.  Each sampler plane needs its own
+    * sampler table entry.  However, sampler table entries are very rare on
+    * G13, and each plane would burn one of those. So we make sure to allocate
+    * only the minimum amount that we actually need (i.e., either 1 or 2), and
+    * then just copy the last sampler plane out as far as we need to fill the
+    * number of image planes.
+    */
+   if (sampler->vk.ycbcr_conversion) {
+      assert(!uses_border(info) &&
+             "consequence of VUID-VkSamplerCreateInfo-addressModeU-01646");
+
+      const VkFilter chroma_filter =
+         sampler->vk.ycbcr_conversion->state.chroma_filter;
+      if (info->magFilter != chroma_filter ||
+          info->minFilter != chroma_filter) {
+         VkSamplerCreateInfo plane2_info = *info;
+         plane2_info.magFilter = chroma_filter;
+         plane2_info.minFilter = chroma_filter;
+
+         pack_sampler(pdev, &plane2_info, false, &samp);
+         result = hk_sampler_heap_add(
+            dev, samp, &sampler->planes[sampler->plane_count].hw);
+
+         if (result != VK_SUCCESS) {
+            hk_DestroySampler(device, hk_sampler_to_handle(sampler),
+                              pAllocator);
+            return result;
+         }
+
+         sampler->plane_count++;
+      }
+   } else if (uses_border(info)) {
+      /* If the sampler uses custom border colours, we need both clamp-to-1
+       * and clamp-to-0 variants. We treat these as planes.
+       */
+      pack_sampler(pdev, info, false, &samp);
+      result = hk_sampler_heap_add(dev, samp,
+                                   &sampler->planes[sampler->plane_count].hw);
+
+      if (result != VK_SUCCESS) {
+         hk_DestroySampler(device, hk_sampler_to_handle(sampler), pAllocator);
+         return result;
+      }
+
+      sampler->plane_count++;
+
+      /* We also need to record the border.
+       *
+       * If there is a border colour component mapping, we need to swizzle with
+       * it. Otherwise, we can assume there's nothing to do.
+       */
+      VkClearColorValue bc = sampler->vk.border_color_value;
+
+      const VkSamplerBorderColorComponentMappingCreateInfoEXT *swiz_info =
+         vk_find_struct_const(
+            info->pNext,
+            SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT);
+
+      if (swiz_info) {
+         const bool is_int = vk_border_color_is_int(info->borderColor);
+         bc = vk_swizzle_color_value(bc, swiz_info->components, is_int);
+      }
+
+      sampler->custom_border = bc;
+      sampler->has_border = true;
+   }
+
+   *pSampler = hk_sampler_to_handle(sampler);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroySampler(VkDevice device, VkSampler _sampler,
+                  const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_sampler, sampler, _sampler);
+
+   if (!sampler)
+      return;
+
+   for (uint8_t plane = 0; plane < sampler->plane_count; plane++) {
+      hk_sampler_heap_remove(dev, sampler->planes[plane].hw);
+   }
+
+   vk_sampler_destroy(&dev->vk, pAllocator, &sampler->vk);
+}
diff --git a/src/asahi/vulkan/hk_sampler.h b/src/asahi/vulkan/hk_sampler.h
new file mode 100644
index 00000000000..444aabc8d65
--- /dev/null
+++ b/src/asahi/vulkan/hk_sampler.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_device.h"
+#include "hk_physical_device.h"
+#include "hk_private.h"
+
+#include "vk_sampler.h"
+#include "vk_ycbcr_conversion.h"
+
+#include "vk_format.h"
+
+struct hk_sampler {
+   struct vk_sampler vk;
+   VkClearColorValue custom_border;
+   bool has_border;
+
+   uint8_t plane_count;
+   uint16_t lod_bias_fp16;
+
+   struct {
+      struct hk_rc_sampler *hw;
+   } planes[2];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_sampler, vk.base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c
new file mode 100644
index 00000000000..60303963fd7
--- /dev/null
+++ b/src/asahi/vulkan/hk_shader.c
@@ -0,0 +1,1432 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_shader.h"
+
+#include "agx_helpers.h"
+#include "agx_nir_lower_gs.h"
+#include "glsl_types.h"
+#include "nir.h"
+#include "nir_builder.h"
+
+#include "agx_bo.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_device.h"
+#include "hk_physical_device.h"
+#include "hk_sampler.h"
+#include "hk_shader.h"
+
+#include "nir_builder_opcodes.h"
+#include "nir_builtin_builder.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "nir_xfb_info.h"
+#include "shader_enums.h"
+#include "vk_nir_convert_ycbcr.h"
+#include "vk_pipeline.h"
+#include "vk_pipeline_layout.h"
+#include "vk_shader_module.h"
+#include "vk_ycbcr_conversion.h"
+
+#include "asahi/compiler/agx_compile.h"
+#include "asahi/lib/agx_linker.h"
+#include "asahi/lib/agx_nir_passes.h"
+#include "asahi/lib/agx_tilebuffer.h"
+#include "asahi/lib/agx_uvs.h"
+#include "compiler/spirv/nir_spirv.h"
+
+#include "util/blob.h"
+#include "util/hash_table.h"
+#include "util/macros.h"
+#include "util/mesa-sha1.h"
+#include "util/simple_mtx.h"
+#include "util/u_debug.h"
+#include "vulkan/vulkan_core.h"
+
+struct hk_fs_key {
+   bool zs_self_dep;
+
+   /** True if sample shading is forced on via an API knob such as
+    * VkPipelineMultisampleStateCreateInfo::minSampleShading
+    */
+   bool force_sample_shading;
+
+   uint8_t pad[2];
+};
+static_assert(sizeof(struct hk_fs_key) == 4, "packed");
+
+static void
+shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+
+   uint32_t comp_size =
+      glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
+   unsigned length = glsl_get_vector_elements(type);
+   *size = comp_size * length, *align = comp_size;
+}
+
+uint64_t
+hk_physical_device_compiler_flags(const struct hk_physical_device *pdev)
+{
+   /* TODO compiler flags */
+   return 0;
+}
+
+const nir_shader_compiler_options *
+hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
+                   UNUSED const struct vk_pipeline_robustness_state *rs)
+{
+   return &agx_nir_options;
+}
+
+static struct spirv_to_nir_options
+hk_get_spirv_options(struct vk_physical_device *vk_pdev,
+                     UNUSED gl_shader_stage stage,
+                     const struct vk_pipeline_robustness_state *rs)
+{
+   return (struct spirv_to_nir_options){
+      .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
+      .phys_ssbo_addr_format = nir_address_format_64bit_global,
+      .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
+      .shared_addr_format = nir_address_format_32bit_offset,
+      .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT,
+      .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT,
+   };
+}
+
+static bool
+lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_jump)
+      return false;
+
+   nir_jump_instr *jump = nir_instr_as_jump(instr);
+   if (jump->type != nir_jump_halt)
+      return false;
+
+   assert(b->impl == nir_shader_get_entrypoint(b->shader));
+   jump->type = nir_jump_return;
+   return true;
+}
+
+void
+hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir)
+{
+   /* Must lower before io to temps */
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_lower_terminate_to_demote);
+      NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return,
+               nir_metadata_all, NULL);
+      NIR_PASS(_, nir, nir_lower_returns);
+   }
+
+   /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */
+   UNUSED bool progress = false;
+   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
+
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_opt_loop);
+      NIR_PASS(progress, nir, nir_opt_loop_unroll);
+   } while (progress);
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      struct nir_lower_sysvals_to_varyings_options sysvals_opts = {
+         .point_coord = true,
+      };
+
+      nir_lower_sysvals_to_varyings(nir, &sysvals_opts);
+   }
+
+   NIR_PASS(_, nir, nir_lower_system_values);
+
+   /* Gather info before preprocess_nir but after some general lowering, so
+    * inputs_read and system_values_read are accurately set.
+    */
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
+              true, false);
+
+   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
+
+   NIR_PASS(_, nir, nir_split_var_copies);
+   NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
+
+   /* Optimize but allow copies because we haven't lowered them yet */
+   agx_preprocess_nir(nir, NULL);
+
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_lower_var_copies);
+}
+
+static void
+hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
+{
+   hk_preprocess_nir_internal(vk_pdev, nir);
+   nir_lower_compute_system_values_options csv_options = {
+      .has_base_workgroup_id = true,
+   };
+   NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
+}
+
+static void
+hk_populate_fs_key(struct hk_fs_key *key,
+                   const struct vk_graphics_pipeline_state *state)
+{
+   memset(key, 0, sizeof(*key));
+
+   if (state == NULL)
+      return;
+
+   if (state->pipeline_flags &
+       VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
+      key->zs_self_dep = true;
+
+   /* We force per-sample interpolation whenever sampleShadingEnable is set
+    * regardless of minSampleShading or rasterizationSamples.
+    *
+    * When sampleShadingEnable is set, few guarantees are made about the
+    * location of interpolation of the inputs.  The only real guarantees are
+    * that the inputs are interpolated within the pixel and that you get at
+    * least `rasterizationSamples * minSampleShading` unique positions.
+    * Importantly, it does not require that when `rasterizationSamples *
+    * minSampleShading <= 1.0` that those positions are at the fragment
+    * center.  Therefore, it's valid to just always do per-sample all the time.
+    *
+    * The one caveat here is that we have to be careful about gl_SampleMaskIn.
+    * When `hk_fs_key::force_sample_shading = true` we also turn any reads of
+    * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
+    * is actually per-fragment, not per-pass.  We handle this by smashing
+    * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
+    */
+   const struct vk_multisample_state *ms = state->ms;
+   if (ms != NULL && ms->sample_shading_enable)
+      key->force_sample_shading = true;
+}
+
+static void
+hk_hash_graphics_state(struct vk_physical_device *device,
+                       const struct vk_graphics_pipeline_state *state,
+                       VkShaderStageFlags stages, blake3_hash blake3_out)
+{
+   struct mesa_blake3 blake3_ctx;
+   _mesa_blake3_init(&blake3_ctx);
+   if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
+      struct hk_fs_key key;
+      hk_populate_fs_key(&key, state);
+      _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
+
+      const bool is_multiview = state->rp->view_mask != 0;
+      _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
+   }
+   _mesa_blake3_final(&blake3_ctx, blake3_out);
+}
+
+static bool
+lower_load_global_constant_offset_instr(nir_builder *b,
+                                        nir_intrinsic_instr *intrin,
+                                        UNUSED void *_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset &&
+       intrin->intrinsic != nir_intrinsic_load_global_constant_bounded)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def *base_addr = intrin->src[0].ssa;
+   nir_def *offset = intrin->src[1].ssa;
+
+   nir_def *zero = NULL;
+   if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
+      nir_def *bound = intrin->src[2].ssa;
+
+      unsigned bit_size = intrin->def.bit_size;
+      assert(bit_size >= 8 && bit_size % 8 == 0);
+      unsigned byte_size = bit_size / 8;
+
+      zero = nir_imm_zero(b, intrin->num_components, bit_size);
+
+      unsigned load_size = byte_size * intrin->num_components;
+
+      nir_def *sat_offset =
+         nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
+      nir_def *in_bounds =
+         nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
+
+      nir_push_if(b, in_bounds);
+   }
+
+   nir_def *val = nir_build_load_global_constant(
+      b, intrin->def.num_components, intrin->def.bit_size,
+      nir_iadd(b, base_addr, nir_u2u64(b, offset)),
+      .align_mul = nir_intrinsic_align_mul(intrin),
+      .align_offset = nir_intrinsic_align_offset(intrin),
+      .access = nir_intrinsic_access(intrin));
+
+   if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
+      nir_pop_if(b, NULL);
+      val = nir_if_phi(b, val, zero);
+   }
+
+   nir_def_rewrite_uses(&intrin->def, val);
+
+   return true;
+}
+
+struct lower_ycbcr_state {
+   uint32_t set_layout_count;
+   struct vk_descriptor_set_layout *const *set_layouts;
+};
+
+static const struct vk_ycbcr_conversion_state *
+lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding,
+                        uint32_t array_index)
+{
+   const struct lower_ycbcr_state *state = _state;
+   assert(set < state->set_layout_count);
+   assert(state->set_layouts[set] != NULL);
+   const struct hk_descriptor_set_layout *set_layout =
+      vk_to_hk_descriptor_set_layout(state->set_layouts[set]);
+   assert(binding < set_layout->binding_count);
+
+   const struct hk_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
+
+   if (bind_layout->immutable_samplers == NULL)
+      return NULL;
+
+   array_index = MIN2(array_index, bind_layout->array_size - 1);
+
+   const struct hk_sampler *sampler =
+      bind_layout->immutable_samplers[array_index];
+
+   return sampler && sampler->vk.ycbcr_conversion
+             ? &sampler->vk.ycbcr_conversion->state
+             : NULL;
+}
+
+static inline bool
+nir_has_image_var(nir_shader *nir)
+{
+   nir_foreach_image_variable(_, nir)
+      return true;
+
+   return false;
+}
+
+static int
+glsl_type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+/*
+ * This is the world's worst multiview implementation. We simply duplicate each
+ * draw on the CPU side, changing a uniform in between, and then plumb the view
+ * index into the layer ID here. Whatever, it works.
+ *
+ * The "proper" implementation on AGX would use vertex amplification, but a
+ * MacBook is not a VR headset.
+ */
+static void
+hk_lower_multiview(nir_shader *nir)
+{
+   /* If there's an existing layer ID write, ignore it. This avoids validation
+    * splat with vk_meta.
+    */
+   nir_variable *existing = nir_find_variable_with_location(
+      nir, nir_var_shader_out, VARYING_SLOT_LAYER);
+
+   if (existing) {
+      existing->data.mode = nir_var_shader_temp;
+      existing->data.location = 0;
+      nir_fixup_deref_modes(nir);
+   }
+
+   /* Now write the view index as the layer */
+   nir_builder b =
+      nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
+
+   nir_variable *layer =
+      nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL);
+
+   layer->data.location = VARYING_SLOT_LAYER;
+
+   nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1));
+   b.shader->info.outputs_written |= VARYING_BIT_LAYER;
+}
+
+/*
+ * KHR_maintenance5 requires that points rasterize with a default point size of
+ * 1.0, while our hardware requires an explicit point size write for this.
+ * Since topology may be dynamic, we insert an unconditional write if necessary.
+ */
+static bool
+hk_nir_insert_psiz_write(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
+      nir_metadata_preserve(impl, nir_metadata_all);
+      return false;
+   }
+
+   nir_builder b = nir_builder_at(nir_after_impl(impl));
+
+   nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0),
+                    .write_mask = nir_component_mask(1),
+                    .io_semantics.location = VARYING_SLOT_PSIZ,
+                    .io_semantics.num_slots = 1, .src_type = nir_type_float32);
+
+   nir->info.outputs_written |= VARYING_BIT_PSIZ;
+   nir_metadata_preserve(b.impl, nir_metadata_control_flow);
+   return true;
+}
+
+static nir_def *
+query_custom_border(nir_builder *b, nir_tex_instr *tex)
+{
+   return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4,
+                                  tex->dest_type, false, false);
+}
+
+static nir_def *
+has_custom_border(nir_builder *b, nir_tex_instr *tex)
+{
+   return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx,
+                                  1, nir_type_bool1, false, false);
+}
+
+static bool
+lower(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+   if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex))
+      return false;
+
+   /* XXX: this is a really weird edge case, is this even well-defined? */
+   if (tex->is_shadow)
+      return false;
+
+   b->cursor = nir_after_instr(&tex->instr);
+   nir_def *has_custom = has_custom_border(b, tex);
+
+   nir_instr *orig = nir_instr_clone(b->shader, &tex->instr);
+   nir_builder_instr_insert(b, orig);
+   nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def;
+
+   nir_push_if(b, has_custom);
+   nir_def *replaced = NULL;
+   {
+      /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */
+      nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr);
+      nir_builder_instr_insert(b, clone_instr);
+
+      nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr);
+      nir_def *clamp_to_0 = &tex_0->def;
+
+      tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0;
+
+      /* Grab the border colour */
+      nir_def *border = query_custom_border(b, tex_0);
+
+      if (tex->op == nir_texop_tg4) {
+         border = nir_replicate(b, nir_channel(b, border, tex->component), 4);
+      }
+
+      /* Combine together with the border */
+      if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float &&
+          tex->op != nir_texop_tg4) {
+
+         /* For floats, lerp together:
+          *
+          * For border texels:  (1 * border) + (0 * border      ) = border
+          * For regular texels: (x * border) + (x * (1 - border)) = x.
+          *
+          * Linear filtering is linear (duh), so lerping is compatible.
+          */
+         replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border);
+      } else {
+         /* For integers, just select componentwise since there is no linear
+          * filtering. Gathers also use this path since they are unfiltered in
+          * each component.
+          */
+         replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0,
+                              border);
+      }
+   }
+   nir_pop_if(b, NULL);
+
+   /* Put it together with a phi */
+   nir_def *phi = nir_if_phi(b, replaced, clamp_to_1);
+   nir_def_replace(&tex->def, phi);
+   return true;
+}
+
+static bool
+agx_nir_lower_custom_border(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, lower, nir_metadata_none, NULL);
+}
+
+/*
+ * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not
+ * written by the vertex shader, but in our implementation, the varying would
+ * otherwise be undefined. This small pass predicates VIEWPORT reads based on
+ * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index).
+ */
+static bool
+lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   if (sem.location != VARYING_SLOT_VIEWPORT)
+      return false;
+
+   b->cursor = nir_after_instr(&intr->instr);
+   nir_def *orig = &intr->def;
+
+   nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem);
+   nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0));
+
+   nir_def_rewrite_uses_after(orig, def, def->parent_instr);
+   return true;
+}
+
+static bool
+lower_subpass_dim(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS)
+      tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
+   else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
+      tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+   else
+      return false;
+
+   return true;
+}
+
+void
+hk_lower_nir(struct hk_device *dev, nir_shader *nir,
+             const struct vk_pipeline_robustness_state *rs, bool is_multiview,
+             uint32_t set_layout_count,
+             struct vk_descriptor_set_layout *const *set_layouts)
+{
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_lower_input_attachments,
+               &(nir_input_attachment_options){
+                  .use_fragcoord_sysval = true,
+                  .use_layer_id_sysval = true,
+                  .use_view_id_for_layer = is_multiview,
+               });
+
+      NIR_PASS(_, nir, nir_shader_instructions_pass, lower_subpass_dim,
+               nir_metadata_all, NULL);
+      NIR_PASS(_, nir, nir_lower_wpos_center);
+   }
+
+   /* XXX: should be last geometry stage, how do I get to that? */
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      if (is_multiview)
+         hk_lower_multiview(nir);
+   }
+
+   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      NIR_PASS(_, nir, nir_lower_patch_vertices,
+               nir->info.tess.tcs_vertices_out, NULL);
+   }
+
+   const struct lower_ycbcr_state ycbcr_state = {
+      .set_layout_count = set_layout_count,
+      .set_layouts = set_layouts,
+   };
+   NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion,
+            &ycbcr_state);
+
+   /* Lower push constants before lower_descriptors */
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
+            nir_address_format_32bit_offset);
+
+   // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
+
+   /* Images accessed through the texture or PBE hardware are robust, so we
+    * don't set lower_image. (There are some sticky details around txf but
+    * they're handled by agx_nir_lower_texture). However, image atomics are
+    * software so require robustness lowering.
+    */
+   nir_lower_robust_access_options robustness = {
+      .lower_image_atomic = true,
+   };
+
+   NIR_PASS(_, nir, nir_lower_robust_access, &robustness);
+
+   /* We must do early lowering before hk_nir_lower_descriptors, since this will
+    * create lod_bias_agx instructions.
+    */
+   NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */);
+   NIR_PASS(_, nir, agx_nir_lower_custom_border);
+
+   NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count,
+            set_layouts);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
+            nir_address_format_64bit_global);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+            hk_buffer_addr_format(rs->storage_buffers));
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
+            hk_buffer_addr_format(rs->uniform_buffers));
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+            lower_load_global_constant_offset_instr, nir_metadata_none, NULL);
+
+   if (!nir->info.shared_memory_explicit_layout) {
+      /* There may be garbage in shared_size, but it's the job of
+       * nir_lower_vars_to_explicit_types to allocate it. We have to reset to
+       * avoid overallocation.
+       */
+      nir->info.shared_size = 0;
+
+      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
+               shared_var_info);
+   }
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
+            nir_address_format_32bit_offset);
+
+   if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
+      /* Align everything up to 16B so we can write whole vec4s. */
+      nir->info.shared_size = align(nir->info.shared_size, 16);
+      NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
+               16);
+
+      /* We need to call lower_compute_system_values again because
+       * nir_zero_initialize_shared_memory generates load_invocation_id which
+       * has to be lowered to load_invocation_index.
+       */
+      NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
+   }
+
+   /* TODO: we can do indirect VS output */
+   nir_variable_mode lower_indirect_modes = 0;
+   if (nir->info.stage == MESA_SHADER_FRAGMENT)
+      lower_indirect_modes |= nir_var_shader_out;
+   else if (nir->info.stage == MESA_SHADER_VERTEX)
+      lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out;
+
+   NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes,
+            UINT32_MAX);
+
+   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+            glsl_type_size, nir_lower_io_lower_64bit_to_32);
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs,
+               nir_metadata_control_flow, NULL);
+   }
+
+   NIR_PASS(_, nir, agx_nir_lower_texture);
+   NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
+
+   agx_preprocess_nir(nir, dev->dev.libagx);
+   NIR_PASS(_, nir, nir_opt_conditional_discard);
+   NIR_PASS(_, nir, nir_opt_if,
+            nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis);
+}
+
+static void
+hk_upload_shader(struct hk_device *dev, struct hk_shader *shader)
+{
+   if (shader->b.info.has_preamble) {
+      unsigned offs = shader->b.info.preamble_offset;
+      assert(offs < shader->b.binary_size);
+
+      size_t size = shader->b.binary_size - offs;
+      assert(size > 0);
+
+      shader->bo = agx_bo_create(&dev->dev, size, AGX_BO_EXEC | AGX_BO_LOW_VA,
+                                 "Preamble");
+      memcpy(shader->bo->ptr.cpu, shader->b.binary + offs, size);
+      shader->preamble_addr = shader->bo->ptr.gpu;
+   }
+
+   if (!shader->linked.ht) {
+      /* If we only have a single variant, link now. */
+      shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0);
+   }
+
+   if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+      agx_pack(&shader->frag_face, FRAGMENT_FACE_2, cfg) {
+         cfg.conservative_depth =
+            agx_translate_depth_layout(shader->b.info.depth_layout);
+      }
+   }
+
+   agx_pack(&shader->counts, COUNTS, cfg) {
+      cfg.uniform_register_count = shader->b.info.push_count;
+      cfg.preshader_register_count = shader->b.info.nr_preamble_gprs;
+      cfg.sampler_state_register_count = agx_translate_sampler_state_count(
+         shader->b.info.uses_txf ? 1 : 0, false);
+   }
+}
+
+DERIVE_HASH_TABLE(hk_fast_link_key_vs);
+DERIVE_HASH_TABLE(hk_fast_link_key_fs);
+
+static VkResult
+hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage)
+{
+   simple_mtx_init(&shader->linked.lock, mtx_plain);
+
+   bool multiple_variants =
+      sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT;
+
+   if (!multiple_variants)
+      return VK_SUCCESS;
+
+   if (sw_stage == MESA_SHADER_VERTEX)
+      shader->linked.ht = hk_fast_link_key_vs_table_create(NULL);
+   else
+      shader->linked.ht = hk_fast_link_key_fs_table_create(NULL);
+
+   return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY
+                                      : VK_SUCCESS;
+}
+
+static VkResult
+hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
+               nir_shader *nir, VkShaderCreateFlagsEXT shader_flags,
+               const struct vk_pipeline_robustness_state *rs,
+               const struct hk_fs_key *fs_key, struct hk_shader *shader,
+               gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info)
+{
+   unsigned vs_uniform_base = 0;
+
+   /* For now, only shader objects are supported */
+   if (sw_stage == MESA_SHADER_VERTEX) {
+      vs_uniform_base =
+         6 * DIV_ROUND_UP(
+                BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4);
+   } else if (sw_stage == MESA_SHADER_FRAGMENT) {
+      shader->info.fs.interp = agx_gather_interp_info(nir);
+      shader->info.fs.writes_memory = nir->info.writes_memory;
+
+      /* Discards must be lowering before lowering MSAA to handle discards */
+      NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
+      NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog,
+               &shader->info.fs.epilog_key);
+      NIR_PASS(_, nir, agx_nir_lower_sample_mask);
+
+      if (nir->info.fs.uses_sample_shading) {
+         /* Ensure the sample ID is preserved in register */
+         nir_builder b =
+            nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
+         nir_export_agx(&b, nir_load_exported_agx(&b, 1, 16, .base = 1),
+                        .base = 1);
+
+         NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
+      }
+
+      NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
+      NIR_PASS(_, nir, agx_nir_lower_interpolation);
+   } else if (sw_stage == MESA_SHADER_TESS_EVAL) {
+      shader->info.ts.ccw = nir->info.tess.ccw;
+      shader->info.ts.point_mode = nir->info.tess.point_mode;
+      shader->info.ts.spacing = nir->info.tess.spacing;
+      shader->info.ts.mode = nir->info.tess._primitive_mode;
+
+      if (nir->info.tess.point_mode) {
+         shader->info.ts.out_prim = MESA_PRIM_POINTS;
+      } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
+         shader->info.ts.out_prim = MESA_PRIM_LINES;
+      } else {
+         shader->info.ts.out_prim = MESA_PRIM_TRIANGLES;
+      }
+
+      /* This destroys info so it needs to happen after the gather */
+      NIR_PASS(_, nir, agx_nir_lower_tes, dev->dev.libagx, hw);
+   } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
+      shader->info.tcs.output_patch_size = nir->info.tess.tcs_vertices_out;
+      shader->info.tcs.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
+      shader->info.tcs.nr_patch_outputs =
+         util_last_bit(nir->info.patch_outputs_written);
+      shader->info.tcs.output_stride = agx_tcs_output_stride(nir);
+   }
+
+   uint64_t outputs = nir->info.outputs_written;
+   if (!hw &&
+       (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) {
+      nir->info.stage = MESA_SHADER_COMPUTE;
+      memset(&nir->info.cs, 0, sizeof(nir->info.cs));
+      nir->xfb_info = NULL;
+   }
+
+   /* XXX: rename */
+   NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base);
+
+#if 0
+   /* TODO */
+   nir_variable_mode robust2_modes = 0;
+   if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
+      robust2_modes |= nir_var_mem_ubo;
+   if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
+      robust2_modes |= nir_var_mem_ssbo;
+#endif
+
+   struct agx_shader_key backend_key = {
+      .needs_g13x_coherency = (dev->dev.params.gpu_generation == 13 &&
+                               dev->dev.params.num_clusters_total > 1) ||
+                              dev->dev.params.num_dies > 1,
+      .reserved_preamble = 128 /* TODO */,
+      .libagx = dev->dev.libagx,
+      .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT,
+      .has_scratch = true,
+   };
+
+   /* For now, sample shading is always dynamic. Indicate that. */
+   if (nir->info.stage == MESA_SHADER_FRAGMENT &&
+       nir->info.fs.uses_sample_shading)
+      backend_key.fs.inside_sample_loop = true;
+
+   agx_compile_shader_nir(nir, &backend_key, NULL, &shader->b);
+
+   shader->code_ptr = shader->b.binary;
+   shader->code_size = shader->b.binary_size;
+
+   shader->info.stage = sw_stage;
+   shader->info.clip_distance_array_size = nir->info.clip_distance_array_size;
+   shader->info.cull_distance_array_size = nir->info.cull_distance_array_size;
+   shader->b.info.outputs = outputs;
+
+   if (sw_stage == MESA_SHADER_COMPUTE) {
+      for (unsigned i = 0; i < 3; ++i)
+         shader->info.cs.local_size[i] = nir->info.workgroup_size[i];
+   }
+
+   if (xfb_info) {
+      assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs));
+
+      memcpy(&shader->info.xfb_info, xfb_info,
+             nir_xfb_info_size(xfb_info->output_count));
+
+      typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4);
+   }
+
+   if (nir->constant_data_size > 0) {
+      uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT);
+
+      void *data = malloc(data_size);
+      if (data == NULL) {
+         ralloc_free(nir);
+         return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      memcpy(data, nir->constant_data, nir->constant_data_size);
+
+      assert(nir->constant_data_size <= data_size);
+      memset(data + nir->constant_data_size, 0,
+             data_size - nir->constant_data_size);
+
+      shader->data_ptr = data;
+      shader->data_size = data_size;
+   }
+
+   ralloc_free(nir);
+
+   VkResult result = hk_init_link_ht(shader, sw_stage);
+   if (result != VK_SUCCESS)
+      return vk_error(dev, result);
+
+   hk_upload_shader(dev, shader);
+   return VK_SUCCESS;
+}
+
+static const struct vk_shader_ops hk_shader_ops;
+
+static void
+hk_destroy_linked_shader(struct hk_linked_shader *linked)
+{
+   agx_bo_unreference(linked->b.bo);
+   ralloc_free(linked);
+}
+
+static void
+hk_destroy_linked_shader_ht(struct hash_entry *he)
+{
+   hk_destroy_linked_shader(he->data);
+}
+
+static void
+hk_shader_destroy(struct hk_shader *s)
+{
+   free((void *)s->code_ptr);
+   free((void *)s->data_ptr);
+   agx_bo_unreference(s->bo);
+
+   simple_mtx_destroy(&s->linked.lock);
+   _mesa_hash_table_destroy(s->linked.ht, hk_destroy_linked_shader_ht);
+
+   if (s->only_linked)
+      hk_destroy_linked_shader(s->only_linked);
+}
+
+void
+hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   hk_foreach_variant(obj, shader) {
+      hk_shader_destroy(shader);
+   }
+
+   vk_shader_free(&dev->vk, pAllocator, &obj->vk);
+}
+
+static void
+hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader)
+{
+   /* Point size must be clamped, excessively large points don't render
+    * properly on G13.
+    *
+    * Must be synced with pointSizeRange.
+    */
+   NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f);
+
+   /* TODO: Optimize out for monolithic? */
+   NIR_PASS(_, nir, hk_nir_insert_psiz_write);
+
+   NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+   NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
+
+   NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs);
+
+   shader->info.vs.cull_distance_array_size =
+      nir->info.cull_distance_array_size;
+}
+
+VkResult
+hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
+                  const struct vk_graphics_pipeline_state *state,
+                  const VkAllocationCallbacks *pAllocator,
+                  struct hk_api_shader **shader_out)
+{
+   VkResult result;
+
+   /* We consume the NIR, regardless of success or failure */
+   nir_shader *nir = info->nir;
+
+   size_t size = sizeof(struct hk_api_shader) +
+                 sizeof(struct hk_shader) * hk_num_variants(info->stage);
+   struct hk_api_shader *obj =
+      vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size);
+
+   if (obj == NULL) {
+      ralloc_free(nir);
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   /* TODO: Multiview with ESO */
+   const bool is_multiview = state && state->rp->view_mask != 0;
+
+   hk_lower_nir(dev, nir, info->robustness, is_multiview,
+                info->set_layout_count, info->set_layouts);
+
+   gl_shader_stage sw_stage = nir->info.stage;
+
+   struct hk_fs_key fs_key_tmp, *fs_key = NULL;
+   if (sw_stage == MESA_SHADER_FRAGMENT) {
+      hk_populate_fs_key(&fs_key_tmp, state);
+      fs_key = &fs_key_tmp;
+
+      nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading;
+
+      /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way
+       * to do this.
+       */
+      if (fs_key->zs_self_dep) {
+         nir_builder b =
+            nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir)));
+         nir_discard_if(&b, nir_imm_false(&b));
+         nir->info.fs.uses_discard = true;
+      }
+
+      NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false);
+   } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
+      NIR_PASS_V(nir, agx_nir_lower_tcs, dev->dev.libagx);
+   }
+
+   /* Compile all variants up front */
+   if (sw_stage == MESA_SHADER_GEOMETRY) {
+      for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
+         struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
+         nir_shader *clone = nir_shader_clone(NULL, nir);
+
+         enum mesa_prim out_prim = MESA_PRIM_MAX;
+         nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
+
+         NIR_PASS(_, clone, agx_nir_lower_gs, dev->dev.libagx, rast_disc,
+                  &count, &rast, &pre_gs, &out_prim,
+                  &count_variant->info.gs.count_words);
+
+         if (!rast_disc) {
+            struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
+
+            hk_lower_hw_vs(rast, shader);
+            shader->info.gs.out_prim = out_prim;
+         }
+
+         struct {
+            nir_shader *in;
+            struct hk_shader *out;
+         } variants[] = {
+            {clone, hk_main_gs_variant(obj, rast_disc)},
+            {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
+            {count, count_variant},
+            {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
+         };
+
+         for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
+            if (variants[v].in) {
+               result = hk_compile_nir(dev, pAllocator, variants[v].in,
+                                       info->flags, info->robustness, NULL,
+                                       variants[v].out, sw_stage, true, NULL);
+               if (result != VK_SUCCESS) {
+                  hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+                  ralloc_free(nir);
+                  return result;
+               }
+            }
+         }
+      }
+   } else if (sw_stage == MESA_SHADER_VERTEX ||
+              sw_stage == MESA_SHADER_TESS_EVAL) {
+
+      if (sw_stage == MESA_SHADER_VERTEX) {
+         assert(
+            !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) &&
+            "Fixed-function attributes not used in Vulkan");
+
+         NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in);
+      }
+
+      /* the shader_out portion of this is load-bearing even for tess eval */
+      NIR_PASS(_, nir, nir_io_add_const_offset_to_base,
+               nir_var_shader_in | nir_var_shader_out);
+
+      for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) {
+         struct hk_shader *shader = &obj->variants[v];
+         bool hw = v == HK_VS_VARIANT_HW;
+
+         /* TODO: Optimize single variant when we know nextStage */
+         nir_shader *clone = nir_shader_clone(NULL, nir);
+
+         if (sw_stage == MESA_SHADER_VERTEX) {
+            NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog,
+                     shader->info.vs.attrib_components_read);
+
+            shader->info.vs.attribs_read =
+               nir->info.inputs_read >> VERT_ATTRIB_GENERIC0;
+         }
+
+         if (hw) {
+            hk_lower_hw_vs(clone, shader);
+         } else {
+            NIR_PASS(_, clone, agx_nir_lower_vs_before_gs, dev->dev.libagx);
+         }
+
+         result = hk_compile_nir(dev, pAllocator, clone, info->flags,
+                                 info->robustness, fs_key, shader, sw_stage, hw,
+                                 nir->xfb_info);
+         if (result != VK_SUCCESS) {
+            hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+            ralloc_free(nir);
+            return result;
+         }
+      }
+   } else {
+      struct hk_shader *shader = hk_only_variant(obj);
+
+      /* hk_compile_nir takes ownership of nir */
+      result =
+         hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness,
+                        fs_key, shader, sw_stage, true, NULL);
+      if (result != VK_SUCCESS) {
+         hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+         return result;
+      }
+   }
+
+   *shader_out = obj;
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
+                   struct vk_shader_compile_info *infos,
+                   const struct vk_graphics_pipeline_state *state,
+                   const VkAllocationCallbacks *pAllocator,
+                   struct vk_shader **shaders_out)
+{
+   struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
+
+   for (uint32_t i = 0; i < shader_count; i++) {
+      VkResult result =
+         hk_compile_shader(dev, &infos[i], state, pAllocator,
+                           (struct hk_api_shader **)&shaders_out[i]);
+      if (result != VK_SUCCESS) {
+         /* Clean up all the shaders before this point */
+         for (uint32_t j = 0; j < i; j++)
+            hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
+
+         /* Clean up all the NIR after this point */
+         for (uint32_t j = i + 1; j < shader_count; j++)
+            ralloc_free(infos[j].nir);
+
+         /* Memset the output array */
+         memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
+
+         return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob,
+                      struct hk_shader *shader)
+{
+   struct hk_shader_info info;
+   blob_copy_bytes(blob, &info, sizeof(info));
+
+   struct agx_shader_info b_info;
+   blob_copy_bytes(blob, &b_info, sizeof(b_info));
+
+   const uint32_t code_size = blob_read_uint32(blob);
+   const uint32_t data_size = blob_read_uint32(blob);
+   if (blob->overrun)
+      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
+
+   VkResult result = hk_init_link_ht(shader, info.stage);
+   if (result != VK_SUCCESS)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   simple_mtx_init(&shader->linked.lock, mtx_plain);
+
+   shader->b.info = b_info;
+   shader->info = info;
+   shader->code_size = code_size;
+   shader->data_size = data_size;
+   shader->b.binary_size = code_size;
+
+   shader->code_ptr = malloc(code_size);
+   if (shader->code_ptr == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   shader->data_ptr = malloc(data_size);
+   if (shader->data_ptr == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
+   blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
+   if (blob->overrun)
+      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
+
+   shader->b.binary = (void *)shader->code_ptr;
+   hk_upload_shader(dev, shader);
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob,
+                          uint32_t binary_version,
+                          const VkAllocationCallbacks *pAllocator,
+                          struct vk_shader **shader_out)
+{
+   struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
+
+   gl_shader_stage stage = blob_read_uint8(blob);
+   if (blob->overrun)
+      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
+
+   size_t size = sizeof(struct hk_api_shader) +
+                 sizeof(struct hk_shader) * hk_num_variants(stage);
+
+   struct hk_api_shader *obj =
+      vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size);
+
+   if (obj == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   hk_foreach_variant(obj, shader) {
+      VkResult result = hk_deserialize_shader(dev, blob, shader);
+
+      if (result != VK_SUCCESS) {
+         hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+         return result;
+      }
+   }
+
+   *shader_out = &obj->vk;
+   return VK_SUCCESS;
+}
+
+static void
+hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader,
+                    struct blob *blob)
+{
+   blob_write_bytes(blob, &shader->info, sizeof(shader->info));
+   blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info));
+
+   blob_write_uint32(blob, shader->code_size);
+   blob_write_uint32(blob, shader->data_size);
+   blob_write_bytes(blob, shader->code_ptr, shader->code_size);
+   blob_write_bytes(blob, shader->data_ptr, shader->data_size);
+}
+
+static bool
+hk_api_shader_serialize(struct vk_device *vk_dev,
+                        const struct vk_shader *vk_shader, struct blob *blob)
+{
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   blob_write_uint8(blob, vk_shader->stage);
+
+   hk_foreach_variant(obj, shader) {
+      hk_shader_serialize(vk_dev, shader, blob);
+   }
+
+   return !blob->out_of_memory;
+}
+
+#define WRITE_STR(field, ...)                                                  \
+   ({                                                                          \
+      memset(field, 0, sizeof(field));                                         \
+      UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__);              \
+      assert(i > 0 && i < sizeof(field));                                      \
+   })
+
+static VkResult
+hk_shader_get_executable_properties(
+   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
+   uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
+{
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
+                          executable_count);
+
+   vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
+   {
+      props->stages = mesa_to_vk_shader_stage(obj->vk.stage);
+      props->subgroupSize = 32;
+      WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(obj->vk.stage));
+      WRITE_STR(props->description, "%s shader",
+                _mesa_shader_stage_to_string(obj->vk.stage));
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static VkResult
+hk_shader_get_executable_statistics(
+   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
+   uint32_t executable_index, uint32_t *statistic_count,
+   VkPipelineExecutableStatisticKHR *statistics)
+{
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
+                          statistic_count);
+
+   assert(executable_index == 0);
+
+   /* TODO: find a sane way to report multiple variants and have that play nice
+    * with zink.
+    */
+   struct hk_shader *shader = hk_any_variant(obj);
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
+   {
+      WRITE_STR(stat->name, "Code Size");
+      WRITE_STR(stat->description,
+                "Size of the compiled shader binary, in bytes");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = shader->code_size;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
+   {
+      WRITE_STR(stat->name, "Number of GPRs");
+      WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = shader->b.info.nr_gprs;
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static bool
+write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
+              const char *data)
+{
+   ir->isText = VK_TRUE;
+
+   size_t data_len = strlen(data) + 1;
+
+   if (ir->pData == NULL) {
+      ir->dataSize = data_len;
+      return true;
+   }
+
+   strncpy(ir->pData, data, ir->dataSize);
+   if (ir->dataSize < data_len)
+      return false;
+
+   ir->dataSize = data_len;
+   return true;
+}
+
+static VkResult
+hk_shader_get_executable_internal_representations(
+   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
+   uint32_t executable_index, uint32_t *internal_representation_count,
+   VkPipelineExecutableInternalRepresentationKHR *internal_representations)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+                          internal_representations,
+                          internal_representation_count);
+   bool incomplete_text = false;
+
+   assert(executable_index == 0);
+
+   /* TODO */
+#if 0
+   vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
+      WRITE_STR(ir->name, "AGX assembly");
+      WRITE_STR(ir->description, "AGX assembly");
+      if (!write_ir_text(ir, TODO))
+         incomplete_text = true;
+   }
+#endif
+
+   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
+}
+
+static const struct vk_shader_ops hk_shader_ops = {
+   .destroy = hk_api_shader_destroy,
+   .serialize = hk_api_shader_serialize,
+   .get_executable_properties = hk_shader_get_executable_properties,
+   .get_executable_statistics = hk_shader_get_executable_statistics,
+   .get_executable_internal_representations =
+      hk_shader_get_executable_internal_representations,
+};
+
+const struct vk_device_shader_ops hk_device_shader_ops = {
+   .get_nir_options = hk_get_nir_options,
+   .get_spirv_options = hk_get_spirv_options,
+   .preprocess_nir = hk_preprocess_nir,
+   .hash_graphics_state = hk_hash_graphics_state,
+   .compile = hk_compile_shaders,
+   .deserialize = hk_deserialize_api_shader,
+   .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
+   .cmd_bind_shaders = hk_cmd_bind_shaders,
+};
+
+struct hk_linked_shader *
+hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main,
+             struct agx_shader_part *prolog, struct agx_shader_part *epilog,
+             unsigned nr_samples_shaded)
+{
+   struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader);
+   agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog,
+                 nr_samples_shaded);
+
+   if (fragment) {
+      agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) {
+         cfg.cf_binding_count = s->b.cf.nr_bindings;
+         cfg.uniform_register_count = main->b.info.push_count;
+         cfg.preshader_register_count = main->b.info.nr_preamble_gprs;
+         cfg.sampler_state_register_count =
+            agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false);
+      }
+   }
+
+   /* Now that we've linked, bake the USC words to bind this program */
+   struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data));
+
+   if (main && main->b.info.immediate_size_16) {
+      unreachable("todo");
+#if 0
+      /* XXX: do ahead of time */
+      uint64_t ptr = agx_pool_upload_aligned(
+         &cmd->pool, s->b.info.immediates, s->b.info.immediate_size_16 * 2, 64);
+
+      for (unsigned range = 0; range < constant_push_ranges; ++range) {
+         unsigned offset = 64 * range;
+         assert(offset < s->b.info.immediate_size_16);
+
+         agx_usc_uniform(&b, s->b.info.immediate_base_uniform + offset,
+                         MIN2(64, s->b.info.immediate_size_16 - offset),
+                         ptr + (offset * 2));
+      }
+#endif
+   }
+
+   agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap);
+
+   if (s->b.uses_txf)
+      agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler);
+
+   if (main && (main->b.info.stage == MESA_SHADER_COMPUTE ||
+                main->b.info.stage == MESA_SHADER_TESS_CTRL)) {
+      unsigned size = main->b.info.local_size;
+
+      agx_usc_pack(&b, SHARED, cfg) {
+         cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
+         cfg.bytes_per_threadgroup = size > 0 ? size : 65536;
+         cfg.uses_shared_memory = size > 0;
+      }
+   } else if (!fragment) {
+      agx_usc_shared_none(&b);
+   }
+
+   agx_usc_push_packed(&b, SHADER, s->b.shader);
+   agx_usc_push_packed(&b, REGISTERS, s->b.regs);
+
+   if (fragment)
+      agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props);
+
+   if (main && main->b.info.has_preamble) {
+      agx_usc_pack(&b, PRESHADER, cfg) {
+         cfg.code = main->preamble_addr;
+      }
+   } else {
+      agx_usc_pack(&b, NO_PRESHADER, cfg)
+         ;
+   }
+
+   s->usc.size = b.head - s->usc.data;
+   return s;
+}
diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h
new file mode 100644
index 00000000000..458266f8365
--- /dev/null
+++ b/src/asahi/vulkan/hk_shader.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/compiler/agx_compile.h"
+#include "util/macros.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_vbo.h"
+#include "agx_pack.h"
+#include "agx_usc.h"
+#include "agx_uvs.h"
+
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_private.h"
+
+#include "nir_xfb_info.h"
+#include "shader_enums.h"
+#include "vk_pipeline_cache.h"
+
+#include "nir.h"
+
+#include "vk_shader.h"
+
+struct hk_physical_device;
+struct hk_pipeline_compilation_ctx;
+struct vk_descriptor_set_layout;
+struct vk_graphics_pipeline_state;
+struct vk_pipeline_cache;
+struct vk_pipeline_layout;
+struct vk_pipeline_robustness_state;
+struct vk_shader_module;
+
+/* TODO: Make dynamic */
+#define HK_ROOT_UNIFORM       104
+#define HK_IMAGE_HEAP_UNIFORM 108
+
+struct hk_shader_info {
+   union {
+      struct {
+         uint32_t attribs_read;
+         BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4);
+         uint8_t cull_distance_array_size;
+         uint8_t _pad[7];
+      } vs;
+
+      struct {
+         /* Local workgroup size */
+         uint16_t local_size[3];
+
+         uint8_t _pad[26];
+      } cs;
+
+      struct {
+         struct agx_interp_info interp;
+         struct agx_fs_epilog_link_info epilog_key;
+
+         bool reads_sample_mask;
+         bool post_depth_coverage;
+         bool uses_sample_shading;
+         bool early_fragment_tests;
+         bool writes_memory;
+
+         uint8_t _pad[7];
+      } fs;
+
+      struct {
+         uint8_t spacing;
+         uint8_t mode;
+         enum mesa_prim out_prim;
+         bool point_mode;
+         bool ccw;
+         uint8_t _pad[27];
+      } ts;
+
+      struct {
+         uint64_t per_vertex_outputs;
+         uint32_t output_stride;
+         uint8_t output_patch_size;
+         uint8_t nr_patch_outputs;
+         uint8_t _pad[18];
+      } tcs;
+
+      struct {
+         unsigned count_words;
+         enum mesa_prim out_prim;
+         uint8_t _pad[27];
+      } gs;
+
+      /* Used to initialize the union for other stages */
+      uint8_t _pad[32];
+   };
+
+   struct agx_unlinked_uvs_layout uvs;
+
+   /* Transform feedback buffer strides */
+   uint8_t xfb_stride[MAX_XFB_BUFFERS];
+
+   gl_shader_stage stage : 8;
+   uint8_t clip_distance_array_size;
+   uint8_t cull_distance_array_size;
+   uint8_t _pad0[1];
+
+   /* XXX: is there a less goofy way to do this? I really don't want dynamic
+    * allocation here.
+    */
+   nir_xfb_info xfb_info;
+   nir_xfb_output_info xfb_outputs[64];
+};
+
+/*
+ * Hash table keys for fast-linked shader variants. These contain the entire
+ * prolog/epilog key so we only do 1 hash table lookup instead of 2 in the
+ * general case where the linked shader is already ready.
+ */
+struct hk_fast_link_key_vs {
+   struct agx_vs_prolog_key prolog;
+};
+
+struct hk_fast_link_key_fs {
+   unsigned nr_samples_shaded;
+   struct agx_fs_prolog_key prolog;
+   struct agx_fs_epilog_key epilog;
+};
+
+struct hk_shader {
+   struct agx_shader_part b;
+
+   struct hk_shader_info info;
+   struct agx_fragment_face_2_packed frag_face;
+   struct agx_counts_packed counts;
+
+   const void *code_ptr;
+   uint32_t code_size;
+
+   const void *data_ptr;
+   uint32_t data_size;
+
+   /* BO for any uploaded shader part */
+   struct agx_bo *bo;
+
+   /* Cache of fast linked variants */
+   struct {
+      simple_mtx_t lock;
+      struct hash_table *ht;
+   } linked;
+
+   /* If there's only a single possibly linked variant, direct pointer. TODO:
+    * Union with the cache to save some space?
+    */
+   struct hk_linked_shader *only_linked;
+
+   /* Address to the uploaded preamble section. Preambles are uploaded
+    * separately from fast-linked main shaders.
+    */
+   uint64_t preamble_addr;
+
+   /* Address of the start of the shader data section */
+   uint64_t data_addr;
+};
+
+enum hk_vs_variant {
+   /* Hardware vertex shader, when next stage is fragment */
+   HK_VS_VARIANT_HW,
+
+   /* Hardware compute shader, when next is geometry/tessellation */
+   HK_VS_VARIANT_SW,
+
+   HK_VS_VARIANTS,
+};
+
+enum hk_gs_variant {
+   /* Hardware vertex shader used for rasterization */
+   HK_GS_VARIANT_RAST,
+
+   /* Main compute shader */
+   HK_GS_VARIANT_MAIN,
+   HK_GS_VARIANT_MAIN_NO_RAST,
+
+   /* Count compute shader */
+   HK_GS_VARIANT_COUNT,
+   HK_GS_VARIANT_COUNT_NO_RAST,
+
+   /* Pre-GS compute shader */
+   HK_GS_VARIANT_PRE,
+   HK_GS_VARIANT_PRE_NO_RAST,
+
+   HK_GS_VARIANTS,
+};
+
+/* clang-format off */
+static const char *hk_gs_variant_name[] = {
+   [HK_GS_VARIANT_RAST] = "Rasterization",
+   [HK_GS_VARIANT_MAIN] = "Main",
+   [HK_GS_VARIANT_MAIN_NO_RAST] = "Main (rast. discard)",
+   [HK_GS_VARIANT_COUNT] = "Count",
+   [HK_GS_VARIANT_COUNT_NO_RAST] = "Count (rast. discard)",
+   [HK_GS_VARIANT_PRE] = "Pre-GS",
+   [HK_GS_VARIANT_PRE_NO_RAST] = "Pre-GS (rast. discard)",
+};
+/* clang-format on */
+
+static inline unsigned
+hk_num_variants(gl_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_TESS_EVAL:
+      return HK_VS_VARIANTS;
+
+   case MESA_SHADER_GEOMETRY:
+      return HK_GS_VARIANTS;
+
+   default:
+      return 1;
+   }
+}
+
+/*
+ * An hk_api shader maps 1:1 to a VkShader object. An hk_api_shader may contain
+ * multiple hardware hk_shader's, built at shader compile time. This complexity
+ * is required to efficiently implement the legacy geometry pipeline.
+ */
+struct hk_api_shader {
+   struct vk_shader vk;
+
+   /* Is this an internal passthrough geometry shader? */
+   bool is_passthrough;
+
+   struct hk_shader variants[];
+};
+
+#define hk_foreach_variant(api_shader, var)                                    \
+   for (struct hk_shader *var = api_shader->variants;                          \
+        var < api_shader->variants + hk_num_variants(api_shader->vk.stage);    \
+        ++var)
+
+static const char *
+hk_variant_name(struct hk_api_shader *obj, struct hk_shader *variant)
+{
+   unsigned i = variant - obj->variants;
+   assert(i < hk_num_variants(obj->vk.stage));
+
+   if (hk_num_variants(obj->vk.stage) == 1) {
+      return NULL;
+   } else if (obj->vk.stage == MESA_SHADER_GEOMETRY) {
+      assert(i < ARRAY_SIZE(hk_gs_variant_name));
+      return hk_gs_variant_name[i];
+   } else {
+      assert(i < 2);
+      return i == HK_VS_VARIANT_SW ? "Software" : "Hardware";
+   }
+}
+
+static struct hk_shader *
+hk_only_variant(struct hk_api_shader *obj)
+{
+   if (!obj)
+      return NULL;
+
+   assert(hk_num_variants(obj->vk.stage) == 1);
+   return &obj->variants[0];
+}
+
+static struct hk_shader *
+hk_any_variant(struct hk_api_shader *obj)
+{
+   if (!obj)
+      return NULL;
+
+   return &obj->variants[0];
+}
+
+static struct hk_shader *
+hk_main_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+{
+   return &obj->variants[HK_GS_VARIANT_MAIN + rast_disc];
+}
+
+static struct hk_shader *
+hk_count_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+{
+   return &obj->variants[HK_GS_VARIANT_COUNT + rast_disc];
+}
+
+static struct hk_shader *
+hk_pre_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+{
+   return &obj->variants[HK_GS_VARIANT_PRE + rast_disc];
+}
+
+#define HK_MAX_LINKED_USC_SIZE                                                 \
+   (AGX_USC_PRESHADER_LENGTH + AGX_USC_FRAGMENT_PROPERTIES_LENGTH +            \
+    AGX_USC_REGISTERS_LENGTH + AGX_USC_SHADER_LENGTH + AGX_USC_SHARED_LENGTH + \
+    AGX_USC_SAMPLER_LENGTH + (AGX_USC_UNIFORM_LENGTH * 9))
+
+struct hk_linked_shader {
+   struct agx_linked_shader b;
+
+   /* Distinct from hk_shader::counts due to addition of cf_binding_count, which
+    * is delayed since it depends on cull distance.
+    */
+   struct agx_fragment_shader_word_0_packed fs_counts;
+
+   /* Baked USC words to bind this linked shader */
+   struct {
+      uint8_t data[HK_MAX_LINKED_USC_SIZE];
+      size_t size;
+   } usc;
+};
+
+struct hk_linked_shader *hk_fast_link(struct hk_device *dev, bool fragment,
+                                      struct hk_shader *main,
+                                      struct agx_shader_part *prolog,
+                                      struct agx_shader_part *epilog,
+                                      unsigned nr_samples_shaded);
+
+extern const struct vk_device_shader_ops hk_device_shader_ops;
+
+uint64_t
+hk_physical_device_compiler_flags(const struct hk_physical_device *pdev);
+
+static inline nir_address_format
+hk_buffer_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
+{
+   switch (robustness) {
+   case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
+      return nir_address_format_64bit_global_32bit_offset;
+   case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
+   case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
+      return nir_address_format_64bit_bounded_global;
+   default:
+      unreachable("Invalid robust buffer access behavior");
+   }
+}
+
+bool hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base);
+
+bool
+hk_nir_lower_descriptors(nir_shader *nir,
+                         const struct vk_pipeline_robustness_state *rs,
+                         uint32_t set_layout_count,
+                         struct vk_descriptor_set_layout *const *set_layouts);
+void hk_lower_nir(struct hk_device *dev, nir_shader *nir,
+                  const struct vk_pipeline_robustness_state *rs,
+                  bool is_multiview, uint32_t set_layout_count,
+                  struct vk_descriptor_set_layout *const *set_layouts);
+
+VkResult hk_compile_shader(struct hk_device *dev,
+                           struct vk_shader_compile_info *info,
+                           const struct vk_graphics_pipeline_state *state,
+                           const VkAllocationCallbacks *pAllocator,
+                           struct hk_api_shader **shader_out);
+
+void hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev,
+                                nir_shader *nir);
+
+void hk_api_shader_destroy(struct vk_device *vk_dev,
+                           struct vk_shader *vk_shader,
+                           const VkAllocationCallbacks *pAllocator);
+
+const nir_shader_compiler_options *
+hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
+                   UNUSED const struct vk_pipeline_robustness_state *rs);
+
+struct hk_api_shader *hk_meta_shader(struct hk_device *dev,
+                                     hk_internal_builder_t builder, void *data,
+                                     size_t data_size);
+
+static inline struct hk_shader *
+hk_meta_kernel(struct hk_device *dev, hk_internal_builder_t builder, void *data,
+               size_t data_size)
+{
+   return hk_only_variant(hk_meta_shader(dev, builder, data, data_size));
+}
+
+struct hk_passthrough_gs_key {
+   /* Bit mask of outputs written by the VS/TES, to be passed through */
+   uint64_t outputs;
+
+   /* Clip/cull sizes, implies clip/cull written in output */
+   uint8_t clip_distance_array_size;
+   uint8_t cull_distance_array_size;
+
+   /* Transform feedback buffer strides */
+   uint8_t xfb_stride[MAX_XFB_BUFFERS];
+
+   /* Decomposed primitive */
+   enum mesa_prim prim;
+
+   /* Transform feedback info. Must add nir_xfb_info_size to get the key size */
+   nir_xfb_info xfb_info;
+};
+
+void hk_nir_passthrough_gs(struct nir_builder *b, const void *key_);
diff --git a/src/asahi/vulkan/hk_wsi.c b/src/asahi/vulkan/hk_wsi.c
new file mode 100644
index 00000000000..b95d09a7d97
--- /dev/null
+++ b/src/asahi/vulkan/hk_wsi.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_wsi.h"
+#include "hk_instance.h"
+#include "wsi_common.h"
+
+static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+hk_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+   return vk_instance_get_proc_addr_unchecked(pdev->vk.instance, pName);
+}
+
+VkResult
+hk_init_wsi(struct hk_physical_device *pdev)
+{
+   VkResult result;
+
+   struct wsi_device_options wsi_options = {.sw_device = false};
+   result = wsi_device_init(
+      &pdev->wsi_device, hk_physical_device_to_handle(pdev), hk_wsi_proc_addr,
+      &pdev->vk.instance->alloc, pdev->master_fd,
+      &hk_physical_device_instance(pdev)->dri_options, &wsi_options);
+   if (result != VK_SUCCESS)
+      return result;
+
+   pdev->wsi_device.supports_scanout = false;
+   pdev->wsi_device.supports_modifiers = true;
+
+   pdev->vk.wsi_device = &pdev->wsi_device;
+
+   return result;
+}
+
+void
+hk_finish_wsi(struct hk_physical_device *pdev)
+{
+   pdev->vk.wsi_device = NULL;
+   wsi_device_finish(&pdev->wsi_device, &pdev->vk.instance->alloc);
+}
diff --git a/src/asahi/vulkan/hk_wsi.h b/src/asahi/vulkan/hk_wsi.h
new file mode 100644
index 00000000000..458f0cd1616
--- /dev/null
+++ b/src/asahi/vulkan/hk_wsi.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_physical_device.h"
+
+VkResult hk_init_wsi(struct hk_physical_device *pdev);
+void hk_finish_wsi(struct hk_physical_device *pdev);
diff --git a/src/asahi/vulkan/meson.build b/src/asahi/vulkan/meson.build
new file mode 100644
index 00000000000..7b66cf2c1f0
--- /dev/null
+++ b/src/asahi/vulkan/meson.build
@@ -0,0 +1,142 @@
+# Copyright © 2022 Collabora Ltd. and Red Hat Inc.
+# SPDX-License-Identifier: MIT
+hk_files = files(
+  'hk_buffer.c',
+  'hk_buffer.h',
+  'hk_buffer_view.c',
+  'hk_buffer_view.h',
+  'hk_cmd_buffer.c',
+  'hk_cmd_buffer.h',
+  'hk_cmd_clear.c',
+  'hk_cmd_dispatch.c',
+  'hk_cmd_draw.c',
+  'hk_cmd_meta.c',
+  'hk_cmd_pool.c',
+  'hk_cmd_pool.h',
+  'hk_descriptor_set.h',
+  'hk_descriptor_set.c',
+  'hk_descriptor_set_layout.c',
+  'hk_descriptor_set_layout.h',
+  'hk_descriptor_table.c',
+  'hk_descriptor_table.h',
+  'hk_device.c',
+  'hk_device.h',
+  'hk_device_memory.c',
+  'hk_device_memory.h',
+  'hk_event.c',
+  'hk_event.h',
+  'hk_format.c',
+  'hk_image.c',
+  'hk_image.h',
+  'hk_image_view.c',
+  'hk_image_view.h',
+  'hk_instance.c',
+  'hk_instance.h',
+  'hk_nir_lower_descriptors.c',
+  'hk_nir_passthrough_gs.c',
+  'hk_physical_device.c',
+  'hk_physical_device.h',
+  'hk_private.h',
+  'hk_query_pool.c',
+  'hk_query_pool.h',
+  'hk_queue.c',
+  'hk_queue.h',
+  'hk_sampler.c',
+  'hk_sampler.h',
+  'hk_shader.c',
+  'hk_shader.h',
+  'hk_wsi.c',
+  'hk_wsi.h'
+)
+
+hk_entrypoints = custom_target(
+  'hk_entrypoints',
+  input : [vk_entrypoints_gen, vk_api_xml],
+  output : ['hk_entrypoints.h', 'hk_entrypoints.c'],
+  command : [
+    prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
+    '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'hk',
+    '--beta', with_vulkan_beta.to_string(),
+  ],
+  depend_files : vk_entrypoints_gen_depend_files,
+)
+
+hk_deps = [
+  dep_libdrm,
+  idep_nir,
+  idep_vulkan_runtime,
+  idep_vulkan_util,
+  idep_vulkan_wsi,
+  idep_vulkan_wsi_headers,
+  idep_agx_pack,
+]
+
+libhk = static_library(
+  'hk',
+  [
+    hk_entrypoints,
+    hk_files,
+    libagx_shaders,
+    sha1_h,
+  ],
+  include_directories : [
+    inc_gallium,
+    inc_gallium_aux,
+    inc_include,
+    inc_src,
+    inc_asahi,
+  ],
+  link_with : [libasahi_lib, libasahi_layout, libasahi_compiler],
+  c_args : ['-Wno-c2x-extensions'],
+  dependencies : [hk_deps],
+  gnu_symbol_visibility : 'hidden',
+)
+
+libvulkan_asahi = shared_library(
+  'vulkan_asahi',
+  link_whole : [libhk],
+  link_args: [ld_args_build_id],
+  gnu_symbol_visibility : 'hidden',
+  install : true,
+)
+
+icd_lib_path = join_paths(get_option('prefix'), get_option('libdir'))
+icd_file_name = 'libvulkan_asahi.so'
+if with_platform_windows
+  icd_lib_path = import('fs').relative_to(get_option('bindir'), with_vulkan_icd_dir)
+  icd_file_name = 'vulkan_asahi.dll'
+endif
+
+asahi_icd = custom_target(
+  'asahi_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : 'asahi_icd.@0@.json'.format(host_machine.cpu()),
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', join_paths(icd_lib_path, icd_file_name),
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+  install_dir : with_vulkan_icd_dir,
+  install_tag : 'runtime',
+  install : true,
+)
+
+_dev_icdname = 'asahi_devenv_icd.@0@.json'.format(host_machine.cpu())
+custom_target(
+  'asahi_devenv_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : _dev_icdname,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', meson.current_build_dir() / icd_file_name,
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', meson.current_build_dir() / _dev_icdname)
+# Deprecated: replaced by VK_DRIVER_FILES above
+devenv.append('VK_ICD_FILENAMES', meson.current_build_dir() / _dev_icdname)