From 5bc828481630147575348b66677edaade9e891e6 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Fri, 26 Jul 2024 10:48:48 -0400
Subject: [PATCH] hk: add Vulkan driver for Apple GPUs

Honeykrisp is a Vulkan 1.3 driver for Apple GPUs. It currently support M1 and
M2, future hardware support is planned. It passed CTS a few months ago and with
two exceptions[1] should still pass now.

Compared to the May snapshot that passed conformance [1], this adds a bunch of
new features, most notably:

* Geometry shaders
* Tessellation shaders
* Transform feedback
* Pipeline statistics queries
* Robustness2
* Host image copy

Theoretically, we now support everything DXVK requires for D3D11 with full
FL11_1. To quote Rob Herring:

   How's performance? Great, because I haven't tested it.

This driver is NOT ready for end users... YET. Stay tuned, it won't be long now
:}

I would like to reiterate: Honeykrisp is not yet ready for end users. Please
read [3].

Regardless, as the kernel UAPI is not yet stable, this driver will refuse to
probe without out-of-tree Mesa patches. This is the same situation as our GL
driver.

On the Mesa side, the biggest todo before the release is improving
performance. Right now, I expect WineD3D with our GL4.6 driver to give better
performance. This isn't fundamental, just needs time ... our GL driver is 3
years old and honeykrisp is 3 months old.

On the non-Mesa side, there's still a lot of movement around krun and FEX
packaging before this becomes broadly useful for x86 games.

At any rate, now that I've finished up geometry and tessellation, I'm hopefully
done rewriting the whole driver every 2 weeks. So I think this is settled enough
that it makes sense to upstream this now instead of building up a gigantic
monster commit in a private branch.

[1] Pipeline robustness and pipeline statistics are included in this tree but
    need bug fixes in the CTS to pass. This is being handled internally in
    Khronos. These features may be disabled to get a conformant driver.

[2] https://rosenzweig.io/blog/vk13-on-the-m1-in-1-month.html

[3] https://dont-ship.it/

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30382>
---
 meson.build                                 |    5 +-
 meson_options.txt                           |    2 +-
 src/.clang-format                           |    2 +
 src/asahi/meson.build                       |    6 +-
 src/asahi/vulkan/hk_buffer.c                |  286 ++
 src/asahi/vulkan/hk_buffer.h                |   45 +
 src/asahi/vulkan/hk_buffer_view.c           |  195 +
 src/asahi/vulkan/hk_buffer_view.h           |   27 +
 src/asahi/vulkan/hk_cmd_buffer.c            |  811 ++++
 src/asahi/vulkan/hk_cmd_buffer.h            |  767 ++++
 src/asahi/vulkan/hk_cmd_clear.c             |  196 +
 src/asahi/vulkan/hk_cmd_dispatch.c          |  249 ++
 src/asahi/vulkan/hk_cmd_draw.c              | 3737 +++++++++++++++++++
 src/asahi/vulkan/hk_cmd_meta.c              | 1692 +++++++++
 src/asahi/vulkan/hk_cmd_pool.c              |  146 +
 src/asahi/vulkan/hk_cmd_pool.h              |   49 +
 src/asahi/vulkan/hk_descriptor_set.c        |  794 ++++
 src/asahi/vulkan/hk_descriptor_set.h        |  107 +
 src/asahi/vulkan/hk_descriptor_set_layout.c |  423 +++
 src/asahi/vulkan/hk_descriptor_set_layout.h |   75 +
 src/asahi/vulkan/hk_descriptor_table.c      |  179 +
 src/asahi/vulkan/hk_descriptor_table.h      |   49 +
 src/asahi/vulkan/hk_device.c                |  548 +++
 src/asahi/vulkan/hk_device.h                |  123 +
 src/asahi/vulkan/hk_device_memory.c         |  330 ++
 src/asahi/vulkan/hk_device_memory.h         |   31 +
 src/asahi/vulkan/hk_event.c                 |  113 +
 src/asahi/vulkan/hk_event.h                 |   22 +
 src/asahi/vulkan/hk_format.c                |  140 +
 src/asahi/vulkan/hk_image.c                 | 1536 ++++++++
 src/asahi/vulkan/hk_image.h                 |  115 +
 src/asahi/vulkan/hk_image_view.c            |  653 ++++
 src/asahi/vulkan/hk_image_view.h            |   66 +
 src/asahi/vulkan/hk_instance.c              |  196 +
 src/asahi/vulkan/hk_instance.h              |   25 +
 src/asahi/vulkan/hk_nir_lower_descriptors.c |  867 +++++
 src/asahi/vulkan/hk_nir_passthrough_gs.c    |  112 +
 src/asahi/vulkan/hk_physical_device.c       | 1417 +++++++
 src/asahi/vulkan/hk_physical_device.h       |   76 +
 src/asahi/vulkan/hk_private.h               |   53 +
 src/asahi/vulkan/hk_query_pool.c            |  580 +++
 src/asahi/vulkan/hk_query_pool.h            |   28 +
 src/asahi/vulkan/hk_queue.c                 |  599 +++
 src/asahi/vulkan/hk_queue.h                 |   42 +
 src/asahi/vulkan/hk_sampler.c               |  281 ++
 src/asahi/vulkan/hk_sampler.h               |   33 +
 src/asahi/vulkan/hk_shader.c                | 1432 +++++++
 src/asahi/vulkan/hk_shader.h                |  400 ++
 src/asahi/vulkan/hk_wsi.c                   |   44 +
 src/asahi/vulkan/hk_wsi.h                   |   13 +
 src/asahi/vulkan/meson.build                |  142 +
 51 files changed, 19855 insertions(+), 4 deletions(-)
 create mode 100644 src/asahi/vulkan/hk_buffer.c
 create mode 100644 src/asahi/vulkan/hk_buffer.h
 create mode 100644 src/asahi/vulkan/hk_buffer_view.c
 create mode 100644 src/asahi/vulkan/hk_buffer_view.h
 create mode 100644 src/asahi/vulkan/hk_cmd_buffer.c
 create mode 100644 src/asahi/vulkan/hk_cmd_buffer.h
 create mode 100644 src/asahi/vulkan/hk_cmd_clear.c
 create mode 100644 src/asahi/vulkan/hk_cmd_dispatch.c
 create mode 100644 src/asahi/vulkan/hk_cmd_draw.c
 create mode 100644 src/asahi/vulkan/hk_cmd_meta.c
 create mode 100644 src/asahi/vulkan/hk_cmd_pool.c
 create mode 100644 src/asahi/vulkan/hk_cmd_pool.h
 create mode 100644 src/asahi/vulkan/hk_descriptor_set.c
 create mode 100644 src/asahi/vulkan/hk_descriptor_set.h
 create mode 100644 src/asahi/vulkan/hk_descriptor_set_layout.c
 create mode 100644 src/asahi/vulkan/hk_descriptor_set_layout.h
 create mode 100644 src/asahi/vulkan/hk_descriptor_table.c
 create mode 100644 src/asahi/vulkan/hk_descriptor_table.h
 create mode 100644 src/asahi/vulkan/hk_device.c
 create mode 100644 src/asahi/vulkan/hk_device.h
 create mode 100644 src/asahi/vulkan/hk_device_memory.c
 create mode 100644 src/asahi/vulkan/hk_device_memory.h
 create mode 100644 src/asahi/vulkan/hk_event.c
 create mode 100644 src/asahi/vulkan/hk_event.h
 create mode 100644 src/asahi/vulkan/hk_format.c
 create mode 100644 src/asahi/vulkan/hk_image.c
 create mode 100644 src/asahi/vulkan/hk_image.h
 create mode 100644 src/asahi/vulkan/hk_image_view.c
 create mode 100644 src/asahi/vulkan/hk_image_view.h
 create mode 100644 src/asahi/vulkan/hk_instance.c
 create mode 100644 src/asahi/vulkan/hk_instance.h
 create mode 100644 src/asahi/vulkan/hk_nir_lower_descriptors.c
 create mode 100644 src/asahi/vulkan/hk_nir_passthrough_gs.c
 create mode 100644 src/asahi/vulkan/hk_physical_device.c
 create mode 100644 src/asahi/vulkan/hk_physical_device.h
 create mode 100644 src/asahi/vulkan/hk_private.h
 create mode 100644 src/asahi/vulkan/hk_query_pool.c
 create mode 100644 src/asahi/vulkan/hk_query_pool.h
 create mode 100644 src/asahi/vulkan/hk_queue.c
 create mode 100644 src/asahi/vulkan/hk_queue.h
 create mode 100644 src/asahi/vulkan/hk_sampler.c
 create mode 100644 src/asahi/vulkan/hk_sampler.h
 create mode 100644 src/asahi/vulkan/hk_shader.c
 create mode 100644 src/asahi/vulkan/hk_shader.h
 create mode 100644 src/asahi/vulkan/hk_wsi.c
 create mode 100644 src/asahi/vulkan/hk_wsi.h
 create mode 100644 src/asahi/vulkan/meson.build

diff --git a/meson.build b/meson.build
index d2eab192618..4afad99f68d 100644
--- a/meson.build
+++ b/meson.build
@@ -240,7 +240,7 @@ elif _vulkan_drivers.contains('all')
    _vulkan_drivers = ['amd', 'intel', 'intel_hasvk', 'swrast',
                       'freedreno', 'panfrost', 'virtio', 'broadcom',
                       'imagination-experimental', 'microsoft-experimental',
-                      'nouveau']
+                      'nouveau', 'asahi']
 endif
 
 with_intel_vk = _vulkan_drivers.contains('intel')
@@ -255,6 +255,7 @@ with_imagination_vk = _vulkan_drivers.contains('imagination-experimental')
 with_imagination_srv = get_option('imagination-srv')
 with_microsoft_vk = _vulkan_drivers.contains('microsoft-experimental')
 with_nouveau_vk = _vulkan_drivers.contains('nouveau')
+with_asahi_vk = _vulkan_drivers.contains('asahi')
 with_any_vk = _vulkan_drivers.length() != 0
 
 if with_any_vk and host_machine.system() == 'windows' and meson.version().version_compare('< 1.3')
@@ -850,7 +851,7 @@ if with_gallium_rusticl
 endif
 
 with_clover_spirv = with_gallium_clover and get_option('opencl-spirv')
-with_clc = with_microsoft_clc or with_intel_clc or with_gallium_asahi or with_gallium_rusticl or with_clover_spirv
+with_clc = with_microsoft_clc or with_intel_clc or with_gallium_asahi or with_asahi_vk or with_gallium_rusticl or with_clover_spirv
 
 dep_clc = null_dep
 if with_gallium_clover or with_clc
diff --git a/meson_options.txt b/meson_options.txt
index f8f4ec29513..ff669621267 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -228,7 +228,7 @@ option(
   value : ['auto'],
   choices : ['auto', 'amd', 'broadcom', 'freedreno', 'intel', 'intel_hasvk',
              'panfrost', 'swrast', 'virtio', 'imagination-experimental',
-             'microsoft-experimental', 'nouveau', 'all'],
+             'microsoft-experimental', 'nouveau', 'asahi', 'all'],
   description : 'List of vulkan drivers to build. If this is set to auto ' +
                 'all drivers applicable to the target OS/architecture ' +
                 'will be built'
diff --git a/src/.clang-format b/src/.clang-format
index d13cd051cf4..142700a493c 100644
--- a/src/.clang-format
+++ b/src/.clang-format
@@ -186,6 +186,8 @@ ForEachMacros:
 # asahi
   - foreach_active
   - foreach_submitted
+  - hk_foreach_view
+  - hk_foreach_variant
   - AGX_BATCH_FOREACH_BO_HANDLE
   - agx_pack
   - agx_push
diff --git a/src/asahi/meson.build b/src/asahi/meson.build
index ac58326a822..c5f08ead519 100644
--- a/src/asahi/meson.build
+++ b/src/asahi/meson.build
@@ -6,7 +6,7 @@ inc_asahi = include_directories([
    '.', 'layout', 'lib', 'genxml', 'compiler'
 ])
 
-if with_gallium_asahi
+if with_gallium_asahi or with_asahi_vk
    subdir('layout')
    subdir('compiler')
    subdir('clc')
@@ -14,6 +14,10 @@ if with_gallium_asahi
    subdir('lib')
 endif
 
+if with_asahi_vk
+   subdir('vulkan')
+endif
+
 if with_tools.contains('drm-shim')
   subdir('drm-shim')
 endif
diff --git a/src/asahi/vulkan/hk_buffer.c b/src/asahi/vulkan/hk_buffer.c
new file mode 100644
index 00000000000..63bec5a0f70
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_buffer.h"
+
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+static uint32_t
+hk_get_buffer_alignment(const struct hk_physical_device *pdev,
+                        VkBufferUsageFlags2KHR usage_flags,
+                        VkBufferCreateFlags create_flags)
+{
+   uint32_t alignment = 16;
+
+   if (usage_flags & VK_BUFFER_USAGE_2_UNIFORM_BUFFER_BIT_KHR)
+      alignment = MAX2(alignment, HK_MIN_UBO_ALIGNMENT);
+
+   if (usage_flags & VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT_KHR)
+      alignment = MAX2(alignment, HK_MIN_SSBO_ALIGNMENT);
+
+   if (usage_flags & (VK_BUFFER_USAGE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR |
+                      VK_BUFFER_USAGE_2_STORAGE_TEXEL_BUFFER_BIT_KHR))
+      alignment = MAX2(alignment, HK_MIN_TEXEL_BUFFER_ALIGNMENT);
+
+   if (create_flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+                       VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))
+      alignment = MAX2(alignment, 4096);
+
+   return alignment;
+}
+
+static uint64_t
+hk_get_bda_replay_addr(const VkBufferCreateInfo *pCreateInfo)
+{
+   uint64_t addr = 0;
+   vk_foreach_struct_const(ext, pCreateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO: {
+         const VkBufferOpaqueCaptureAddressCreateInfo *bda = (void *)ext;
+         if (bda->opaqueCaptureAddress != 0) {
+#ifdef NDEBUG
+            return bda->opaqueCaptureAddress;
+#else
+            assert(addr == 0 || bda->opaqueCaptureAddress == addr);
+            addr = bda->opaqueCaptureAddress;
+#endif
+         }
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT: {
+         const VkBufferDeviceAddressCreateInfoEXT *bda = (void *)ext;
+         if (bda->deviceAddress != 0) {
+#ifdef NDEBUG
+            return bda->deviceAddress;
+#else
+            assert(addr == 0 || bda->deviceAddress == addr);
+            addr = bda->deviceAddress;
+#endif
+         }
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   return addr;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateBuffer(VkDevice device, const VkBufferCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_buffer *buffer;
+
+   if (pCreateInfo->size > HK_MAX_BUFFER_SIZE)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   buffer =
+      vk_buffer_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*buffer));
+   if (!buffer)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   if (buffer->vk.size > 0 &&
+       (buffer->vk.create_flags &
+        (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+         VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))) {
+
+      unreachable("todo");
+#if 0
+      const uint32_t alignment =
+         hk_get_buffer_alignment(hk_device_physical(dev),
+                                  buffer->vk.usage,
+                                  buffer->vk.create_flags);
+      assert(alignment >= 4096);
+      buffer->vma_size_B = align64(buffer->vk.size, alignment);
+
+      const bool sparse_residency =
+         buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT;
+      const bool bda_capture_replay =
+         buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT;
+
+      uint64_t bda_replay_addr = 0;
+      if (bda_capture_replay)
+         bda_replay_addr = hk_get_bda_replay_addr(pCreateInfo);
+
+      buffer->addr = nouveau_ws_alloc_vma(dev->ws_dev, bda_replay_addr,
+                                          buffer->vma_size_B,
+                                          alignment, bda_capture_replay,
+                                          sparse_residency);
+#endif
+      if (buffer->addr == 0) {
+         vk_buffer_destroy(&dev->vk, pAllocator, &buffer->vk);
+         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "Sparse VMA allocation failed");
+      }
+   }
+
+   *pBuffer = hk_buffer_to_handle(buffer);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyBuffer(VkDevice device, VkBuffer _buffer,
+                 const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   if (!buffer)
+      return;
+
+   if (buffer->vma_size_B > 0) {
+      unreachable("todo");
+#if 0
+      const bool sparse_residency =
+         buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT;
+      const bool bda_capture_replay =
+         buffer->vk.create_flags &
+         VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT;
+
+      agx_bo_unbind_vma(dev->ws_dev, buffer->addr, buffer->vma_size_B);
+      nouveau_ws_free_vma(dev->ws_dev, buffer->addr, buffer->vma_size_B,
+                          bda_capture_replay, sparse_residency);
+#endif
+   }
+
+   vk_buffer_destroy(&dev->vk, pAllocator, &buffer->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceBufferMemoryRequirements(
+   VkDevice device, const VkDeviceBufferMemoryRequirements *pInfo,
+   VkMemoryRequirements2 *pMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   const uint32_t alignment = hk_get_buffer_alignment(
+      hk_device_physical(dev), pInfo->pCreateInfo->usage,
+      pInfo->pCreateInfo->flags);
+
+   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements){
+      .size = align64(pInfo->pCreateInfo->size, alignment),
+      .alignment = alignment,
+      .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
+   };
+
+   vk_foreach_struct_const(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *dedicated = (void *)ext;
+         dedicated->prefersDedicatedAllocation = false;
+         dedicated->requiresDedicatedAllocation = false;
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceExternalBufferProperties(
+   VkPhysicalDevice physicalDevice,
+   const VkPhysicalDeviceExternalBufferInfo *pExternalBufferInfo,
+   VkExternalBufferProperties *pExternalBufferProperties)
+{
+   /* The Vulkan 1.3.256 spec says:
+    *
+    *    VUID-VkPhysicalDeviceExternalBufferInfo-handleType-parameter
+    *
+    *    "handleType must be a valid VkExternalMemoryHandleTypeFlagBits value"
+    *
+    * This differs from VkPhysicalDeviceExternalImageFormatInfo, which
+    * surprisingly permits handleType == 0.
+    */
+   assert(pExternalBufferInfo->handleType != 0);
+
+   /* All of the current flags are for sparse which we don't support yet.
+    * Even when we do support it, doing sparse on external memory sounds
+    * sketchy.  Also, just disallowing flags is the safe option.
+    */
+   if (pExternalBufferInfo->flags)
+      goto unsupported;
+
+   switch (pExternalBufferInfo->handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      pExternalBufferProperties->externalMemoryProperties =
+         hk_dma_buf_mem_props;
+      return;
+   default:
+      goto unsupported;
+   }
+
+unsupported:
+   /* From the Vulkan 1.3.256 spec:
+    *
+    *    compatibleHandleTypes must include at least handleType.
+    */
+   pExternalBufferProperties->externalMemoryProperties =
+      (VkExternalMemoryProperties){
+         .compatibleHandleTypes = pExternalBufferInfo->handleType,
+      };
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_BindBufferMemory2(VkDevice device, uint32_t bindInfoCount,
+                     const VkBindBufferMemoryInfo *pBindInfos)
+{
+   for (uint32_t i = 0; i < bindInfoCount; ++i) {
+      VK_FROM_HANDLE(hk_device_memory, mem, pBindInfos[i].memory);
+      VK_FROM_HANDLE(hk_buffer, buffer, pBindInfos[i].buffer);
+
+      if (buffer->vma_size_B) {
+         unreachable("todo");
+#if 0
+         VK_FROM_HANDLE(hk_device, dev, device);
+         agx_bo_bind_vma(dev->ws_dev,
+                                mem->bo,
+                                buffer->addr,
+                                buffer->vma_size_B,
+                                pBindInfos[i].memoryOffset,
+                                0 /* pte_kind */);
+#endif
+      } else {
+         buffer->addr = mem->bo->ptr.gpu + pBindInfos[i].memoryOffset;
+      }
+
+      const VkBindMemoryStatusKHR *status =
+         vk_find_struct_const(pBindInfos[i].pNext, BIND_MEMORY_STATUS_KHR);
+      if (status != NULL && status->pResult != NULL)
+         *status->pResult = VK_SUCCESS;
+   }
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkDeviceAddress VKAPI_CALL
+hk_GetBufferDeviceAddress(UNUSED VkDevice device,
+                          const VkBufferDeviceAddressInfo *pInfo)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, pInfo->buffer);
+
+   return hk_buffer_address(buffer, 0);
+}
+
+VKAPI_ATTR uint64_t VKAPI_CALL
+hk_GetBufferOpaqueCaptureAddress(UNUSED VkDevice device,
+                                 const VkBufferDeviceAddressInfo *pInfo)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, pInfo->buffer);
+
+   return hk_buffer_address(buffer, 0);
+}
diff --git a/src/asahi/vulkan/hk_buffer.h b/src/asahi/vulkan/hk_buffer.h
new file mode 100644
index 00000000000..f349a3df0e2
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include "hk_device_memory.h"
+#include "hk_private.h"
+
+#include "vk_buffer.h"
+
+struct hk_device_memory;
+struct hk_physical_device;
+
+struct hk_buffer {
+   struct vk_buffer vk;
+   uint64_t addr;
+
+   /** Size of the reserved VMA range for sparse buffers, zero otherwise. */
+   uint64_t vma_size_B;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_buffer, vk.base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+
+static inline uint64_t
+hk_buffer_address(const struct hk_buffer *buffer, uint64_t offset)
+{
+   return buffer->addr + offset;
+}
+
+static inline struct hk_addr_range
+hk_buffer_addr_range(const struct hk_buffer *buffer, uint64_t offset,
+                     uint64_t range)
+{
+   if (buffer == NULL)
+      return (struct hk_addr_range){.range = 0};
+
+   return (struct hk_addr_range){
+      .addr = hk_buffer_address(buffer, offset),
+      .range = vk_buffer_range(&buffer->vk, offset, range),
+   };
+}
diff --git a/src/asahi/vulkan/hk_buffer_view.c b/src/asahi/vulkan/hk_buffer_view.c
new file mode 100644
index 00000000000..73d32d945ae
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer_view.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_buffer_view.h"
+#include "asahi/lib/agx_formats.h"
+#include "asahi/lib/agx_nir_lower_vbo.h"
+#include "util/bitscan.h"
+#include "util/format/u_format.h"
+#include "util/format/u_formats.h"
+
+#include "agx_helpers.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "hk_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vk_format.h"
+
+VkFormatFeatureFlags2
+hk_get_buffer_format_features(struct hk_physical_device *pdev,
+                              VkFormat vk_format)
+{
+   VkFormatFeatureFlags2 features = 0;
+   enum pipe_format p_format = vk_format_to_pipe_format(vk_format);
+
+   if (p_format == PIPE_FORMAT_NONE)
+      return 0;
+
+   if (agx_vbo_supports_format(p_format))
+      features |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
+
+   if (agx_pixel_format[p_format].texturable &&
+       !util_format_is_depth_or_stencil(p_format)) {
+
+      features |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT;
+
+      /* RGB32 specially supported for uniform texel buffers only. */
+      if (util_is_power_of_two_nonzero(util_format_get_blocksize(p_format))) {
+         features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT |
+                     VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+      }
+
+      if (p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT)
+         features |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+   }
+
+   return features;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo,
+                    const VkAllocationCallbacks *pAllocator,
+                    VkBufferView *pBufferView)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_buffer, buffer, pCreateInfo->buffer);
+   struct hk_buffer_view *view;
+   VkResult result;
+
+   view = vk_buffer_view_create(&device->vk, pCreateInfo, pAllocator,
+                                sizeof(*view));
+   if (!view)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   enum pipe_format format = vk_format_to_pipe_format(view->vk.format);
+   const struct util_format_description *desc = util_format_description(format);
+
+   uint8_t format_swizzle[4] = {
+      desc->swizzle[0],
+      desc->swizzle[1],
+      desc->swizzle[2],
+      desc->swizzle[3],
+   };
+
+   if (util_format_is_depth_or_stencil(format)) {
+      assert(!util_format_is_depth_and_stencil(format) &&
+             "separate stencil always used");
+
+      /* Broadcast depth and stencil */
+      format_swizzle[0] = 0;
+      format_swizzle[1] = 0;
+      format_swizzle[2] = 0;
+      format_swizzle[3] = 0;
+   }
+
+   /* Decompose the offset into a multiple of 16-bytes (which we can include in
+    * the address) and an extra texel-aligned tail offset of up to 15 bytes.
+    *
+    * This lets us offset partially in the shader instead, getting
+    * around alignment restrictions on the base address pointer.
+    */
+   uint64_t base = hk_buffer_address(buffer, 0) + (view->vk.offset & ~0xf);
+   uint32_t tail_offset_B = view->vk.offset & 0xf;
+   uint32_t tail_offset_el = tail_offset_B / util_format_get_blocksize(format);
+   assert(tail_offset_el * util_format_get_blocksize(format) == tail_offset_B &&
+          "must be texel aligned");
+
+   struct agx_texture_packed tex;
+   agx_pack(&tex, TEXTURE, cfg) {
+      cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
+      cfg.layout = AGX_LAYOUT_LINEAR;
+      cfg.channels = agx_pixel_format[format].channels;
+      cfg.type = agx_pixel_format[format].type;
+      cfg.swizzle_r = agx_channel_from_pipe(format_swizzle[0]);
+      cfg.swizzle_g = agx_channel_from_pipe(format_swizzle[1]);
+      cfg.swizzle_b = agx_channel_from_pipe(format_swizzle[2]);
+      cfg.swizzle_a = agx_channel_from_pipe(format_swizzle[3]);
+
+      cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
+      cfg.height = DIV_ROUND_UP(view->vk.elements, cfg.width);
+      cfg.first_level = cfg.last_level = 0;
+
+      cfg.address = base;
+      cfg.buffer_size_sw = view->vk.elements;
+      cfg.buffer_offset_sw = tail_offset_el;
+
+      cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+      cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
+
+      cfg.depth = 1;
+      cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16;
+   }
+
+   struct agx_pbe_packed pbe;
+   agx_pack(&pbe, PBE, cfg) {
+      cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
+      cfg.layout = AGX_LAYOUT_LINEAR;
+      cfg.channels = agx_pixel_format[format].channels;
+      cfg.type = agx_pixel_format[format].type;
+      cfg.srgb = util_format_is_srgb(format);
+
+      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
+
+      for (unsigned i = 0; i < desc->nr_channels; ++i) {
+         if (desc->swizzle[i] == 0)
+            cfg.swizzle_r = i;
+         else if (desc->swizzle[i] == 1)
+            cfg.swizzle_g = i;
+         else if (desc->swizzle[i] == 2)
+            cfg.swizzle_b = i;
+         else if (desc->swizzle[i] == 3)
+            cfg.swizzle_a = i;
+      }
+
+      cfg.buffer = base;
+      cfg.buffer_offset_sw = tail_offset_el;
+
+      cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
+      cfg.height = DIV_ROUND_UP(view->vk.elements, cfg.width);
+      cfg.level = 0;
+      cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 4;
+      cfg.layers = 1;
+      cfg.levels = 1;
+   };
+
+   result = hk_descriptor_table_add(device, &device->images, &tex, sizeof(tex),
+                                    &view->tex_desc_index);
+   if (result != VK_SUCCESS) {
+      vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+      return result;
+   }
+
+   result = hk_descriptor_table_add(device, &device->images, &pbe, sizeof(pbe),
+                                    &view->pbe_desc_index);
+   if (result != VK_SUCCESS) {
+      hk_descriptor_table_remove(device, &device->images, view->tex_desc_index);
+      vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+      return result;
+   }
+
+   *pBufferView = hk_buffer_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
+                     const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_buffer_view, view, bufferView);
+
+   if (!view)
+      return;
+
+   hk_descriptor_table_remove(device, &device->images, view->tex_desc_index);
+   hk_descriptor_table_remove(device, &device->images, view->pbe_desc_index);
+
+   vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+}
diff --git a/src/asahi/vulkan/hk_buffer_view.h b/src/asahi/vulkan/hk_buffer_view.h
new file mode 100644
index 00000000000..6b182006f1a
--- /dev/null
+++ b/src/asahi/vulkan/hk_buffer_view.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_buffer_view.h"
+
+struct hk_physical_device;
+
+VkFormatFeatureFlags2
+hk_get_buffer_format_features(struct hk_physical_device *pdevice,
+                              VkFormat format);
+
+struct hk_buffer_view {
+   struct vk_buffer_view vk;
+
+   /** Index in the image descriptor table */
+   uint32_t tex_desc_index, pbe_desc_index;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_buffer_view, vk.base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
diff --git a/src/asahi/vulkan/hk_cmd_buffer.c b/src/asahi/vulkan/hk_cmd_buffer.c
new file mode 100644
index 00000000000..b3b362bf2b7
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_buffer.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_cmd_buffer.h"
+
+#include "agx_bo.h"
+#include "agx_linker.h"
+#include "agx_tilebuffer.h"
+#include "agx_usc.h"
+#include "hk_buffer.h"
+#include "hk_cmd_pool.h"
+#include "hk_descriptor_set.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_entrypoints.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+
+#include "pool.h"
+#include "shader_enums.h"
+#include "vk_pipeline_layout.h"
+#include "vk_synchronization.h"
+
+#include "nouveau/nouveau.h"
+#include "util/list.h"
+#include "util/macros.h"
+#include "util/u_dynarray.h"
+#include "vulkan/vulkan_core.h"
+
+static void
+hk_descriptor_state_fini(struct hk_cmd_buffer *cmd,
+                         struct hk_descriptor_state *desc)
+{
+   struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
+
+   for (unsigned i = 0; i < HK_MAX_SETS; i++) {
+      vk_free(&pool->vk.alloc, desc->push[i]);
+      desc->push[i] = NULL;
+   }
+}
+
+static void
+hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd)
+{
+   struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
+
+   hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors);
+   hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors);
+
+   hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos);
+   hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos);
+
+   list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) {
+      list_del(&it->node);
+      hk_cs_destroy(it);
+   }
+
+   util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) {
+      agx_bo_unreference(*bo);
+   }
+
+   util_dynarray_clear(&cmd->large_bos);
+}
+
+static void
+hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
+{
+   struct hk_cmd_buffer *cmd =
+      container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
+   struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
+
+   hk_free_resettable_cmd_buffer(cmd);
+   vk_command_buffer_finish(&cmd->vk);
+   vk_free(&pool->vk.alloc, cmd);
+}
+
+static VkResult
+hk_create_cmd_buffer(struct vk_command_pool *vk_pool,
+                     VkCommandBufferLevel level,
+                     struct vk_command_buffer **cmd_buffer_out)
+{
+   struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk);
+   struct hk_device *dev = hk_cmd_pool_device(pool);
+   struct hk_cmd_buffer *cmd;
+   VkResult result;
+
+   cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (cmd == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result =
+      vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level);
+   if (result != VK_SUCCESS) {
+      vk_free(&pool->vk.alloc, cmd);
+      return result;
+   }
+
+   util_dynarray_init(&cmd->large_bos, NULL);
+
+   cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi;
+   cmd->vk.dynamic_graphics_state.ms.sample_locations =
+      &cmd->state.gfx._dynamic_sl;
+
+   list_inithead(&cmd->uploader.main.bos);
+   list_inithead(&cmd->uploader.usc.bos);
+   list_inithead(&cmd->control_streams);
+
+   *cmd_buffer_out = &cmd->vk;
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
+                    UNUSED VkCommandBufferResetFlags flags)
+{
+   struct hk_cmd_buffer *cmd =
+      container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
+
+   vk_command_buffer_reset(&cmd->vk);
+   hk_free_resettable_cmd_buffer(cmd);
+
+   cmd->uploader.main.map = NULL;
+   cmd->uploader.main.base = 0;
+   cmd->uploader.main.offset = 0;
+   cmd->uploader.usc.map = NULL;
+   cmd->uploader.usc.base = 0;
+   cmd->uploader.usc.offset = 0;
+
+   cmd->current_cs.gfx = NULL;
+   cmd->current_cs.cs = NULL;
+   cmd->current_cs.post_gfx = NULL;
+   cmd->current_cs.pre_gfx = NULL;
+
+   /* TODO: clear pool! */
+
+   memset(&cmd->state, 0, sizeof(cmd->state));
+}
+
+const struct vk_command_buffer_ops hk_cmd_buffer_ops = {
+   .create = hk_create_cmd_buffer,
+   .reset = hk_reset_cmd_buffer,
+   .destroy = hk_destroy_cmd_buffer,
+};
+
+static VkResult
+hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc,
+                       struct hk_cmd_bo **bo_out)
+{
+   VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (usc)
+      list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos);
+   else
+      list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos);
+
+   return VK_SUCCESS;
+}
+
+struct agx_ptr
+hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
+                       uint32_t alignment, bool usc)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_uploader *uploader =
+      usc ? &cmd->uploader.usc : &cmd->uploader.main;
+
+   /* Specially handle large allocations owned by the command buffer, e.g. used
+    * for statically allocated vertex output buffers with geometry shaders.
+    */
+   if (size > HK_CMD_BO_SIZE) {
+      uint32_t flags = usc ? AGX_BO_LOW_VA : 0;
+      struct agx_bo *bo =
+         agx_bo_create(&dev->dev, size, flags, "Large pool allocation");
+
+      util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo);
+      return bo->ptr;
+   }
+
+   assert(size <= HK_CMD_BO_SIZE);
+   assert(alignment > 0);
+
+   uint32_t offset = align(uploader->offset, alignment);
+
+   assert(offset <= HK_CMD_BO_SIZE);
+   if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) {
+      uploader->offset = offset + size;
+
+      return (struct agx_ptr){
+         .gpu = uploader->base + offset,
+         .cpu = uploader->map + offset,
+      };
+   }
+
+   struct hk_cmd_bo *bo;
+   VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(&cmd->vk, result);
+      return (struct agx_ptr){0};
+   }
+
+   /* Pick whichever of the current upload BO and the new BO will have more
+    * room left to be the BO for the next upload.  If our upload size is
+    * bigger than the old offset, we're better off burning the whole new
+    * upload BO on this one allocation and continuing on the current upload
+    * BO.
+    */
+   if (uploader->map == NULL || size < uploader->offset) {
+      uploader->map = bo->bo->ptr.cpu;
+      uploader->base = bo->bo->ptr.gpu;
+      uploader->offset = size;
+   }
+
+   return (struct agx_ptr){
+      .gpu = bo->bo->ptr.gpu,
+      .cpu = bo->map,
+   };
+}
+
+uint64_t
+hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size,
+               uint32_t alignment)
+{
+   struct agx_ptr T = hk_pool_alloc(cmd, size, alignment);
+   if (unlikely(T.cpu == NULL))
+      return 0;
+
+   memcpy(T.cpu, data, size);
+   return T.gpu;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
+                      const VkCommandBufferBeginInfo *pBeginInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   hk_reset_cmd_buffer(&cmd->vk, 0);
+
+   hk_cmd_buffer_begin_compute(cmd, pBeginInfo);
+   hk_cmd_buffer_begin_graphics(cmd, pBeginInfo);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL &&
+          "must end rendering before ending the command buffer");
+
+   hk_cmd_buffer_end_compute(cmd);
+   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+
+   return vk_command_buffer_get_record_result(&cmd->vk);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
+                       const VkDependencyInfo *pDependencyInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   /* The big hammer. We end both compute and graphics batches. Ending compute
+    * here is necessary to properly handle graphics->compute dependencies.
+    *
+    * XXX: perf. */
+   hk_cmd_buffer_end_compute(cmd);
+   hk_cmd_buffer_end_graphics(cmd);
+}
+
+void
+hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
+                    const gl_shader_stage *stages,
+                    struct vk_shader **const shaders)
+{
+   struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
+
+   for (uint32_t i = 0; i < stage_count; i++) {
+      struct hk_api_shader *shader =
+         container_of(shaders[i], struct hk_api_shader, vk);
+
+      if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL)
+         hk_cmd_bind_compute_shader(cmd, shader);
+      else
+         hk_cmd_bind_graphics_shader(cmd, stages[i], shader);
+   }
+}
+
+static void
+hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
+                        struct hk_descriptor_state *desc,
+                        const VkBindDescriptorSetsInfoKHR *info)
+{
+   VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
+
+   /* Fro the Vulkan 1.3.275 spec:
+    *
+    *    "When binding a descriptor set (see Descriptor Set Binding) to
+    *    set number N...
+    *
+    *    If, additionally, the previously bound descriptor set for set
+    *    N was bound using a pipeline layout not compatible for set N,
+    *    then all bindings in sets numbered greater than N are
+    *    disturbed."
+    *
+    * This means that, if some earlier set gets bound in such a way that
+    * it changes set_dynamic_buffer_start[s], this binding is implicitly
+    * invalidated.  Therefore, we can always look at the current value
+    * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
+    * range and it's only our responsibility to adjust all
+    * set_dynamic_buffer_start[p] for p > s as needed.
+    */
+   uint8_t dyn_buffer_start =
+      desc->root.set_dynamic_buffer_start[info->firstSet];
+
+   uint32_t next_dyn_offset = 0;
+   for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
+      unsigned s = i + info->firstSet;
+      VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]);
+
+      if (desc->sets[s] != set) {
+         if (set != NULL) {
+            desc->root.sets[s] = hk_descriptor_set_addr(set);
+            desc->set_sizes[s] = set->size;
+         } else {
+            desc->root.sets[s] = 0;
+            desc->set_sizes[s] = 0;
+         }
+         desc->sets[s] = set;
+         desc->sets_dirty |= BITFIELD_BIT(s);
+
+         /* Binding descriptors invalidates push descriptors */
+         desc->push_dirty &= ~BITFIELD_BIT(s);
+      }
+
+      desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
+
+      if (pipeline_layout->set_layouts[s] != NULL) {
+         const struct hk_descriptor_set_layout *set_layout =
+            vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]);
+
+         if (set != NULL && set_layout->dynamic_buffer_count > 0) {
+            for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) {
+               struct hk_buffer_address addr = set->dynamic_buffers[j];
+               addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j];
+               desc->root.dynamic_buffers[dyn_buffer_start + j] = addr;
+            }
+            next_dyn_offset += set->layout->dynamic_buffer_count;
+         }
+
+         dyn_buffer_start += set_layout->dynamic_buffer_count;
+      } else {
+         assert(set == NULL);
+      }
+   }
+   assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
+   assert(next_dyn_offset <= info->dynamicOffsetCount);
+
+   for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
+        s++)
+      desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
+
+   desc->root_dirty = true;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindDescriptorSets2KHR(
+   VkCommandBuffer commandBuffer,
+   const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+      hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors,
+                              pBindDescriptorSetsInfo);
+   }
+
+   if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors,
+                              pBindDescriptorSetsInfo);
+   }
+}
+
+static void
+hk_push_constants(UNUSED struct hk_cmd_buffer *cmd,
+                  struct hk_descriptor_state *desc,
+                  const VkPushConstantsInfoKHR *info)
+{
+   memcpy(desc->root.push + info->offset, info->pValues, info->size);
+   desc->root_dirty = true;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
+                        const VkPushConstantsInfoKHR *pPushConstantsInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
+      hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo);
+
+   if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
+      hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo);
+}
+
+static struct hk_push_descriptor_set *
+hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd,
+                        struct hk_descriptor_state *desc, uint32_t set)
+{
+   assert(set < HK_MAX_SETS);
+   if (unlikely(desc->push[set] == NULL)) {
+      desc->push[set] =
+         vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (unlikely(desc->push[set] == NULL)) {
+         vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return NULL;
+      }
+   }
+
+   /* Pushing descriptors replaces whatever sets are bound */
+   desc->sets[set] = NULL;
+   desc->push_dirty |= BITFIELD_BIT(set);
+
+   return desc->push[set];
+}
+
+static void
+hk_push_descriptor_set(struct hk_cmd_buffer *cmd,
+                       struct hk_descriptor_state *desc,
+                       const VkPushDescriptorSetInfoKHR *info)
+{
+   VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
+
+   struct hk_push_descriptor_set *push_set =
+      hk_cmd_push_descriptors(cmd, desc, info->set);
+   if (unlikely(push_set == NULL))
+      return;
+
+   struct hk_descriptor_set_layout *set_layout =
+      vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]);
+
+   hk_push_descriptor_set_update(push_set, set_layout,
+                                 info->descriptorWriteCount,
+                                 info->pDescriptorWrites);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPushDescriptorSet2KHR(
+   VkCommandBuffer commandBuffer,
+   const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+      hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors,
+                             pPushDescriptorSetInfo);
+   }
+
+   if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors,
+                             pPushDescriptorSetInfo);
+   }
+}
+
+void
+hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
+                                     struct hk_descriptor_state *desc)
+{
+   u_foreach_bit(set_idx, desc->push_dirty) {
+      struct hk_push_descriptor_set *push_set = desc->push[set_idx];
+      uint64_t push_set_addr = hk_pool_upload(
+         cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT);
+
+      desc->root.sets[set_idx] = push_set_addr;
+      desc->set_sizes[set_idx] = sizeof(push_set->data);
+   }
+
+   desc->root_dirty = true;
+   desc->push_dirty = 0;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdPushDescriptorSetWithTemplate2KHR(
+   VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR
+                                     *pPushDescriptorSetWithTemplateInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_descriptor_update_template, template,
+                  pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
+   VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout,
+                  pPushDescriptorSetWithTemplateInfo->layout);
+
+   struct hk_descriptor_state *desc =
+      hk_get_descriptors_state(cmd, template->bind_point);
+   struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors(
+      cmd, desc, pPushDescriptorSetWithTemplateInfo->set);
+   if (unlikely(push_set == NULL))
+      return;
+
+   struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(
+      pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]);
+
+   hk_push_descriptor_set_update_template(
+      push_set, set_layout, template,
+      pPushDescriptorSetWithTemplateInfo->pData);
+}
+
+uint64_t
+hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
+                          VkPipelineBindPoint bind_point)
+{
+   struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
+   struct hk_root_descriptor_table *root = &desc->root;
+
+   struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8);
+   if (!root_ptr.cpu)
+      return 0;
+
+   root->root_desc_addr = root_ptr.gpu;
+
+   memcpy(root_ptr.cpu, root, sizeof(*root));
+   return root_ptr.gpu;
+}
+
+void
+hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
+                               struct hk_cmd_buffer *cmd)
+{
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   /* Upload texture/PBE descriptors for each render target so we can clear
+    * spilled render targets.
+    */
+   struct agx_ptr descs =
+      hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64);
+   struct agx_texture_packed *desc = descs.cpu;
+   if (!desc)
+      return;
+
+   for (unsigned i = 0; i < render->color_att_count; ++i) {
+      struct hk_image_view *iview = render->color_att[i].iview;
+      if (!iview) {
+         /* XXX: probably should emit a null descriptor here...? */
+         continue;
+      }
+
+      memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc));
+      memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc));
+   }
+
+   desc = descs.cpu;
+
+   /* Bind the base as u0_u1 for bindless access */
+   agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8));
+}
+
+void
+hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                   struct hk_shader *s)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   uint32_t max_scratch_size =
+      MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size);
+
+   if (max_scratch_size == 0)
+      return;
+
+   unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0;
+
+   /* XXX: need to lock around agx_scratch_alloc... */
+   /* Note: this uses the hardware stage, not the software stage */
+   switch (s->b.info.stage) {
+   case PIPE_SHADER_FRAGMENT:
+      agx_scratch_alloc(&dev->scratch.fs, max_scratch_size, 0);
+      cs->scratch.fs.main = true;
+      cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size);
+      break;
+   case PIPE_SHADER_VERTEX:
+      agx_scratch_alloc(&dev->scratch.vs, max_scratch_size, 0);
+      cs->scratch.vs.main = true;
+      cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size);
+      break;
+   default:
+      agx_scratch_alloc(&dev->scratch.cs, max_scratch_size, 0);
+      cs->scratch.cs.main = true;
+      cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size);
+      break;
+   }
+}
+
+uint32_t
+hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
+                    struct hk_linked_shader *linked)
+{
+   enum pipe_shader_type sw_stage = s->info.stage;
+   enum pipe_shader_type hw_stage = s->b.info.stage;
+
+   unsigned constant_push_ranges =
+      DIV_ROUND_UP(s->b.info.immediate_size_16, 64);
+   unsigned push_ranges = 2;
+   unsigned stage_ranges = 3;
+
+   size_t usc_size =
+      agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4);
+   struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
+   if (!t.cpu)
+      return 0;
+
+   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
+
+   uint64_t root_ptr;
+
+   if (sw_stage == PIPE_SHADER_COMPUTE)
+      root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
+   else
+      root_ptr = cmd->state.gfx.root;
+
+   static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0,
+                 "self-reflective");
+
+   agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr);
+
+   if (sw_stage == MESA_SHADER_VERTEX) {
+      unsigned count =
+         DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4);
+
+      if (count) {
+         agx_usc_uniform(
+            &b, 0, 4 * count,
+            root_ptr + hk_root_descriptor_offset(draw.attrib_base));
+
+         agx_usc_uniform(
+            &b, 4 * count, 2 * count,
+            root_ptr + hk_root_descriptor_offset(draw.attrib_clamps));
+      }
+
+      if (cmd->state.gfx.draw_params)
+         agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params);
+
+      if (cmd->state.gfx.draw_id_ptr)
+         agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr);
+
+      if (hw_stage == MESA_SHADER_COMPUTE) {
+         agx_usc_uniform(
+            &b, (6 * count) + 8, 4,
+            root_ptr + hk_root_descriptor_offset(draw.input_assembly));
+      }
+   } else if (sw_stage == MESA_SHADER_FRAGMENT) {
+      if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) {
+         hk_usc_upload_spilled_rt_descs(&b, cmd);
+      }
+
+      agx_usc_uniform(
+         &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant));
+
+      /* The SHARED state is baked into linked->usc for non-fragment shaders. We
+       * don't pass around the information to bake the tilebuffer layout.
+       *
+       * TODO: We probably could with some refactor.
+       */
+      agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc);
+   }
+
+   agx_usc_push_blob(&b, linked->usc.data, linked->usc.size);
+   return t.gpu;
+}
+
+/* Specialized variant of hk_upload_usc_words for internal dispatches that do
+ * not use any state except for some directly mapped uniforms.
+ */
+uint32_t
+hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd, struct hk_shader *s,
+                           void *data, size_t data_size)
+{
+   assert(s->info.stage == MESA_SHADER_COMPUTE);
+   assert(s->b.info.scratch_size == 0 && "you shouldn't be spilling!");
+   assert(s->b.info.preamble_scratch_size == 0 && "you shouldn't be spilling!");
+
+   unsigned constant_push_ranges =
+      DIV_ROUND_UP(s->b.info.immediate_size_16, 64);
+
+   size_t usc_size = agx_usc_size(constant_push_ranges + 7);
+   struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
+   if (!t.cpu)
+      return 0;
+
+   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
+
+   /* Map the data directly as uniforms starting at u0 */
+   agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2),
+                   hk_pool_upload(cmd, data, data_size, 4));
+
+   agx_usc_push_blob(&b, s->only_linked->usc.data, s->only_linked->usc.size);
+   return t.gpu;
+}
+
+void
+hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
+{
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+   uint8_t *map = cs->current;
+
+   cs->tib = render->tilebuffer;
+
+   /* Assume this is not the first control stream of the render pass, so
+    * initially use the partial background program and ZLS control.
+    * hk_BeginRendering will override.
+    */
+   cs->cr = render->cr;
+   cs->cr.bg.main = render->cr.bg.partial;
+   cs->cr.zls_control = render->cr.zls_control_partial;
+
+   /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
+    * with another that caused stale data to be cached and the CPU wrote to it
+    * in the meantime.
+    */
+   agx_push(map, VDM_BARRIER, cfg) {
+      cfg.usc_cache_inval = true;
+   }
+
+   struct AGX_PPP_HEADER present = {
+      .w_clamp = true,
+      .occlusion_query_2 = true,
+      .output_unknown = true,
+      .varying_word_2 = true,
+      .viewport_count = 1, /* irrelevant */
+   };
+
+   size_t size = agx_ppp_update_size(&present);
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
+   if (!T.cpu)
+      return;
+
+   struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
+
+   /* clang-format off */
+   agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
+   agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
+   agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
+   agx_ppp_push(&ppp, VARYING_2, cfg);
+   /* clang-format on */
+
+   agx_ppp_fini(&map, &ppp);
+   cs->current = map;
+
+   util_dynarray_init(&cs->scissor, NULL);
+   util_dynarray_init(&cs->depth_bias, NULL);
+
+   /* All graphics state must be reemited in each control stream */
+   hk_cmd_buffer_dirty_all(cmd);
+}
+
+void
+hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                       size_t space)
+{
+   bool vdm = cs->type == HK_CS_VDM;
+
+   size_t link_length =
+      vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
+
+   /* Assert that we have space for a link tag */
+   assert((cs->current + link_length) <= cs->end && "Encoder overflowed");
+
+   /* Always leave room for a link tag, in case we run out of space later,
+    * plus padding because VDM apparently overreads?
+    *
+    * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
+    */
+   space += link_length + 0x800;
+
+   /* If there is room in the command buffer, we're done */
+   if (likely((cs->end - cs->current) >= space))
+      return;
+
+   /* Otherwise, we need to allocate a new command buffer. We use memory owned
+    * by the batch to simplify lifetime management for the BO.
+    */
+   size_t size = 65536;
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 256);
+
+   /* Jump from the old control stream to the new control stream */
+   if (vdm) {
+      agx_pack(cs->current, VDM_STREAM_LINK, cfg) {
+         cfg.target_lo = T.gpu & BITFIELD_MASK(32);
+         cfg.target_hi = T.gpu >> 32;
+      }
+   } else {
+      agx_pack(cs->current, CDM_STREAM_LINK, cfg) {
+         cfg.target_lo = T.gpu & BITFIELD_MASK(32);
+         cfg.target_hi = T.gpu >> 32;
+      }
+   }
+
+   /* Swap out the control stream */
+   cs->current = T.cpu;
+   cs->end = cs->current + size;
+   cs->stream_linked = true;
+}
diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h
new file mode 100644
index 00000000000..0b93f0a924f
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@@ -0,0 +1,767 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "util/macros.h"
+
+#include "util/list.h"
+#include "agx_helpers.h"
+#include "agx_linker.h"
+#include "agx_pack.h"
+#include "agx_tilebuffer.h"
+#include "agx_uvs.h"
+#include "pool.h"
+#include "shader_enums.h"
+
+#include "hk_private.h"
+#include "hk_shader.h"
+
+#include "hk_cmd_pool.h"
+#include "hk_descriptor_set.h"
+
+#include "asahi/lib/agx_nir_lower_vbo.h"
+#include "util/u_dynarray.h"
+#include "vulkan/vulkan_core.h"
+
+#include "vk_command_buffer.h"
+
+#include <stdio.h>
+
+struct hk_buffer;
+struct hk_cmd_bo;
+struct hk_cmd_pool;
+struct hk_image_view;
+struct hk_push_descriptor_set;
+struct hk_shader;
+struct hk_linked_shader;
+struct agx_usc_builder;
+struct vk_shader;
+
+/** Root descriptor table. */
+struct hk_root_descriptor_table {
+   uint64_t root_desc_addr;
+
+   union {
+      struct {
+         uint32_t view_index;
+         uint32_t ppp_multisamplectl;
+
+         /* Vertex input state */
+         uint64_t attrib_base[AGX_MAX_VBUFS];
+         uint32_t attrib_clamps[AGX_MAX_VBUFS];
+
+         /* Pointer to the VS->TCS, VS->GS, or TES->GS buffer. */
+         uint64_t vertex_output_buffer;
+
+         /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
+         uint64_t vertex_outputs;
+
+         /* Address of input assembly buffer if geom/tess is used, else 0 */
+         uint64_t input_assembly;
+
+         /* Address of tessellation param buffer if tessellation used, else 0 */
+         uint64_t tess_params;
+
+         /* Address of geometry param buffer if GS is used, else 0 */
+         uint64_t geometry_params;
+
+         /* Pipeline statistics queries. This is a base address with flags. */
+         uint64_t pipeline_stats;
+         VkQueryPipelineStatisticFlags pipeline_stats_flags;
+
+         float blend_constant[4];
+         uint16_t no_epilog_discard;
+         uint16_t _pad1;
+         uint16_t api_sample_mask;
+         uint16_t _pad2;
+         uint16_t force_never_in_shader;
+         uint16_t _pad3;
+         uint16_t provoking;
+         uint16_t _pad4;
+
+         /* Mapping from varying slots written by the last vertex stage to UVS
+          * indices. This mapping must be compatible with the fragment shader.
+          */
+         uint8_t uvs_index[VARYING_SLOT_MAX];
+      } draw;
+      struct {
+         uint64_t group_count_addr;
+         uint32_t base_group[3];
+      } cs;
+   };
+
+   /* Client push constants */
+   uint8_t push[HK_MAX_PUSH_SIZE];
+
+   /* Descriptor set base addresses */
+   uint64_t sets[HK_MAX_SETS];
+
+   /* Dynamic buffer bindings */
+   struct hk_buffer_address dynamic_buffers[HK_MAX_DYNAMIC_BUFFERS];
+
+   /* Start index in dynamic_buffers where each set starts */
+   uint8_t set_dynamic_buffer_start[HK_MAX_SETS];
+};
+
+/* helper macro for computing root descriptor byte offsets */
+#define hk_root_descriptor_offset(member)                                      \
+   offsetof(struct hk_root_descriptor_table, member)
+
+struct hk_descriptor_state {
+   bool root_dirty;
+   struct hk_root_descriptor_table root;
+
+   uint32_t set_sizes[HK_MAX_SETS];
+   struct hk_descriptor_set *sets[HK_MAX_SETS];
+   uint32_t sets_dirty;
+
+   struct hk_push_descriptor_set *push[HK_MAX_SETS];
+   uint32_t push_dirty;
+};
+
+struct hk_attachment {
+   VkFormat vk_format;
+   struct hk_image_view *iview;
+
+   VkResolveModeFlagBits resolve_mode;
+   struct hk_image_view *resolve_iview;
+};
+
+struct hk_bg_eot {
+   uint64_t usc;
+   struct agx_counts_packed counts;
+};
+
+struct hk_render_registers {
+   uint32_t width, height, layers;
+   uint32_t isp_bgobjdepth;
+   uint32_t isp_bgobjvals;
+   struct agx_zls_control_packed zls_control, zls_control_partial;
+   uint32_t iogpu_unk_214;
+   uint32_t depth_dimensions;
+
+   struct {
+      uint32_t dimensions;
+      uint64_t buffer, meta;
+      uint32_t stride, meta_stride;
+   } depth;
+
+   struct {
+      uint64_t buffer, meta;
+      uint32_t stride, meta_stride;
+   } stencil;
+
+   struct {
+      struct hk_bg_eot main;
+      struct hk_bg_eot partial;
+   } bg;
+
+   struct {
+      struct hk_bg_eot main;
+      struct hk_bg_eot partial;
+   } eot;
+};
+
+struct hk_rendering_state {
+   VkRenderingFlagBits flags;
+
+   VkRect2D area;
+   uint32_t layer_count;
+   uint32_t view_mask;
+
+   uint32_t color_att_count;
+   struct hk_attachment color_att[HK_MAX_RTS];
+   struct hk_attachment depth_att;
+   struct hk_attachment stencil_att;
+
+   struct agx_tilebuffer_layout tilebuffer;
+   struct hk_render_registers cr;
+};
+
+struct hk_index_buffer_state {
+   struct hk_addr_range buffer;
+   enum agx_index_size size;
+   uint32_t restart;
+};
+
+/* Dirty tracking bits for state not tracked by vk_dynamic_graphics_state or
+ * shaders_dirty.
+ */
+enum hk_dirty {
+   HK_DIRTY_INDEX = BITFIELD_BIT(0),
+   HK_DIRTY_VB = BITFIELD_BIT(1),
+   HK_DIRTY_OCCLUSION = BITFIELD_BIT(2),
+   HK_DIRTY_PROVOKING = BITFIELD_BIT(3),
+   HK_DIRTY_VARYINGS = BITFIELD_BIT(4),
+};
+
+struct hk_graphics_state {
+   struct hk_rendering_state render;
+   struct hk_descriptor_state descriptors;
+
+   enum hk_dirty dirty;
+
+   uint64_t root;
+   uint64_t draw_params;
+   uint64_t draw_id_ptr;
+
+   uint32_t shaders_dirty;
+   struct hk_api_shader *shaders[MESA_SHADER_MESH + 1];
+
+   /* Vertex buffers */
+   struct hk_addr_range vb[AGX_MAX_VBUFS];
+
+   /* Transform feedback buffers */
+   struct hk_addr_range xfb[4];
+
+   /* Is transform feedback enabled? */
+   bool xfb_enabled;
+
+   /* Internal transform feedback offset vec4.
+    *
+    * TODO: Strictly could be global.
+    */
+   uint64_t xfb_offsets;
+
+   /* Pointer to the GPU memory backing active transform feedback queries,
+    * per-stream. Zero if no query is bound.
+    */
+   uint64_t xfb_query[4];
+
+   struct hk_index_buffer_state index;
+   enum agx_primitive topology;
+   enum agx_object_type object_type;
+
+   /* Provoking vertex 0, 1, or 2. Usually 0 or 2 for FIRST/LAST. 1 can only be
+    * set for tri fans.
+    */
+   uint8_t provoking;
+
+   struct {
+      enum agx_visibility_mode mode;
+
+      /* If enabled, index of the current occlusion query in the occlusion heap.
+       * There can only be one active at a time (hardware contraint).
+       */
+      uint16_t index;
+   } occlusion;
+
+   /* Fast linked shader data structures */
+   uint64_t varyings;
+   struct agx_varyings_vs linked_varyings;
+
+   uint32_t linked_dirty;
+   struct hk_linked_shader *linked[PIPE_SHADER_TYPES];
+   bool generate_primitive_id;
+
+   /* Tessellation state */
+   uint64_t tess_out_draws;
+
+   /* Needed by vk_command_buffer::dynamic_graphics_state */
+   struct vk_vertex_input_state _dynamic_vi;
+   struct vk_sample_locations_state _dynamic_sl;
+};
+
+struct hk_compute_state {
+   struct hk_descriptor_state descriptors;
+   struct hk_api_shader *shader;
+};
+
+struct hk_cmd_push {
+   void *map;
+   uint64_t addr;
+   uint32_t range;
+   bool no_prefetch;
+};
+
+struct hk_scratch_req {
+   bool main;
+   bool preamble;
+};
+
+/*
+ * hk_cs represents a single control stream, to be enqueued either to the
+ * CDM or VDM for compute/3D respectively.
+ */
+enum hk_cs_type {
+   HK_CS_CDM,
+   HK_CS_VDM,
+};
+
+struct hk_cs {
+   struct list_head node;
+
+   /* Data master */
+   enum hk_cs_type type;
+
+   /* Address of the root control stream for the job */
+   uint64_t addr;
+
+   /* Start pointer of the root control stream */
+   void *start;
+
+   /* Current pointer within the control stream */
+   void *current;
+
+   /* End pointer of the current chunk of the control stream */
+   void *end;
+
+   /* Whether there is more than just the root chunk */
+   bool stream_linked;
+
+   /* Scratch requirements */
+   struct {
+      union {
+         struct hk_scratch_req vs;
+         struct hk_scratch_req cs;
+      };
+
+      struct hk_scratch_req fs;
+   } scratch;
+
+   /* Remaining state is for graphics only, ignored for compute */
+   struct agx_tilebuffer_layout tib;
+
+   struct util_dynarray scissor, depth_bias;
+   uint64_t uploaded_scissor, uploaded_zbias;
+
+   /* We can only set ppp_multisamplectl once per batch. has_sample_locations
+    * tracks if we've committed to a set of sample locations yet. vk_meta
+    * operations do not set has_sample_locations since they don't care and it
+    * would interfere with the app-provided samples.
+    *
+    */
+   bool has_sample_locations;
+   uint32_t ppp_multisamplectl;
+
+   struct hk_render_registers cr;
+};
+
+struct hk_uploader {
+   /** List of hk_cmd_bo */
+   struct list_head bos;
+
+   /* Current addresses */
+   uint8_t *map;
+   uint64_t base;
+   uint32_t offset;
+};
+
+struct hk_cmd_buffer {
+   struct vk_command_buffer vk;
+
+   struct {
+      struct hk_graphics_state gfx;
+      struct hk_compute_state cs;
+   } state;
+
+   struct {
+      struct hk_uploader main, usc;
+   } uploader;
+
+   /* List of all recorded control streams */
+   struct list_head control_streams;
+
+   /* Current recorded control stream */
+   struct {
+      /* VDM stream for 3D */
+      struct hk_cs *gfx;
+
+      /* CDM stream for compute */
+      struct hk_cs *cs;
+
+      /* CDM stream that executes immediately before the current graphics
+       * control stream. Used for geometry shading, tessellation, etc.
+       */
+      struct hk_cs *pre_gfx;
+
+      /* CDM stream that will execute after the current graphics control stream
+       * finishes. Used for queries.
+       */
+      struct hk_cs *post_gfx;
+   } current_cs;
+
+   /* Are we currently inside a vk_meta operation? This alters sample location
+    * behaviour.
+    */
+   bool in_meta;
+
+   /* XXX: move me?
+    *
+    * Indirect draw generated by the pre-GS for the geometry shader.
+    */
+   uint64_t geom_indirect;
+
+   /* Does the command buffer use the geometry heap? */
+   bool uses_heap;
+
+   /* Owned large BOs */
+   struct util_dynarray large_bos;
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+
+extern const struct vk_command_buffer_ops hk_cmd_buffer_ops;
+
+static inline struct hk_device *
+hk_cmd_buffer_device(struct hk_cmd_buffer *cmd)
+{
+   return (struct hk_device *)cmd->vk.base.device;
+}
+
+static inline struct hk_cmd_pool *
+hk_cmd_buffer_pool(struct hk_cmd_buffer *cmd)
+{
+   return (struct hk_cmd_pool *)cmd->vk.pool;
+}
+
+/*
+ * The hardware vertex shader is supplied by the last geometry stage. The
+ * geometry pipeline is vertex->tess->geometry so we search backwards.
+ */
+static inline struct hk_shader *
+hk_bound_hw_vs(struct hk_graphics_state *gfx)
+{
+   struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
+   struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+
+   if (gs)
+      return &gs->variants[HK_GS_VARIANT_RAST];
+   else if (tes)
+      return &tes->variants[HK_VS_VARIANT_HW];
+   else
+      return &vs->variants[HK_VS_VARIANT_HW];
+}
+
+static inline struct hk_shader *
+hk_bound_sw_vs(struct hk_graphics_state *gfx)
+{
+   struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+
+   if (hw_vs == &vs->variants[HK_VS_VARIANT_HW])
+      return hw_vs;
+   else
+      return &vs->variants[HK_VS_VARIANT_SW];
+}
+
+static inline struct hk_shader *
+hk_bound_sw_vs_before_gs(struct hk_graphics_state *gfx)
+{
+   struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
+   struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
+   struct hk_api_shader *api = tes ?: vs;
+
+   return &api->variants[HK_VS_VARIANT_SW];
+}
+
+struct agx_ptr hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
+                                      uint32_t alignment, bool usc);
+
+uint64_t hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data,
+                        uint32_t size, uint32_t alignment);
+
+static inline struct agx_ptr
+hk_pool_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment)
+{
+   return hk_pool_alloc_internal(cmd, size, alignment, false);
+}
+
+static inline struct agx_ptr
+hk_pool_usc_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment)
+{
+   return hk_pool_alloc_internal(cmd, size, alignment, true);
+}
+
+void hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs);
+uint32_t hk_default_sample_positions(unsigned nr_samples);
+
+static inline struct hk_cs *
+hk_cmd_buffer_get_cs_general(struct hk_cmd_buffer *cmd, struct hk_cs **ptr,
+                             bool compute)
+{
+   if ((*ptr) == NULL) {
+      /* Allocate root control stream */
+      size_t initial_size = 65536;
+      struct agx_ptr root = hk_pool_alloc(cmd, initial_size, 1024);
+      if (!root.cpu)
+         return NULL;
+
+      /* Allocate hk_cs for the new stream */
+      struct hk_cs *cs = malloc(sizeof(*cs));
+      *cs = (struct hk_cs){
+         .type = compute ? HK_CS_CDM : HK_CS_VDM,
+         .addr = root.gpu,
+         .start = root.cpu,
+         .current = root.cpu,
+         .end = root.cpu + initial_size,
+      };
+
+      list_inithead(&cs->node);
+
+      bool before_gfx = (ptr == &cmd->current_cs.pre_gfx);
+
+      /* Insert into the command buffer. We usually append to the end of the
+       * command buffer, except for pre-graphics streams which go right before
+       * the graphics workload. (This implies a level of out-of-order processing
+       * that's allowed by Vulkan and required for efficient
+       * geometry/tessellation shaders.)
+       */
+      if (before_gfx && cmd->current_cs.gfx) {
+         list_addtail(&cs->node, &cmd->current_cs.gfx->node);
+      } else {
+         list_addtail(&cs->node, &cmd->control_streams);
+      }
+
+      *ptr = cs;
+
+      if (!compute)
+         hk_cs_init_graphics(cmd, cs);
+   }
+
+   assert(*ptr != NULL);
+   return *ptr;
+}
+
+static inline struct hk_cs *
+hk_cmd_buffer_get_cs(struct hk_cmd_buffer *cmd, bool compute)
+{
+   struct hk_cs **ptr = compute ? &cmd->current_cs.cs : &cmd->current_cs.gfx;
+   return hk_cmd_buffer_get_cs_general(cmd, ptr, compute);
+}
+
+void hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                            size_t space);
+
+static void
+hk_cmd_buffer_dirty_all(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   vk_dynamic_graphics_state_dirty_all(dyn);
+   gfx->dirty = ~0;
+   gfx->shaders_dirty = ~0;
+   gfx->linked_dirty = ~0;
+   gfx->descriptors.root_dirty = true;
+}
+
+static inline void
+hk_cs_destroy(struct hk_cs *cs)
+{
+   if (cs->type == HK_CS_VDM) {
+      util_dynarray_fini(&cs->scissor);
+      util_dynarray_fini(&cs->depth_bias);
+   }
+
+   free(cs);
+}
+
+static void
+hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
+{
+   if (*ptr) {
+      struct hk_cs *cs = *ptr;
+      void *map = cs->current;
+      agx_push(map, CDM_STREAM_TERMINATE, _)
+         ;
+
+      cs->current = map;
+   }
+
+   *ptr = NULL;
+}
+
+static void
+hk_cmd_buffer_end_compute(struct hk_cmd_buffer *cmd)
+{
+   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.cs);
+}
+
+static void
+hk_cmd_buffer_end_graphics(struct hk_cmd_buffer *cmd)
+{
+   struct hk_cs *cs = cmd->current_cs.gfx;
+
+   if (cs) {
+      void *map = cs->current;
+      agx_push(map, VDM_STREAM_TERMINATE, _)
+         ;
+
+      /* Scissor and depth bias arrays are staged to dynamic arrays on the CPU.
+       * When we end the control stream, they're done growing and are ready for
+       * upload.
+       */
+      cs->uploaded_scissor =
+         hk_pool_upload(cmd, cs->scissor.data, cs->scissor.size, 64);
+
+      cs->uploaded_zbias =
+         hk_pool_upload(cmd, cs->depth_bias.data, cs->depth_bias.size, 64);
+
+      /* TODO: maybe free scissor/depth_bias now? */
+
+      cmd->current_cs.gfx->current = map;
+      cmd->current_cs.gfx = NULL;
+      hk_cmd_buffer_end_compute_internal(&cmd->current_cs.pre_gfx);
+      hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+   }
+
+   assert(cmd->current_cs.gfx == NULL);
+
+   /* We just flushed out the heap use. If we want to use it again, we'll need
+    * to queue a free for it again.
+    */
+   cmd->uses_heap = false;
+}
+
+static inline uint64_t
+hk_pipeline_stat_addr(struct hk_cmd_buffer *cmd,
+                      VkQueryPipelineStatisticFlagBits stat)
+{
+   struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
+   VkQueryPipelineStatisticFlags flags = root->draw.pipeline_stats_flags;
+
+   if (flags & stat) {
+      assert(!cmd->in_meta && "queries paused for meta");
+      assert(util_bitcount(stat) == 1 && "by construction");
+
+      /* Prefix sum to determine the compacted index in the query pool */
+      uint32_t index = util_bitcount(flags & (stat - 1));
+
+      return root->draw.pipeline_stats + (sizeof(uint64_t) * index);
+   } else {
+      /* Query disabled */
+      return 0;
+   }
+}
+
+void hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
+                                  const VkCommandBufferBeginInfo *pBeginInfo);
+void hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd,
+                                 const VkCommandBufferBeginInfo *pBeginInfo);
+
+void hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd);
+void hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd);
+
+void hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
+                         const gl_shader_stage *stages,
+                         struct vk_shader **const shaders);
+
+void hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
+                                 const gl_shader_stage stage,
+                                 struct hk_api_shader *shader);
+
+void hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd,
+                                struct hk_api_shader *shader);
+
+void hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
+                               struct hk_addr_range addr_range);
+
+static inline struct hk_descriptor_state *
+hk_get_descriptors_state(struct hk_cmd_buffer *cmd,
+                         VkPipelineBindPoint bind_point)
+{
+   switch (bind_point) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      return &cmd->state.gfx.descriptors;
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      return &cmd->state.cs.descriptors;
+   default:
+      unreachable("Unhandled bind point");
+   }
+};
+
+void hk_cmd_flush_wait_dep(struct hk_cmd_buffer *cmd,
+                           const VkDependencyInfo *dep, bool wait);
+
+void hk_cmd_invalidate_deps(struct hk_cmd_buffer *cmd, uint32_t dep_count,
+                            const VkDependencyInfo *deps);
+
+void hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
+                                          struct hk_descriptor_state *desc);
+
+void hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd,
+                               const VkRenderingInfo *pRenderingInfo);
+
+uint64_t hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
+                                   VkPipelineBindPoint bind_point);
+
+void hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                        struct hk_shader *s);
+
+uint32_t hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
+                             struct hk_linked_shader *linked);
+
+uint32_t hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd,
+                                    struct hk_shader *s, void *data,
+                                    size_t data_size);
+
+void hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
+                                    struct hk_cmd_buffer *cmd);
+
+void hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs);
+
+struct hk_grid {
+   bool indirect;
+   union {
+      uint32_t count[3];
+      uint64_t ptr;
+   };
+};
+
+static struct hk_grid
+hk_grid(uint32_t x, uint32_t y, uint32_t z)
+{
+   return (struct hk_grid){.indirect = false, .count = {x, y, z}};
+}
+
+static struct hk_grid
+hk_grid_indirect(uint64_t ptr)
+{
+   return (struct hk_grid){.indirect = true, .ptr = ptr};
+}
+
+void hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs,
+                          struct hk_shader *s, uint32_t usc,
+                          struct hk_grid grid, struct hk_grid local_size);
+
+static inline void
+hk_dispatch_with_local_size(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                            struct hk_shader *s, struct hk_grid grid,
+                            struct hk_grid local_size)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   uint32_t usc = hk_upload_usc_words(cmd, s, s->only_linked);
+
+   hk_reserve_scratch(cmd, cs, s);
+   hk_dispatch_with_usc(dev, cs, s, usc, grid, local_size);
+}
+
+static inline void
+hk_dispatch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_shader *s,
+            struct hk_grid grid)
+{
+   assert(s->info.stage == MESA_SHADER_COMPUTE);
+
+   struct hk_grid local_size =
+      hk_grid(s->info.cs.local_size[0], s->info.cs.local_size[1],
+              s->info.cs.local_size[2]);
+
+   if (!grid.indirect) {
+      grid.count[0] *= local_size.count[0];
+      grid.count[1] *= local_size.count[1];
+      grid.count[2] *= local_size.count[2];
+   }
+
+   hk_dispatch_with_local_size(cmd, cs, s, grid, local_size);
+}
+
+void hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
+                    bool after_gfx);
diff --git a/src/asahi/vulkan/hk_cmd_clear.c b/src/asahi/vulkan/hk_cmd_clear.c
new file mode 100644
index 00000000000..427c5fed2a1
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_clear.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "agx_formats.h"
+#include "hk_cmd_buffer.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+
+#include "vk_format.h"
+#include "vk_meta.h"
+
+static VkImageViewType
+render_view_type(VkImageType image_type, unsigned layer_count)
+{
+   switch (image_type) {
+   case VK_IMAGE_TYPE_1D:
+      return layer_count == 1 ? VK_IMAGE_VIEW_TYPE_1D
+                              : VK_IMAGE_VIEW_TYPE_1D_ARRAY;
+   case VK_IMAGE_TYPE_2D:
+      return layer_count == 1 ? VK_IMAGE_VIEW_TYPE_2D
+                              : VK_IMAGE_VIEW_TYPE_2D_ARRAY;
+   case VK_IMAGE_TYPE_3D:
+      return VK_IMAGE_VIEW_TYPE_3D;
+   default:
+      unreachable("Invalid image type");
+   }
+}
+
+static void
+clear_image(struct hk_cmd_buffer *cmd, struct hk_image *image,
+            VkImageLayout image_layout, VkFormat format,
+            const VkClearValue *clear_value, uint32_t range_count,
+            const VkImageSubresourceRange *ranges)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   ASSERTED VkResult result;
+
+   for (uint32_t r = 0; r < range_count; r++) {
+      const uint32_t level_count =
+         vk_image_subresource_level_count(&image->vk, &ranges[r]);
+
+      for (uint32_t l = 0; l < level_count; l++) {
+         const uint32_t level = ranges[r].baseMipLevel + l;
+
+         const VkExtent3D level_extent =
+            vk_image_mip_level_extent(&image->vk, level);
+
+         uint32_t base_array_layer, layer_count;
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+            base_array_layer = 0;
+            layer_count = level_extent.depth;
+         } else {
+            base_array_layer = ranges[r].baseArrayLayer;
+            layer_count =
+               vk_image_subresource_layer_count(&image->vk, &ranges[r]);
+         }
+
+         const VkImageViewUsageCreateInfo view_usage_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+            .usage = (ranges[r].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
+                        ? VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT
+                        : VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+         };
+         const VkImageViewCreateInfo view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+            .pNext = &view_usage_info,
+            .image = hk_image_to_handle(image),
+            .viewType = render_view_type(image->vk.image_type, layer_count),
+            .format = format,
+            .subresourceRange =
+               {
+                  .aspectMask = image->vk.aspects,
+                  .baseMipLevel = level,
+                  .levelCount = 1,
+                  .baseArrayLayer = base_array_layer,
+                  .layerCount = layer_count,
+               },
+         };
+
+         /* We use vk_meta_create_image_view here for lifetime managemnt */
+         VkImageView view;
+         result =
+            vk_meta_create_image_view(&cmd->vk, &dev->meta, &view_info, &view);
+         assert(result == VK_SUCCESS);
+
+         VkRenderingInfo render = {
+            .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
+            .renderArea =
+               {
+                  .offset = {0, 0},
+                  .extent = {level_extent.width, level_extent.height},
+               },
+            .layerCount = layer_count,
+         };
+
+         VkRenderingAttachmentInfo vk_att = {
+            .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+            .imageView = view,
+            .imageLayout = image_layout,
+            .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .clearValue = *clear_value,
+         };
+
+         if (ranges[r].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
+            render.colorAttachmentCount = 1;
+            render.pColorAttachments = &vk_att;
+         }
+         if (ranges[r].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
+            render.pDepthAttachment = &vk_att;
+         if (ranges[r].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
+            render.pStencilAttachment = &vk_att;
+
+         hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), &render);
+         hk_CmdEndRendering(hk_cmd_buffer_to_handle(cmd));
+      }
+   }
+}
+
+static VkFormat
+vk_packed_int_format_for_size(unsigned size_B)
+{
+   switch (size_B) {
+   case 1:
+      return VK_FORMAT_R8_UINT;
+   case 2:
+      return VK_FORMAT_R16_UINT;
+   case 4:
+      return VK_FORMAT_R32_UINT;
+   case 8:
+      return VK_FORMAT_R32G32_UINT;
+   case 16:
+      return VK_FORMAT_R32G32B32A32_UINT;
+   default:
+      unreachable("Invalid image format size");
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdClearColorImage(VkCommandBuffer commandBuffer, VkImage _image,
+                      VkImageLayout imageLayout,
+                      const VkClearColorValue *pColor, uint32_t rangeCount,
+                      const VkImageSubresourceRange *pRanges)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   VkClearValue clear_value = {
+      .color = *pColor,
+   };
+
+   VkFormat vk_format = image->vk.format;
+   if (vk_format == VK_FORMAT_R64_UINT || vk_format == VK_FORMAT_R64_SINT)
+      vk_format = VK_FORMAT_R32G32_UINT;
+
+   enum pipe_format p_format = vk_format_to_pipe_format(vk_format);
+   assert(p_format != PIPE_FORMAT_NONE);
+
+   if (!agx_pixel_format[p_format].renderable) {
+      memset(&clear_value, 0, sizeof(clear_value));
+      util_format_pack_rgba(p_format, clear_value.color.uint32, pColor->uint32,
+                            1);
+
+      unsigned bpp = util_format_get_blocksize(p_format);
+      vk_format = vk_packed_int_format_for_size(bpp);
+   }
+
+   clear_image(cmd, image, imageLayout, vk_format, &clear_value, rangeCount,
+               pRanges);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer, VkImage _image,
+                             VkImageLayout imageLayout,
+                             const VkClearDepthStencilValue *pDepthStencil,
+                             uint32_t rangeCount,
+                             const VkImageSubresourceRange *pRanges)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   const VkClearValue clear_value = {
+      .depthStencil = *pDepthStencil,
+   };
+
+   clear_image(cmd, image, imageLayout, image->vk.format, &clear_value,
+               rangeCount, pRanges);
+}
diff --git a/src/asahi/vulkan/hk_cmd_dispatch.c b/src/asahi/vulkan/hk_cmd_dispatch.c
new file mode 100644
index 00000000000..54c1a454992
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_dispatch.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "shaders/query.h"
+#include "vulkan/vulkan_core.h"
+#include "agx_helpers.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_gs.h"
+#include "agx_pack.h"
+#include "agx_scratch.h"
+#include "agx_tilebuffer.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_set.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+#include "pool.h"
+
+void
+hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd,
+                            const VkCommandBufferBeginInfo *pBeginInfo)
+{
+}
+
+void
+hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd)
+{
+   memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
+}
+
+void
+hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd,
+                           struct hk_api_shader *shader)
+{
+   cmd->state.cs.shader = shader;
+}
+
+void
+hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs)
+{
+   assert(cs->type == HK_CS_CDM);
+   assert(cs->current + AGX_CDM_BARRIER_LENGTH < cs->end &&
+          "caller must ensure space");
+
+   uint8_t *out = cs->current;
+
+   agx_push(out, CDM_BARRIER, cfg) {
+      cfg.unk_5 = true;
+      cfg.unk_6 = true;
+      cfg.unk_8 = true;
+      // cfg.unk_11 = true;
+      // cfg.unk_20 = true;
+      if (dev->dev.params.num_clusters_total > 1) {
+         // cfg.unk_24 = true;
+         if (dev->dev.params.gpu_generation == 13) {
+            cfg.unk_4 = true;
+            // cfg.unk_26 = true;
+         }
+      }
+
+      /* With multiple launches in the same CDM stream, we can get cache
+       * coherency (? or sync?) issues. We hit this with blits, which need - in
+       * between dispatches - need the PBE cache to be flushed and the texture
+       * cache to be invalidated. Until we know what bits mean what exactly,
+       * let's just set these after every launch to be safe. We can revisit in
+       * the future when we figure out what the bits mean.
+       */
+      cfg.unk_0 = true;
+      cfg.unk_1 = true;
+      cfg.unk_2 = true;
+      cfg.usc_cache_inval = true;
+      cfg.unk_4 = true;
+      cfg.unk_5 = true;
+      cfg.unk_6 = true;
+      cfg.unk_7 = true;
+      cfg.unk_8 = true;
+      cfg.unk_9 = true;
+      cfg.unk_10 = true;
+      cfg.unk_11 = true;
+      cfg.unk_12 = true;
+      cfg.unk_13 = true;
+      cfg.unk_14 = true;
+      cfg.unk_15 = true;
+      cfg.unk_16 = true;
+      cfg.unk_17 = true;
+      cfg.unk_18 = true;
+      cfg.unk_19 = true;
+   }
+
+   cs->current = out;
+}
+
+/*
+ * Enqueue workgroups to a given CDM control stream with a given prepared USC
+ * words. This does not interact with any global state, so it is suitable for
+ * internal dispatches that do not save/restore state. That can be simpler /
+ * lower overhead than vk_meta for special operations that logically operate
+ * as graphics.
+ */
+void
+hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs,
+                     struct hk_shader *s, uint32_t usc, struct hk_grid grid,
+                     struct hk_grid local_size)
+{
+   assert(cs->current + 0x2000 < cs->end && "should have ensured space");
+   uint8_t *out = cs->current;
+
+   agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
+      if (grid.indirect)
+         cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
+      else
+         cfg.mode = AGX_CDM_MODE_DIRECT;
+
+      /* For now, always bind the txf sampler and nothing else */
+      cfg.sampler_state_register_count = 1;
+
+      cfg.uniform_register_count = s->b.info.push_count;
+      cfg.preshader_register_count = s->b.info.nr_preamble_gprs;
+   }
+
+   agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
+      cfg.pipeline = usc;
+   }
+
+   /* Added in G14X */
+   if (dev->dev.params.gpu_generation >= 14 &&
+       dev->dev.params.num_clusters_total > 1) {
+
+      agx_push(out, CDM_UNK_G14X, cfg)
+         ;
+   }
+
+   assert(!local_size.indirect);
+
+   if (grid.indirect) {
+      agx_push(out, CDM_INDIRECT, cfg) {
+         cfg.address_hi = grid.ptr >> 32;
+         cfg.address_lo = grid.ptr & BITFIELD64_MASK(32);
+      }
+   } else {
+      agx_push(out, CDM_GLOBAL_SIZE, cfg) {
+         cfg.x = grid.count[0];
+         cfg.y = grid.count[1];
+         cfg.z = grid.count[2];
+      }
+   }
+
+   agx_push(out, CDM_LOCAL_SIZE, cfg) {
+      cfg.x = local_size.count[0];
+      cfg.y = local_size.count[1];
+      cfg.z = local_size.count[2];
+   }
+
+   cs->current = out;
+   hk_cdm_cache_flush(dev, cs);
+}
+
+static void
+dispatch(struct hk_cmd_buffer *cmd, struct hk_grid grid)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_shader *s = hk_only_variant(cmd->state.cs.shader);
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true /* compute */);
+   if (!cs)
+      return;
+
+   uint64_t stat = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT);
+
+   if (stat) {
+      uint32_t local_size_threads = s->info.cs.local_size[0] *
+                                    s->info.cs.local_size[1] *
+                                    s->info.cs.local_size[2];
+
+      struct libagx_cs_invocation_params p = {
+         .grid = cmd->state.cs.descriptors.root.cs.group_count_addr,
+         .local_size_threads = local_size_threads,
+         .statistic = stat,
+      };
+
+      struct hk_shader *s =
+         hk_meta_kernel(dev, agx_nir_increment_cs_invocations, NULL, 0);
+
+      uint64_t params = hk_pool_upload(cmd, &p, sizeof(p), 8);
+      uint32_t usc =
+         hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+
+      hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1));
+   }
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+   hk_dispatch(cmd, cs, s, grid);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t baseGroupX,
+                   uint32_t baseGroupY, uint32_t baseGroupZ,
+                   uint32_t groupCountX, uint32_t groupCountY,
+                   uint32_t groupCountZ)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_descriptor_state *desc = &cmd->state.cs.descriptors;
+   if (desc->push_dirty)
+      hk_cmd_buffer_flush_push_descriptors(cmd, desc);
+
+   desc->root.cs.base_group[0] = baseGroupX;
+   desc->root.cs.base_group[1] = baseGroupY;
+   desc->root.cs.base_group[2] = baseGroupZ;
+
+   /* We don't want to key the shader to whether we're indirectly dispatching,
+    * so treat everything as indirect.
+    */
+   VkDispatchIndirectCommand group_count = {
+      .x = groupCountX,
+      .y = groupCountY,
+      .z = groupCountZ,
+   };
+
+   desc->root.cs.group_count_addr =
+      hk_pool_upload(cmd, &group_count, sizeof(group_count), 8);
+
+   dispatch(cmd, hk_grid(groupCountX, groupCountY, groupCountZ));
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                       VkDeviceSize offset)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+   struct hk_descriptor_state *desc = &cmd->state.cs.descriptors;
+   if (desc->push_dirty)
+      hk_cmd_buffer_flush_push_descriptors(cmd, desc);
+
+   desc->root.cs.base_group[0] = 0;
+   desc->root.cs.base_group[1] = 0;
+   desc->root.cs.base_group[2] = 0;
+
+   uint64_t dispatch_addr = hk_buffer_address(buffer, offset);
+   assert(dispatch_addr != 0);
+
+   desc->root.cs.group_count_addr = dispatch_addr;
+   dispatch(cmd, hk_grid_indirect(dispatch_addr));
+}
diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c
new file mode 100644
index 00000000000..78a7a922d15
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@@ -0,0 +1,3737 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include <assert.h>
+#include "agx_bg_eot.h"
+#include "agx_bo.h"
+#include "agx_compile.h"
+#include "agx_compiler.h"
+#include "agx_device.h"
+#include "agx_helpers.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_gs.h"
+#include "agx_nir_lower_vbo.h"
+#include "agx_ppp.h"
+#include "agx_tilebuffer.h"
+#include "agx_usc.h"
+#include "agx_uvs.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+#include "hk_private.h"
+#include "hk_shader.h"
+
+#include "asahi/genxml/agx_pack.h"
+#include "asahi/lib/libagx_shaders.h"
+#include "asahi/lib/shaders/geometry.h"
+#include "shaders/query.h"
+#include "shaders/tessellator.h"
+#include "util/bitpack_helpers.h"
+#include "util/blend.h"
+#include "util/format/format_utils.h"
+#include "util/format/u_formats.h"
+#include "util/macros.h"
+#include "util/ralloc.h"
+#include "vulkan/vulkan_core.h"
+#include "layout.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_lower_blend.h"
+#include "nir_xfb_info.h"
+#include "pool.h"
+#include "shader_enums.h"
+#include "vk_blend.h"
+#include "vk_enum_to_str.h"
+#include "vk_format.h"
+#include "vk_graphics_state.h"
+#include "vk_pipeline.h"
+#include "vk_render_pass.h"
+#include "vk_standard_sample_locations.h"
+#include "vk_util.h"
+
+#define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
+
+#define IS_SHADER_DIRTY(bit)                                                   \
+   (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
+
+#define IS_LINKED_DIRTY(bit)                                                   \
+   (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
+
+struct hk_draw {
+   struct hk_grid b;
+   struct hk_addr_range index;
+   bool indexed;
+   uint32_t start;
+   uint32_t index_bias;
+   uint32_t start_instance;
+
+   /* Indicates that the indirect draw consists of raw VDM commands and should
+    * be stream linked to. Used to accelerate tessellation.
+    */
+   bool raw;
+
+   /* Set within hk_draw() but here so geometry/tessellation can override */
+   bool restart;
+   enum agx_index_size index_size;
+};
+
+static struct hk_draw
+hk_draw_indirect(uint64_t ptr)
+{
+   return (struct hk_draw){.b = hk_grid_indirect(ptr)};
+}
+
+static struct hk_draw
+hk_draw_indexed_indirect(uint64_t ptr, struct hk_addr_range index,
+                         enum agx_index_size index_size, bool restart)
+{
+   return (struct hk_draw){
+      .b = hk_grid_indirect(ptr),
+      .index = index,
+      .indexed = true,
+      .index_size = index_size,
+      .restart = restart,
+   };
+}
+
+/* XXX: deduplicate */
+static inline enum mesa_prim
+vk_conv_topology(VkPrimitiveTopology topology)
+{
+   switch (topology) {
+   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+      return MESA_PRIM_POINTS;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+      return MESA_PRIM_LINES;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+      return MESA_PRIM_LINE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+   case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
+#pragma GCC diagnostic pop
+      return MESA_PRIM_TRIANGLES;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+      return MESA_PRIM_TRIANGLE_STRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+      return MESA_PRIM_TRIANGLE_FAN;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      return MESA_PRIM_LINES_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+      return MESA_PRIM_LINE_STRIP_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      return MESA_PRIM_TRIANGLES_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+      return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
+   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
+      return MESA_PRIM_PATCHES;
+   default:
+      unreachable("invalid");
+   }
+}
+
+static void
+hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   /* These depend on color attachment count */
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
+
+   /* These depend on the depth/stencil format */
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
+
+   /* This may depend on render targets for ESO */
+   BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
+}
+
+void
+hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
+                             const VkCommandBufferBeginInfo *pBeginInfo)
+{
+   if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+       (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
+      char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
+      const VkRenderingInfo *resume_info =
+         vk_get_command_buffer_inheritance_as_rendering_resume(
+            cmd->vk.level, pBeginInfo, gcbiar_data);
+      if (resume_info) {
+         hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
+      } else {
+         const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
+            vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
+                                                             pBeginInfo);
+         assert(inheritance_info);
+
+         struct hk_rendering_state *render = &cmd->state.gfx.render;
+         render->flags = inheritance_info->flags;
+         render->area = (VkRect2D){};
+         render->layer_count = 0;
+         render->view_mask = inheritance_info->viewMask;
+         render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
+
+         render->color_att_count = inheritance_info->colorAttachmentCount;
+         for (uint32_t i = 0; i < render->color_att_count; i++) {
+            render->color_att[i].vk_format =
+               inheritance_info->pColorAttachmentFormats[i];
+         }
+         render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
+         render->stencil_att.vk_format =
+            inheritance_info->stencilAttachmentFormat;
+
+         hk_cmd_buffer_dirty_render_pass(cmd);
+      }
+   }
+
+   hk_cmd_buffer_dirty_all(cmd);
+
+   /* If multiview is disabled, always read 0. If multiview is enabled,
+    * hk_set_view_index will dirty the root each draw.
+    */
+   cmd->state.gfx.descriptors.root.draw.view_index = 0;
+   cmd->state.gfx.descriptors.root_dirty = true;
+}
+
+void
+hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
+{
+   hk_cmd_buffer_dirty_all(cmd);
+
+   /* From the Vulkan 1.3.275 spec:
+    *
+    *    "...There is one exception to this rule - if the primary command
+    *    buffer is inside a render pass instance, then the render pass and
+    *    subpass state is not disturbed by executing secondary command
+    *    buffers."
+    *
+    * We need to reset everything EXCEPT the render pass state.
+    */
+   struct hk_rendering_state render_save = cmd->state.gfx.render;
+   memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
+   cmd->state.gfx.render = render_save;
+}
+
+static void
+hk_attachment_init(struct hk_attachment *att,
+                   const VkRenderingAttachmentInfo *info)
+{
+   if (info == NULL || info->imageView == VK_NULL_HANDLE) {
+      *att = (struct hk_attachment){
+         .iview = NULL,
+      };
+      return;
+   }
+
+   VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
+   *att = (struct hk_attachment){
+      .vk_format = iview->vk.format,
+      .iview = iview,
+   };
+
+   if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
+      VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
+      att->resolve_mode = info->resolveMode;
+      att->resolve_iview = res_iview;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetRenderingAreaGranularityKHR(
+   VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
+   VkExtent2D *pGranularity)
+{
+   *pGranularity = (VkExtent2D){.width = 1, .height = 1};
+}
+
+static struct hk_bg_eot
+hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
+                bool store, bool partial_render, bool incomplete_render_area)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   /* Construct the key */
+   struct agx_bg_eot_key key = {.tib = render->tilebuffer};
+   static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
+
+   key.tib.layered = (render->cr.layers > 1);
+
+   bool needs_textures_for_spilled_rts =
+      agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
+
+   for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
+      const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
+      if (att_info->imageView == VK_NULL_HANDLE)
+         continue;
+
+      /* Partial render programs exist only to store/load the tilebuffer to
+       * main memory. When render targets are already spilled to main memory,
+       * there's nothing to do.
+       */
+      if (key.tib.spilled[i] && (partial_render || store))
+         continue;
+
+      if (store) {
+         bool store = att_info->storeOp == VK_ATTACHMENT_STORE_OP_STORE;
+
+         /* When resolving, we store the intermediate multisampled image as the
+          * resolve is a separate control stream. This could be optimized.
+          */
+         store |= att_info->resolveMode != VK_RESOLVE_MODE_NONE;
+
+         /* Partial renders always need to flush to memory. */
+         store |= partial_render;
+
+         key.op[i] = store ? AGX_EOT_STORE : AGX_BG_EOT_NONE;
+      } else {
+         bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
+         bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
+
+         /* The background program used for partial renders must always load
+          * whatever was stored in the mid-frame end-of-tile program.
+          */
+         load |= partial_render;
+
+         /* With an incomplete render area, we're forced to load back tiles and
+          * then use the 3D pipe for the clear.
+          */
+         load |= incomplete_render_area;
+
+         /* Don't read back spilled render targets, they're already in memory */
+         load &= !key.tib.spilled[i];
+
+         key.op[i] = load    ? AGX_BG_LOAD
+                     : clear ? AGX_BG_CLEAR
+                             : AGX_BG_EOT_NONE;
+      }
+   }
+
+   /* Begin building the pipeline */
+   size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
+   struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
+   if (!t.cpu)
+      return (struct hk_bg_eot){.usc = t.gpu};
+
+   struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
+
+   bool uses_txf = false;
+   unsigned uniforms = 0;
+   unsigned nr_tex = 0;
+
+   for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
+      const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
+      struct hk_image_view *iview = render->color_att[rt].iview;
+
+      if (key.op[rt] == AGX_BG_LOAD) {
+         uses_txf = true;
+
+         uint32_t index = key.tib.layered
+                             ? iview->planes[0].layered_background_desc_index
+                             : iview->planes[0].background_desc_index;
+
+         agx_usc_pack(&b, TEXTURE, cfg) {
+            /* Shifted to match eMRT indexing, could be optimized */
+            cfg.start = rt * 2;
+            cfg.count = 1;
+            cfg.buffer = dev->images.bo->ptr.gpu + index * AGX_TEXTURE_LENGTH;
+         }
+
+         nr_tex = (rt * 2) + 1;
+      } else if (key.op[rt] == AGX_BG_CLEAR) {
+         static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
+         uint64_t colour =
+            hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
+
+         agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
+         uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
+      } else if (key.op[rt] == AGX_EOT_STORE) {
+         uint32_t index = key.tib.layered
+                             ? iview->planes[0].layered_eot_pbe_desc_index
+                             : iview->planes[0].eot_pbe_desc_index;
+
+         agx_usc_pack(&b, TEXTURE, cfg) {
+            cfg.start = rt;
+            cfg.count = 1;
+            cfg.buffer = dev->images.bo->ptr.gpu + index * AGX_TEXTURE_LENGTH;
+         }
+
+         nr_tex = rt + 1;
+      }
+   }
+
+   if (needs_textures_for_spilled_rts) {
+      hk_usc_upload_spilled_rt_descs(&b, cmd);
+      uniforms = MAX2(uniforms, 4);
+   }
+
+   if (uses_txf) {
+      agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler);
+   }
+
+   /* For attachmentless rendering, we don't know the sample count until
+    * draw-time. But we have trivial bg/eot programs in that case too.
+    */
+   if (key.tib.nr_samples >= 1) {
+      agx_usc_push_packed(&b, SHARED, &key.tib.usc);
+   } else {
+      assert(key.tib.sample_size_B == 0);
+      agx_usc_shared_none(&b);
+
+      key.tib.nr_samples = 1;
+   }
+
+   /* Get the shader */
+   key.reserved_preamble = uniforms;
+   /* XXX: locking? */
+   struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
+
+   agx_usc_pack(&b, SHADER, cfg) {
+      cfg.code = shader->ptr;
+      cfg.unk_2 = 0;
+   }
+
+   agx_usc_pack(&b, REGISTERS, cfg)
+      cfg.register_count = shader->info.nr_gprs;
+
+   if (shader->info.has_preamble) {
+      agx_usc_pack(&b, PRESHADER, cfg) {
+         cfg.code = shader->ptr + shader->info.preamble_offset;
+      }
+   } else {
+      agx_usc_pack(&b, NO_PRESHADER, cfg)
+         ;
+   }
+
+   struct hk_bg_eot ret = {.usc = t.gpu};
+
+   agx_pack(&ret.counts, COUNTS, cfg) {
+      cfg.uniform_register_count = shader->info.push_count;
+      cfg.preshader_register_count = shader->info.nr_preamble_gprs;
+      cfg.texture_state_register_count = nr_tex;
+      cfg.sampler_state_register_count =
+         agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
+   }
+
+   return ret;
+}
+
+static bool
+is_aligned(unsigned x, unsigned pot_alignment)
+{
+   assert(util_is_power_of_two_nonzero(pot_alignment));
+   return (x & (pot_alignment - 1)) == 0;
+}
+
+static void
+hk_merge_render_iview(struct hk_rendering_state *render,
+                      struct hk_image_view *iview)
+{
+   if (iview) {
+      unsigned samples = iview->vk.image->samples;
+      /* TODO: is this right for ycbcr? */
+      unsigned level = iview->vk.base_mip_level;
+      unsigned width = u_minify(iview->vk.image->extent.width, level);
+      unsigned height = u_minify(iview->vk.image->extent.height, level);
+
+      assert(render->tilebuffer.nr_samples == 0 ||
+             render->tilebuffer.nr_samples == samples);
+      render->tilebuffer.nr_samples = samples;
+
+      /* TODO: Is this merging logic sound? Not sure how this is supposed to
+       * work conceptually.
+       */
+      render->cr.width = MAX2(render->cr.width, width);
+      render->cr.height = MAX2(render->cr.height, height);
+   }
+}
+
+static void
+hk_pack_zls_control(struct agx_zls_control_packed *packed,
+                    struct ail_layout *z_layout, struct ail_layout *s_layout,
+                    const VkRenderingAttachmentInfo *attach_z,
+                    const VkRenderingAttachmentInfo *attach_s,
+                    bool incomplete_render_area, bool partial_render)
+{
+   agx_pack(packed, ZLS_CONTROL, zls_control) {
+      if (z_layout) {
+         zls_control.z_store_enable =
+            attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
+            attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render;
+
+         zls_control.z_load_enable =
+            attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
+            incomplete_render_area;
+
+         if (ail_is_compressed(z_layout)) {
+            zls_control.z_compress_1 = true;
+            zls_control.z_compress_2 = true;
+         }
+
+         if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
+            zls_control.z_format = AGX_ZLS_FORMAT_16;
+         } else {
+            zls_control.z_format = AGX_ZLS_FORMAT_32F;
+         }
+      }
+
+      if (s_layout) {
+         /* TODO:
+          * Fail
+          * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
+          * without the force
+          * .. maybe a VkRenderPass emulation bug.
+          */
+         zls_control.s_store_enable =
+            attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
+            attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
+            true;
+
+         zls_control.s_load_enable =
+            attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
+            incomplete_render_area;
+
+         if (ail_is_compressed(s_layout)) {
+            zls_control.s_compress_1 = true;
+            zls_control.s_compress_2 = true;
+         }
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
+                     const VkRenderingInfo *pRenderingInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   memset(render, 0, sizeof(*render));
+
+   render->flags = pRenderingInfo->flags;
+   render->area = pRenderingInfo->renderArea;
+   render->view_mask = pRenderingInfo->viewMask;
+   render->layer_count = pRenderingInfo->layerCount;
+   render->tilebuffer.nr_samples = 0;
+
+   const uint32_t layer_count = render->view_mask
+                                   ? util_last_bit(render->view_mask)
+                                   : render->layer_count;
+
+   render->color_att_count = pRenderingInfo->colorAttachmentCount;
+   for (uint32_t i = 0; i < render->color_att_count; i++) {
+      hk_attachment_init(&render->color_att[i],
+                         &pRenderingInfo->pColorAttachments[i]);
+   }
+
+   hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
+   hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
+
+   for (uint32_t i = 0; i < render->color_att_count; i++) {
+      hk_merge_render_iview(render, render->color_att[i].iview);
+   }
+
+   hk_merge_render_iview(render,
+                         render->depth_att.iview ?: render->stencil_att.iview);
+
+   /* Infer for attachmentless. samples is inferred at draw-time. */
+   render->cr.width =
+      MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
+
+   render->cr.height = MAX2(render->cr.height,
+                            render->area.offset.y + render->area.extent.height);
+
+   render->cr.layers = layer_count;
+
+   /* Choose a tilebuffer layout given the framebuffer key */
+   enum pipe_format formats[HK_MAX_RTS] = {0};
+   for (unsigned i = 0; i < render->color_att_count; ++i) {
+      formats[i] = vk_format_to_pipe_format(render->color_att[i].vk_format);
+   }
+
+   /* For now, we force layered=true since it makes compatibility problems way
+    * easier.
+    */
+   render->tilebuffer = agx_build_tilebuffer_layout(
+      formats, render->color_att_count, render->tilebuffer.nr_samples, true);
+
+   hk_cmd_buffer_dirty_render_pass(cmd);
+
+   /* Determine whether the render area is complete, enabling us to use a
+    * fast-clear.
+    *
+    * TODO: If it is incomplete but tile aligned, it should be possibly to fast
+    * clear with the appropriate settings. This is critical for performance.
+    */
+   bool incomplete_render_area =
+      render->area.offset.x > 0 || render->area.offset.y > 0 ||
+      render->area.extent.width < render->cr.width ||
+      render->area.extent.height < render->cr.height ||
+      (render->view_mask &&
+       render->view_mask != BITFIELD64_MASK(render->cr.layers));
+
+   render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
+                                        incomplete_render_area);
+   render->cr.bg.partial =
+      hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
+
+   render->cr.eot.main =
+      hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
+   render->cr.eot.partial = render->cr.eot.main;
+
+   render->cr.isp_bgobjvals = 0x300;
+
+   const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
+   const VkRenderingAttachmentInfo *attach_s =
+      pRenderingInfo->pStencilAttachment;
+
+   render->cr.iogpu_unk_214 = 0xc000;
+
+   struct ail_layout *z_layout = NULL, *s_layout = NULL;
+
+   if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
+      struct hk_image_view *view = render->depth_att.iview;
+      struct hk_image *image =
+         container_of(view->vk.image, struct hk_image, vk);
+
+      z_layout = &image->planes[0].layout;
+
+      unsigned level = view->vk.base_mip_level;
+      unsigned first_layer = view->vk.base_array_layer;
+
+      const struct util_format_description *desc =
+         util_format_description(vk_format_to_pipe_format(view->vk.format));
+
+      assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
+             desc->format == PIPE_FORMAT_Z16_UNORM ||
+             desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+
+      render->cr.depth.buffer =
+         hk_image_base_address(image, 0) +
+         ail_get_layer_level_B(z_layout, first_layer, level);
+
+      /* Main stride in pages */
+      assert((z_layout->depth_px == 1 ||
+              is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
+             "Page aligned Z layers");
+
+      unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
+      render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
+
+      assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
+
+      if (ail_is_compressed(z_layout)) {
+         render->cr.depth.meta =
+            hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
+            (first_layer * z_layout->compression_layer_stride_B) +
+            z_layout->level_offsets_compressed_B[level];
+
+         /* Meta stride in cache lines */
+         assert(
+            is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
+            "Cacheline aligned Z meta layers");
+
+         unsigned stride_lines =
+            z_layout->compression_layer_stride_B / AIL_CACHELINE;
+         render->cr.depth.meta_stride = (stride_lines - 1) << 14;
+      }
+
+      float clear_depth = attach_z->clearValue.depthStencil.depth;
+
+      if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
+         render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
+         render->cr.iogpu_unk_214 |= 0x40000;
+      } else {
+         render->cr.isp_bgobjdepth = fui(clear_depth);
+      }
+   }
+
+   if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
+      struct hk_image_view *view = render->stencil_att.iview;
+      struct hk_image *image =
+         container_of(view->vk.image, struct hk_image, vk);
+
+      /* Stencil is always the last plane (possibly the only plane) */
+      unsigned plane = image->plane_count - 1;
+      s_layout = &image->planes[plane].layout;
+      assert(s_layout->format == PIPE_FORMAT_S8_UINT);
+
+      unsigned level = view->vk.base_mip_level;
+      unsigned first_layer = view->vk.base_array_layer;
+
+      render->cr.stencil.buffer =
+         hk_image_base_address(image, plane) +
+         ail_get_layer_level_B(s_layout, first_layer, level);
+
+      /* Main stride in pages */
+      assert((s_layout->depth_px == 1 ||
+              is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
+             "Page aligned S layers");
+      unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
+      render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
+
+      if (ail_is_compressed(s_layout)) {
+         render->cr.stencil.meta =
+            hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
+            (first_layer * s_layout->compression_layer_stride_B) +
+            s_layout->level_offsets_compressed_B[level];
+
+         /* Meta stride in cache lines */
+         assert(
+            is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
+            "Cacheline aligned S meta layers");
+
+         unsigned stride_lines =
+            s_layout->compression_layer_stride_B / AIL_CACHELINE;
+
+         render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
+      }
+
+      render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
+   }
+
+   hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
+                       attach_s, incomplete_render_area, false);
+
+   hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
+                       attach_z, attach_s, incomplete_render_area, true);
+
+   /* If multiview is disabled, always read 0. If multiview is enabled,
+    * hk_set_view_index will dirty the root each draw.
+    */
+   cmd->state.gfx.descriptors.root.draw.view_index = 0;
+   cmd->state.gfx.descriptors.root_dirty = true;
+
+   if (render->flags & VK_RENDERING_RESUMING_BIT)
+      return;
+
+   /* The first control stream of the render pass is special since it gets
+    * the clears. Create it and swap in the clear.
+    */
+   assert(!cmd->current_cs.gfx && "not already in a render pass");
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
+   if (!cs)
+      return;
+
+   cs->cr.bg.main = render->cr.bg.main;
+   cs->cr.zls_control = render->cr.zls_control;
+
+   /* Reordering barrier for post-gfx, in case we had any. */
+   hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
+
+   /* Don't reorder compute across render passes.
+    *
+    * TODO: Check if this is necessary if the proper PipelineBarriers are
+    * handled... there may be CTS bugs...
+    */
+   hk_cmd_buffer_end_compute(cmd);
+
+   if (incomplete_render_area) {
+      uint32_t clear_count = 0;
+      VkClearAttachment clear_att[HK_MAX_RTS + 1];
+      for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
+         const VkRenderingAttachmentInfo *att_info =
+            &pRenderingInfo->pColorAttachments[i];
+         if (att_info->imageView == VK_NULL_HANDLE ||
+             att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
+            continue;
+
+         clear_att[clear_count++] = (VkClearAttachment){
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .colorAttachment = i,
+            .clearValue = att_info->clearValue,
+         };
+      }
+
+      clear_att[clear_count] = (VkClearAttachment){
+         .aspectMask = 0,
+      };
+
+      if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
+          attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+         clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
+         clear_att[clear_count].clearValue.depthStencil.depth =
+            attach_z->clearValue.depthStencil.depth;
+      }
+
+      if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
+          attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
+         clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+         clear_att[clear_count].clearValue.depthStencil.stencil =
+            attach_s->clearValue.depthStencil.stencil;
+      }
+
+      if (clear_att[clear_count].aspectMask != 0)
+         clear_count++;
+
+      if (clear_count > 0) {
+         const VkClearRect clear_rect = {
+            .rect = render->area,
+            .baseArrayLayer = 0,
+            .layerCount = render->view_mask ? 1 : render->layer_count,
+         };
+
+         hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
+                                clear_att, 1, &clear_rect);
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndRendering(VkCommandBuffer commandBuffer)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   hk_cmd_buffer_end_graphics(cmd);
+
+   bool need_resolve = false;
+
+   /* Translate render state back to VK for meta */
+   VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
+   for (uint32_t i = 0; i < render->color_att_count; i++) {
+      if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
+         need_resolve = true;
+
+      vk_color_att[i] = (VkRenderingAttachmentInfo){
+         .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+         .imageView = hk_image_view_to_handle(render->color_att[i].iview),
+         .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         .resolveMode = render->color_att[i].resolve_mode,
+         .resolveImageView =
+            hk_image_view_to_handle(render->color_att[i].resolve_iview),
+         .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      };
+   }
+
+   const VkRenderingAttachmentInfo vk_depth_att = {
+      .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+      .imageView = hk_image_view_to_handle(render->depth_att.iview),
+      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      .resolveMode = render->depth_att.resolve_mode,
+      .resolveImageView =
+         hk_image_view_to_handle(render->depth_att.resolve_iview),
+      .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
+   if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
+      need_resolve = true;
+
+   const VkRenderingAttachmentInfo vk_stencil_att = {
+      .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
+      .imageView = hk_image_view_to_handle(render->stencil_att.iview),
+      .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+      .resolveMode = render->stencil_att.resolve_mode,
+      .resolveImageView =
+         hk_image_view_to_handle(render->stencil_att.resolve_iview),
+      .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
+   };
+   if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
+      need_resolve = true;
+
+   const VkRenderingInfo vk_render = {
+      .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
+      .renderArea = render->area,
+      .layerCount = render->layer_count,
+      .viewMask = render->view_mask,
+      .colorAttachmentCount = render->color_att_count,
+      .pColorAttachments = vk_color_att,
+      .pDepthAttachment = &vk_depth_att,
+      .pStencilAttachment = &vk_stencil_att,
+   };
+
+   if (render->flags & VK_RENDERING_SUSPENDING_BIT)
+      need_resolve = false;
+
+   memset(render, 0, sizeof(*render));
+
+   if (need_resolve) {
+      hk_meta_resolve_rendering(cmd, &vk_render);
+   }
+}
+
+static uint64_t
+hk_geometry_state(struct hk_cmd_buffer *cmd)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   /* We tie heap allocation to geometry state allocation, so allocate now. */
+   if (unlikely(!dev->heap)) {
+      size_t size = 128 * 1024 * 1024;
+      dev->heap = agx_bo_create(&dev->dev, size, 0, "Geometry heap");
+
+      /* The geometry state buffer is initialized here and then is treated by
+       * the CPU as rodata, even though the GPU uses it for scratch internally.
+       */
+      off_t off = dev->rodata.geometry_state - dev->rodata.bo->ptr.gpu;
+      struct agx_geometry_state *map = dev->rodata.bo->ptr.cpu + off;
+
+      *map = (struct agx_geometry_state){
+         .heap = dev->heap->ptr.gpu,
+         .heap_size = size,
+      };
+   }
+
+   /* We need to free all allocations after each command buffer execution */
+   if (!cmd->uses_heap) {
+      uint64_t addr = dev->rodata.geometry_state;
+
+      /* Zeroing the allocated index frees everything */
+      hk_queue_write(cmd,
+                     addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
+                     true /* after gfx */);
+
+      cmd->uses_heap = true;
+   }
+
+   return dev->rodata.geometry_state;
+}
+
+static uint64_t
+hk_upload_gsi_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
+
+   unsigned index_size_B =
+      draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
+
+   uint64_t vb;
+   if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
+      assert(index_size_B == 4);
+
+      vb = desc->root.draw.tess_params +
+           offsetof(struct libagx_tess_args, tes_buffer);
+   } else {
+      vb = desc->root.root_desc_addr +
+           offsetof(struct hk_root_descriptor_table, draw.vertex_output_buffer);
+   }
+
+   struct agx_gs_setup_indirect_params gsi = {
+      .index_buffer = draw.index.addr,
+      .index_size_B = index_size_B,
+      .index_buffer_range_el = draw.index.range / index_size_B,
+      .zero_sink = dev->rodata.zero_sink,
+      .draw = draw.b.ptr,
+      .vertex_buffer = vb,
+      .ia = desc->root.draw.input_assembly,
+      .geom = desc->root.draw.geometry_params,
+      .vs_outputs = vs->b.info.outputs,
+   };
+
+   return hk_pool_upload(cmd, &gsi, sizeof(gsi), 8);
+}
+
+static uint64_t
+hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   assert(!draw.b.indirect && "indirect params written by GPU");
+
+   struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
+
+   if (draw.indexed) {
+      unsigned index_size_B = agx_index_size_to_B(draw.index_size);
+      unsigned range_el = draw.index.range / index_size_B;
+
+      ia.index_buffer =
+         libagx_index_buffer(draw.index.addr, range_el, draw.start,
+                             index_size_B, dev->rodata.zero_sink);
+
+      ia.index_buffer_range_el =
+         libagx_index_buffer_range_el(range_el, draw.start);
+   }
+
+   return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
+}
+
+static enum mesa_prim
+hk_gs_in_prim(struct hk_cmd_buffer *cmd)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
+
+   if (tes != NULL)
+      return tes->variants[HK_GS_VARIANT_RAST].info.ts.out_prim;
+   else
+      return vk_conv_topology(dyn->ia.primitive_topology);
+}
+
+static enum mesa_prim
+hk_rast_prim(struct hk_cmd_buffer *cmd)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+
+   if (gs != NULL)
+      return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim;
+   else
+      return hk_gs_in_prim(cmd);
+}
+
+static uint64_t
+hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
+
+   bool rast_disc = dyn->rs.rasterizer_discard_enable;
+   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
+
+   /* XXX: We should deduplicate this logic */
+   bool restart = (draw.indexed && draw.restart);
+   bool indirect =
+      draw.b.indirect || gfx->shaders[MESA_SHADER_TESS_EVAL] || restart;
+   enum mesa_prim mode = hk_gs_in_prim(cmd);
+
+   if (restart) {
+      mode = u_decomposed_prim(mode);
+   }
+
+   struct agx_geometry_params params = {
+      .state = hk_geometry_state(cmd),
+      .indirect_desc = cmd->geom_indirect,
+      .flat_outputs = fs ? fs->info.fs.interp.flat : 0,
+      .input_topology = mode,
+
+      /* Overriden by the indirect setup kernel. As tess->GS is always indirect,
+       * we can assume here that we're VS->GS.
+       */
+      .input_buffer = desc->root.draw.vertex_output_buffer,
+      .input_mask = desc->root.draw.vertex_outputs,
+   };
+
+   if (gfx->xfb_enabled) {
+      for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
+         params.xfb_base_original[i] = gfx->xfb[i].addr;
+         params.xfb_size[i] = gfx->xfb[i].range;
+         params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
+      }
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
+      uint64_t q = gfx->xfb_query[i];
+
+      if (q) {
+         params.xfb_prims_generated_counter[i] = q;
+         params.prims_generated_counter[i] = q + sizeof(uint64_t);
+      }
+   }
+
+   /* Calculate input primitive count for direct draws, and allocate the vertex
+    * & count buffers. GPU calculates and allocates for indirect draws.
+    */
+   unsigned count_buffer_stride = count->info.gs.count_words * 4;
+
+   if (indirect) {
+      params.count_buffer_stride = count_buffer_stride;
+      params.vs_grid[2] = params.gs_grid[2] = 1;
+   } else {
+      uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
+
+      params.vs_grid[0] = verts;
+      params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
+
+      params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
+      params.input_primitives = params.gs_grid[0] * instances;
+
+      unsigned size = params.input_primitives * count_buffer_stride;
+      if (size) {
+         params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
+      }
+   }
+
+   desc->root_dirty = true;
+   return hk_pool_upload(cmd, &params, sizeof(params), 8);
+}
+
+/*
+ * Tessellation has a fast path where the tessellator generates a VDM Index List
+ * command per patch, as well as a slow path using prefix sums to generate a
+ * single combined API draw. We need the latter if tessellation is fed into
+ * another software stage (geometry shading), or if we need accurate primitive
+ * IDs in the linked fragment shader (since that would require a prefix sum
+ * anyway).
+ */
+static bool
+hk_tess_needs_prefix_sum(struct hk_cmd_buffer *cmd)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   return gfx->shaders[MESA_SHADER_GEOMETRY] || gfx->generate_primitive_id;
+}
+
+static uint64_t
+hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
+   struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
+
+   struct libagx_tess_args args = {
+      .heap = hk_geometry_state(cmd),
+      .tcs_stride_el = tcs->info.tcs.output_stride / 4,
+      .statistic = hk_pipeline_stat_addr(
+         cmd,
+         VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
+
+      .input_patch_size = dyn->ts.patch_control_points,
+      .output_patch_size = tcs->info.tcs.output_patch_size,
+      .tcs_patch_constants = tcs->info.tcs.nr_patch_outputs,
+      .tcs_per_vertex_outputs = tcs->info.tcs.per_vertex_outputs,
+   };
+
+   bool with_counts = hk_tess_needs_prefix_sum(cmd);
+
+   /* This assumes !with_counts, if we have counts it's only one draw */
+   uint32_t draw_stride_el = tes->info.ts.point_mode ? 4 : 6;
+   size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
+
+   /* heap is allocated by hk_geometry_state */
+   args.patch_coord_buffer = dev->heap->ptr.gpu;
+
+   if (!draw.b.indirect) {
+      unsigned in_patches = draw.b.count[0] / args.input_patch_size;
+      if (in_patches == 0)
+         unreachable("todo: drop the draw?");
+
+      unsigned unrolled_patches = in_patches * draw.b.count[1];
+
+      uint32_t alloc = 0;
+      uint32_t tcs_out_offs = alloc;
+      alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
+
+      uint32_t patch_coord_offs = alloc;
+      alloc += unrolled_patches * 4 * 32;
+
+      uint32_t count_offs = alloc;
+      if (with_counts)
+         alloc += unrolled_patches * sizeof(uint32_t) * 32;
+
+      uint32_t draw_offs = alloc;
+
+      if (with_counts) {
+         /* Single API draw */
+         alloc += 5 * sizeof(uint32_t);
+      } else {
+         /* Padding added because VDM overreads */
+         alloc += (draw_stride_B * unrolled_patches) +
+                  (AGX_VDM_BARRIER_LENGTH + 0x800);
+      }
+
+      struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
+      args.tcs_buffer = blob.gpu + tcs_out_offs;
+      args.patches_per_instance = in_patches;
+      args.coord_allocs = blob.gpu + patch_coord_offs;
+      args.nr_patches = unrolled_patches;
+      args.out_draws = blob.gpu + draw_offs;
+
+      gfx->tess_out_draws = args.out_draws;
+
+      if (with_counts) {
+         args.counts = blob.gpu + count_offs;
+      } else {
+         /* Arrange so we return after all generated draws */
+         uint8_t *ret = (uint8_t *)blob.cpu + draw_offs +
+                        (draw_stride_B * unrolled_patches);
+
+         agx_pack(ret, VDM_BARRIER, cfg) {
+            cfg.returns = true;
+         }
+      }
+   } else {
+      unreachable("todo: indirect with tess");
+#if 0
+      args.tcs_statistic = agx_get_query_address(
+         batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
+
+      args.indirect = agx_indirect_buffer_ptr(batch, indirect);
+
+      /* Allocate 3x indirect global+local grids for VS/TCS/tess */
+      uint32_t grid_stride = sizeof(uint32_t) * 6;
+      args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
+
+      vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
+      tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
+      tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
+
+      args.vertex_outputs = ctx->vs->b.info.outputs;
+      args.vertex_output_buffer_ptr =
+         agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
+
+      batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
+
+      if (with_counts) {
+         args.out_draws = agx_pool_alloc_aligned_with_bo(
+                             &batch->pool, draw_stride, 4, &draw_bo)
+                             .gpu;
+      } else {
+         unreachable("need an extra indirection...");
+      }
+#endif
+   }
+
+   return hk_pool_upload(cmd, &args, sizeof(args), 8);
+}
+
+static struct hk_api_shader *
+hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
+                            hk_internal_builder_t builder)
+{
+   /* Try to get the cached shader */
+   struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
+   if (ent)
+      return ent->data;
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                                  &agx_nir_options, NULL);
+   builder(&b, key->key);
+
+   const struct vk_pipeline_robustness_state rs = {
+      .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
+      .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+   };
+
+   struct vk_shader_compile_info info = {
+      .stage = b.shader->info.stage,
+      .nir = b.shader,
+      .robustness = &rs,
+   };
+
+   /* We need to link libagx and assign shared before preprocessing, matching
+    * what the driver would otherwise produce.
+    */
+   agx_link_libagx(b.shader, dev->dev.libagx);
+
+   if (info.stage == MESA_SHADER_COMPUTE) {
+      NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
+               nir_var_mem_shared, glsl_get_cl_type_size_align);
+
+      /* Commit to the layout so we don't clobber later */
+      b.shader->info.shared_memory_explicit_layout = true;
+
+      NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
+               nir_address_format_62bit_generic);
+   }
+
+   hk_preprocess_nir_internal(dev->vk.physical, b.shader);
+
+   struct hk_api_shader *s;
+   if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
+      return NULL;
+
+   /* ..and cache it before we return. The key is on the stack right now, so
+    * clone it before using it as a hash table key. The clone is logically owned
+    * by the hash table.
+    */
+   size_t total_key_size = sizeof(*key) + key->key_size;
+   void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
+
+   _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
+   return s;
+}
+
+struct hk_api_shader *
+hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
+               size_t data_size)
+{
+   size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
+
+   struct hk_internal_key *key = alloca(total_key_size);
+   key->builder = builder;
+   key->key_size = data_size;
+
+   if (data_size)
+      memcpy(key->key, data, data_size);
+
+   simple_mtx_lock(&dev->kernels.lock);
+   struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
+   simple_mtx_unlock(&dev->kernels.lock);
+
+   return s;
+}
+
+static struct hk_draw
+hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                        struct hk_draw draw, uint32_t draw_count)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
+
+   /* The unroll kernel assumes an indirect draw. Synthesize one if needed */
+   if (!draw.b.indirect) {
+      uint32_t desc[5] = {draw.b.count[0], draw.b.count[1], draw.start,
+                          draw.index_bias, draw.start_instance};
+
+      draw =
+         hk_draw_indexed_indirect(hk_pool_upload(cmd, desc, sizeof(desc), 4),
+                                  draw.index, draw.index_size, true);
+   }
+
+   /* Next, we unroll the index buffer used by the indirect draw */
+   struct agx_unroll_restart_key key = {
+      .prim = vk_conv_topology(dyn->ia.primitive_topology),
+      .index_size_B = agx_index_size_to_B(draw.index_size),
+   };
+
+   struct agx_restart_unroll_params ia = {
+      .heap = hk_geometry_state(cmd),
+      .index_buffer = draw.index.addr,
+      .count = hk_pool_upload(cmd, &draw_count, sizeof(uint32_t), 4),
+      .draws = draw.b.ptr,
+      .out_draws = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
+      .max_draws = 1 /* TODO: MDI */,
+      .restart_index = gfx->index.restart,
+      .index_buffer_size_el = draw.index.range / key.index_size_B,
+      .flatshade_first =
+         dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
+      .zero_sink = dev->rodata.zero_sink,
+   };
+
+   struct hk_shader *s =
+      hk_meta_kernel(dev, agx_nir_unroll_restart, &key, sizeof(key));
+
+   uint64_t params = hk_pool_upload(cmd, &ia, sizeof(ia), 8);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1024 * draw_count, 1, 1),
+                        hk_grid(1024, 1, 1));
+
+   struct hk_addr_range out_index = {
+      .addr = dev->heap->ptr.gpu,
+      .range = dev->heap->size,
+   };
+
+   return hk_draw_indexed_indirect(ia.out_draws, out_index, draw.index_size,
+                                   false /* restart */);
+}
+
+static struct hk_draw
+hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                     struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+   struct hk_grid grid_vs, grid_gs;
+
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   bool rast_disc = dyn->rs.rasterizer_discard_enable;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
+
+   struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
+   struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
+   struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
+   struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
+
+   unsigned count_words = count->info.gs.count_words;
+
+   if (false /* TODO */)
+      perf_debug(dev, "Transform feedbck");
+   else if (count_words)
+      perf_debug(dev, "Geometry shader with counts");
+   else
+      perf_debug(dev, "Geometry shader without counts");
+
+   enum mesa_prim mode = hk_gs_in_prim(cmd);
+
+   if (draw.indexed && draw.restart) {
+      draw = hk_draw_without_restart(cmd, cs, draw, 1);
+      mode = u_decomposed_prim(mode);
+   }
+
+   /* Setup grids */
+   if (draw.b.indirect) {
+      struct agx_gs_setup_indirect_key key = {.prim = mode};
+
+      struct hk_shader *gsi =
+         hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
+
+      uint64_t push = hk_upload_gsi_params(cmd, draw);
+      uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
+
+      hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
+                           hk_grid(1, 1, 1));
+
+      uint64_t geometry_params = desc->root.draw.geometry_params;
+      grid_vs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, vs_grid));
+
+      grid_gs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, gs_grid));
+   } else {
+      grid_vs = grid_gs = draw.b;
+      grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
+   }
+
+   /* Launch the vertex shader first */
+   hk_reserve_scratch(cmd, cs, vs);
+   hk_dispatch_with_usc(dev, cs, vs,
+                        hk_upload_usc_words(cmd, vs,
+                                            vs->info.stage == MESA_SHADER_VERTEX
+                                               ? gfx->linked[MESA_SHADER_VERTEX]
+                                               : vs->only_linked),
+                        grid_vs, hk_grid(1, 1, 1));
+
+   /* If we need counts, launch the count shader and prefix sum the results. */
+   if (count_words) {
+      hk_dispatch_with_local_size(cmd, cs, count, grid_gs, hk_grid(1, 1, 1));
+
+      struct hk_api_shader *prefix_sum = hk_meta_shader(
+         dev, agx_nir_prefix_sum_gs, &count_words, sizeof(count_words));
+
+      /* XXX: hack */
+      hk_only_variant(prefix_sum)->info.stage = MESA_SHADER_GEOMETRY;
+
+      hk_dispatch_with_local_size(cmd, cs, hk_only_variant(prefix_sum),
+                                  hk_grid(1024 * count_words, 1, 1),
+                                  hk_grid(1024, 1, 1));
+   }
+
+   /* Pre-GS shader */
+   hk_dispatch_with_local_size(cmd, cs, pre_gs, hk_grid(1, 1, 1),
+                               hk_grid(1, 1, 1));
+
+   /* Pre-rast geometry shader */
+   hk_dispatch_with_local_size(cmd, cs, main, grid_gs, hk_grid(1, 1, 1));
+
+   struct hk_addr_range range = (struct hk_addr_range){
+      .addr = dev->heap->ptr.gpu,
+      .range = dev->heap->size,
+   };
+
+   bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS;
+   return hk_draw_indexed_indirect(cmd->geom_indirect, range,
+                                   AGX_INDEX_SIZE_U32, restart);
+}
+
+static struct hk_draw
+hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_grid grid_vs, grid_tcs, grid_tess;
+
+   struct hk_shader *vs = hk_bound_sw_vs(gfx);
+   struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
+   struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
+
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   uint32_t input_patch_size = dyn->ts.patch_control_points;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
+
+   perf_debug(dev, "Tessellation");
+
+   uint64_t tcs_stat = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
+
+   /* Setup grids */
+   if (draw.b.indirect) {
+      unreachable("todo: indirect tess");
+#if 0
+      struct agx_gs_setup_indirect_key key = {.prim = mode};
+
+      struct hk_shader *gsi =
+         hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
+
+      uint64_t push = hk_upload_gsi_params(cmd, draw);
+      uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
+
+      hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
+                           hk_grid(1, 1, 1));
+
+      uint64_t geometry_params = desc->root.draw.geometry_params;
+      grid_vs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, vs_grid));
+
+      grid_gs = hk_grid_indirect(geometry_params +
+                                 offsetof(struct agx_geometry_params, gs_grid));
+#endif
+   } else {
+      uint32_t patches = draw.b.count[0] / input_patch_size;
+      grid_vs = grid_tcs = draw.b;
+
+      grid_tcs.count[0] = patches * tcs->info.tcs.output_patch_size;
+      grid_tess = hk_grid(patches * draw.b.count[1], 1, 1);
+
+      /* TCS invocation counter increments once per-patch */
+      if (tcs_stat) {
+         perf_debug(dev, "Direct TCS statistic");
+
+         struct libagx_increment_params args = {
+            .statistic = tcs_stat,
+            .delta = patches,
+         };
+
+         struct hk_shader *s =
+            hk_meta_kernel(dev, agx_nir_increment_statistic, NULL, 0);
+
+         uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
+         uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+
+         hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1),
+                              hk_grid(1, 1, 1));
+      }
+   }
+
+   /* First launch the VS and TCS */
+   hk_reserve_scratch(cmd, cs, vs);
+   hk_reserve_scratch(cmd, cs, tcs);
+
+   /* XXX perf: grid size */
+   hk_dispatch_with_usc(
+      dev, cs, vs,
+      hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
+      hk_grid(64, 1, 1));
+
+   hk_dispatch_with_usc(
+      dev, cs, tcs, hk_upload_usc_words(cmd, tcs, tcs->only_linked), grid_tcs,
+      hk_grid(tcs->info.tcs.output_patch_size, 1, 1));
+
+   /* TODO indirect */
+
+   bool with_counts = hk_tess_needs_prefix_sum(cmd);
+   uint64_t state = gfx->descriptors.root.draw.tess_params;
+
+   /* If the domain is flipped, we need to flip the winding order */
+   bool ccw = tes->info.ts.ccw;
+   ccw ^= dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
+
+   enum libagx_tess_partitioning partitioning =
+      tes->info.ts.spacing == TESS_SPACING_EQUAL
+         ? LIBAGX_TESS_PARTITIONING_INTEGER
+      : tes->info.ts.spacing == TESS_SPACING_FRACTIONAL_ODD
+         ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
+         : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
+
+   enum libagx_tess_output_primitive prim =
+      tes->info.ts.point_mode ? LIBAGX_TESS_OUTPUT_POINT
+      : ccw                   ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
+                              : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
+
+   struct agx_tessellator_key key = {
+      .prim = tes->info.ts.mode,
+      .output_primitive = prim,
+      .partitioning = partitioning,
+   };
+
+   if (with_counts) {
+      perf_debug(dev, "Tessellation with counts");
+
+      /* Generate counts */
+      key.mode = LIBAGX_TESS_MODE_COUNT;
+      {
+         struct hk_shader *tess =
+            hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
+
+         hk_dispatch_with_usc(
+            dev, cs, tess,
+            hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
+            grid_tess, hk_grid(64, 1, 1));
+      }
+
+      /* Prefix sum counts, allocating index buffer space. */
+      {
+         struct hk_shader *sum =
+            hk_meta_kernel(dev, agx_nir_prefix_sum_tess, NULL, 0);
+
+         hk_dispatch_with_usc(
+            dev, cs, sum,
+            hk_upload_usc_words_kernel(cmd, sum, &state, sizeof(state)),
+            hk_grid(1024, 1, 1), hk_grid(1024, 1, 1));
+      }
+
+      key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
+   } else {
+      key.mode = LIBAGX_TESS_MODE_VDM;
+   }
+
+   /* Now we can tessellate */
+   {
+      struct hk_shader *tess =
+         hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
+
+      hk_dispatch_with_usc(
+         dev, cs, tess,
+         hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
+         grid_tess, hk_grid(64, 1, 1));
+   }
+
+   struct hk_addr_range range = (struct hk_addr_range){
+      .addr = dev->heap->ptr.gpu,
+      .range = dev->heap->size,
+   };
+
+   struct hk_draw out = hk_draw_indexed_indirect(gfx->tess_out_draws, range,
+                                                 AGX_INDEX_SIZE_U32, false);
+   out.raw = !with_counts;
+   return out;
+}
+
+void
+hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
+                            const gl_shader_stage stage,
+                            struct hk_api_shader *shader)
+{
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
+   if (cmd->state.gfx.shaders[stage] == shader)
+      return;
+
+   cmd->state.gfx.shaders[stage] = shader;
+   cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
+   }
+}
+
+static uint32_t
+hk_pipeline_bind_group(gl_shader_stage stage)
+{
+   return stage;
+}
+
+static void
+hk_flush_shaders(struct hk_cmd_buffer *cmd)
+{
+   if (cmd->state.gfx.shaders_dirty == 0)
+      return;
+
+   /* Map shader types to shaders */
+   struct hk_api_shader *type_shader[6] = {
+      NULL,
+   };
+   uint32_t types_dirty = 0;
+
+   const uint32_t gfx_stages =
+      BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
+      BITFIELD_BIT(MESA_SHADER_TESS_EVAL) | BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
+      BITFIELD_BIT(MESA_SHADER_FRAGMENT);
+
+   /* Geometry shading overrides the restart index, reemit on rebind */
+   if (IS_SHADER_DIRTY(GEOMETRY)) {
+      cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
+   }
+
+   u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
+      /* TODO: compact? */
+      uint32_t type = stage;
+      types_dirty |= BITFIELD_BIT(type);
+
+      /* Only copy non-NULL shaders because mesh/task alias with vertex and
+       * tessellation stages.
+       */
+      if (cmd->state.gfx.shaders[stage] != NULL) {
+         assert(type < ARRAY_SIZE(type_shader));
+         assert(type_shader[type] == NULL);
+         type_shader[type] = cmd->state.gfx.shaders[stage];
+      }
+   }
+
+   u_foreach_bit(type, types_dirty) {
+      struct hk_api_shader *shader = type_shader[type];
+
+      /* We always map index == type */
+      // const uint32_t idx = type;
+
+      if (shader == NULL)
+         continue;
+
+      /* TODO */
+   }
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+   struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
+
+   /* If we have a new VS/FS pair, UVS locations may have changed so need to
+    * relink. We do this here because there's no dependence on the fast linked
+    * shaders.
+    */
+   agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
+                  fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
+                  fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
+
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+   desc->root_dirty = true;
+
+   for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
+      desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
+   }
+}
+
+static struct agx_shader_part *
+hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
+                            hk_internal_builder_t builder, bool preprocess_nir,
+                            bool stop, unsigned cf_base)
+{
+   /* Try to get the cached shader */
+   struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
+   if (ent)
+      return ent->data;
+
+   nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
+   builder(&b, key->key);
+
+   if (preprocess_nir)
+      agx_preprocess_nir(b.shader, dev->dev.libagx);
+
+   struct agx_shader_key backend_key = {
+      .needs_g13x_coherency = (dev->dev.params.gpu_generation == 13 &&
+                               dev->dev.params.num_clusters_total > 1) ||
+                              dev->dev.params.num_dies > 1,
+      .libagx = dev->dev.libagx,
+      .secondary = true,
+      .no_stop = !stop,
+   };
+
+   /* We always use dynamic sample shading in the GL driver. Indicate that. */
+   if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
+      backend_key.fs.cf_base = cf_base;
+
+      if (b.shader->info.fs.uses_sample_shading)
+         backend_key.fs.inside_sample_loop = true;
+   }
+
+   struct agx_shader_part *part =
+      rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
+
+   agx_compile_shader_nir(b.shader, &backend_key, NULL, part);
+
+   ralloc_free(b.shader);
+
+   /* ..and cache it before we return. The key is on the stack right now, so
+    * clone it before using it as a hash table key. The clone is logically owned
+    * by the hash table.
+    */
+   size_t total_key_size = sizeof(*key) + key->key_size;
+   void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
+
+   _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
+   return part;
+}
+
+static struct agx_shader_part *
+hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
+                     hk_internal_builder_t builder, bool preprocess_nir,
+                     bool stop, unsigned cf_base)
+{
+   /* Build the meta shader key */
+   size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
+
+   struct hk_internal_key *key = alloca(total_key_size);
+   key->builder = builder;
+   key->key_size = data_size;
+
+   if (data_size)
+      memcpy(key->key, data, data_size);
+
+   simple_mtx_lock(&dev->prolog_epilog.lock);
+
+   struct agx_shader_part *part = hk_get_prolog_epilog_locked(
+      dev, key, builder, preprocess_nir, stop, cf_base);
+
+   simple_mtx_unlock(&dev->prolog_epilog.lock);
+   return part;
+}
+
+static struct hk_linked_shader *
+hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
+                             struct hk_fast_link_key_vs *key)
+{
+   struct agx_shader_part *prolog =
+      hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
+                           agx_nir_vs_prolog, false, false, 0);
+
+   struct hk_linked_shader *linked =
+      hk_fast_link(dev, false, shader, prolog, NULL, 0);
+
+   struct hk_fast_link_key *key_clone =
+      ralloc_memdup(shader->linked.ht, key, sizeof(*key));
+
+   /* XXX: Fix this higher up the stack */
+   linked->b.uses_base_param |= !key->prolog.hw;
+
+   _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
+   return linked;
+}
+
+static void
+build_fs_prolog(nir_builder *b, const void *key)
+{
+   agx_nir_fs_prolog(b, key);
+
+   /* Lower load_stat_query_address_agx, needed for FS statistics */
+   NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
+}
+
+static struct hk_linked_shader *
+hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
+                             struct hk_fast_link_key_fs *key)
+{
+   /* TODO: prolog without fs needs to work too... */
+   bool needs_prolog = key->prolog.statistics ||
+                       key->prolog.cull_distance_size ||
+                       key->prolog.api_sample_mask != 0xff;
+
+   struct agx_shader_part *prolog = NULL;
+   if (needs_prolog) {
+      prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
+                                    build_fs_prolog, false, false,
+                                    key->prolog.cf_base);
+   }
+
+   /* If sample shading is used, don't stop at the epilog, there's a
+    * footer that the fast linker will insert to stop.
+    */
+   bool epilog_stop = (key->nr_samples_shaded == 0);
+
+   struct agx_shader_part *epilog =
+      hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
+                           agx_nir_fs_epilog, true, epilog_stop, 0);
+
+   struct hk_linked_shader *linked =
+      hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
+
+   struct hk_fast_link_key *key_clone =
+      ralloc_memdup(shader->linked.ht, key, sizeof(*key));
+
+   _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
+   return linked;
+}
+
+/*
+ * First, look for a fully linked variant. Else, build the required shader
+ * parts and link.
+ */
+static struct hk_linked_shader *
+hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
+{
+   struct hk_linked_shader *linked;
+   simple_mtx_lock(&shader->linked.lock);
+
+   struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
+
+   if (ent)
+      linked = ent->data;
+   else if (shader->info.stage == MESA_SHADER_VERTEX)
+      linked = hk_get_fast_linked_locked_vs(dev, shader, key);
+   else if (shader->info.stage == MESA_SHADER_FRAGMENT)
+      linked = hk_get_fast_linked_locked_fs(dev, shader, key);
+   else
+      unreachable("invalid stage");
+
+   simple_mtx_unlock(&shader->linked.lock);
+   return linked;
+}
+
+static void
+hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
+                      void *key)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
+   gl_shader_stage stage = shader->info.stage;
+
+   if (cmd->state.gfx.linked[stage] != new) {
+      cmd->state.gfx.linked[stage] = new;
+      cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
+   }
+}
+
+static enum agx_polygon_mode
+translate_polygon_mode(VkPolygonMode vk_mode)
+{
+   static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
+                 AGX_POLYGON_MODE_FILL);
+   static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
+                 AGX_POLYGON_MODE_LINE);
+   static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
+                 AGX_POLYGON_MODE_POINT);
+
+   assert(vk_mode <= VK_POLYGON_MODE_POINT);
+   return (enum agx_polygon_mode)vk_mode;
+}
+
+static enum agx_zs_func
+translate_compare_op(VkCompareOp vk_mode)
+{
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
+                 AGX_ZS_FUNC_LEQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
+                 AGX_ZS_FUNC_GREATER);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
+                 AGX_ZS_FUNC_NOT_EQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
+                 AGX_ZS_FUNC_GEQUAL);
+   static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
+
+   assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
+   return (enum agx_zs_func)vk_mode;
+}
+
+static enum agx_stencil_op
+translate_stencil_op(VkStencilOp vk_op)
+{
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
+                 AGX_STENCIL_OP_KEEP);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
+                 AGX_STENCIL_OP_ZERO);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
+                 AGX_STENCIL_OP_REPLACE);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
+                 AGX_STENCIL_OP_INCR_SAT);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
+                 AGX_STENCIL_OP_DECR_SAT);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
+                 AGX_STENCIL_OP_INVERT);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
+                 AGX_STENCIL_OP_INCR_WRAP);
+   static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
+                 AGX_STENCIL_OP_DECR_WRAP);
+
+   return (enum agx_stencil_op)vk_op;
+}
+
+static void
+hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
+                         struct vk_stencil_test_face_state s, bool enabled)
+{
+   if (enabled) {
+      agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
+         cfg.compare = translate_compare_op(s.op.compare);
+         cfg.write_mask = s.write_mask;
+         cfg.read_mask = s.compare_mask;
+
+         cfg.depth_pass = translate_stencil_op(s.op.pass);
+         cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
+         cfg.stencil_fail = translate_stencil_op(s.op.fail);
+      }
+   } else {
+      agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
+         cfg.compare = AGX_ZS_FUNC_ALWAYS;
+         cfg.write_mask = 0xFF;
+         cfg.read_mask = 0xFF;
+
+         cfg.depth_pass = AGX_STENCIL_OP_KEEP;
+         cfg.depth_fail = AGX_STENCIL_OP_KEEP;
+         cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
+      }
+   }
+}
+
+static bool
+hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
+{
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   return dyn->ds.stencil.test_enable &&
+          render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
+}
+
+static void
+hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd->vk.dynamic_graphics_state;
+
+   /* We always need at least 1 viewport for the hardware. With rasterizer
+    * discard the app may not supply any, but we can just program garbage.
+    */
+   unsigned count = MAX2(dyn->vp.viewport_count, 1);
+
+   unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
+   unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
+
+   /* We implicitly scissor to the viewport. We need to do a min/max dance to
+    * handle inverted viewports.
+    */
+   for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
+      const VkViewport *vp = &dyn->vp.viewports[i];
+
+      minx[i] = MIN2(vp->x, vp->x + vp->width);
+      miny[i] = MIN2(vp->y, vp->y + vp->height);
+      maxx[i] = MAX2(vp->x, vp->x + vp->width);
+      maxy[i] = MAX2(vp->y, vp->y + vp->height);
+   }
+
+   /* Additionally clamp to the framebuffer so we don't rasterize
+    * off-screen pixels. TODO: Is this necessary? the GL driver does this but
+    * it might be cargoculted at this point.
+    *
+    * which is software-visible and can cause faults with
+    * eMRT when the framebuffer is not a multiple of the tile size.
+    */
+   for (unsigned i = 0; i < count; ++i) {
+      minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
+      maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
+      miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
+      maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
+   }
+
+   /* We additionally apply any API scissors */
+   for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
+      const VkRect2D *s = &dyn->vp.scissors[i];
+
+      minx[i] = MAX2(minx[i], s->offset.x);
+      miny[i] = MAX2(miny[i], s->offset.y);
+      maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
+      maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
+   }
+
+   /* Upload a hardware scissor for each viewport, whether there's a
+    * corresponding API scissor or not.
+    */
+   unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
+   struct agx_scissor_packed *scissors =
+      util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
+
+   for (unsigned i = 0; i < count; ++i) {
+      const VkViewport *vp = &dyn->vp.viewports[i];
+
+      agx_pack(scissors + i, SCISSOR, cfg) {
+         cfg.min_x = minx[i];
+         cfg.min_y = miny[i];
+         cfg.max_x = maxx[i];
+         cfg.max_y = maxy[i];
+
+         /* These settings in conjunction with the PPP control depth clip/clamp
+          * settings implement depth clip/clamping. Properly setting them
+          * together is required for conformant depth clip enable.
+          *
+          * TODO: Reverse-engineer the finer interactions here.
+          */
+         if (dyn->rs.depth_clamp_enable) {
+            cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
+            cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
+         } else {
+            cfg.min_z = 0.0;
+            cfg.max_z = 1.0;
+         }
+      }
+   }
+
+   /* Upload state */
+   struct AGX_PPP_HEADER present = {
+      .depth_bias_scissor = true,
+      .region_clip = true,
+      .viewport = true,
+      .viewport_count = count,
+   };
+
+   size_t size = agx_ppp_update_size(&present);
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
+   if (!T.cpu)
+      return;
+
+   struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
+
+   agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
+      cfg.scissor = index;
+
+      /* Use the current depth bias, we allocate linearly */
+      unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
+      cfg.depth_bias = count ? count - 1 : 0;
+   };
+
+   for (unsigned i = 0; i < count; ++i) {
+      agx_ppp_push(&ppp, REGION_CLIP, cfg) {
+         cfg.enable = true;
+         cfg.min_x = minx[i] / 32;
+         cfg.min_y = miny[i] / 32;
+         cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
+         cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
+      }
+   }
+
+   agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
+      ;
+
+   /* Upload viewports */
+   for (unsigned i = 0; i < count; ++i) {
+      const VkViewport *vp = &dyn->vp.viewports[i];
+
+      agx_ppp_push(&ppp, VIEWPORT, cfg) {
+         cfg.translate_x = vp->x + 0.5f * vp->width;
+         cfg.translate_y = vp->y + 0.5f * vp->height;
+         cfg.translate_z = vp->minDepth;
+
+         cfg.scale_x = vp->width * 0.5f;
+         cfg.scale_y = vp->height * 0.5f;
+         cfg.scale_z = vp->maxDepth - vp->minDepth;
+      }
+   }
+
+   agx_ppp_fini(out, &ppp);
+}
+
+static enum agx_object_type
+translate_object_type(enum mesa_prim topology)
+{
+   static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
+   static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
+
+   if (topology == MESA_PRIM_POINTS)
+      return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
+   else if (topology <= MESA_PRIM_LINE_STRIP)
+      return AGX_OBJECT_TYPE_LINE;
+   else
+      return AGX_OBJECT_TYPE_TRIANGLE;
+}
+
+static enum agx_primitive
+translate_hw_primitive_topology(enum mesa_prim prim)
+{
+   switch (prim) {
+   case MESA_PRIM_POINTS:
+      return AGX_PRIMITIVE_POINTS;
+   case MESA_PRIM_LINES:
+      return AGX_PRIMITIVE_LINES;
+   case MESA_PRIM_LINE_STRIP:
+      return AGX_PRIMITIVE_LINE_STRIP;
+   case MESA_PRIM_TRIANGLES:
+      return AGX_PRIMITIVE_TRIANGLES;
+   case MESA_PRIM_TRIANGLE_STRIP:
+      return AGX_PRIMITIVE_TRIANGLE_STRIP;
+   case MESA_PRIM_TRIANGLE_FAN:
+      return AGX_PRIMITIVE_TRIANGLE_FAN;
+   default:
+      unreachable("Invalid hardware primitive topology");
+   }
+}
+
+static inline enum agx_vdm_vertex
+translate_vdm_vertex(unsigned vtx)
+{
+   static_assert(AGX_VDM_VERTEX_0 == 0);
+   static_assert(AGX_VDM_VERTEX_1 == 1);
+   static_assert(AGX_VDM_VERTEX_2 == 2);
+
+   assert(vtx <= 2);
+   return vtx;
+}
+
+static inline enum agx_ppp_vertex
+translate_ppp_vertex(unsigned vtx)
+{
+   static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
+   static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
+   static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
+
+   assert(vtx <= 2);
+   return vtx + 1;
+}
+
+static void
+hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
+{
+   uint8_t *out = cs->current;
+   agx_push(out, VDM_STATE, cfg) {
+      cfg.restart_index_present = true;
+   }
+
+   agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
+      if (cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY])
+         cfg.value = BITFIELD_MASK(32);
+      else
+         cfg.value = cmd->state.gfx.index.restart;
+   }
+
+   cs->current = out;
+}
+
+/*
+ * Return the given sample positions, packed into a 32-bit word with fixed
+ * point nibbles for each x/y component of the (at most 4) samples. This is
+ * suitable for programming the PPP_MULTISAMPLECTL control register.
+ */
+static uint32_t
+hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
+{
+   uint32_t ctrl = 0;
+
+   for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
+      VkSampleLocationEXT loc = sl->locations[i];
+
+      uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
+      uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
+
+      assert(x <= 15);
+      assert(y <= 15);
+
+      /* Push bytes in reverse order so we can use constant shifts. */
+      ctrl = (ctrl << 8) | (y << 4) | x;
+   }
+
+   return ctrl;
+}
+
+/*
+ * Return the standard sample positions, prepacked as above for efficiency.
+ */
+uint32_t
+hk_default_sample_positions(unsigned nr_samples)
+{
+   switch (nr_samples) {
+   case 0:
+   case 1:
+      return 0x88;
+   case 2:
+      return 0x44cc;
+   case 4:
+      return 0xeaa26e26;
+   default:
+      unreachable("Invalid sample count");
+   }
+}
+
+static void
+hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
+{
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
+
+   bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
+                      IS_SHADER_DIRTY(GEOMETRY);
+   bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
+
+   struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
+   bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
+
+   bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
+
+   bool face_dirty =
+      IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
+      IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
+      IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
+
+   bool stencil_face_dirty =
+      IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
+      IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
+
+   struct AGX_PPP_HEADER dirty = {
+      .fragment_control =
+         IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
+         IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
+
+      .fragment_control_2 =
+         IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
+
+      .fragment_front_face = face_dirty,
+      .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
+      .fragment_front_stencil = stencil_face_dirty,
+      .fragment_back_face = face_dirty,
+      .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
+      .fragment_back_stencil = stencil_face_dirty,
+      .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
+      .varying_counts_32 = varyings_dirty,
+      .varying_counts_16 = varyings_dirty,
+      .cull =
+         IS_DIRTY(RS_CULL_MODE) || IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
+         IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
+         IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
+         IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || (gfx->dirty & HK_DIRTY_PROVOKING),
+      .cull_2 = varyings_dirty,
+
+      /* With a null FS, the fragment shader PPP word is ignored and doesn't
+       * need to be present.
+       */
+      .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
+                                gfx->descriptors.root_dirty),
+
+      .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
+      .output_size = hw_vs_dirty,
+      .viewport_count = 1, /* irrelevant */
+   };
+
+   /* Calculate the update size. If it equals the header, there is nothing to
+    * update so early-exit.
+    */
+   size_t size = agx_ppp_update_size(&dirty);
+   if (size == AGX_PPP_HEADER_LENGTH)
+      return;
+
+   /* Otherwise, allocate enough space for the update and push it. */
+   assert(size > AGX_PPP_HEADER_LENGTH);
+
+   struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
+   if (!T.cpu)
+      return;
+
+   struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
+
+   if (dirty.fragment_control) {
+      agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
+         cfg.visibility_mode = gfx->occlusion.mode;
+         cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
+
+         /* TODO: Consider optimizing this? */
+         cfg.two_sided_stencil = cfg.stencil_test_enable;
+
+         cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
+                                 gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
+
+         /* Always enable scissoring so we may scissor to the viewport (TODO:
+          * optimize this out if the viewport is the default and the app does
+          * not use the scissor test)
+          */
+         cfg.scissor_enable = true;
+
+         /* This avoids broken derivatives along primitive edges */
+         cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
+      }
+   }
+
+   if (dirty.fragment_control_2) {
+      if (linked_fs) {
+         /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
+          * main fragment control word and has to be combined into the secondary
+          * word for reliable behaviour.
+          */
+         agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
+                             linked_fs->b.fragment_control) {
+
+            cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
+         }
+      } else {
+         /* If there is no fragment shader, we must disable tag writes to avoid
+          * executing the missing shader. This optimizes depth-only passes.
+          */
+         agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
+            cfg.tag_write_disable = true;
+            cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
+         }
+      }
+   }
+
+   struct agx_fragment_face_packed fragment_face;
+   struct agx_fragment_face_2_packed fragment_face_2;
+
+   if (dirty.fragment_front_face) {
+      bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
+      bool z_test = has_z && dyn->ds.depth.test_enable;
+
+      agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
+         cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
+         cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
+         cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
+
+         if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
+            cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
+         else
+            cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
+      };
+
+      agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
+         cfg.stencil_reference = dyn->ds.stencil.front.reference;
+      }
+   }
+
+   if (dirty.fragment_front_face_2) {
+      agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
+         cfg.object_type = gfx->object_type;
+
+         /* TODO: flip the default? */
+         if (fs)
+            cfg.conservative_depth = 0;
+      }
+
+      if (fs)
+         agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
+
+      agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
+   }
+
+   if (dirty.fragment_front_stencil) {
+      hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
+                               hk_stencil_test_enabled(cmd));
+   }
+
+   if (dirty.fragment_back_face) {
+      assert(dirty.fragment_front_face);
+
+      agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
+         cfg.stencil_reference = dyn->ds.stencil.back.reference;
+      }
+   }
+
+   if (dirty.fragment_back_face_2) {
+      assert(dirty.fragment_front_face_2);
+
+      agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
+   }
+
+   if (dirty.fragment_back_stencil) {
+      hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
+                               hk_stencil_test_enabled(cmd));
+   }
+
+   if (dirty.output_select) {
+      struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
+
+      if (linked_fs) {
+         agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
+                                   &linked_fs->b.osel);
+      } else {
+         agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
+      }
+   }
+
+   assert(dirty.varying_counts_32 == dirty.varying_counts_16);
+
+   if (dirty.varying_counts_32) {
+      agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
+                          VARYING_COUNTS);
+
+      agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
+                          VARYING_COUNTS);
+   }
+
+   if (dirty.cull) {
+      agx_ppp_push(&ppp, CULL, cfg) {
+         cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
+         cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
+         cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
+         cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
+         cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
+
+         /* We do not support unrestricted depth, so clamping is inverted from
+          * clipping. This implementation seems to pass CTS without unrestricted
+          * depth support.
+          *
+          * TODO: Make sure this is right with gl_FragDepth.
+          */
+         cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
+         cfg.depth_clamp = !cfg.depth_clip;
+
+         cfg.primitive_msaa =
+            gfx->object_type == AGX_OBJECT_TYPE_LINE &&
+            dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
+      }
+   }
+
+   if (dirty.cull_2) {
+      agx_ppp_push(&ppp, CULL_2, cfg) {
+         cfg.needs_primitive_id = gfx->generate_primitive_id;
+      }
+   }
+
+   if (dirty.fragment_shader) {
+      /* TODO: Do less often? */
+      hk_reserve_scratch(cmd, cs, fs);
+
+      agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
+
+      agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
+         cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
+      }
+
+      agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
+         cfg.cf_bindings = gfx->varyings;
+      }
+
+      agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
+         ;
+   }
+
+   if (dirty.occlusion_query) {
+      agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
+         cfg.index = gfx->occlusion.index;
+      }
+   }
+
+   if (dirty.output_size) {
+      agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
+         cfg.count = hw_vs->info.uvs.size;
+      }
+   }
+
+   agx_ppp_fini(out, &ppp);
+}
+
+static void
+hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
+                       uint32_t draw_id, struct hk_draw draw)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
+   struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
+
+   if (!vk_dynamic_graphics_state_any_dirty(dyn) &&
+       !(gfx->dirty & ~HK_DIRTY_INDEX) && !gfx->descriptors.root_dirty &&
+       !gfx->shaders_dirty && !sw_vs->b.info.uses_draw_id &&
+       !sw_vs->b.info.uses_base_param &&
+       !(gfx->linked[MESA_SHADER_VERTEX] &&
+         gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
+      return;
+
+   struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
+
+   assert(cs->current + 0x1000 < cs->end && "already ensured space");
+   uint8_t *out = cs->current;
+
+   struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
+
+   bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
+                   IS_SHADER_DIRTY(GEOMETRY);
+   bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
+   bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
+
+   if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
+      static_assert(sizeof(desc->root.draw.blend_constant) ==
+                       sizeof(dyn->cb.blend_constants) &&
+                    "common size");
+
+      memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
+             sizeof(dyn->cb.blend_constants));
+      desc->root_dirty = true;
+   }
+
+   if (IS_DIRTY(MS_SAMPLE_MASK)) {
+      desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
+      desc->root_dirty = true;
+   }
+
+   if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
+       IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
+
+      const struct hk_rendering_state *render = &cmd->state.gfx.render;
+      bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
+      bool z_test = has_z && dyn->ds.depth.test_enable;
+
+      desc->root.draw.force_never_in_shader =
+         z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
+         fs->info.fs.writes_memory;
+
+      desc->root_dirty = true;
+   }
+
+   /* The main shader must not run tests if the epilog will. */
+   bool nontrivial_force_early =
+      fs && (fs->b.info.early_fragment_tests &&
+             (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
+
+   bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
+                          (fs && (fs->info.fs.epilog_key.write_z ||
+                                  fs->info.fs.epilog_key.write_s));
+   epilog_discards &= !nontrivial_force_early;
+
+   if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
+      desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
+      desc->root_dirty = true;
+   }
+
+   if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
+       IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
+
+      struct hk_fast_link_key_vs key = {
+         .prolog.hw = (sw_vs == hw_vs),
+
+         /* FIXME: handle pipeline robustness "properly" */
+         .prolog.robustness.level =
+            (dev->vk.enabled_features.robustBufferAccess2 ||
+             dev->vk.enabled_features.pipelineRobustness)
+               ? AGX_ROBUSTNESS_D3D
+               : AGX_ROBUSTNESS_GL,
+
+         .prolog.robustness.soft_fault = false /*TODO*/,
+      };
+
+      if (!key.prolog.hw) {
+         key.prolog.sw_index_size_B =
+            draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
+      }
+
+      static_assert(sizeof(key.prolog.component_mask) ==
+                    sizeof(sw_vs->info.vs.attrib_components_read));
+      BITSET_COPY(key.prolog.component_mask,
+                  sw_vs->info.vs.attrib_components_read);
+
+      u_foreach_bit(a, dyn->vi->attributes_valid) {
+         struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
+
+         assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
+         struct vk_vertex_binding_state binding =
+            dyn->vi->bindings[attr.binding];
+
+         /* nir_assign_io_var_locations compacts vertex inputs, eliminating
+          * unused inputs. We need to do the same here to match the locations.
+          */
+         unsigned slot =
+            util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
+
+         key.prolog.attribs[slot] = (struct agx_velem_key){
+            .format = vk_format_to_pipe_format(attr.format),
+            .stride = dyn->vi_binding_strides[attr.binding],
+            .divisor = binding.divisor,
+            .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
+         };
+      }
+
+      hk_update_fast_linked(cmd, sw_vs, &key);
+   }
+
+   if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
+       (gfx->dirty & HK_DIRTY_VB)) {
+
+      uint64_t sink = dev->rodata.zero_sink;
+
+      unsigned slot = 0;
+      u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
+         if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
+            struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
+            struct hk_addr_range vb = gfx->vb[attr.binding];
+
+            desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
+               vb.addr, sink, vk_format_to_pipe_format(attr.format), vb.range,
+               dyn->vi_binding_strides[attr.binding], attr.offset,
+               &desc->root.draw.attrib_base[slot]);
+         } else {
+            desc->root.draw.attrib_base[slot] = sink;
+            desc->root.draw.attrib_clamps[slot] = 0;
+         }
+
+         ++slot;
+      }
+
+      desc->root_dirty = true;
+   }
+
+   if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
+       IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
+       IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
+       IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
+       IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
+       IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
+       IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
+       IS_DIRTY(CB_BLEND_CONSTANTS) ||
+       desc->root_dirty /* for pipeline stats */ || true) {
+
+      if (fs) {
+         unsigned samples_shaded = 0;
+         if (fs->info.fs.epilog_key.sample_shading)
+            samples_shaded = dyn->ms.rasterization_samples;
+
+         unsigned tib_sample_mask =
+            BITFIELD_MASK(dyn->ms.rasterization_samples);
+         unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
+         bool has_sample_mask = api_sample_mask != tib_sample_mask;
+
+         struct hk_fast_link_key_fs key = {
+            .prolog.statistics = hk_pipeline_stat_addr(
+               cmd,
+               VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
+
+            .prolog.cull_distance_size =
+               hw_vs->info.vs.cull_distance_array_size,
+            .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
+            .nr_samples_shaded = samples_shaded,
+         };
+
+         bool prolog_discards =
+            has_sample_mask || key.prolog.cull_distance_size;
+
+         bool needs_prolog = key.prolog.statistics || prolog_discards;
+
+         if (needs_prolog) {
+            /* With late main shader tests, the prolog runs tests if neither the
+             * main shader nor epilog will.
+             *
+             * With (nontrivial) early main shader tests, the prolog does not
+             * run tests, the tests will run at the start of the main shader.
+             * This ensures tests are after API sample mask and cull distance
+             * discards.
+             */
+            key.prolog.run_zs_tests = !nontrivial_force_early &&
+                                      !fs->b.info.writes_sample_mask &&
+                                      !epilog_discards && prolog_discards;
+
+            if (key.prolog.cull_distance_size) {
+               key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
+            }
+         }
+
+         key.epilog = (struct agx_fs_epilog_key){
+            .link = fs->info.fs.epilog_key,
+            .nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
+            .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
+            .blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
+            .blend.logicop_func = dyn->cb.logic_op_enable
+                                     ? vk_logic_op_to_pipe(dyn->cb.logic_op)
+                                     : PIPE_LOGICOP_COPY,
+         };
+
+         key.epilog.link.already_ran_zs |= nontrivial_force_early;
+
+         struct hk_rendering_state *render = &cmd->state.gfx.render;
+         for (uint32_t i = 0; i < render->color_att_count; i++) {
+            key.epilog.rt_formats[i] =
+               vk_format_to_pipe_format(render->color_att[i].vk_format);
+
+            const struct vk_color_blend_attachment_state *cb =
+               &dyn->cb.attachments[i];
+
+            bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
+            unsigned write_mask = write_enable ? cb->write_mask : 0;
+
+            /* nir_lower_blend always blends, so use a default blend state when
+             * blending is disabled at an API level.
+             */
+            if (!dyn->cb.attachments[i].blend_enable) {
+               key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
+                  .colormask = write_mask,
+                  .rgb_func = PIPE_BLEND_ADD,
+                  .alpha_func = PIPE_BLEND_ADD,
+                  .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
+                  .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
+                  .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
+                  .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
+               };
+            } else {
+               key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
+                  .colormask = write_mask,
+
+                  .rgb_src_factor =
+                     vk_blend_factor_to_pipe(cb->src_color_blend_factor),
+
+                  .rgb_dst_factor =
+                     vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
+
+                  .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
+
+                  .alpha_src_factor =
+                     vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
+
+                  .alpha_dst_factor =
+                     vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
+
+                  .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
+               };
+            }
+         }
+
+         hk_update_fast_linked(cmd, fs, &key);
+      } else {
+         /* TODO: prolog without fs needs to work too... */
+         if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
+            cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
+            cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
+         }
+      }
+   }
+
+   /* If the vertex shader uses draw parameters, vertex uniforms are dirty every
+    * draw. Fragment uniforms are unaffected.
+    *
+    * For a direct draw, we upload the draw parameters as-if indirect to
+    * avoid keying to indirectness.
+    */
+   if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
+      if (draw.b.indirect) {
+         gfx->draw_params = draw.b.ptr;
+
+         if (draw.indexed) {
+            gfx->draw_params +=
+               offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
+         } else {
+            gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
+         }
+      } else {
+         uint32_t params[] = {
+            draw.indexed ? draw.index_bias : draw.start,
+            draw.start_instance,
+         };
+
+         gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
+      }
+   } else {
+      gfx->draw_params = 0;
+   }
+
+   if (sw_vs->b.info.uses_draw_id) {
+      /* TODO: rodata? */
+      gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
+   } else {
+      gfx->draw_id_ptr = 0;
+   }
+
+   if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
+      enum mesa_prim prim = hk_rast_prim(cmd);
+
+      gfx->topology = translate_hw_primitive_topology(prim);
+      gfx->object_type = translate_object_type(prim);
+   }
+
+   if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
+      unsigned provoking;
+      if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
+         provoking = 2;
+      else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
+         provoking = 1;
+      else
+         provoking = 0;
+
+      if (provoking != gfx->provoking) {
+         gfx->provoking = provoking;
+         gfx->dirty |= HK_DIRTY_PROVOKING;
+
+         gfx->descriptors.root.draw.provoking = provoking;
+         gfx->descriptors.root_dirty = true;
+      }
+   }
+
+   /* With attachmentless rendering, we don't know the sample count until draw
+    * time, so we do a late tilebuffer fix up. But with rasterizer discard,
+    * rasterization_samples might be 0.
+    */
+   if (dyn->ms.rasterization_samples &&
+       gfx->render.tilebuffer.nr_samples != dyn->ms.rasterization_samples) {
+
+      assert(gfx->render.tilebuffer.nr_samples == 0);
+
+      unsigned nr_samples = MAX2(dyn->ms.rasterization_samples, 1);
+      gfx->render.tilebuffer.nr_samples = nr_samples;
+      agx_tilebuffer_pack_usc(&gfx->render.tilebuffer);
+      cs->tib = gfx->render.tilebuffer;
+   }
+
+   if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
+       IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
+
+      uint32_t ctrl;
+      if (dyn->ms.sample_locations_enable) {
+         ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
+      } else {
+         ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
+      }
+
+      bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
+
+      if (!cs->has_sample_locations) {
+         cs->ppp_multisamplectl = ctrl;
+
+         /* If we're in vk_meta, do not commit to the sample locations yet.
+          * vk_meta doesn't care, but the app will!
+          */
+         cs->has_sample_locations |= !dont_commit;
+      } else {
+         assert(dont_commit || cs->ppp_multisamplectl == ctrl);
+      }
+
+      gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
+      gfx->descriptors.root_dirty = true;
+   }
+
+   /* Link varyings before uploading tessellation state, becuase the
+    * gfx->generate_primitive_id boolean needs to be plumbed.
+    */
+   struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
+   struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
+   bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
+   bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
+
+   if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
+      unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
+      if (bindings) {
+         size_t linkage_size =
+            AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
+
+         struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
+         if (!t.cpu)
+            return;
+
+         agx_link_varyings_vs_fs(
+            t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
+            &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
+
+         gfx->varyings = t.gpu;
+      } else {
+         gfx->varyings = 0;
+      }
+
+      gfx->dirty |= HK_DIRTY_VARYINGS;
+   }
+
+   if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
+       gfx->shaders[MESA_SHADER_GEOMETRY]) {
+
+      struct hk_shader *vs = hk_bound_sw_vs(gfx);
+      desc->root.draw.vertex_outputs = vs->b.info.outputs;
+
+      /* XXX: We should deduplicate this logic */
+      bool restart = (draw.indexed && draw.restart);
+      bool indirect = draw.b.indirect || restart;
+
+      desc->root.draw.input_assembly =
+         indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
+                  : hk_upload_ia_params(cmd, draw);
+
+      if (!indirect) {
+         uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
+         unsigned vb_size =
+            libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
+
+         /* Allocate if there are any outputs, or use the null sink to trap
+          * reads if there aren't. Those reads are undefined but should not
+          * fault. Affects:
+          *
+          *    dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
+          */
+         desc->root.draw.vertex_output_buffer =
+            vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
+                    : dev->rodata.null_sink;
+      }
+   }
+
+   if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
+      gfx->descriptors.root.draw.tess_params = hk_upload_tess_params(cmd, draw);
+      gfx->descriptors.root_dirty = true;
+   }
+
+   if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
+      /* TODO: size */
+      cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
+
+      gfx->descriptors.root.draw.geometry_params =
+         hk_upload_geometry_params(cmd, draw);
+
+      gfx->descriptors.root_dirty = true;
+   }
+
+   /* Root must be uploaded after the above, which touch the root */
+   if (gfx->descriptors.root_dirty) {
+      gfx->root =
+         hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   }
+
+   /* Hardware dynamic state must be deferred until after the root and fast
+    * linking, since it will use the root address and the linked shaders.
+    */
+   if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
+       IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
+       gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
+
+      /* TODO: Do less often? */
+      hk_reserve_scratch(cmd, cs, hw_vs);
+
+      agx_push(out, VDM_STATE, cfg) {
+         cfg.vertex_shader_word_0_present = true;
+         cfg.vertex_shader_word_1_present = true;
+         cfg.vertex_outputs_present = true;
+         cfg.vertex_unknown_present = true;
+      }
+
+      agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
+
+      struct hk_linked_shader *linked_hw_vs =
+         (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
+
+      agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
+         cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
+      }
+
+      agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
+
+      agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
+         cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
+         cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
+         cfg.generate_primitive_id = gfx->generate_primitive_id;
+      }
+
+      /* Pad up to a multiple of 8 bytes */
+      memset(out, 0, 4);
+      out += 4;
+   }
+
+   if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
+      void *ptr =
+         util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
+
+      agx_pack(ptr, DEPTH_BIAS, cfg) {
+         cfg.depth_bias = dyn->rs.depth_bias.constant;
+         cfg.slope_scale = dyn->rs.depth_bias.slope;
+         cfg.clamp = dyn->rs.depth_bias.clamp;
+
+         /* Value from the PowerVR driver. */
+         if (render->depth_att.vk_format == VK_FORMAT_D16_UNORM) {
+            cfg.depth_bias /= (1 << 15);
+         }
+      }
+   }
+
+   /* Hardware viewport/scissor state is entangled with depth bias. */
+   if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
+       IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
+       IS_DIRTY(VP_VIEWPORT_COUNT) ||
+       IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
+       IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
+
+      hk_flush_vp_state(cmd, cs, &out);
+   }
+
+   hk_flush_ppp_state(cmd, cs, &out);
+   cs->current = out;
+
+   vk_dynamic_graphics_state_clear_dirty(dyn);
+   gfx->shaders_dirty = 0;
+   gfx->linked_dirty = 0;
+   gfx->dirty = 0;
+   gfx->descriptors.root_dirty = false;
+}
+
+static bool
+hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   if (!draw.indexed)
+      return false;
+
+   /* If tessellation is used, we'll go through the robust path anyway, don't
+    * end up with a tess+geom doom combo.
+    */
+   if (gfx->shaders[MESA_SHADER_TESS_EVAL])
+      return false;
+
+   /* Allowed with maint6 without robustness features enabled */
+   if (draw.index.range == 0)
+      return true;
+
+   if (!(dev->vk.enabled_features.robustBufferAccess ||
+         dev->vk.enabled_features.robustBufferAccess2 ||
+         dev->vk.enabled_features.pipelineRobustness))
+      return false;
+
+   if (draw.b.indirect) {
+      return true;
+   } else {
+      uint32_t range_B =
+         (draw.start + draw.b.count[0]) * agx_index_size_to_B(draw.index_size);
+
+      return range_B > draw.index.range;
+   }
+}
+
+static void
+hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct hk_draw draw)
+{
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
+
+   /* If there's an application geometry shader, there's nothing to un/bind */
+   if (gs && !gs->is_passthrough)
+      return;
+
+   /* Determine if we need a geometry shader to emulate XFB or adjacency */
+   struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
+   struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
+   uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
+
+   VkPrimitiveTopology topology = dyn->ia.primitive_topology;
+   bool adjacency =
+      (topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY) ||
+      (topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY) ||
+      (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY) ||
+      (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY);
+
+   /* TODO: Don't use a whole GS just for index robustness. */
+   bool index_robustness = hk_needs_index_robustness(cmd, draw);
+
+   bool needs_gs = xfb_outputs || adjacency || index_robustness;
+
+   /* Various pipeline statistics are implemented in the pre-GS shader. TODO:
+    * This could easily be optimized.
+    */
+   VkQueryPipelineStatisticFlagBits ia_statistics[] = {
+      VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
+      VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
+      VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
+   };
+
+   bool ia_stats = false;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) {
+      ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0;
+   }
+
+   needs_gs |= ia_stats;
+
+   /* If we already have a matching GS configuration, we're done */
+   if ((gs != NULL) == needs_gs)
+      return;
+
+   /* If we don't need a GS but we do have a passthrough, unbind it */
+   if (gs) {
+      assert(!needs_gs && gs->is_passthrough);
+      hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
+      return;
+   }
+
+   /* Else, we need to bind a passthrough GS */
+   size_t key_size =
+      sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
+   struct hk_passthrough_gs_key *key = alloca(key_size);
+
+   *key = (struct hk_passthrough_gs_key){
+      .prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
+      .outputs = last_sw->b.info.outputs,
+      .clip_distance_array_size = last_sw->info.clip_distance_array_size,
+      .cull_distance_array_size = last_sw->info.cull_distance_array_size,
+   };
+
+   if (xfb_outputs) {
+      typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
+                   ARRAY_SIZE(key->xfb_stride));
+
+      memcpy(&key->xfb_info, &last_sw->info.xfb_info,
+             nir_xfb_info_size(xfb_outputs));
+   }
+
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   perf_debug(dev, "Binding passthrough GS for%s%s%s%s\n",
+              xfb_outputs ? " XFB" : "", adjacency ? " adjacency" : "",
+              index_robustness ? " robustness" : "",
+              ia_stats ? " statistics" : "");
+
+   gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
+   gs->is_passthrough = true;
+   hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
+}
+
+static struct hk_cs *
+hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
+                   struct hk_draw draw)
+{
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
+   if (!cs)
+      return NULL;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+   struct hk_descriptor_state *desc = &gfx->descriptors;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+#ifndef NDEBUG
+   if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
+      hk_cmd_buffer_dirty_all(cmd);
+   }
+#endif
+
+   /* TODO: Try to reduce draw overhead of this */
+   hk_handle_passthrough_gs(cmd, draw);
+
+   hk_flush_shaders(cmd);
+
+   if (desc->push_dirty)
+      hk_cmd_buffer_flush_push_descriptors(cmd, desc);
+
+   if ((gfx->dirty & HK_DIRTY_INDEX) &&
+       (gfx->index.restart || gfx->shaders[MESA_SHADER_GEOMETRY]))
+      hk_flush_index(cmd, cs);
+
+   hk_flush_dynamic_state(cmd, cs, draw_id, draw);
+   return cs;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                          VkDeviceSize offset, VkDeviceSize size,
+                          VkIndexType indexType)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   cmd->state.gfx.index = (struct hk_index_buffer_state){
+      .buffer = hk_buffer_addr_range(buffer, offset, size),
+      .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
+      .restart = vk_index_to_restart(indexType),
+   };
+
+   /* TODO: check if necessary, blob does this */
+   cmd->state.gfx.index.buffer.range =
+      align(cmd->state.gfx.index.buffer.range, 4);
+
+   cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
+}
+
+void
+hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
+                          struct hk_addr_range addr_range)
+{
+   cmd->state.gfx.vb[vb_idx] = addr_range;
+   cmd->state.gfx.dirty |= HK_DIRTY_VB;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
+                         uint32_t bindingCount, const VkBuffer *pBuffers,
+                         const VkDeviceSize *pOffsets,
+                         const VkDeviceSize *pSizes,
+                         const VkDeviceSize *pStrides)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   if (pStrides) {
+      vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
+                                        pStrides);
+   }
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
+      uint32_t idx = firstBinding + i;
+
+      uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
+      const struct hk_addr_range addr_range =
+         hk_buffer_addr_range(buffer, pOffsets[i], size);
+
+      hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
+   }
+}
+
+static bool
+hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
+{
+   if (cmd->state.gfx.render.view_mask) {
+      cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
+      cmd->state.gfx.descriptors.root_dirty = true;
+   }
+
+   return true;
+}
+
+/*
+ * Iterator macro to duplicate a draw for each enabled view (when multiview is
+ * enabled, else always view 0). Along with hk_lower_multiview, this forms the
+ * world's worst multiview lowering.
+ */
+#define hk_foreach_view(cmd)                                                   \
+   u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1)               \
+      if (hk_set_view_index(cmd, view_idx))
+
+static void
+hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw,
+             uint64_t ia_vertices, uint64_t vs_invocations)
+{
+   /* XXX: stream link needed? */
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   perf_debug(dev, "Input assembly counters");
+
+   struct agx_increment_ia_counters_key key = {
+      .index_size_B = draw.restart ? agx_index_size_to_B(draw.index_size) : 0,
+   };
+
+   uint64_t draw_ptr;
+   if (draw.b.indirect) {
+      draw_ptr = draw.b.ptr;
+   } else {
+      uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
+      draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
+   }
+
+   struct libagx_increment_ia_counters args = {
+      .ia_vertices = ia_vertices,
+      .vs_invocations = vs_invocations,
+      .restart_index = cmd->state.gfx.index.restart,
+      .draw = draw_ptr,
+      .index_buffer = draw.index.addr,
+      .index_buffer_range_el =
+         key.index_size_B ? (draw.index.range / key.index_size_B) : 0,
+   };
+
+   uint64_t wg_size = key.index_size_B ? 1024 : 1;
+
+   struct hk_shader *s =
+      hk_meta_kernel(dev, agx_nir_increment_ia_counters, &key, sizeof(key));
+
+   uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(wg_size, 1, 1),
+                        hk_grid(wg_size, 1, 1));
+}
+
+static void
+hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct hk_draw draw_)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd->vk.dynamic_graphics_state;
+
+   /* Filter trivial draws so we don't need to worry about null index buffers */
+   if (!draw_.b.indirect && (draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
+      return;
+
+   draw_.restart = dyn->ia.primitive_restart_enable;
+   draw_.index_size = cmd->state.gfx.index.size;
+
+   uint64_t stat_ia_verts = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
+
+   uint64_t stat_vs_inv = hk_pipeline_stat_addr(
+      cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
+
+   bool ia_stats = stat_ia_verts || stat_vs_inv;
+
+   hk_foreach_view(cmd) {
+      struct hk_draw draw = draw_;
+      struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
+      /* If we failed to allocate a control stream, we've already lost the
+       * device. Just drop the draw so we don't crash.
+       */
+      if (!cs)
+         return;
+
+      bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
+      bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
+      struct hk_cs *ccs = NULL;
+      uint8_t *out = cs->current;
+      assert(cs->current + 0x1000 < cs->end);
+
+      if (geom || tess || ia_stats) {
+         ccs =
+            hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
+         if (!ccs)
+            return;
+      }
+
+      if (ia_stats) {
+         hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv);
+      }
+
+      if (tess) {
+         draw = hk_launch_tess(cmd, ccs, draw);
+
+         if (draw.raw) {
+            assert(!geom);
+            assert(draw.b.indirect);
+
+            agx_push(out, VDM_STREAM_LINK, cfg) {
+               cfg.target_lo = draw.b.ptr & BITFIELD_MASK(32);
+               cfg.target_hi = draw.b.ptr >> 32;
+               cfg.with_return = true;
+            }
+
+            cs->current = out;
+            continue;
+         }
+      }
+
+      if (geom) {
+         draw = hk_launch_gs_prerast(cmd, ccs, draw);
+
+         /* We must not draw if the app specified rasterizer discard. This is
+          * required for both performance (it is pointless to rasterize and
+          * there are no side effects), but also correctness (no indirect draw
+          * descriptor will be filled out).
+          */
+         if (dyn->rs.rasterizer_discard_enable)
+            continue;
+      }
+
+      uint64_t ib = draw.index.addr;
+      if (draw.indexed && !draw.b.indirect)
+         ib += (draw.start << draw.index_size);
+
+      agx_push(out, INDEX_LIST, cfg) {
+         cfg.primitive = cmd->state.gfx.topology;
+
+         if (draw.b.indirect) {
+            cfg.indirect_buffer_present = true;
+         } else {
+            cfg.instance_count_present = true;
+            cfg.index_count_present = true;
+            cfg.start_present = true;
+         }
+
+         if (draw.indexed) {
+            cfg.restart_enable = draw.restart;
+            cfg.index_buffer_hi = ib >> 32;
+            cfg.index_size = draw.index_size;
+
+            cfg.index_buffer_present = true;
+            cfg.index_buffer_size_present = true;
+         }
+      }
+
+      if (draw.indexed) {
+         agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
+            cfg.buffer_lo = ib;
+         }
+      }
+
+      if (draw.b.indirect) {
+         agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
+            cfg.address_hi = draw.b.ptr >> 32;
+            cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
+         }
+      } else {
+         agx_push(out, INDEX_LIST_COUNT, cfg) {
+            cfg.count = draw.b.count[0];
+         }
+
+         agx_push(out, INDEX_LIST_INSTANCES, cfg) {
+            cfg.count = draw.b.count[1];
+         }
+
+         agx_push(out, INDEX_LIST_START, cfg) {
+            cfg.start = draw.indexed ? draw.index_bias : draw.start;
+         }
+      }
+
+      if (draw.indexed) {
+         agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
+            cfg.size = draw.index.range;
+         }
+      }
+
+      cs->current = out;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
+           uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   struct hk_draw draw = {
+      .b = hk_grid(vertexCount, instanceCount, 1),
+      .start = firstVertex,
+      .start_instance = firstInstance,
+   };
+
+   hk_draw(cmd, 0, draw);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
+                   const VkMultiDrawInfoEXT *pVertexInfo,
+                   uint32_t instanceCount, uint32_t firstInstance,
+                   uint32_t stride)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   for (unsigned i = 0; i < drawCount; ++i) {
+      struct hk_draw draw = {
+         .b = hk_grid(pVertexInfo->vertexCount, instanceCount, 1),
+         .start = pVertexInfo->firstVertex,
+         .start_instance = firstInstance,
+      };
+
+      hk_draw(cmd, i, draw);
+      pVertexInfo = ((void *)pVertexInfo) + stride;
+   }
+}
+
+static void
+hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
+                uint32_t indexCount, uint32_t instanceCount,
+                uint32_t firstIndex, int32_t vertexOffset,
+                uint32_t firstInstance)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+
+   struct hk_draw draw = {
+      .b = hk_grid(indexCount, instanceCount, 1),
+      .indexed = true,
+      .index = cmd->state.gfx.index.buffer,
+      .start = firstIndex,
+      .index_bias = vertexOffset,
+      .start_instance = firstInstance,
+   };
+
+   hk_draw(cmd, draw_id, draw);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
+                  uint32_t instanceCount, uint32_t firstIndex,
+                  int32_t vertexOffset, uint32_t firstInstance)
+{
+   hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
+                   vertexOffset, firstInstance);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
+                          const VkMultiDrawIndexedInfoEXT *pIndexInfo,
+                          uint32_t instanceCount, uint32_t firstInstance,
+                          uint32_t stride, const int32_t *pVertexOffset)
+{
+   for (unsigned i = 0; i < drawCount; ++i) {
+      const uint32_t vertex_offset =
+         pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
+
+      hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
+                      pIndexInfo->firstIndex, vertex_offset, firstInstance);
+
+      pIndexInfo = ((void *)pIndexInfo) + stride;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                   VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   /* From the Vulkan 1.3.238 spec:
+    *
+    *    VUID-vkCmdDrawIndirect-drawCount-00476
+    *
+    *    "If drawCount is greater than 1, stride must be a multiple of 4 and
+    *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
+    *
+    * and
+    *
+    *    "If drawCount is less than or equal to one, stride is ignored."
+    */
+   if (drawCount > 1) {
+      assert(stride % 4 == 0);
+      assert(stride >= sizeof(VkDrawIndirectCommand));
+   }
+
+   for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
+      uint64_t addr = hk_buffer_address(buffer, offset) + stride * draw_id;
+      hk_draw(cmd, draw_id, hk_draw_indirect(addr));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                          VkDeviceSize offset, uint32_t drawCount,
+                          uint32_t stride)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   /* From the Vulkan 1.3.238 spec:
+    *
+    *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
+    *
+    *    "If drawCount is greater than 1, stride must be a multiple of 4 and
+    *    must be greater than or equal to
+    * sizeof(VkDrawIndexedIndirectCommand)"
+    *
+    * and
+    *
+    *    "If drawCount is less than or equal to one, stride is ignored."
+    */
+   if (drawCount > 1) {
+      assert(stride % 4 == 0);
+      assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
+   }
+
+   for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
+      uint64_t addr = hk_buffer_address(buffer, offset) + stride * draw_id;
+
+      hk_draw(
+         cmd, draw_id,
+         hk_draw_indexed_indirect(addr, cmd->state.gfx.index.buffer, 0, 0));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                        VkDeviceSize offset, VkBuffer countBuffer,
+                        VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
+                        uint32_t stride)
+{
+   unreachable("TODO");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
+                               VkDeviceSize offset, VkBuffer countBuffer,
+                               VkDeviceSize countBufferOffset,
+                               uint32_t maxDrawCount, uint32_t stride)
+{
+   unreachable("TODO");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
+                               uint32_t instanceCount, uint32_t firstInstance,
+                               VkBuffer counterBuffer,
+                               VkDeviceSize counterBufferOffset,
+                               uint32_t counterOffset, uint32_t vertexStride)
+{
+   unreachable("TODO");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
+                                      uint32_t firstBinding,
+                                      uint32_t bindingCount,
+                                      const VkBuffer *pBuffers,
+                                      const VkDeviceSize *pOffsets,
+                                      const VkDeviceSize *pSizes)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
+      uint32_t idx = firstBinding + i;
+      uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
+
+      gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size);
+   }
+}
+
+static void
+hk_libagx_copy_xfb_counters(nir_builder *b, const void *key)
+{
+   b->shader->info.workgroup_size_variable = true;
+
+   libagx_copy_xfb_counters(b, nir_load_preamble(b, 1, 64));
+}
+
+static void
+hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
+                 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
+                 const VkDeviceSize *pCounterBufferOffsets, bool begin)
+
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_graphics_state *gfx = &cmd->state.gfx;
+
+   gfx->xfb_enabled = begin;
+
+   /* If we haven't reserved XFB offsets yet for the command buffer, do so. */
+   if (!gfx->xfb_offsets) {
+      gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
+   }
+
+   struct hk_cs *cs =
+      hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
+   if (!cs)
+      return;
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   struct libagx_xfb_counter_copy params = {};
+   unsigned copies = 0;
+
+   /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
+   if (begin) {
+      for (copies = 0; copies < 4; ++copies) {
+         params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
+      }
+   }
+
+   for (unsigned i = 0; i < counterBufferCount; ++i) {
+      if (pCounterBuffers[i] == VK_NULL_HANDLE)
+         continue;
+
+      VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
+
+      uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
+      uint64_t cb_addr = hk_buffer_address(buffer, offset);
+      uint32_t cmd_idx = firstCounterBuffer + i;
+
+      if (begin) {
+         params.src[cmd_idx] = cb_addr;
+      } else {
+         params.dest[copies] = cb_addr;
+         params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
+         ++copies;
+      }
+   }
+
+   if (begin)
+      copies = 4;
+
+   if (copies > 0) {
+      perf_debug(dev, "XFB counter copy");
+
+      struct hk_shader *s =
+         hk_meta_kernel(dev, hk_libagx_copy_xfb_counters, NULL, 0);
+
+      uint64_t push = hk_pool_upload(cmd, &params, sizeof(params), 8);
+      uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+
+      hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(copies, 1, 1),
+                           hk_grid(copies, 1, 1));
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
+                                uint32_t firstCounterBuffer,
+                                uint32_t counterBufferCount,
+                                const VkBuffer *pCounterBuffers,
+                                const VkDeviceSize *pCounterBufferOffsets)
+{
+   hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
+                    pCounterBuffers, pCounterBufferOffsets, true);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
+                              uint32_t firstCounterBuffer,
+                              uint32_t counterBufferCount,
+                              const VkBuffer *pCounterBuffers,
+                              const VkDeviceSize *pCounterBufferOffsets)
+{
+   hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
+                    pCounterBuffers, pCounterBufferOffsets, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginConditionalRenderingEXT(
+   VkCommandBuffer commandBuffer,
+   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
+{
+   unreachable("stub");
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
+{
+   unreachable("stub");
+}
diff --git a/src/asahi/vulkan/hk_cmd_meta.c b/src/asahi/vulkan/hk_cmd_meta.c
new file mode 100644
index 00000000000..ee70d9d0d3c
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_meta.c
@@ -0,0 +1,1692 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "vulkan/vulkan_core.h"
+#include "agx_pack.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "nir_builder.h"
+#include "shader_enums.h"
+#include "vk_format.h"
+#include "vk_meta.h"
+#include "vk_pipeline.h"
+
+static VkResult
+hk_cmd_bind_map_buffer(struct vk_command_buffer *vk_cmd,
+                       struct vk_meta_device *meta, VkBuffer _buffer,
+                       void **map_out)
+{
+   struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
+   VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
+
+   assert(buffer->vk.size < UINT_MAX);
+   struct agx_ptr T = hk_pool_alloc(cmd, buffer->vk.size, 16);
+   if (unlikely(T.cpu == NULL))
+      return VK_ERROR_OUT_OF_POOL_MEMORY;
+
+   buffer->addr = T.gpu;
+   *map_out = T.cpu;
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_device_init_meta(struct hk_device *dev)
+{
+   VkResult result = vk_meta_device_init(&dev->vk, &dev->meta);
+   if (result != VK_SUCCESS)
+      return result;
+
+   dev->meta.use_gs_for_layer = false;
+   dev->meta.use_stencil_export = true;
+   dev->meta.cmd_bind_map_buffer = hk_cmd_bind_map_buffer;
+   dev->meta.max_bind_map_buffer_size_B = 64 * 1024;
+
+   return VK_SUCCESS;
+}
+
+void
+hk_device_finish_meta(struct hk_device *dev)
+{
+   vk_meta_device_finish(&dev->vk, &dev->meta);
+}
+
+struct hk_meta_save {
+   struct vk_vertex_input_state _dynamic_vi;
+   struct vk_sample_locations_state _dynamic_sl;
+   struct vk_dynamic_graphics_state dynamic;
+   struct hk_api_shader *shaders[MESA_SHADER_MESH + 1];
+   struct hk_addr_range vb0;
+   struct hk_descriptor_set *desc0;
+   bool has_push_desc0;
+   enum agx_visibility_mode occlusion;
+   struct hk_push_descriptor_set push_desc0;
+   VkQueryPipelineStatisticFlags pipeline_stats_flags;
+   uint8_t push[128];
+};
+
+static void
+hk_meta_begin(struct hk_cmd_buffer *cmd, struct hk_meta_save *save,
+              VkPipelineBindPoint bind_point)
+{
+   struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
+
+   if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
+      save->dynamic = cmd->vk.dynamic_graphics_state;
+      save->_dynamic_vi = cmd->state.gfx._dynamic_vi;
+      save->_dynamic_sl = cmd->state.gfx._dynamic_sl;
+
+      static_assert(sizeof(cmd->state.gfx.shaders) == sizeof(save->shaders));
+      memcpy(save->shaders, cmd->state.gfx.shaders, sizeof(save->shaders));
+
+      /* Pause queries */
+      save->occlusion = cmd->state.gfx.occlusion.mode;
+      cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
+      cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
+
+      save->pipeline_stats_flags = desc->root.draw.pipeline_stats_flags;
+      desc->root.draw.pipeline_stats_flags = 0;
+      desc->root_dirty = true;
+   } else {
+      save->shaders[MESA_SHADER_COMPUTE] = cmd->state.cs.shader;
+   }
+
+   save->vb0 = cmd->state.gfx.vb[0];
+
+   save->desc0 = desc->sets[0];
+   save->has_push_desc0 = desc->push[0];
+   if (save->has_push_desc0)
+      save->push_desc0 = *desc->push[0];
+
+   static_assert(sizeof(save->push) == sizeof(desc->root.push));
+   memcpy(save->push, desc->root.push, sizeof(save->push));
+
+   cmd->in_meta = true;
+}
+
+static void
+hk_meta_init_render(struct hk_cmd_buffer *cmd,
+                    struct vk_meta_rendering_info *info)
+{
+   const struct hk_rendering_state *render = &cmd->state.gfx.render;
+
+   *info = (struct vk_meta_rendering_info){
+      .samples = MAX2(render->tilebuffer.nr_samples, 1),
+      .view_mask = render->view_mask,
+      .color_attachment_count = render->color_att_count,
+      .depth_attachment_format = render->depth_att.vk_format,
+      .stencil_attachment_format = render->stencil_att.vk_format,
+   };
+   for (uint32_t a = 0; a < render->color_att_count; a++)
+      info->color_attachment_formats[a] = render->color_att[a].vk_format;
+}
+
+static void
+hk_meta_end(struct hk_cmd_buffer *cmd, struct hk_meta_save *save,
+            VkPipelineBindPoint bind_point)
+{
+   struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
+   desc->root_dirty = true;
+
+   if (save->desc0) {
+      desc->sets[0] = save->desc0;
+      desc->root.sets[0] = hk_descriptor_set_addr(save->desc0);
+      desc->sets_dirty |= BITFIELD_BIT(0);
+      desc->push_dirty &= ~BITFIELD_BIT(0);
+   } else if (save->has_push_desc0) {
+      *desc->push[0] = save->push_desc0;
+      desc->push_dirty |= BITFIELD_BIT(0);
+   }
+
+   if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
+      /* Restore the dynamic state */
+      assert(save->dynamic.vi == &cmd->state.gfx._dynamic_vi);
+      assert(save->dynamic.ms.sample_locations == &cmd->state.gfx._dynamic_sl);
+      cmd->vk.dynamic_graphics_state = save->dynamic;
+      cmd->state.gfx._dynamic_vi = save->_dynamic_vi;
+      cmd->state.gfx._dynamic_sl = save->_dynamic_sl;
+      memcpy(cmd->vk.dynamic_graphics_state.dirty,
+             cmd->vk.dynamic_graphics_state.set,
+             sizeof(cmd->vk.dynamic_graphics_state.set));
+
+      for (uint32_t stage = 0; stage < ARRAY_SIZE(save->shaders); stage++) {
+         hk_cmd_bind_graphics_shader(cmd, stage, save->shaders[stage]);
+      }
+
+      hk_cmd_bind_vertex_buffer(cmd, 0, save->vb0);
+
+      /* Restore queries */
+      cmd->state.gfx.occlusion.mode = save->occlusion;
+      cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
+
+      desc->root.draw.pipeline_stats_flags = save->pipeline_stats_flags;
+      desc->root_dirty = true;
+   } else {
+      hk_cmd_bind_compute_shader(cmd, save->shaders[MESA_SHADER_COMPUTE]);
+   }
+
+   memcpy(desc->root.push, save->push, sizeof(save->push));
+   cmd->in_meta = false;
+}
+
+#define VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE (0xcafe0000)
+#define VK_META_OBJECT_KEY_FILL_PIPELINE                 (0xcafe0001)
+
+#define BINDING_OUTPUT 0
+#define BINDING_INPUT  1
+
+static VkFormat
+aspect_format(VkFormat fmt, VkImageAspectFlags aspect)
+{
+   bool depth = (aspect & VK_IMAGE_ASPECT_DEPTH_BIT);
+   bool stencil = (aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
+
+   enum pipe_format p_format = vk_format_to_pipe_format(fmt);
+
+   if (util_format_is_depth_or_stencil(p_format)) {
+      assert(depth ^ stencil);
+      if (depth) {
+         switch (fmt) {
+         case VK_FORMAT_D32_SFLOAT:
+         case VK_FORMAT_D32_SFLOAT_S8_UINT:
+            return VK_FORMAT_D32_SFLOAT;
+         case VK_FORMAT_D16_UNORM:
+         case VK_FORMAT_D16_UNORM_S8_UINT:
+            return VK_FORMAT_D16_UNORM;
+         default:
+            unreachable("invalid depth");
+         }
+      } else {
+         switch (fmt) {
+         case VK_FORMAT_S8_UINT:
+         case VK_FORMAT_D32_SFLOAT_S8_UINT:
+         case VK_FORMAT_D16_UNORM_S8_UINT:
+            return VK_FORMAT_S8_UINT;
+         default:
+            unreachable("invalid stencil");
+         }
+      }
+   }
+
+   assert(!depth && !stencil);
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(fmt);
+
+   if (ycbcr_info) {
+      switch (aspect) {
+      case VK_IMAGE_ASPECT_PLANE_0_BIT:
+         return ycbcr_info->planes[0].format;
+      case VK_IMAGE_ASPECT_PLANE_1_BIT:
+         return ycbcr_info->planes[1].format;
+      case VK_IMAGE_ASPECT_PLANE_2_BIT:
+         return ycbcr_info->planes[2].format;
+      default:
+         unreachable("invalid ycbcr aspect");
+      }
+   }
+
+   return fmt;
+}
+
+static VkFormat
+canonical_format(VkFormat fmt)
+{
+   enum pipe_format p_format = vk_format_to_pipe_format(fmt);
+
+   if (util_format_is_depth_or_stencil(p_format))
+      return fmt;
+
+   switch (util_format_get_blocksize(p_format)) {
+   case 1:
+      return VK_FORMAT_R8_UINT;
+   case 2:
+      return VK_FORMAT_R16_UINT;
+   case 4:
+      return VK_FORMAT_R32_UINT;
+   case 8:
+      return VK_FORMAT_R32G32_UINT;
+   case 16:
+      return VK_FORMAT_R32G32B32A32_UINT;
+   default:
+      unreachable("invalid bpp");
+   }
+}
+
+enum copy_type {
+   BUF2IMG,
+   IMG2BUF,
+   IMG2IMG,
+};
+
+struct vk_meta_push_data {
+   uint32_t buffer_offset;
+   uint32_t row_extent;
+   uint32_t slice_or_layer_extent;
+
+   int32_t src_offset_el[4];
+   int32_t dst_offset_el[4];
+   uint32_t grid_el[3];
+} PACKED;
+
+#define get_push(b, name)                                                      \
+   nir_load_push_constant(                                                     \
+      b, 1, sizeof(((struct vk_meta_push_data *)0)->name) * 8,                 \
+      nir_imm_int(b, offsetof(struct vk_meta_push_data, name)))
+
+struct vk_meta_image_copy_key {
+   enum vk_meta_object_key_type key_type;
+   enum copy_type type;
+   unsigned block_size;
+   unsigned nr_samples;
+};
+
+static nir_def *
+linearize_coords(nir_builder *b, nir_def *coord,
+                 const struct vk_meta_image_copy_key *key)
+{
+   assert(key->nr_samples == 1 && "buffer<-->image copies not multisampled");
+
+   nir_def *row_extent = get_push(b, row_extent);
+   nir_def *slice_or_layer_extent = get_push(b, slice_or_layer_extent);
+   nir_def *x = nir_channel(b, coord, 0);
+   nir_def *y = nir_channel(b, coord, 1);
+   nir_def *z_or_layer = nir_channel(b, coord, 2);
+
+   nir_def *v = get_push(b, buffer_offset);
+
+   v = nir_iadd(b, v, nir_imul_imm(b, x, key->block_size));
+   v = nir_iadd(b, v, nir_imul(b, y, row_extent));
+   v = nir_iadd(b, v, nir_imul(b, z_or_layer, slice_or_layer_extent));
+
+   return nir_udiv_imm(b, v, key->block_size);
+}
+
+static nir_shader *
+build_image_copy_shader(const struct vk_meta_image_copy_key *key)
+{
+   nir_builder build =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "vk-meta-copy");
+
+   nir_builder *b = &build;
+   b->shader->info.workgroup_size[0] = 32;
+   b->shader->info.workgroup_size[1] = 32;
+
+   bool src_is_buf = key->type == BUF2IMG;
+   bool dst_is_buf = key->type == IMG2BUF;
+
+   bool msaa = key->nr_samples > 1;
+   enum glsl_sampler_dim dim_2d =
+      msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D;
+   enum glsl_sampler_dim dim_src = src_is_buf ? GLSL_SAMPLER_DIM_BUF : dim_2d;
+   enum glsl_sampler_dim dim_dst = dst_is_buf ? GLSL_SAMPLER_DIM_BUF : dim_2d;
+
+   const struct glsl_type *texture_type =
+      glsl_sampler_type(dim_src, false, !src_is_buf, GLSL_TYPE_UINT);
+
+   const struct glsl_type *image_type =
+      glsl_image_type(dim_dst, !dst_is_buf, GLSL_TYPE_UINT);
+
+   nir_variable *texture =
+      nir_variable_create(b->shader, nir_var_uniform, texture_type, "source");
+   nir_variable *image =
+      nir_variable_create(b->shader, nir_var_image, image_type, "dest");
+
+   image->data.descriptor_set = 0;
+   image->data.binding = BINDING_OUTPUT;
+   image->data.access = ACCESS_NON_READABLE;
+
+   texture->data.descriptor_set = 0;
+   texture->data.binding = BINDING_INPUT;
+
+   /* Grab the offset vectors */
+   nir_def *src_offset_el = nir_load_push_constant(
+      b, 3, 32,
+      nir_imm_int(b, offsetof(struct vk_meta_push_data, src_offset_el)));
+
+   nir_def *dst_offset_el = nir_load_push_constant(
+      b, 3, 32,
+      nir_imm_int(b, offsetof(struct vk_meta_push_data, dst_offset_el)));
+
+   nir_def *grid_el = nir_load_push_constant(
+      b, 3, 32, nir_imm_int(b, offsetof(struct vk_meta_push_data, grid_el)));
+
+   /* We're done setting up variables, do the copy */
+   nir_def *coord = nir_load_global_invocation_id(b, 32);
+
+   nir_push_if(b,
+               nir_ball(b, nir_trim_vector(b, nir_ult(b, coord, grid_el), 2)));
+   {
+      nir_def *src_coord = nir_iadd(b, coord, src_offset_el);
+      nir_def *dst_coord = nir_iadd(b, coord, dst_offset_el);
+
+      /* Special case handle buffer indexing */
+      if (dst_is_buf) {
+         dst_coord = linearize_coords(b, coord, key);
+      } else if (src_is_buf) {
+         src_coord = linearize_coords(b, coord, key);
+      }
+
+      /* Copy formatted texel from texture to storage image */
+      for (unsigned s = 0; s < key->nr_samples; ++s) {
+         nir_deref_instr *deref = nir_build_deref_var(b, texture);
+         nir_def *ms_index = nir_imm_int(b, s);
+
+         nir_def *value = msaa ? nir_txf_ms_deref(b, deref, src_coord, ms_index)
+                               : nir_txf_deref(b, deref, src_coord, NULL);
+
+         nir_image_deref_store(b, &nir_build_deref_var(b, image)->def,
+                               nir_pad_vec4(b, dst_coord), ms_index, value,
+                               nir_imm_int(b, 0), .image_dim = dim_dst,
+                               .image_array = !dst_is_buf);
+      }
+   }
+   nir_pop_if(b, NULL);
+   return b->shader;
+}
+
+static VkResult
+get_image_copy_descriptor_set_layout(struct vk_device *device,
+                                     struct vk_meta_device *meta,
+                                     VkDescriptorSetLayout *layout_out,
+                                     enum copy_type type)
+{
+   const char *keys[] = {
+      [IMG2BUF] = "vk-meta-copy-image-to-buffer-descriptor-set-layout",
+      [BUF2IMG] = "vk-meta-copy-buffer-to-image-descriptor-set-layout",
+      [IMG2IMG] = "vk-meta-copy-image-to-image-descriptor-set-layout",
+   };
+
+   VkDescriptorSetLayout from_cache = vk_meta_lookup_descriptor_set_layout(
+      meta, keys[type], strlen(keys[type]));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkDescriptorSetLayoutBinding bindings[] = {
+      {
+         .binding = BINDING_OUTPUT,
+         .descriptorType = type != IMG2BUF
+                              ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE
+                              : VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+      {
+         .binding = BINDING_INPUT,
+         .descriptorType = type == BUF2IMG
+                              ? VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER
+                              : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+   };
+
+   const VkDescriptorSetLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .bindingCount = ARRAY_SIZE(bindings),
+      .pBindings = bindings,
+   };
+
+   return vk_meta_create_descriptor_set_layout(device, meta, &info, keys[type],
+                                               strlen(keys[type]), layout_out);
+}
+
+static VkResult
+get_image_copy_pipeline_layout(struct vk_device *device,
+                               struct vk_meta_device *meta,
+                               struct vk_meta_image_copy_key *key,
+                               VkDescriptorSetLayout set_layout,
+                               VkPipelineLayout *layout_out,
+                               enum copy_type type)
+{
+   const char *keys[] = {
+      [IMG2BUF] = "vk-meta-copy-image-to-buffer-pipeline-layout",
+      [BUF2IMG] = "vk-meta-copy-buffer-to-image-pipeline-layout",
+      [IMG2IMG] = "vk-meta-copy-image-to-image-pipeline-layout",
+   };
+
+   VkPipelineLayout from_cache =
+      vk_meta_lookup_pipeline_layout(meta, keys[type], strlen(keys[type]));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkPipelineLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = &set_layout,
+   };
+
+   const VkPushConstantRange push_range = {
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(struct vk_meta_push_data),
+   };
+
+   info.pushConstantRangeCount = 1;
+   info.pPushConstantRanges = &push_range;
+
+   return vk_meta_create_pipeline_layout(device, meta, &info, keys[type],
+                                         strlen(keys[type]), layout_out);
+}
+
+static VkResult
+get_image_copy_pipeline(struct vk_device *device, struct vk_meta_device *meta,
+                        const struct vk_meta_image_copy_key *key,
+                        VkPipelineLayout layout, VkPipeline *pipeline_out)
+{
+   VkPipeline from_cache = vk_meta_lookup_pipeline(meta, key, sizeof(*key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkPipelineShaderStageNirCreateInfoMESA nir_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA,
+      .nir = build_image_copy_shader(key),
+   };
+   const VkPipelineShaderStageCreateInfo cs_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &nir_info,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+   };
+
+   const VkComputePipelineCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = cs_info,
+      .layout = layout,
+   };
+
+   VkResult result = vk_meta_create_compute_pipeline(
+      device, meta, &info, key, sizeof(*key), pipeline_out);
+   ralloc_free(nir_info.nir);
+
+   return result;
+}
+
+static void
+hk_meta_copy_image_to_buffer2(struct vk_command_buffer *cmd,
+                              struct vk_meta_device *meta,
+                              const VkCopyImageToBufferInfo2 *pCopyBufferInfo)
+{
+   VK_FROM_HANDLE(vk_image, image, pCopyBufferInfo->srcImage);
+   VK_FROM_HANDLE(vk_image, src_image, pCopyBufferInfo->srcImage);
+
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkResult result;
+
+   VkDescriptorSetLayout set_layout;
+   result =
+      get_image_copy_descriptor_set_layout(device, meta, &set_layout, IMG2BUF);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   bool per_layer =
+      util_format_is_compressed(vk_format_to_pipe_format(image->format));
+
+   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
+      const VkBufferImageCopy2 *region = &pCopyBufferInfo->pRegions[i];
+
+      unsigned layers = MAX2(region->imageExtent.depth,
+                             vk_image_subresource_layer_count(
+                                src_image, &region->imageSubresource));
+      unsigned layer_iters = per_layer ? layers : 1;
+
+      for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) {
+
+         VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+         VkFormat aspect_fmt = aspect_format(image->format, aspect);
+         VkFormat canonical = canonical_format(aspect_fmt);
+
+         uint32_t blocksize_B =
+            util_format_get_blocksize(vk_format_to_pipe_format(canonical));
+
+         enum pipe_format p_format = vk_format_to_pipe_format(image->format);
+
+         unsigned row_extent = util_format_get_nblocksx(
+                                  p_format, MAX2(region->bufferRowLength,
+                                                 region->imageExtent.width)) *
+                               blocksize_B;
+         unsigned slice_extent =
+            util_format_get_nblocksy(
+               p_format,
+               MAX2(region->bufferImageHeight, region->imageExtent.height)) *
+            row_extent;
+         unsigned layer_extent =
+            util_format_get_nblocksz(p_format, region->imageExtent.depth) *
+            slice_extent;
+
+         bool is_3d = region->imageExtent.depth > 1;
+
+         struct vk_meta_image_copy_key key = {
+            .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE,
+            .type = IMG2BUF,
+            .block_size = blocksize_B,
+            .nr_samples = image->samples,
+         };
+
+         VkPipelineLayout pipeline_layout;
+         result = get_image_copy_pipeline_layout(device, meta, &key, set_layout,
+                                                 &pipeline_layout, false);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkImageView src_view;
+         const VkImageViewUsageCreateInfo src_view_usage = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+            .usage = VK_IMAGE_USAGE_SAMPLED_BIT,
+         };
+         const VkImageViewCreateInfo src_view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+            .pNext = &src_view_usage,
+            .image = pCopyBufferInfo->srcImage,
+            .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+            .format = canonical,
+            .subresourceRange =
+               {
+                  .aspectMask = region->imageSubresource.aspectMask,
+                  .baseMipLevel = region->imageSubresource.mipLevel,
+                  .baseArrayLayer =
+                     MAX2(region->imageOffset.z,
+                          region->imageSubresource.baseArrayLayer) +
+                     layer_offs,
+                  .layerCount = per_layer ? 1 : layers,
+                  .levelCount = 1,
+               },
+         };
+
+         result =
+            vk_meta_create_image_view(cmd, meta, &src_view_info, &src_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkDescriptorImageInfo src_info = {
+            .imageLayout = pCopyBufferInfo->srcImageLayout,
+            .imageView = src_view,
+         };
+
+         VkWriteDescriptorSet desc_writes[2];
+
+         const VkBufferViewCreateInfo dst_view_info = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = pCopyBufferInfo->dstBuffer,
+            .format = canonical,
+
+            /* Ideally, this would be region->bufferOffset, but that might not
+             * be aligned to minTexelBufferOffsetAlignment. Instead, we use a 0
+             * offset (which is definitely aligned) and add the offset ourselves
+             * in the shader.
+             */
+            .offset = 0,
+            .range = VK_WHOLE_SIZE,
+         };
+
+         VkBufferView dst_view;
+         VkResult result =
+            vk_meta_create_buffer_view(cmd, meta, &dst_view_info, &dst_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         desc_writes[0] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_OUTPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+            .descriptorCount = 1,
+            .pTexelBufferView = &dst_view,
+         };
+
+         desc_writes[1] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_INPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+            .descriptorCount = 1,
+            .pImageInfo = &src_info,
+         };
+
+         disp->CmdPushDescriptorSetKHR(
+            vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE,
+            pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes);
+
+         VkPipeline pipeline;
+         result = get_image_copy_pipeline(device, meta, &key, pipeline_layout,
+                                          &pipeline);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                               VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+         enum pipe_format p_src_fmt =
+            vk_format_to_pipe_format(src_image->format);
+
+         struct vk_meta_push_data push = {
+            .buffer_offset = region->bufferOffset,
+            .row_extent = row_extent,
+            .slice_or_layer_extent = is_3d ? slice_extent : layer_extent,
+
+            .src_offset_el[0] =
+               util_format_get_nblocksx(p_src_fmt, region->imageOffset.x),
+            .src_offset_el[1] =
+               util_format_get_nblocksy(p_src_fmt, region->imageOffset.y),
+
+            .grid_el[0] =
+               util_format_get_nblocksx(p_format, region->imageExtent.width),
+            .grid_el[1] =
+               util_format_get_nblocksy(p_format, region->imageExtent.height),
+            .grid_el[2] = per_layer ? 1 : layers,
+         };
+
+         push.buffer_offset += push.slice_or_layer_extent * layer_offs;
+
+         disp->CmdPushConstants(vk_command_buffer_to_handle(cmd),
+                                pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                                sizeof(push), &push);
+
+         disp->CmdDispatch(vk_command_buffer_to_handle(cmd),
+                           DIV_ROUND_UP(push.grid_el[0], 32),
+                           DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]);
+      }
+   }
+}
+
+static void
+hk_meta_copy_buffer_to_image2(struct vk_command_buffer *cmd,
+                              struct vk_meta_device *meta,
+                              const struct VkCopyBufferToImageInfo2 *info)
+{
+   VK_FROM_HANDLE(vk_image, image, info->dstImage);
+
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkDescriptorSetLayout set_layout;
+   VkResult result =
+      get_image_copy_descriptor_set_layout(device, meta, &set_layout, BUF2IMG);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   bool per_layer =
+      util_format_is_compressed(vk_format_to_pipe_format(image->format));
+
+   for (unsigned r = 0; r < info->regionCount; ++r) {
+      const VkBufferImageCopy2 *region = &info->pRegions[r];
+
+      unsigned layers = MAX2(
+         region->imageExtent.depth,
+         vk_image_subresource_layer_count(image, &region->imageSubresource));
+      unsigned layer_iters = per_layer ? layers : 1;
+
+      for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) {
+         VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+         VkFormat aspect_fmt = aspect_format(image->format, aspect);
+         VkFormat canonical = canonical_format(aspect_fmt);
+         enum pipe_format p_format = vk_format_to_pipe_format(aspect_fmt);
+         uint32_t blocksize_B = util_format_get_blocksize(p_format);
+         bool is_3d = region->imageExtent.depth > 1;
+
+         struct vk_meta_image_copy_key key = {
+            .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE,
+            .type = BUF2IMG,
+            .block_size = blocksize_B,
+            .nr_samples = image->samples,
+         };
+
+         VkPipelineLayout pipeline_layout;
+         result = get_image_copy_pipeline_layout(device, meta, &key, set_layout,
+                                                 &pipeline_layout, true);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkWriteDescriptorSet desc_writes[2];
+
+         unsigned row_extent = util_format_get_nblocksx(
+                                  p_format, MAX2(region->bufferRowLength,
+                                                 region->imageExtent.width)) *
+                               blocksize_B;
+         unsigned slice_extent =
+            util_format_get_nblocksy(
+               p_format,
+               MAX2(region->bufferImageHeight, region->imageExtent.height)) *
+            row_extent;
+         unsigned layer_extent =
+            util_format_get_nblocksz(p_format, region->imageExtent.depth) *
+            slice_extent;
+
+         /* Create a view into the source buffer as a texel buffer */
+         const VkBufferViewCreateInfo src_view_info = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = info->srcBuffer,
+            .format = canonical,
+
+            /* Ideally, this would be region->bufferOffset, but that might not
+             * be aligned to minTexelBufferOffsetAlignment. Instead, we use a 0
+             * offset (which is definitely aligned) and add the offset ourselves
+             * in the shader.
+             */
+            .offset = 0,
+            .range = VK_WHOLE_SIZE,
+         };
+
+         assert((region->bufferOffset % blocksize_B) == 0 && "must be aligned");
+
+         VkBufferView src_view;
+         result =
+            vk_meta_create_buffer_view(cmd, meta, &src_view_info, &src_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         VkImageView dst_view;
+         const VkImageViewUsageCreateInfo dst_view_usage = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+            .usage = VK_IMAGE_USAGE_STORAGE_BIT,
+         };
+         const VkImageViewCreateInfo dst_view_info = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+            .pNext = &dst_view_usage,
+            .image = info->dstImage,
+            .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+            .format = canonical,
+            .subresourceRange =
+               {
+                  .aspectMask = region->imageSubresource.aspectMask,
+                  .baseMipLevel = region->imageSubresource.mipLevel,
+                  .baseArrayLayer =
+                     MAX2(region->imageOffset.z,
+                          region->imageSubresource.baseArrayLayer) +
+                     layer_offs,
+                  .layerCount = per_layer ? 1 : layers,
+                  .levelCount = 1,
+               },
+         };
+
+         result =
+            vk_meta_create_image_view(cmd, meta, &dst_view_info, &dst_view);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         const VkDescriptorImageInfo dst_info = {
+            .imageView = dst_view,
+            .imageLayout = info->dstImageLayout,
+         };
+
+         desc_writes[0] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_OUTPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .descriptorCount = 1,
+            .pImageInfo = &dst_info,
+         };
+
+         desc_writes[1] = (VkWriteDescriptorSet){
+            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+            .dstSet = 0,
+            .dstBinding = BINDING_INPUT,
+            .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+            .descriptorCount = 1,
+            .pTexelBufferView = &src_view,
+         };
+
+         disp->CmdPushDescriptorSetKHR(
+            vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE,
+            pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes);
+
+         VkPipeline pipeline;
+         result = get_image_copy_pipeline(device, meta, &key, pipeline_layout,
+                                          &pipeline);
+         if (unlikely(result != VK_SUCCESS)) {
+            vk_command_buffer_set_error(cmd, result);
+            return;
+         }
+
+         disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                               VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+         struct vk_meta_push_data push = {
+            .buffer_offset = region->bufferOffset,
+            .row_extent = row_extent,
+            .slice_or_layer_extent = is_3d ? slice_extent : layer_extent,
+
+            .dst_offset_el[0] =
+               util_format_get_nblocksx(p_format, region->imageOffset.x),
+            .dst_offset_el[1] =
+               util_format_get_nblocksy(p_format, region->imageOffset.y),
+
+            .grid_el[0] =
+               util_format_get_nblocksx(p_format, region->imageExtent.width),
+            .grid_el[1] =
+               util_format_get_nblocksy(p_format, region->imageExtent.height),
+            .grid_el[2] = per_layer ? 1 : layers,
+         };
+
+         push.buffer_offset += push.slice_or_layer_extent * layer_offs;
+
+         disp->CmdPushConstants(vk_command_buffer_to_handle(cmd),
+                                pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                                sizeof(push), &push);
+
+         disp->CmdDispatch(vk_command_buffer_to_handle(cmd),
+                           DIV_ROUND_UP(push.grid_el[0], 32),
+                           DIV_ROUND_UP(push.grid_el[1], 32), push.grid_el[2]);
+      }
+   }
+}
+
+static void
+hk_meta_copy_image2(struct vk_command_buffer *cmd, struct vk_meta_device *meta,
+                    const struct VkCopyImageInfo2 *info)
+{
+   VK_FROM_HANDLE(vk_image, src_image, info->srcImage);
+   VK_FROM_HANDLE(vk_image, dst_image, info->dstImage);
+
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkDescriptorSetLayout set_layout;
+   VkResult result =
+      get_image_copy_descriptor_set_layout(device, meta, &set_layout, BUF2IMG);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   bool per_layer =
+      util_format_is_compressed(vk_format_to_pipe_format(src_image->format)) ||
+      util_format_is_compressed(vk_format_to_pipe_format(dst_image->format));
+
+   for (unsigned r = 0; r < info->regionCount; ++r) {
+      const VkImageCopy2 *region = &info->pRegions[r];
+
+      unsigned layers = MAX2(
+         vk_image_subresource_layer_count(src_image, &region->srcSubresource),
+         region->extent.depth);
+      unsigned layer_iters = per_layer ? layers : 1;
+
+      for (unsigned layer_offs = 0; layer_offs < layer_iters; ++layer_offs) {
+         u_foreach_bit(aspect, region->srcSubresource.aspectMask) {
+            /* We use the source format throughout for consistent scaling with
+             * compressed<-->uncompressed copies, where the extents are defined
+             * to follow the source.
+             */
+            VkFormat aspect_fmt = aspect_format(src_image->format, 1 << aspect);
+            VkFormat canonical = canonical_format(aspect_fmt);
+            uint32_t blocksize_B =
+               util_format_get_blocksize(vk_format_to_pipe_format(canonical));
+
+            struct vk_meta_image_copy_key key = {
+               .key_type = VK_META_OBJECT_KEY_COPY_IMAGE_TO_BUFFER_PIPELINE,
+               .type = IMG2IMG,
+               .block_size = blocksize_B,
+               .nr_samples = dst_image->samples,
+            };
+
+            assert(key.nr_samples == src_image->samples);
+
+            VkPipelineLayout pipeline_layout;
+            result = get_image_copy_pipeline_layout(
+               device, meta, &key, set_layout, &pipeline_layout, true);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            VkWriteDescriptorSet desc_writes[2];
+
+            VkImageView src_view;
+            const VkImageViewUsageCreateInfo src_view_usage = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+               .usage = VK_IMAGE_USAGE_SAMPLED_BIT,
+            };
+            const VkImageViewCreateInfo src_view_info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+               .pNext = &src_view_usage,
+               .image = info->srcImage,
+               .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+               .format = canonical,
+               .subresourceRange =
+                  {
+                     .aspectMask =
+                        region->srcSubresource.aspectMask & (1 << aspect),
+                     .baseMipLevel = region->srcSubresource.mipLevel,
+                     .baseArrayLayer =
+                        MAX2(region->srcOffset.z,
+                             region->srcSubresource.baseArrayLayer) +
+                        layer_offs,
+                     .layerCount = per_layer ? 1 : layers,
+                     .levelCount = 1,
+                  },
+            };
+
+            result =
+               vk_meta_create_image_view(cmd, meta, &src_view_info, &src_view);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            VkDescriptorImageInfo src_info = {
+               .imageLayout = info->srcImageLayout,
+               .imageView = src_view,
+            };
+
+            VkImageView dst_view;
+            const VkImageViewUsageCreateInfo dst_view_usage = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+               .usage = VK_IMAGE_USAGE_STORAGE_BIT,
+            };
+            const VkImageViewCreateInfo dst_view_info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .flags = VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+               .pNext = &dst_view_usage,
+               .image = info->dstImage,
+               .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+               .format = canonical,
+               .subresourceRange =
+                  {
+                     .aspectMask =
+                        vk_format_get_ycbcr_info(dst_image->format) ||
+                              vk_format_get_ycbcr_info(src_image->format)
+                           ? region->dstSubresource.aspectMask
+                           : (1 << aspect),
+                     .baseMipLevel = region->dstSubresource.mipLevel,
+                     .baseArrayLayer =
+                        MAX2(region->dstOffset.z,
+                             region->dstSubresource.baseArrayLayer) +
+                        layer_offs,
+                     .layerCount = per_layer ? 1 : layers,
+                     .levelCount = 1,
+                  },
+            };
+
+            result =
+               vk_meta_create_image_view(cmd, meta, &dst_view_info, &dst_view);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            const VkDescriptorImageInfo dst_info = {
+               .imageView = dst_view,
+               .imageLayout = info->dstImageLayout,
+            };
+
+            desc_writes[0] = (VkWriteDescriptorSet){
+               .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+               .dstSet = 0,
+               .dstBinding = BINDING_OUTPUT,
+               .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+               .descriptorCount = 1,
+               .pImageInfo = &dst_info,
+            };
+
+            desc_writes[1] = (VkWriteDescriptorSet){
+               .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+               .dstSet = 0,
+               .dstBinding = BINDING_INPUT,
+               .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+               .descriptorCount = 1,
+               .pImageInfo = &src_info,
+            };
+
+            disp->CmdPushDescriptorSetKHR(
+               vk_command_buffer_to_handle(cmd), VK_PIPELINE_BIND_POINT_COMPUTE,
+               pipeline_layout, 0, ARRAY_SIZE(desc_writes), desc_writes);
+
+            VkPipeline pipeline;
+            result = get_image_copy_pipeline(device, meta, &key,
+                                             pipeline_layout, &pipeline);
+            if (unlikely(result != VK_SUCCESS)) {
+               vk_command_buffer_set_error(cmd, result);
+               return;
+            }
+
+            disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                                  VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+            enum pipe_format p_src_fmt =
+               vk_format_to_pipe_format(src_image->format);
+            enum pipe_format p_dst_fmt =
+               vk_format_to_pipe_format(dst_image->format);
+            enum pipe_format p_format = vk_format_to_pipe_format(aspect_fmt);
+
+            struct vk_meta_push_data push = {
+               .src_offset_el[0] =
+                  util_format_get_nblocksx(p_src_fmt, region->srcOffset.x),
+               .src_offset_el[1] =
+                  util_format_get_nblocksy(p_src_fmt, region->srcOffset.y),
+
+               .dst_offset_el[0] =
+                  util_format_get_nblocksx(p_dst_fmt, region->dstOffset.x),
+               .dst_offset_el[1] =
+                  util_format_get_nblocksy(p_dst_fmt, region->dstOffset.y),
+
+               .grid_el[0] =
+                  util_format_get_nblocksx(p_format, region->extent.width),
+               .grid_el[1] =
+                  util_format_get_nblocksy(p_format, region->extent.height),
+               .grid_el[2] = per_layer ? 1 : layers,
+            };
+
+            disp->CmdPushConstants(vk_command_buffer_to_handle(cmd),
+                                   pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(push), &push);
+
+            disp->CmdDispatch(vk_command_buffer_to_handle(cmd),
+                              DIV_ROUND_UP(push.grid_el[0], 32),
+                              DIV_ROUND_UP(push.grid_el[1], 32),
+                              push.grid_el[2]);
+         }
+      }
+   }
+}
+
+struct vk_meta_image_to_buffer_push_data {
+   uint32_t dest_offset_el;
+};
+
+#define get_image_push(b, name)                                                \
+   nir_load_push_constant(                                                     \
+      b, 1, sizeof(((struct vk_meta_image_to_buffer_push_data *)0)->name) * 8, \
+      nir_imm_int(b,                                                           \
+                  offsetof(struct vk_meta_image_to_buffer_push_data, name)))
+
+enum copy_source {
+   COPY_SOURCE_PATTERN,
+   COPY_SOURCE_BUFFER,
+};
+
+struct vk_meta_buffer_copy_key {
+   enum vk_meta_object_key_type key_type;
+   enum copy_source source;
+
+   /* Power-of-two block size for the transfer, range [1, 16] */
+   uint8_t blocksize;
+   uint8_t pad[3];
+};
+static_assert(sizeof(struct vk_meta_buffer_copy_key) == 12, "packed");
+
+/* XXX: TODO: move to common */
+/* Copyright © Microsoft Corporation */
+static nir_def *
+dzn_nir_create_bo_desc(nir_builder *b, nir_variable_mode mode,
+                       uint32_t desc_set, uint32_t binding, const char *name,
+                       unsigned access, const struct glsl_type *dummy_type)
+{
+   nir_variable *var = nir_variable_create(b->shader, mode, dummy_type, name);
+   var->data.descriptor_set = desc_set;
+   var->data.binding = binding;
+   var->data.access = access;
+
+   assert(mode == nir_var_mem_ubo || mode == nir_var_mem_ssbo);
+   if (mode == nir_var_mem_ubo)
+      b->shader->info.num_ubos++;
+   else
+      b->shader->info.num_ssbos++;
+
+   VkDescriptorType desc_type = var->data.mode == nir_var_mem_ubo
+                                   ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
+                                   : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+   nir_address_format addr_format =
+      nir_address_format_64bit_global_32bit_offset; /* XXX */
+   nir_def *index = nir_vulkan_resource_index(
+      b, nir_address_format_num_components(addr_format),
+      nir_address_format_bit_size(addr_format), nir_imm_int(b, 0),
+      .desc_set = desc_set, .binding = binding, .desc_type = desc_type);
+
+   nir_def *desc = nir_load_vulkan_descriptor(
+      b, nir_address_format_num_components(addr_format),
+      nir_address_format_bit_size(addr_format), index, .desc_type = desc_type);
+
+   return desc;
+}
+
+static const struct glsl_type *
+type_for_blocksize(uint8_t blocksize)
+{
+   assert(util_is_power_of_two_nonzero(blocksize) && blocksize <= 16);
+
+   if (blocksize > 4)
+      return glsl_vector_type(GLSL_TYPE_UINT, blocksize / 4);
+   else
+      return glsl_uintN_t_type(8 * blocksize);
+}
+
+static nir_shader *
+build_buffer_copy_shader(const struct vk_meta_buffer_copy_key *key)
+{
+   nir_builder build = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
+                                                      "vk-meta-copy-to-buffer");
+   nir_builder *b = &build;
+
+   const struct glsl_type *type =
+      glsl_array_type(type_for_blocksize(key->blocksize), 0, key->blocksize);
+
+   nir_def *index = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+   nir_def *value;
+
+   if (key->source == COPY_SOURCE_BUFFER) {
+      nir_def *ubo =
+         dzn_nir_create_bo_desc(b, nir_var_mem_ubo, 0, BINDING_INPUT, "source",
+                                ACCESS_NON_WRITEABLE, type);
+      nir_deref_instr *ubo_deref =
+         nir_build_deref_cast(b, ubo, nir_var_mem_ubo, type, key->blocksize);
+
+      nir_deref_instr *element_deref = nir_build_deref_array(
+         b, ubo_deref, nir_u2uN(b, index, ubo_deref->def.bit_size));
+
+      value = nir_load_deref(b, element_deref);
+   } else {
+      nir_def *pattern = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0));
+
+      assert(key->blocksize >= 4 && "fills at least 32-bit");
+      value = nir_replicate(b, pattern, key->blocksize / 4);
+   }
+
+   /* Write out raw bytes to SSBO */
+   nir_def *ssbo =
+      dzn_nir_create_bo_desc(b, nir_var_mem_ssbo, 0, BINDING_OUTPUT,
+                             "destination", ACCESS_NON_READABLE, type);
+
+   nir_deref_instr *ssbo_deref =
+      nir_build_deref_cast(b, ssbo, nir_var_mem_ssbo, type, key->blocksize);
+
+   nir_deref_instr *element_deref = nir_build_deref_array(
+      b, ssbo_deref, nir_u2uN(b, index, ssbo_deref->def.bit_size));
+
+   nir_store_deref(b, element_deref, value,
+                   nir_component_mask(value->num_components));
+
+   return b->shader;
+}
+
+static VkResult
+get_buffer_copy_descriptor_set_layout(struct vk_device *device,
+                                      struct vk_meta_device *meta,
+                                      VkDescriptorSetLayout *layout_out,
+                                      enum copy_source source)
+{
+   const char buffer_key[] = "vk-meta-buffer-copy-descriptor-set-layout";
+   const char fill_key[] = "vk-meta-fill__-copy-descriptor-set-layout";
+
+   static_assert(sizeof(buffer_key) == sizeof(fill_key));
+   const char *key = source == COPY_SOURCE_BUFFER ? buffer_key : fill_key;
+
+   VkDescriptorSetLayout from_cache =
+      vk_meta_lookup_descriptor_set_layout(meta, key, sizeof(buffer_key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkDescriptorSetLayoutBinding bindings[] = {
+      {
+         .binding = BINDING_OUTPUT,
+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+      {
+         .binding = BINDING_INPUT,
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      },
+   };
+
+   const VkDescriptorSetLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .bindingCount = ARRAY_SIZE(bindings),
+      .pBindings = bindings,
+   };
+
+   return vk_meta_create_descriptor_set_layout(device, meta, &info, key,
+                                               sizeof(key), layout_out);
+}
+
+static VkResult
+get_buffer_copy_pipeline_layout(struct vk_device *device,
+                                struct vk_meta_device *meta,
+                                struct vk_meta_buffer_copy_key *key,
+                                VkDescriptorSetLayout set_layout,
+                                VkPipelineLayout *layout_out)
+{
+   const char copy_key[] = "vk-meta-buffer-copy-pipeline-layout";
+   const char fill_key[] = "vk-meta-buffer-fill-pipeline-layout";
+   const char cimg_key[] = "vk-meta-buffer-cimg-pipeline-layout";
+
+   STATIC_ASSERT(sizeof(copy_key) == sizeof(fill_key));
+   STATIC_ASSERT(sizeof(copy_key) == sizeof(cimg_key));
+   const char *pipeline_key =
+      key->source == COPY_SOURCE_BUFFER ? copy_key : fill_key;
+
+   VkPipelineLayout from_cache =
+      vk_meta_lookup_pipeline_layout(meta, pipeline_key, sizeof(copy_key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *layout_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   VkPipelineLayoutCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = &set_layout,
+   };
+
+   size_t push_size = 0;
+   if (key->source == COPY_SOURCE_PATTERN)
+      push_size = sizeof(uint32_t);
+
+   const VkPushConstantRange push_range = {
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = push_size,
+   };
+
+   if (push_size) {
+      info.pushConstantRangeCount = 1;
+      info.pPushConstantRanges = &push_range;
+   }
+
+   return vk_meta_create_pipeline_layout(device, meta, &info, pipeline_key,
+                                         sizeof(copy_key), layout_out);
+}
+
+static VkResult
+get_buffer_copy_pipeline(struct vk_device *device, struct vk_meta_device *meta,
+                         const struct vk_meta_buffer_copy_key *key,
+                         VkPipelineLayout layout, VkPipeline *pipeline_out)
+{
+   VkPipeline from_cache = vk_meta_lookup_pipeline(meta, key, sizeof(*key));
+   if (from_cache != VK_NULL_HANDLE) {
+      *pipeline_out = from_cache;
+      return VK_SUCCESS;
+   }
+
+   const VkPipelineShaderStageNirCreateInfoMESA nir_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_NIR_CREATE_INFO_MESA,
+      .nir = build_buffer_copy_shader(key),
+   };
+   const VkPipelineShaderStageCreateInfo cs_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .pNext = &nir_info,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .pName = "main",
+   };
+
+   const VkComputePipelineCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = cs_info,
+      .layout = layout,
+   };
+
+   VkResult result = vk_meta_create_compute_pipeline(
+      device, meta, &info, key, sizeof(*key), pipeline_out);
+   ralloc_free(nir_info.nir);
+
+   return result;
+}
+
+static unsigned
+alignment_of(unsigned x)
+{
+   return 1 << MIN2(__builtin_ctz(x), 31);
+}
+
+struct copy_desc {
+   enum copy_source source;
+
+   union {
+      uint32_t pattern;
+
+      struct {
+         struct vk_buffer *source;
+         VkDeviceSize srcOffset;
+      } buffer;
+
+      struct {
+         struct vk_image *image;
+         VkDescriptorImageInfo *info;
+         VkFormat format;
+         struct vk_meta_image_to_buffer_push_data push;
+      } image;
+   };
+};
+
+static void
+do_copy(struct vk_command_buffer *cmd, struct vk_meta_device *meta, size_t size,
+        struct vk_buffer *dest, VkDeviceSize dstOffset, struct copy_desc *desc)
+{
+   struct vk_device *device = cmd->base.device;
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   VkResult result;
+
+   /* The "alignment" of the copy is the maximum alignment that all accesses
+    * within the copy will satsify.
+    */
+   unsigned alignment = MIN2(alignment_of(dstOffset), alignment_of(size));
+
+   if (desc->source == COPY_SOURCE_BUFFER)
+      alignment = MIN2(alignment, alignment_of(desc->buffer.srcOffset));
+
+   struct vk_meta_buffer_copy_key key = {
+      .key_type = VK_META_OBJECT_KEY_FILL_PIPELINE,
+      .source = desc->source,
+      .blocksize = MIN2(alignment, 16),
+   };
+
+   VkDescriptorSetLayout set_layout;
+   result = get_buffer_copy_descriptor_set_layout(device, meta, &set_layout,
+                                                  desc->source);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   VkPipelineLayout pipeline_layout;
+   result = get_buffer_copy_pipeline_layout(device, meta, &key, set_layout,
+                                            &pipeline_layout);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   VkDescriptorBufferInfo buffer_infos[2];
+   VkWriteDescriptorSet desc_writes[2];
+
+   for (unsigned i = 0; i < 2; ++i) {
+      bool is_dest = (i == BINDING_OUTPUT);
+
+      if (!is_dest && desc->source != COPY_SOURCE_BUFFER)
+         continue;
+
+      buffer_infos[i] = (VkDescriptorBufferInfo){
+         .buffer = vk_buffer_to_handle(is_dest ? dest : desc->buffer.source),
+         .offset = is_dest ? dstOffset : desc->buffer.srcOffset,
+         .range = size,
+      };
+
+      desc_writes[i] = (VkWriteDescriptorSet){
+         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+         .dstSet = 0,
+         .dstBinding = i,
+         .descriptorType = is_dest ? VK_DESCRIPTOR_TYPE_STORAGE_BUFFER
+                                   : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+         .descriptorCount = 1,
+         .pBufferInfo = &buffer_infos[i],
+      };
+   }
+
+   unsigned desc_count = desc->source == COPY_SOURCE_PATTERN ? 1 : 2;
+   disp->CmdPushDescriptorSetKHR(vk_command_buffer_to_handle(cmd),
+                                 VK_PIPELINE_BIND_POINT_COMPUTE,
+                                 pipeline_layout, 0, desc_count, desc_writes);
+
+   VkPipeline pipeline;
+   result =
+      get_buffer_copy_pipeline(device, meta, &key, pipeline_layout, &pipeline);
+   if (unlikely(result != VK_SUCCESS)) {
+      vk_command_buffer_set_error(cmd, result);
+      return;
+   }
+
+   disp->CmdBindPipeline(vk_command_buffer_to_handle(cmd),
+                         VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   if (desc->source == COPY_SOURCE_PATTERN) {
+      disp->CmdPushConstants(vk_command_buffer_to_handle(cmd), pipeline_layout,
+                             VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(uint32_t),
+                             &desc->pattern);
+   }
+
+   disp->CmdDispatch(vk_command_buffer_to_handle(cmd), size / key.blocksize, 1,
+                     1);
+}
+
+static void
+hk_meta_fill_buffer(struct vk_command_buffer *cmd, struct vk_meta_device *meta,
+                    struct vk_buffer *dest, VkDeviceSize dstOffset,
+                    VkDeviceSize dstRange, uint32_t data)
+{
+   size_t size = ROUND_DOWN_TO(vk_buffer_range(dest, dstOffset, dstRange), 4);
+   dstOffset = ROUND_DOWN_TO(dstOffset, 4);
+
+   do_copy(cmd, meta, size, dest, dstOffset,
+           &(struct copy_desc){
+              .source = COPY_SOURCE_PATTERN,
+              .pattern = data,
+           });
+}
+
+static void
+hk_meta_update_buffer(struct vk_command_buffer *cmd,
+                      struct vk_meta_device *meta, struct vk_buffer *dest,
+                      VkDeviceSize dstOffset, VkDeviceSize dstRange,
+                      const void *data)
+{
+   /* Create a buffer to hold the data */
+   const VkBufferCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = vk_buffer_range(dest, dstOffset, dstRange),
+      .usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+      .queueFamilyIndexCount = 1,
+      .pQueueFamilyIndices = &cmd->pool->queue_family_index,
+   };
+
+   VkBuffer buffer;
+   VkResult result = vk_meta_create_buffer(cmd, meta, &info, &buffer);
+   if (unlikely(result != VK_SUCCESS))
+      return;
+
+   /* Map the buffer for CPU access */
+   void *map;
+   result = meta->cmd_bind_map_buffer(cmd, meta, buffer, &map);
+   if (unlikely(result != VK_SUCCESS))
+      return;
+
+   /* Copy from the CPU input to the staging buffer */
+   memcpy(map, data, info.size);
+
+   /* Copy between the buffers on the GPU */
+   VK_FROM_HANDLE(vk_buffer, buffer_, buffer);
+   size_t size = ROUND_DOWN_TO(vk_buffer_range(dest, dstOffset, dstRange), 4);
+   dstOffset = ROUND_DOWN_TO(dstOffset, 4);
+
+   do_copy(cmd, meta, size, dest, dstOffset,
+           &(struct copy_desc){
+              .source = COPY_SOURCE_BUFFER,
+              .buffer.source = buffer_,
+           });
+}
+
+static void
+hk_meta_copy_buffer2(struct vk_command_buffer *cmd, struct vk_meta_device *meta,
+                     const VkCopyBufferInfo2 *pCopyBufferInfo)
+{
+   VK_FROM_HANDLE(vk_buffer, dst, pCopyBufferInfo->dstBuffer);
+   VK_FROM_HANDLE(vk_buffer, src, pCopyBufferInfo->srcBuffer);
+
+   for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
+      const VkBufferCopy2 *copy = &pCopyBufferInfo->pRegions[i];
+
+      do_copy(cmd, meta, copy->size, dst, copy->dstOffset,
+              &(struct copy_desc){
+                 .source = COPY_SOURCE_BUFFER,
+                 .buffer.source = src,
+                 .buffer.srcOffset = copy->srcOffset,
+              });
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBlitImage2(VkCommandBuffer commandBuffer,
+                 const VkBlitImageInfo2 *pBlitImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_blit_image2(&cmd->vk, &dev->meta, pBlitImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdResolveImage2(VkCommandBuffer commandBuffer,
+                    const VkResolveImageInfo2 *pResolveImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_resolve_image2(&cmd->vk, &dev->meta, pResolveImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+void
+hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd,
+                          const VkRenderingInfo *pRenderingInfo)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_resolve_rendering(&cmd->vk, &dev->meta, pRenderingInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
+                  const VkCopyBufferInfo2 *pCopyBufferInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_buffer2(&cmd->vk, &dev->meta, pCopyBufferInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+                         const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_buffer_to_image2(&cmd->vk, &dev->meta, pCopyBufferToImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
+                         const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_image_to_buffer2(&cmd->vk, &dev->meta, pCopyImageToBufferInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyImage2(VkCommandBuffer commandBuffer,
+                 const VkCopyImageInfo2 *pCopyImageInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_copy_image2(&cmd->vk, &dev->meta, pCopyImageInfo);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
+                 VkDeviceSize dstOffset, VkDeviceSize dstRange, uint32_t data)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_buffer, buffer, dstBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_fill_buffer(&cmd->vk, &dev->meta, buffer, dstOffset, dstRange, data);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
+                   VkDeviceSize dstOffset, VkDeviceSize dstRange,
+                   const void *pData)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(vk_buffer, buffer, dstBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+   hk_meta_update_buffer(&cmd->vk, &dev->meta, buffer, dstOffset, dstRange,
+                         pData);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdClearAttachments(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
+                       const VkClearAttachment *pAttachments,
+                       uint32_t rectCount, const VkClearRect *pRects)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   struct vk_meta_rendering_info render_info;
+   hk_meta_init_render(cmd, &render_info);
+
+   struct hk_meta_save save;
+   hk_meta_begin(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+   vk_meta_clear_attachments(&cmd->vk, &dev->meta, &render_info,
+                             attachmentCount, pAttachments, rectCount, pRects);
+   hk_meta_end(cmd, &save, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
diff --git a/src/asahi/vulkan/hk_cmd_pool.c b/src/asahi/vulkan/hk_cmd_pool.c
new file mode 100644
index 00000000000..a3f2a85468a
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_pool.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_cmd_pool.h"
+#include "asahi/lib/agx_bo.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+static VkResult
+hk_cmd_bo_create(struct hk_cmd_pool *pool, bool usc, struct hk_cmd_bo **bo_out)
+{
+   struct hk_device *dev = hk_cmd_pool_device(pool);
+   struct hk_cmd_bo *bo;
+
+   bo = vk_zalloc(&pool->vk.alloc, sizeof(*bo), 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (bo == NULL)
+      return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   bo->bo = agx_bo_create(&dev->dev, HK_CMD_BO_SIZE, usc ? AGX_BO_LOW_VA : 0,
+                          "Command pool");
+   if (bo->bo == NULL) {
+      vk_free(&pool->vk.alloc, bo);
+      return vk_error(pool, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   bo->map = bo->bo->ptr.cpu;
+
+   *bo_out = bo;
+   return VK_SUCCESS;
+}
+
+static void
+hk_cmd_bo_destroy(struct hk_cmd_pool *pool, struct hk_cmd_bo *bo)
+{
+   agx_bo_unreference(bo->bo);
+   vk_free(&pool->vk.alloc, bo);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateCommandPool(VkDevice _device,
+                     const VkCommandPoolCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator,
+                     VkCommandPool *pCmdPool)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   struct hk_cmd_pool *pool;
+
+   pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pool == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result =
+      vk_command_pool_init(&device->vk, &pool->vk, pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pool);
+      return result;
+   }
+
+   list_inithead(&pool->free_bos);
+   list_inithead(&pool->free_usc_bos);
+
+   *pCmdPool = hk_cmd_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_cmd_pool_destroy_bos(struct hk_cmd_pool *pool)
+{
+   list_for_each_entry_safe(struct hk_cmd_bo, bo, &pool->free_bos, link)
+      hk_cmd_bo_destroy(pool, bo);
+
+   list_inithead(&pool->free_bos);
+
+   list_for_each_entry_safe(struct hk_cmd_bo, bo, &pool->free_usc_bos, link)
+      hk_cmd_bo_destroy(pool, bo);
+
+   list_inithead(&pool->free_usc_bos);
+}
+
+VkResult
+hk_cmd_pool_alloc_bo(struct hk_cmd_pool *pool, bool usc,
+                     struct hk_cmd_bo **bo_out)
+{
+   struct hk_cmd_bo *bo = NULL;
+   if (usc) {
+      if (!list_is_empty(&pool->free_usc_bos))
+         bo = list_first_entry(&pool->free_usc_bos, struct hk_cmd_bo, link);
+   } else {
+      if (!list_is_empty(&pool->free_bos))
+         bo = list_first_entry(&pool->free_bos, struct hk_cmd_bo, link);
+   }
+   if (bo) {
+      list_del(&bo->link);
+      *bo_out = bo;
+      return VK_SUCCESS;
+   }
+
+   return hk_cmd_bo_create(pool, usc, bo_out);
+}
+
+void
+hk_cmd_pool_free_bo_list(struct hk_cmd_pool *pool, struct list_head *bos)
+{
+   list_splicetail(bos, &pool->free_bos);
+   list_inithead(bos);
+}
+
+void
+hk_cmd_pool_free_usc_bo_list(struct hk_cmd_pool *pool, struct list_head *bos)
+{
+   list_splicetail(bos, &pool->free_usc_bos);
+   list_inithead(bos);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_cmd_pool, pool, commandPool);
+
+   if (!pool)
+      return;
+
+   vk_command_pool_finish(&pool->vk);
+   hk_cmd_pool_destroy_bos(pool);
+   vk_free2(&device->vk.alloc, pAllocator, pool);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_TrimCommandPool(VkDevice device, VkCommandPool commandPool,
+                   VkCommandPoolTrimFlags flags)
+{
+   VK_FROM_HANDLE(hk_cmd_pool, pool, commandPool);
+
+   vk_command_pool_trim(&pool->vk, flags);
+   hk_cmd_pool_destroy_bos(pool);
+}
diff --git a/src/asahi/vulkan/hk_cmd_pool.h b/src/asahi/vulkan/hk_cmd_pool.h
new file mode 100644
index 00000000000..dbac305f833
--- /dev/null
+++ b/src/asahi/vulkan/hk_cmd_pool.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_command_pool.h"
+
+/* XXX: FIXME */
+#define HK_CMD_BO_SIZE 1024 * 1024
+
+/* Recyclable command buffer BO, used for both push buffers and upload */
+struct hk_cmd_bo {
+   struct agx_bo *bo;
+
+   void *map;
+
+   /** Link in hk_cmd_pool::free_bos or hk_cmd_buffer::bos */
+   struct list_head link;
+};
+
+struct hk_cmd_pool {
+   struct vk_command_pool vk;
+
+   /** List of hk_cmd_bo */
+   struct list_head free_bos;
+   struct list_head free_usc_bos;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_cmd_pool, vk.base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+
+static inline struct hk_device *
+hk_cmd_pool_device(struct hk_cmd_pool *pool)
+{
+   return (struct hk_device *)pool->vk.base.device;
+}
+
+VkResult hk_cmd_pool_alloc_bo(struct hk_cmd_pool *pool, bool force_usc,
+                              struct hk_cmd_bo **bo_out);
+
+void hk_cmd_pool_free_bo_list(struct hk_cmd_pool *pool, struct list_head *bos);
+void hk_cmd_pool_free_usc_bo_list(struct hk_cmd_pool *pool,
+                                  struct list_head *bos);
diff --git a/src/asahi/vulkan/hk_descriptor_set.c b/src/asahi/vulkan/hk_descriptor_set.c
new file mode 100644
index 00000000000..b59a9ac4b57
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_descriptor_set.h"
+#include "asahi/lib/agx_bo.h"
+#include "vulkan/vulkan_core.h"
+
+#include "hk_buffer.h"
+#include "hk_buffer_view.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image_view.h"
+#include "hk_physical_device.h"
+#include "hk_sampler.h"
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+static inline void *
+desc_ubo_data(struct hk_descriptor_set *set, uint32_t binding, uint32_t elem,
+              uint32_t *size_out)
+{
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      &set->layout->binding[binding];
+
+   uint32_t offset = binding_layout->offset + elem * binding_layout->stride;
+   assert(offset < set->size);
+
+   if (size_out != NULL)
+      *size_out = set->size - offset;
+
+   return (char *)set->mapped_ptr + offset;
+}
+
+static void
+write_desc(struct hk_descriptor_set *set, uint32_t binding, uint32_t elem,
+           const void *desc_data, size_t desc_size)
+{
+   ASSERTED uint32_t dst_size;
+   void *dst = desc_ubo_data(set, binding, elem, &dst_size);
+   assert(desc_size <= dst_size);
+   memcpy(dst, desc_data, desc_size);
+}
+
+static void
+write_sampled_image_view_desc(struct hk_descriptor_set *set,
+                              const VkDescriptorImageInfo *const info,
+                              uint32_t binding, uint32_t elem,
+                              VkDescriptorType descriptor_type)
+{
+   struct hk_sampled_image_descriptor desc[3] = {};
+   assert(HK_NULL_TEX_OFFSET == 0 && "zero initialized so null descs implicit");
+
+   uint8_t plane_count = 1;
+   bool ia = (descriptor_type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT);
+
+   if (descriptor_type != VK_DESCRIPTOR_TYPE_SAMPLER && info &&
+       info->imageView != VK_NULL_HANDLE) {
+      VK_FROM_HANDLE(hk_image_view, view, info->imageView);
+
+      plane_count = view->plane_count;
+      for (uint8_t plane = 0; plane < plane_count; plane++) {
+         unsigned index = ia ? view->planes[plane].ia_desc_index
+                             : view->planes[plane].sampled_desc_index;
+
+         assert(index < (1 << 20));
+         desc[plane].image_offset = index * HK_IMAGE_STRIDE;
+      }
+   }
+
+   if (descriptor_type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+       descriptor_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
+      const struct hk_descriptor_set_binding_layout *binding_layout =
+         &set->layout->binding[binding];
+
+      struct hk_sampler *sampler;
+      if (binding_layout->immutable_samplers) {
+         sampler = binding_layout->immutable_samplers[elem];
+      } else {
+         sampler = hk_sampler_from_handle(info->sampler);
+      }
+
+      if (sampler->has_border)
+         assert(plane_count == 1);
+      else
+         plane_count = MAX2(plane_count, sampler->plane_count);
+
+      for (uint8_t plane = 0; plane < plane_count; plane++) {
+         /* We need to replicate the last sampler plane out to all image
+          * planes due to sampler table entry limitations. See
+          * hk_CreateSampler in hk_sampler.c for more details.
+          */
+         uint8_t sampler_plane = MIN2(plane, sampler->plane_count - 1);
+         assert(sampler->planes[sampler_plane].hw->index < (1 << 12));
+
+         /* All bindless samplers are indexed from 28 in hardware, add here so
+          * we don't have to care in the shader.
+          */
+         desc[plane].sampler_index =
+            sampler->planes[sampler_plane].hw->index + 28;
+         desc[plane].lod_bias_fp16 = sampler->lod_bias_fp16;
+         desc[plane].has_border = sampler->has_border;
+      }
+
+      if (sampler->has_border) {
+         assert(sampler->plane_count == 2);
+         desc[0].clamp_0_sampler_index = sampler->planes[1].hw->index + 28;
+
+         static_assert(sizeof(desc[0].border) == sizeof(sampler->custom_border),
+                       "fixed format");
+
+         memcpy(desc[0].border, sampler->custom_border.uint32,
+                sizeof(sampler->custom_border));
+      }
+   }
+   write_desc(set, binding, elem, desc, sizeof(desc[0]) * plane_count);
+}
+
+static void
+write_storage_image_view_desc(struct hk_descriptor_set *set,
+                              const VkDescriptorImageInfo *const info,
+                              uint32_t binding, uint32_t elem)
+{
+   struct hk_storage_image_descriptor desc = {};
+
+   if (info && info->imageView != VK_NULL_HANDLE) {
+      VK_FROM_HANDLE(hk_image_view, view, info->imageView);
+
+      /* Storage images are always single plane */
+      assert(view->plane_count == 1);
+      uint8_t plane = 0;
+
+      desc.tex_offset =
+         view->planes[plane].ro_storage_desc_index * HK_IMAGE_STRIDE;
+
+      desc.pbe_offset =
+         view->planes[plane].storage_desc_index * HK_IMAGE_STRIDE;
+   } else {
+      desc.tex_offset = HK_NULL_TEX_OFFSET;
+      desc.pbe_offset = HK_NULL_PBE_OFFSET;
+   }
+
+   write_desc(set, binding, elem, &desc, sizeof(desc));
+}
+
+static void
+write_buffer_desc(struct hk_descriptor_set *set,
+                  const VkDescriptorBufferInfo *const info, uint32_t binding,
+                  uint32_t elem)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, info->buffer);
+
+   const struct hk_addr_range addr_range =
+      hk_buffer_addr_range(buffer, info->offset, info->range);
+   assert(addr_range.range <= UINT32_MAX);
+
+   const struct hk_buffer_address desc = {
+      .base_addr = addr_range.addr,
+      .size = addr_range.range,
+   };
+   write_desc(set, binding, elem, &desc, sizeof(desc));
+}
+
+static void
+write_dynamic_buffer_desc(struct hk_descriptor_set *set,
+                          const VkDescriptorBufferInfo *const info,
+                          uint32_t binding, uint32_t elem)
+{
+   VK_FROM_HANDLE(hk_buffer, buffer, info->buffer);
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      &set->layout->binding[binding];
+
+   const struct hk_addr_range addr_range =
+      hk_buffer_addr_range(buffer, info->offset, info->range);
+   assert(addr_range.range <= UINT32_MAX);
+
+   struct hk_buffer_address *desc =
+      &set->dynamic_buffers[binding_layout->dynamic_buffer_index + elem];
+   *desc = (struct hk_buffer_address){
+      .base_addr = addr_range.addr,
+      .size = addr_range.range,
+   };
+}
+
+static void
+write_buffer_view_desc(struct hk_descriptor_set *set,
+                       const VkBufferView bufferView, uint32_t binding,
+                       uint32_t elem)
+{
+   struct hk_buffer_view_descriptor desc = {};
+   if (bufferView != VK_NULL_HANDLE) {
+      VK_FROM_HANDLE(hk_buffer_view, view, bufferView);
+
+      assert(view->tex_desc_index < (1 << 20));
+      assert(view->pbe_desc_index < (1 << 20));
+
+      desc.tex_offset = view->tex_desc_index * HK_IMAGE_STRIDE;
+      desc.pbe_offset = view->pbe_desc_index * HK_IMAGE_STRIDE;
+   } else {
+      desc.tex_offset = HK_NULL_TEX_OFFSET;
+      desc.pbe_offset = HK_NULL_PBE_OFFSET;
+   }
+
+   write_desc(set, binding, elem, &desc, sizeof(desc));
+}
+
+static void
+write_inline_uniform_data(struct hk_descriptor_set *set,
+                          const VkWriteDescriptorSetInlineUniformBlock *info,
+                          uint32_t binding, uint32_t offset)
+{
+   assert(set->layout->binding[binding].stride == 1);
+   write_desc(set, binding, offset, info->pData, info->dataSize);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_UpdateDescriptorSets(VkDevice device, uint32_t descriptorWriteCount,
+                        const VkWriteDescriptorSet *pDescriptorWrites,
+                        uint32_t descriptorCopyCount,
+                        const VkCopyDescriptorSet *pDescriptorCopies)
+{
+   for (uint32_t w = 0; w < descriptorWriteCount; w++) {
+      const VkWriteDescriptorSet *write = &pDescriptorWrites[w];
+      VK_FROM_HANDLE(hk_descriptor_set, set, write->dstSet);
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_sampled_image_view_desc(
+               set, write->pImageInfo + j, write->dstBinding,
+               write->dstArrayElement + j, write->descriptorType);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_storage_image_view_desc(set, write->pImageInfo + j,
+                                          write->dstBinding,
+                                          write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_view_desc(set, write->pTexelBufferView[j],
+                                   write->dstBinding,
+                                   write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_desc(set, write->pBufferInfo + j, write->dstBinding,
+                              write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_dynamic_buffer_desc(set, write->pBufferInfo + j,
+                                      write->dstBinding,
+                                      write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+         const VkWriteDescriptorSetInlineUniformBlock *write_inline =
+            vk_find_struct_const(write->pNext,
+                                 WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+         assert(write_inline->dataSize == write->descriptorCount);
+         write_inline_uniform_data(set, write_inline, write->dstBinding,
+                                   write->dstArrayElement);
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   for (uint32_t i = 0; i < descriptorCopyCount; i++) {
+      const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
+      VK_FROM_HANDLE(hk_descriptor_set, src, copy->srcSet);
+      VK_FROM_HANDLE(hk_descriptor_set, dst, copy->dstSet);
+
+      const struct hk_descriptor_set_binding_layout *src_binding_layout =
+         &src->layout->binding[copy->srcBinding];
+      const struct hk_descriptor_set_binding_layout *dst_binding_layout =
+         &dst->layout->binding[copy->dstBinding];
+
+      if (dst_binding_layout->stride > 0 && src_binding_layout->stride > 0) {
+         for (uint32_t j = 0; j < copy->descriptorCount; j++) {
+            ASSERTED uint32_t dst_max_size, src_max_size;
+            void *dst_map = desc_ubo_data(
+               dst, copy->dstBinding, copy->dstArrayElement + j, &dst_max_size);
+            const void *src_map = desc_ubo_data(
+               src, copy->srcBinding, copy->srcArrayElement + j, &src_max_size);
+            const uint32_t copy_size =
+               MIN2(dst_binding_layout->stride, src_binding_layout->stride);
+            assert(copy_size <= dst_max_size && copy_size <= src_max_size);
+            memcpy(dst_map, src_map, copy_size);
+         }
+      }
+
+      switch (src_binding_layout->type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+         const uint32_t dst_dyn_start =
+            dst_binding_layout->dynamic_buffer_index + copy->dstArrayElement;
+         const uint32_t src_dyn_start =
+            src_binding_layout->dynamic_buffer_index + copy->srcArrayElement;
+         typed_memcpy(&dst->dynamic_buffers[dst_dyn_start],
+                      &src->dynamic_buffers[src_dyn_start],
+                      copy->descriptorCount);
+         break;
+      }
+      default:
+         break;
+      }
+   }
+}
+
+void
+hk_push_descriptor_set_update(struct hk_push_descriptor_set *push_set,
+                              struct hk_descriptor_set_layout *layout,
+                              uint32_t write_count,
+                              const VkWriteDescriptorSet *writes)
+{
+   assert(layout->non_variable_descriptor_buffer_size < sizeof(push_set->data));
+   struct hk_descriptor_set set = {
+      .layout = layout,
+      .size = sizeof(push_set->data),
+      .mapped_ptr = push_set->data,
+   };
+
+   for (uint32_t w = 0; w < write_count; w++) {
+      const VkWriteDescriptorSet *write = &writes[w];
+      assert(write->dstSet == VK_NULL_HANDLE);
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_sampled_image_view_desc(
+               &set, write->pImageInfo + j, write->dstBinding,
+               write->dstArrayElement + j, write->descriptorType);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_storage_image_view_desc(&set, write->pImageInfo + j,
+                                          write->dstBinding,
+                                          write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_view_desc(&set, write->pTexelBufferView[j],
+                                   write->dstBinding,
+                                   write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            write_buffer_desc(&set, write->pBufferInfo + j, write->dstBinding,
+                              write->dstArrayElement + j);
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+}
+
+static void hk_descriptor_pool_free(struct hk_descriptor_pool *pool,
+                                    uint64_t addr, uint64_t size);
+
+static void
+hk_descriptor_set_destroy(struct hk_device *dev,
+                          struct hk_descriptor_pool *pool,
+                          struct hk_descriptor_set *set)
+{
+   list_del(&set->link);
+   if (set->size > 0)
+      hk_descriptor_pool_free(pool, set->addr, set->size);
+   vk_descriptor_set_layout_unref(&dev->vk, &set->layout->vk);
+
+   vk_object_free(&dev->vk, NULL, set);
+}
+
+static void
+hk_destroy_descriptor_pool(struct hk_device *dev,
+                           const VkAllocationCallbacks *pAllocator,
+                           struct hk_descriptor_pool *pool)
+{
+   list_for_each_entry_safe(struct hk_descriptor_set, set, &pool->sets, link)
+      hk_descriptor_set_destroy(dev, pool, set);
+
+   util_vma_heap_finish(&pool->heap);
+   agx_bo_unreference(pool->bo);
+
+   vk_object_free(&dev->vk, pAllocator, pool);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateDescriptorPool(VkDevice _device,
+                        const VkDescriptorPoolCreateInfo *pCreateInfo,
+                        const VkAllocationCallbacks *pAllocator,
+                        VkDescriptorPool *pDescriptorPool)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_descriptor_pool *pool;
+
+   pool = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pool),
+                           VK_OBJECT_TYPE_DESCRIPTOR_POOL);
+   if (!pool)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   list_inithead(&pool->sets);
+
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+
+   uint32_t max_align = 0;
+   for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
+      const VkMutableDescriptorTypeListEXT *type_list = NULL;
+      if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT &&
+          mutable_info && i < mutable_info->mutableDescriptorTypeListCount)
+         type_list = &mutable_info->pMutableDescriptorTypeLists[i];
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, pCreateInfo->pPoolSizes[i].type,
+                                          type_list, &stride, &alignment);
+      max_align = MAX2(max_align, alignment);
+   }
+
+   uint64_t bo_size = 0;
+   for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
+      const VkMutableDescriptorTypeListEXT *type_list = NULL;
+      if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT &&
+          mutable_info && i < mutable_info->mutableDescriptorTypeListCount)
+         type_list = &mutable_info->pMutableDescriptorTypeLists[i];
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, pCreateInfo->pPoolSizes[i].type,
+                                          type_list, &stride, &alignment);
+      bo_size +=
+         MAX2(stride, max_align) * pCreateInfo->pPoolSizes[i].descriptorCount;
+   }
+
+   /* Individual descriptor sets are aligned to the min UBO alignment to
+    * ensure that we don't end up with unaligned data access in any shaders.
+    * This means that each descriptor buffer allocated may burn up to 16B of
+    * extra space to get the right alignment.  (Technically, it's at most 28B
+    * because we're always going to start at least 4B aligned but we're being
+    * conservative here.)  Allocate enough extra space that we can chop it
+    * into maxSets pieces and align each one of them to 32B.
+    */
+   bo_size += HK_MIN_UBO_ALIGNMENT * pCreateInfo->maxSets;
+
+   if (bo_size) {
+      pool->bo = agx_bo_create(&dev->dev, bo_size, 0, "Descriptor pool");
+      if (!pool->bo) {
+         hk_destroy_descriptor_pool(dev, pAllocator, pool);
+         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+
+      pool->mapped_ptr = pool->bo->ptr.cpu;
+
+      /* The BO may be larger thanks to GPU page alignment.  We may as well
+       * make that extra space available to the client.
+       */
+      assert(pool->bo->size >= bo_size);
+      util_vma_heap_init(&pool->heap, pool->bo->ptr.gpu, pool->bo->size);
+   } else {
+      util_vma_heap_init(&pool->heap, 0, 0);
+   }
+
+   *pDescriptorPool = hk_descriptor_pool_to_handle(pool);
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_descriptor_pool_alloc(struct hk_descriptor_pool *pool, uint64_t size,
+                         uint64_t alignment, uint64_t *addr_out, void **map_out)
+{
+   assert(size > 0);
+   uint64_t addr = util_vma_heap_alloc(&pool->heap, size, alignment);
+   if (addr == 0)
+      return VK_ERROR_OUT_OF_POOL_MEMORY;
+
+   assert(addr >= pool->bo->ptr.gpu);
+   assert(addr + size <= pool->bo->ptr.gpu + pool->bo->size);
+   uint64_t offset = addr - pool->bo->ptr.gpu;
+
+   *addr_out = addr;
+   *map_out = pool->mapped_ptr + offset;
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_descriptor_pool_free(struct hk_descriptor_pool *pool, uint64_t addr,
+                        uint64_t size)
+{
+   assert(size > 0);
+   assert(addr >= pool->bo->ptr.gpu);
+   assert(addr + size <= pool->bo->ptr.gpu + pool->bo->size);
+   util_vma_heap_free(&pool->heap, addr, size);
+}
+
+static VkResult
+hk_descriptor_set_create(struct hk_device *dev, struct hk_descriptor_pool *pool,
+                         struct hk_descriptor_set_layout *layout,
+                         uint32_t variable_count,
+                         struct hk_descriptor_set **out_set)
+{
+   struct hk_descriptor_set *set;
+   VkResult result;
+
+   uint32_t mem_size =
+      sizeof(struct hk_descriptor_set) +
+      layout->dynamic_buffer_count * sizeof(struct hk_buffer_address);
+
+   set =
+      vk_object_zalloc(&dev->vk, NULL, mem_size, VK_OBJECT_TYPE_DESCRIPTOR_SET);
+   if (!set)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   set->size = layout->non_variable_descriptor_buffer_size;
+
+   if (layout->binding_count > 0 &&
+       (layout->binding[layout->binding_count - 1].flags &
+        VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
+      uint32_t stride = layout->binding[layout->binding_count - 1].stride;
+      set->size += stride * variable_count;
+   }
+
+   set->size = align64(set->size, HK_MIN_UBO_ALIGNMENT);
+
+   if (set->size > 0) {
+      result = hk_descriptor_pool_alloc(pool, set->size, HK_MIN_UBO_ALIGNMENT,
+                                        &set->addr, &set->mapped_ptr);
+      if (result != VK_SUCCESS) {
+         vk_object_free(&dev->vk, NULL, set);
+         return result;
+      }
+   }
+
+   vk_descriptor_set_layout_ref(&layout->vk);
+   set->layout = layout;
+
+   for (uint32_t b = 0; b < layout->binding_count; b++) {
+      if (layout->binding[b].type != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          layout->binding[b].type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
+
+      if (layout->binding[b].immutable_samplers == NULL)
+         continue;
+
+      uint32_t array_size = layout->binding[b].array_size;
+      if (layout->binding[b].flags &
+          VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)
+         array_size = variable_count;
+
+      for (uint32_t j = 0; j < array_size; j++) {
+         write_sampled_image_view_desc(set, NULL, b, j,
+                                       layout->binding[b].type);
+      }
+   }
+
+   list_addtail(&set->link, &pool->sets);
+   *out_set = set;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_AllocateDescriptorSets(VkDevice device,
+                          const VkDescriptorSetAllocateInfo *pAllocateInfo,
+                          VkDescriptorSet *pDescriptorSets)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, pAllocateInfo->descriptorPool);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t i;
+
+   struct hk_descriptor_set *set = NULL;
+
+   const VkDescriptorSetVariableDescriptorCountAllocateInfo *var_desc_count =
+      vk_find_struct_const(
+         pAllocateInfo->pNext,
+         DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO);
+
+   /* allocate a set of buffers for each shader to contain descriptors */
+   for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
+      VK_FROM_HANDLE(hk_descriptor_set_layout, layout,
+                     pAllocateInfo->pSetLayouts[i]);
+      /* If descriptorSetCount is zero or this structure is not included in
+       * the pNext chain, then the variable lengths are considered to be zero.
+       */
+      const uint32_t variable_count =
+         var_desc_count && var_desc_count->descriptorSetCount > 0
+            ? var_desc_count->pDescriptorCounts[i]
+            : 0;
+
+      result =
+         hk_descriptor_set_create(dev, pool, layout, variable_count, &set);
+      if (result != VK_SUCCESS)
+         break;
+
+      pDescriptorSets[i] = hk_descriptor_set_to_handle(set);
+   }
+
+   if (result != VK_SUCCESS) {
+      hk_FreeDescriptorSets(device, pAllocateInfo->descriptorPool, i,
+                            pDescriptorSets);
+      for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
+         pDescriptorSets[i] = VK_NULL_HANDLE;
+      }
+   }
+   return result;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_FreeDescriptorSets(VkDevice device, VkDescriptorPool descriptorPool,
+                      uint32_t descriptorSetCount,
+                      const VkDescriptorSet *pDescriptorSets)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, descriptorPool);
+
+   for (uint32_t i = 0; i < descriptorSetCount; i++) {
+      VK_FROM_HANDLE(hk_descriptor_set, set, pDescriptorSets[i]);
+
+      if (set)
+         hk_descriptor_set_destroy(dev, pool, set);
+   }
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyDescriptorPool(VkDevice device, VkDescriptorPool _pool,
+                         const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, _pool);
+
+   if (!_pool)
+      return;
+
+   hk_destroy_descriptor_pool(dev, pAllocator, pool);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_ResetDescriptorPool(VkDevice device, VkDescriptorPool descriptorPool,
+                       VkDescriptorPoolResetFlags flags)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_descriptor_pool, pool, descriptorPool);
+
+   list_for_each_entry_safe(struct hk_descriptor_set, set, &pool->sets, link)
+      hk_descriptor_set_destroy(dev, pool, set);
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_descriptor_set_write_template(
+   struct hk_descriptor_set *set,
+   const struct vk_descriptor_update_template *template, const void *data)
+{
+   for (uint32_t i = 0; i < template->entry_count; i++) {
+      const struct vk_descriptor_template_entry *entry = &template->entries[i];
+
+      switch (entry->type) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorImageInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_sampled_image_view_desc(set, info, entry->binding,
+                                          entry->array_element + j,
+                                          entry->type);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorImageInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_storage_image_view_desc(set, info, entry->binding,
+                                          entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkBufferView *bview =
+               data + entry->offset + j * entry->stride;
+
+            write_buffer_view_desc(set, *bview, entry->binding,
+                                   entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorBufferInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_buffer_desc(set, info, entry->binding,
+                              entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorBufferInfo *info =
+               data + entry->offset + j * entry->stride;
+
+            write_dynamic_buffer_desc(set, info, entry->binding,
+                                      entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+         write_desc(set, entry->binding, entry->array_element,
+                    data + entry->offset, entry->array_count);
+         break;
+
+      default:
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_UpdateDescriptorSetWithTemplate(
+   VkDevice device, VkDescriptorSet descriptorSet,
+   VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData)
+{
+   VK_FROM_HANDLE(hk_descriptor_set, set, descriptorSet);
+   VK_FROM_HANDLE(vk_descriptor_update_template, template,
+                  descriptorUpdateTemplate);
+
+   hk_descriptor_set_write_template(set, template, pData);
+}
+
+void
+hk_push_descriptor_set_update_template(
+   struct hk_push_descriptor_set *push_set,
+   struct hk_descriptor_set_layout *layout,
+   const struct vk_descriptor_update_template *template, const void *data)
+{
+   struct hk_descriptor_set tmp_set = {
+      .layout = layout,
+      .size = sizeof(push_set->data),
+      .mapped_ptr = push_set->data,
+   };
+   hk_descriptor_set_write_template(&tmp_set, template, data);
+}
diff --git a/src/asahi/vulkan/hk_descriptor_set.h b/src/asahi/vulkan/hk_descriptor_set.h
new file mode 100644
index 00000000000..88606654df2
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "hk_device.h"
+#include "vk_descriptor_update_template.h"
+#include "vk_object.h"
+
+#include "util/list.h"
+#include "util/vma.h"
+
+/* Stride of the image heap, equal to the size of a texture/PBE descriptor */
+#define HK_IMAGE_STRIDE (24)
+
+struct hk_descriptor_set_layout;
+
+struct hk_sampled_image_descriptor {
+   uint32_t image_offset;
+   uint16_t sampler_index;
+   uint16_t lod_bias_fp16;
+   /* TODO: This should probably be a heap! */
+   uint32_t border[4];
+   /* XXX: Single bit! Tuck it in somewhere else */
+   uint32_t has_border;
+   uint16_t clamp_0_sampler_index;
+   uint16_t pad_0;
+};
+static_assert(sizeof(struct hk_sampled_image_descriptor) == 32,
+              "hk_sampled_image_descriptor has no holes");
+
+struct hk_storage_image_descriptor {
+   uint32_t tex_offset;
+   uint32_t pbe_offset;
+};
+static_assert(sizeof(struct hk_storage_image_descriptor) == 8,
+              "hk_storage_image_descriptor has no holes");
+
+struct hk_buffer_view_descriptor {
+   uint32_t tex_offset;
+   uint32_t pbe_offset;
+};
+static_assert(sizeof(struct hk_buffer_view_descriptor) == 8,
+              "hk_buffer_view_descriptor has no holes");
+
+/* This has to match nir_address_format_64bit_bounded_global */
+struct hk_buffer_address {
+   uint64_t base_addr;
+   uint32_t size;
+   uint32_t zero; /* Must be zero! */
+};
+
+struct hk_descriptor_pool {
+   struct vk_object_base base;
+
+   struct list_head sets;
+
+   struct agx_bo *bo;
+   uint8_t *mapped_ptr;
+   struct util_vma_heap heap;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+
+struct hk_descriptor_set {
+   struct vk_object_base base;
+
+   /* Link in hk_descriptor_pool::sets */
+   struct list_head link;
+
+   struct hk_descriptor_set_layout *layout;
+   void *mapped_ptr;
+   uint64_t addr;
+   uint32_t size;
+
+   struct hk_buffer_address dynamic_buffers[];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+
+static inline uint64_t
+hk_descriptor_set_addr(const struct hk_descriptor_set *set)
+{
+   return set->addr;
+}
+
+struct hk_push_descriptor_set {
+   uint8_t data[HK_PUSH_DESCRIPTOR_SET_SIZE];
+};
+
+void hk_push_descriptor_set_update(struct hk_push_descriptor_set *push_set,
+                                   struct hk_descriptor_set_layout *layout,
+                                   uint32_t write_count,
+                                   const VkWriteDescriptorSet *writes);
+
+void hk_push_descriptor_set_update_template(
+   struct hk_push_descriptor_set *push_set,
+   struct hk_descriptor_set_layout *layout,
+   const struct vk_descriptor_update_template *template, const void *data);
diff --git a/src/asahi/vulkan/hk_descriptor_set_layout.c b/src/asahi/vulkan/hk_descriptor_set_layout.c
new file mode 100644
index 00000000000..7efe2e127a6
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_descriptor_set_layout.h"
+
+#include "hk_descriptor_set.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+#include "hk_sampler.h"
+
+#include "vk_pipeline_layout.h"
+
+static bool
+binding_has_immutable_samplers(const VkDescriptorSetLayoutBinding *binding)
+{
+   switch (binding->descriptorType) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      return binding->pImmutableSamplers != NULL;
+
+   default:
+      return false;
+   }
+}
+
+void
+hk_descriptor_stride_align_for_type(
+   const struct hk_physical_device *pdev, VkDescriptorType type,
+   const VkMutableDescriptorTypeListEXT *type_list, uint32_t *stride,
+   uint32_t *alignment)
+{
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      /* TODO: How do samplers work? */
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+      *stride = *alignment = sizeof(struct hk_sampled_image_descriptor);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      *stride = *alignment = sizeof(struct hk_storage_image_descriptor);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      *stride = *alignment = sizeof(struct hk_buffer_view_descriptor);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      *stride = *alignment = sizeof(struct hk_buffer_address);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      *stride = *alignment = 0; /* These don't take up buffer space */
+      break;
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+      *stride = 1; /* Array size is bytes */
+      *alignment = HK_MIN_UBO_ALIGNMENT;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
+      *stride = *alignment = 0;
+      if (type_list == NULL)
+         *stride = *alignment = HK_MAX_DESCRIPTOR_SIZE;
+      for (unsigned i = 0; type_list && i < type_list->descriptorTypeCount;
+           i++) {
+         /* This shouldn't recurse */
+         assert(type_list->pDescriptorTypes[i] !=
+                VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
+         uint32_t desc_stride, desc_align;
+         hk_descriptor_stride_align_for_type(pdev,
+                                             type_list->pDescriptorTypes[i],
+                                             NULL, &desc_stride, &desc_align);
+         *stride = MAX2(*stride, desc_stride);
+         *alignment = MAX2(*alignment, desc_align);
+      }
+      *stride = ALIGN(*stride, *alignment);
+      break;
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
+
+   assert(*stride <= HK_MAX_DESCRIPTOR_SIZE);
+}
+
+static const VkMutableDescriptorTypeListEXT *
+hk_descriptor_get_type_list(VkDescriptorType type,
+                            const VkMutableDescriptorTypeCreateInfoEXT *info,
+                            const uint32_t info_idx)
+{
+   const VkMutableDescriptorTypeListEXT *type_list = NULL;
+   if (type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
+      assert(info != NULL);
+      assert(info_idx < info->mutableDescriptorTypeListCount);
+      type_list = &info->pMutableDescriptorTypeLists[info_idx];
+   }
+   return type_list;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateDescriptorSetLayout(VkDevice device,
+                             const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *pAllocator,
+                             VkDescriptorSetLayout *pSetLayout)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   uint32_t num_bindings = 0;
+   uint32_t immutable_sampler_count = 0;
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
+      num_bindings = MAX2(num_bindings, binding->binding + 1);
+
+      /* From the Vulkan 1.1.97 spec for VkDescriptorSetLayoutBinding:
+       *
+       *    "If descriptorType specifies a VK_DESCRIPTOR_TYPE_SAMPLER or
+       *    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER type descriptor, then
+       *    pImmutableSamplers can be used to initialize a set of immutable
+       *    samplers. [...]  If descriptorType is not one of these descriptor
+       *    types, then pImmutableSamplers is ignored.
+       *
+       * We need to be careful here and only parse pImmutableSamplers if we
+       * have one of the right descriptor types.
+       */
+      if (binding_has_immutable_samplers(binding))
+         immutable_sampler_count += binding->descriptorCount;
+   }
+
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct hk_descriptor_set_layout, layout, 1);
+   VK_MULTIALLOC_DECL(&ma, struct hk_descriptor_set_binding_layout, bindings,
+                      num_bindings);
+   VK_MULTIALLOC_DECL(&ma, struct hk_sampler *, samplers,
+                      immutable_sampler_count);
+
+   if (!vk_descriptor_set_layout_multizalloc(&dev->vk, &ma))
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   layout->binding_count = num_bindings;
+
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
+      uint32_t b = binding->binding;
+      /* We temporarily store pCreateInfo->pBindings[] index (plus one) in the
+       * immutable_samplers pointer.  This provides us with a quick-and-dirty
+       * way to sort the bindings by binding number.
+       */
+      layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1);
+   }
+
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+
+   uint32_t buffer_size = 0;
+   uint8_t dynamic_buffer_count = 0;
+   for (uint32_t b = 0; b < num_bindings; b++) {
+      /* We stashed the pCreateInfo->pBindings[] index (plus one) in the
+       * immutable_samplers pointer.  Check for NULL (empty binding) and then
+       * reset it and compute the index.
+       */
+      if (layout->binding[b].immutable_samplers == NULL)
+         continue;
+      const uint32_t info_idx =
+         (uintptr_t)(void *)layout->binding[b].immutable_samplers - 1;
+      layout->binding[b].immutable_samplers = NULL;
+
+      const VkDescriptorSetLayoutBinding *binding =
+         &pCreateInfo->pBindings[info_idx];
+
+      if (binding->descriptorCount == 0)
+         continue;
+
+      layout->binding[b].type = binding->descriptorType;
+
+      if (binding_flags_info && binding_flags_info->bindingCount > 0) {
+         assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount);
+         layout->binding[b].flags = binding_flags_info->pBindingFlags[info_idx];
+      }
+
+      layout->binding[b].array_size = binding->descriptorCount;
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         layout->binding[b].dynamic_buffer_index = dynamic_buffer_count;
+         dynamic_buffer_count += binding->descriptorCount;
+         break;
+      default:
+         break;
+      }
+
+      const VkMutableDescriptorTypeListEXT *type_list =
+         hk_descriptor_get_type_list(binding->descriptorType, mutable_info,
+                                     info_idx);
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, binding->descriptorType,
+                                          type_list, &stride, &alignment);
+
+      uint8_t max_plane_count = 1;
+
+      if (binding_has_immutable_samplers(binding)) {
+         layout->binding[b].immutable_samplers = samplers;
+         samplers += binding->descriptorCount;
+         for (uint32_t i = 0; i < binding->descriptorCount; i++) {
+            VK_FROM_HANDLE(hk_sampler, sampler, binding->pImmutableSamplers[i]);
+            layout->binding[b].immutable_samplers[i] = sampler;
+            const uint8_t sampler_plane_count =
+               sampler->vk.ycbcr_conversion
+                  ? vk_format_get_plane_count(
+                       sampler->vk.ycbcr_conversion->state.format)
+                  : 1;
+            if (max_plane_count < sampler_plane_count)
+               max_plane_count = sampler_plane_count;
+         }
+      }
+
+      stride *= max_plane_count;
+
+      if (stride > 0) {
+         assert(stride <= UINT8_MAX);
+         assert(util_is_power_of_two_nonzero(alignment));
+
+         buffer_size = align64(buffer_size, alignment);
+         layout->binding[b].offset = buffer_size;
+         layout->binding[b].stride = stride;
+
+         if (layout->binding[b].flags &
+             VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) {
+            /* From the Vulkan 1.3.256 spec:
+             *
+             *    VUID-VkDescriptorSetLayoutBindingFlagsCreateInfo-pBindingFlags-03004
+             *    "If an element of pBindingFlags includes
+             *    VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT, then
+             *    all other elements of
+             *    VkDescriptorSetLayoutCreateInfo::pBindings must have a
+             *    smaller value of binding"
+             *
+             * In other words, it has to be the last binding.
+             */
+            assert(b == num_bindings - 1);
+         } else {
+            /* the allocation size will be computed at descriptor allocation,
+             * but the buffer size will be already aligned as this binding will
+             * be the last
+             */
+            buffer_size += stride * binding->descriptorCount;
+         }
+      }
+   }
+
+   layout->non_variable_descriptor_buffer_size = buffer_size;
+   layout->dynamic_buffer_count = dynamic_buffer_count;
+
+   struct mesa_blake3 blake3_ctx;
+   _mesa_blake3_init(&blake3_ctx);
+
+#define BLAKE3_UPDATE_VALUE(x)                                                 \
+   _mesa_blake3_update(&blake3_ctx, &(x), sizeof(x));
+   BLAKE3_UPDATE_VALUE(layout->non_variable_descriptor_buffer_size);
+   BLAKE3_UPDATE_VALUE(layout->dynamic_buffer_count);
+   BLAKE3_UPDATE_VALUE(layout->binding_count);
+
+   for (uint32_t b = 0; b < num_bindings; b++) {
+      BLAKE3_UPDATE_VALUE(layout->binding[b].type);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].flags);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].array_size);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].offset);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].stride);
+      BLAKE3_UPDATE_VALUE(layout->binding[b].dynamic_buffer_index);
+
+      if (layout->binding[b].immutable_samplers != NULL) {
+         for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
+            const struct hk_sampler *sampler =
+               layout->binding[b].immutable_samplers[i];
+
+            /* We zalloc the object, so it's safe to hash the whole thing */
+            if (sampler != NULL && sampler->vk.ycbcr_conversion != NULL)
+               BLAKE3_UPDATE_VALUE(sampler->vk.ycbcr_conversion->state);
+         }
+      }
+   }
+#undef BLAKE3_UPDATE_VALUE
+
+   _mesa_blake3_final(&blake3_ctx, layout->vk.blake3);
+
+   *pSetLayout = hk_descriptor_set_layout_to_handle(layout);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDescriptorSetLayoutSupport(
+   VkDevice device, const VkDescriptorSetLayoutCreateInfo *pCreateInfo,
+   VkDescriptorSetLayoutSupport *pSupport)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+
+   /* Figure out the maximum alignment up-front.  Otherwise, we need to sort
+    * the list of descriptors by binding number in order to get the size
+    * accumulation right.
+    */
+   uint32_t max_align = 0;
+   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[i];
+      const VkMutableDescriptorTypeListEXT *type_list =
+         hk_descriptor_get_type_list(binding->descriptorType, mutable_info, i);
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, binding->descriptorType,
+                                          type_list, &stride, &alignment);
+      max_align = MAX2(max_align, alignment);
+   }
+
+   uint64_t non_variable_size = 0;
+   uint32_t variable_stride = 0;
+   uint32_t variable_count = 0;
+   uint8_t dynamic_buffer_count = 0;
+
+   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[i];
+
+      VkDescriptorBindingFlags flags = 0;
+      if (binding_flags != NULL && binding_flags->bindingCount > 0)
+         flags = binding_flags->pBindingFlags[i];
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         dynamic_buffer_count += binding->descriptorCount;
+         break;
+      default:
+         break;
+      }
+
+      const VkMutableDescriptorTypeListEXT *type_list =
+         hk_descriptor_get_type_list(binding->descriptorType, mutable_info, i);
+
+      uint32_t stride, alignment;
+      hk_descriptor_stride_align_for_type(pdev, binding->descriptorType,
+                                          type_list, &stride, &alignment);
+
+      if (stride > 0) {
+         assert(stride <= UINT8_MAX);
+         assert(util_is_power_of_two_nonzero(alignment));
+
+         if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) {
+            /* From the Vulkan 1.3.256 spec:
+             *
+             *    "For the purposes of this command, a variable-sized
+             *    descriptor binding with a descriptorCount of zero is treated
+             *    as if the descriptorCount is one"
+             */
+            variable_count = MAX2(1, binding->descriptorCount);
+            variable_stride = stride;
+         } else {
+            /* Since we're aligning to the maximum and since this is just a
+             * check for whether or not the max buffer size is big enough, we
+             * keep non_variable_size aligned to max_align.
+             */
+            non_variable_size += stride * binding->descriptorCount;
+            non_variable_size = align64(non_variable_size, max_align);
+         }
+      }
+   }
+
+   uint64_t buffer_size = non_variable_size;
+   if (variable_stride > 0) {
+      buffer_size += variable_stride * variable_count;
+      buffer_size = align64(buffer_size, max_align);
+   }
+
+   uint32_t max_buffer_size;
+   if (pCreateInfo->flags &
+       VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
+      max_buffer_size = HK_PUSH_DESCRIPTOR_SET_SIZE;
+   else
+      max_buffer_size = HK_MAX_DESCRIPTOR_SET_SIZE;
+
+   pSupport->supported = dynamic_buffer_count <= HK_MAX_DYNAMIC_BUFFERS &&
+                         buffer_size <= max_buffer_size;
+
+   vk_foreach_struct(ext, pSupport->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
+         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         if (variable_stride > 0) {
+            vs->maxVariableDescriptorCount =
+               (max_buffer_size - non_variable_size) / variable_stride;
+         } else {
+            vs->maxVariableDescriptorCount = 0;
+         }
+         break;
+      }
+
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
diff --git a/src/asahi/vulkan/hk_descriptor_set_layout.h b/src/asahi/vulkan/hk_descriptor_set_layout.h
new file mode 100644
index 00000000000..a21a885a918
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_descriptor_set_layout.h"
+#include "vk_object.h"
+
+struct hk_device;
+struct hk_physical_device;
+struct hk_sampler;
+struct vk_pipeline_layout;
+
+struct hk_descriptor_set_binding_layout {
+   /* The type of the descriptors in this binding */
+   VkDescriptorType type;
+
+   /* Flags provided when this binding was created */
+   VkDescriptorBindingFlags flags;
+
+   /* Number of array elements in this binding (or size in bytes for inline
+    * uniform data)
+    */
+   uint32_t array_size;
+
+   /* Offset into the descriptor buffer where this descriptor lives */
+   uint32_t offset;
+
+   /* Stride between array elements in the descriptor buffer */
+   uint8_t stride;
+
+   /* Index into the dynamic buffer binding array */
+   uint8_t dynamic_buffer_index;
+
+   /* Immutable samplers (or NULL if no immutable samplers) */
+   struct hk_sampler **immutable_samplers;
+};
+
+struct hk_descriptor_set_layout {
+   struct vk_descriptor_set_layout vk;
+
+   /* Size of the descriptor buffer for this descriptor set */
+   /* Does not contain the size needed for variable count descriptors */
+   uint32_t non_variable_descriptor_buffer_size;
+
+   /* Number of dynamic UBO bindings in this set */
+   uint8_t dynamic_buffer_count;
+
+   /* Number of bindings in this descriptor set */
+   uint32_t binding_count;
+
+   /* Bindings in this descriptor set */
+   struct hk_descriptor_set_binding_layout binding[0];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_descriptor_set_layout, vk.base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+
+void hk_descriptor_stride_align_for_type(
+   const struct hk_physical_device *pdev, VkDescriptorType type,
+   const VkMutableDescriptorTypeListEXT *type_list, uint32_t *stride,
+   uint32_t *alignment);
+
+static inline struct hk_descriptor_set_layout *
+vk_to_hk_descriptor_set_layout(struct vk_descriptor_set_layout *layout)
+{
+   return container_of(layout, struct hk_descriptor_set_layout, vk);
+}
diff --git a/src/asahi/vulkan/hk_descriptor_table.c b/src/asahi/vulkan/hk_descriptor_table.c
new file mode 100644
index 00000000000..6d07ac6f384
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_table.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_descriptor_table.h"
+
+#include "hk_device.h"
+#include "hk_physical_device.h"
+
+#include "asahi/lib/agx_bo.h"
+#include <sys/mman.h>
+
+static VkResult
+hk_descriptor_table_grow_locked(struct hk_device *dev,
+                                struct hk_descriptor_table *table,
+                                uint32_t new_alloc)
+{
+   struct agx_bo *new_bo;
+   uint32_t *new_free_table;
+
+   assert(new_alloc > table->alloc && new_alloc <= table->max_alloc);
+
+   const uint32_t new_bo_size = new_alloc * table->desc_size;
+   new_bo = agx_bo_create(&dev->dev, new_bo_size, 0, "Descriptor table");
+
+   if (new_bo == NULL) {
+      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "Failed to allocate the descriptor table");
+   }
+
+   void *new_map = new_bo->ptr.cpu;
+
+   assert(table->bo == NULL && "not yet implemented sparse binding");
+   table->bo = new_bo;
+   table->map = new_map;
+
+   const size_t new_free_table_size = new_alloc * sizeof(uint32_t);
+   new_free_table =
+      vk_realloc(&dev->vk.alloc, table->free_table, new_free_table_size, 4,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_free_table == NULL) {
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "Failed to allocate image descriptor free table");
+   }
+   table->free_table = new_free_table;
+
+   table->alloc = new_alloc;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_descriptor_table_init(struct hk_device *dev,
+                         struct hk_descriptor_table *table,
+                         uint32_t descriptor_size,
+                         uint32_t min_descriptor_count,
+                         uint32_t max_descriptor_count)
+{
+   memset(table, 0, sizeof(*table));
+   VkResult result;
+
+   simple_mtx_init(&table->mutex, mtx_plain);
+
+   assert(util_is_power_of_two_nonzero(min_descriptor_count));
+   assert(util_is_power_of_two_nonzero(max_descriptor_count));
+
+   /* TODO: sparse binding for stable gpu va */
+   min_descriptor_count = max_descriptor_count;
+
+   table->desc_size = descriptor_size;
+   table->alloc = 0;
+   table->max_alloc = max_descriptor_count;
+   table->next_desc = 0;
+   table->free_count = 0;
+
+   result = hk_descriptor_table_grow_locked(dev, table, min_descriptor_count);
+   if (result != VK_SUCCESS) {
+      hk_descriptor_table_finish(dev, table);
+      return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+hk_descriptor_table_finish(struct hk_device *dev,
+                           struct hk_descriptor_table *table)
+{
+   agx_bo_unreference(table->bo);
+   vk_free(&dev->vk.alloc, table->free_table);
+   simple_mtx_destroy(&table->mutex);
+}
+
+#define HK_IMAGE_DESC_INVALID
+
+static VkResult
+hk_descriptor_table_alloc_locked(struct hk_device *dev,
+                                 struct hk_descriptor_table *table,
+                                 uint32_t *index_out)
+{
+   VkResult result;
+
+   if (table->free_count > 0) {
+      *index_out = table->free_table[--table->free_count];
+      return VK_SUCCESS;
+   }
+
+   if (table->next_desc < table->alloc) {
+      *index_out = table->next_desc++;
+      return VK_SUCCESS;
+   }
+
+   if (table->next_desc >= table->max_alloc) {
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "Descriptor table not large enough");
+   }
+
+   result = hk_descriptor_table_grow_locked(dev, table, table->alloc * 2);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(table->next_desc < table->alloc);
+   *index_out = table->next_desc++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_descriptor_table_add_locked(struct hk_device *dev,
+                               struct hk_descriptor_table *table,
+                               const void *desc_data, size_t desc_size,
+                               uint32_t *index_out)
+{
+   VkResult result = hk_descriptor_table_alloc_locked(dev, table, index_out);
+   if (result != VK_SUCCESS)
+      return result;
+
+   void *map = (char *)table->map + (*index_out * table->desc_size);
+
+   assert(desc_size == table->desc_size);
+   memcpy(map, desc_data, table->desc_size);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_descriptor_table_add(struct hk_device *dev,
+                        struct hk_descriptor_table *table,
+                        const void *desc_data, size_t desc_size,
+                        uint32_t *index_out)
+{
+   simple_mtx_lock(&table->mutex);
+   VkResult result = hk_descriptor_table_add_locked(dev, table, desc_data,
+                                                    desc_size, index_out);
+   simple_mtx_unlock(&table->mutex);
+
+   return result;
+}
+
+void
+hk_descriptor_table_remove(struct hk_device *dev,
+                           struct hk_descriptor_table *table, uint32_t index)
+{
+   simple_mtx_lock(&table->mutex);
+
+   void *map = (char *)table->map + (index * table->desc_size);
+   memset(map, 0, table->desc_size);
+
+   /* Sanity check for double-free */
+   assert(table->free_count < table->alloc);
+   for (uint32_t i = 0; i < table->free_count; i++)
+      assert(table->free_table[i] != index);
+
+   table->free_table[table->free_count++] = index;
+
+   simple_mtx_unlock(&table->mutex);
+}
diff --git a/src/asahi/vulkan/hk_descriptor_table.h b/src/asahi/vulkan/hk_descriptor_table.h
new file mode 100644
index 00000000000..759bcf8a4b5
--- /dev/null
+++ b/src/asahi/vulkan/hk_descriptor_table.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "asahi/lib/agx_bo.h"
+#include "util/simple_mtx.h"
+
+struct hk_device;
+
+struct hk_descriptor_table {
+   simple_mtx_t mutex;
+
+   uint32_t desc_size;  /**< Size of a descriptor */
+   uint32_t alloc;      /**< Number of descriptors allocated */
+   uint32_t max_alloc;  /**< Maximum possible number of descriptors */
+   uint32_t next_desc;  /**< Next unallocated descriptor */
+   uint32_t free_count; /**< Size of free_table */
+
+   struct agx_bo *bo;
+   void *map;
+
+   /* Stack for free descriptor elements */
+   uint32_t *free_table;
+};
+
+VkResult hk_descriptor_table_init(struct hk_device *dev,
+                                  struct hk_descriptor_table *table,
+                                  uint32_t descriptor_size,
+                                  uint32_t min_descriptor_count,
+                                  uint32_t max_descriptor_count);
+
+void hk_descriptor_table_finish(struct hk_device *dev,
+                                struct hk_descriptor_table *table);
+
+VkResult hk_descriptor_table_add(struct hk_device *dev,
+                                 struct hk_descriptor_table *table,
+                                 const void *desc_data, size_t desc_size,
+                                 uint32_t *index_out);
+
+void hk_descriptor_table_remove(struct hk_device *dev,
+                                struct hk_descriptor_table *table,
+                                uint32_t index);
diff --git a/src/asahi/vulkan/hk_device.c b/src/asahi/vulkan/hk_device.c
new file mode 100644
index 00000000000..f5c4535aca2
--- /dev/null
+++ b/src/asahi/vulkan/hk_device.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_device.h"
+
+#include "agx_bg_eot.h"
+#include "agx_opcodes.h"
+#include "agx_scratch.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_table.h"
+#include "hk_entrypoints.h"
+#include "hk_instance.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+
+#include "asahi/genxml/agx_pack.h"
+#include "asahi/lib/agx_bo.h"
+#include "asahi/lib/agx_device.h"
+#include "asahi/lib/shaders/geometry.h"
+#include "util/hash_table.h"
+#include "util/os_file.h"
+#include "util/ralloc.h"
+#include "util/simple_mtx.h"
+#include "vulkan/vulkan_core.h"
+#include "vulkan/wsi/wsi_common.h"
+#include "vk_cmd_enqueue_entrypoints.h"
+#include "vk_common_entrypoints.h"
+#include "vk_pipeline_cache.h"
+
+#include <fcntl.h>
+#include <xf86drm.h>
+
+/*
+ * We preupload some constants so we can cheaply reference later without extra
+ * allocation and copying.
+ *
+ * TODO: This is small, don't waste a whole BO.
+ */
+static VkResult
+hk_upload_rodata(struct hk_device *dev)
+{
+   dev->rodata.bo =
+      agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, "Read only data");
+
+   if (!dev->rodata.bo)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   uint8_t *map = dev->rodata.bo->ptr.cpu;
+   uint32_t offs = 0;
+
+   offs = align(offs, 8);
+   agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) {
+      cfg.start = 0;
+      cfg.count = 1;
+      cfg.buffer = dev->rodata.bo->ptr.gpu + offs;
+   }
+
+   agx_pack(map + offs, SAMPLER, cfg) {
+      /* Allow mipmapping. This is respected by txf, weirdly. */
+      cfg.mip_filter = AGX_MIP_FILTER_NEAREST;
+
+      /* Out-of-bounds reads must return 0 */
+      cfg.wrap_s = AGX_WRAP_CLAMP_TO_BORDER;
+      cfg.wrap_t = AGX_WRAP_CLAMP_TO_BORDER;
+      cfg.wrap_r = AGX_WRAP_CLAMP_TO_BORDER;
+      cfg.border_colour = AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
+   }
+   offs += AGX_SAMPLER_LENGTH;
+
+   /* The image heap is allocated on the device prior to the rodata. The heap
+    * lives as long as the device does and has a stable address (requiring
+    * sparse binding to grow dynamically). That means its address is effectively
+    * rodata and can be uploaded now. agx_usc_uniform requires an indirection to
+    * push the heap address, so this takes care of that indirection up front to
+    * cut an alloc/upload at draw time.
+    */
+   offs = align(offs, sizeof(uint64_t));
+   agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
+      cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
+      cfg.size_halfs = 4;
+      cfg.buffer = dev->rodata.bo->ptr.gpu + offs;
+   }
+
+   uint64_t *image_heap_ptr = dev->rodata.bo->ptr.cpu + offs;
+   *image_heap_ptr = dev->images.bo->ptr.gpu;
+   offs += sizeof(uint64_t);
+
+   /* The geometry state buffer isn't strictly readonly data, but we only have a
+    * single instance of it device-wide and -- after initializing at heap
+    * allocate time -- it is read-only from the CPU perspective. The GPU uses it
+    * for scratch, but is required to reset it after use to ensure resubmitting
+    * the same command buffer works.
+    *
+    * So, we allocate it here for convenience.
+    */
+   offs = align(offs, sizeof(uint64_t));
+   dev->rodata.geometry_state = dev->rodata.bo->ptr.gpu + offs;
+   offs += sizeof(struct agx_geometry_state);
+
+   /* For null readonly buffers, we need to allocate 16 bytes of zeroes for
+    * robustness2 semantics on read.
+    */
+   offs = align(offs, 16);
+   dev->rodata.zero_sink = dev->rodata.bo->ptr.gpu + offs;
+   memset(dev->rodata.bo->ptr.cpu + offs, 0, 16);
+   offs += 16;
+
+   /* For null storage descriptors, we need to reserve 16 bytes to catch writes.
+    * No particular content is required; we cannot get robustness2 semantics
+    * without more work.
+    */
+   offs = align(offs, 16);
+   dev->rodata.null_sink = dev->rodata.bo->ptr.gpu + offs;
+   offs += 16;
+
+   return VK_SUCCESS;
+}
+
+static uint32_t
+internal_key_hash(const void *key_)
+{
+   const struct hk_internal_key *key = key_;
+
+   return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
+}
+
+static bool
+internal_key_equal(const void *a_, const void *b_)
+{
+   const struct hk_internal_key *a = a_;
+   const struct hk_internal_key *b = b_;
+
+   return a->builder == b->builder && a->key_size == b->key_size &&
+          memcmp(a->key, b->key, a->key_size) == 0;
+}
+
+static VkResult
+hk_init_internal_shaders(struct hk_internal_shaders *s)
+{
+   s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
+   if (!s->ht)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   simple_mtx_init(&s->lock, mtx_plain);
+   return VK_SUCCESS;
+}
+
+static void
+hk_destroy_internal_shaders(struct hk_device *dev,
+                            struct hk_internal_shaders *s, bool part)
+{
+   hash_table_foreach(s->ht, ent) {
+      if (part) {
+         struct agx_shader_part *part = ent->data;
+         free(part->binary);
+
+         /* The agx_shader_part itself is ralloc'd against the hash table so
+          * will be freed.
+          */
+      } else {
+         struct hk_api_shader *obj = ent->data;
+         hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
+      }
+   }
+
+   _mesa_hash_table_destroy(s->ht, NULL);
+   simple_mtx_destroy(&s->lock);
+}
+
+DERIVE_HASH_TABLE(agx_sampler_packed);
+
+static VkResult
+hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
+{
+   h->ht = agx_sampler_packed_table_create(NULL);
+   if (!h->ht)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   VkResult result =
+      hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
+
+   if (result != VK_SUCCESS) {
+      ralloc_free(h->ht);
+      return result;
+   }
+
+   simple_mtx_init(&h->lock, mtx_plain);
+   return VK_SUCCESS;
+}
+
+static void
+hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
+{
+   hk_descriptor_table_finish(dev, &h->table);
+   ralloc_free(h->ht);
+   simple_mtx_destroy(&h->lock);
+}
+
+static VkResult
+hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
+                           struct agx_sampler_packed desc,
+                           struct hk_rc_sampler **out)
+{
+   struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
+   if (ent != NULL) {
+      *out = ent->data;
+
+      assert((*out)->refcount != 0);
+      (*out)->refcount++;
+
+      return VK_SUCCESS;
+   }
+
+   struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
+   if (!rc)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   uint32_t index;
+   VkResult result =
+      hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
+   if (result != VK_SUCCESS) {
+      ralloc_free(rc);
+      return result;
+   }
+
+   *rc = (struct hk_rc_sampler){
+      .key = desc,
+      .refcount = 1,
+      .index = index,
+   };
+
+   _mesa_hash_table_insert(h->ht, &rc->key, rc);
+   *out = rc;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
+                    struct hk_rc_sampler **out)
+{
+   struct hk_sampler_heap *h = &dev->samplers;
+
+   simple_mtx_lock(&h->lock);
+   VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
+   simple_mtx_unlock(&h->lock);
+
+   return result;
+}
+
+static void
+hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
+                              struct hk_rc_sampler *rc)
+{
+   assert(rc->refcount != 0);
+   rc->refcount--;
+
+   if (rc->refcount == 0) {
+      hk_descriptor_table_remove(dev, &h->table, rc->index);
+      _mesa_hash_table_remove_key(h->ht, &rc->key);
+      ralloc_free(rc);
+   }
+}
+
+void
+hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
+{
+   struct hk_sampler_heap *h = &dev->samplers;
+
+   simple_mtx_lock(&h->lock);
+   hk_sampler_heap_remove_locked(dev, h, rc);
+   simple_mtx_unlock(&h->lock);
+}
+
+/*
+ * To implement nullDescriptor, the descriptor set code will reference
+ * preuploaded null descriptors at fixed offsets in the image heap. Here we
+ * upload those descriptors, initializing the image heap.
+ */
+static void
+hk_upload_null_descriptors(struct hk_device *dev)
+{
+   struct agx_texture_packed null_tex;
+   struct agx_pbe_packed null_pbe;
+   uint32_t offset_tex, offset_pbe;
+
+   agx_set_null_texture(&null_tex, dev->rodata.null_sink);
+   agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
+
+   hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
+                           &offset_tex);
+
+   hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
+                           &offset_pbe);
+
+   assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
+   assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateDevice(VkPhysicalDevice physicalDevice,
+                const VkDeviceCreateInfo *pCreateInfo,
+                const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+   VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
+   struct hk_device *dev;
+
+   dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!dev)
+      return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_device_dispatch_table dispatch_table;
+
+   /* For secondary command buffer support, overwrite any command entrypoints
+    * in the main device-level dispatch table with
+    * vk_cmd_enqueue_unless_primary_Cmd*.
+    */
+   vk_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
+
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &hk_device_entrypoints, false);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &wsi_device_entrypoints, false);
+
+   /* Populate primary cmd_dispatch table */
+   vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
+                                             &hk_device_entrypoints, true);
+   vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
+                                             &wsi_device_entrypoints, false);
+   vk_device_dispatch_table_from_entrypoints(
+      &dev->cmd_dispatch, &vk_common_device_entrypoints, false);
+
+   result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
+                           pAllocator);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   dev->vk.shader_ops = &hk_device_shader_ops;
+   dev->vk.command_dispatch_table = &dev->cmd_dispatch;
+
+   drmDevicePtr drm_device = NULL;
+   int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
+   if (ret != 0) {
+      result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to get DRM device: %m");
+      goto fail_init;
+   }
+
+   const char *path = drm_device->nodes[DRM_NODE_RENDER];
+   dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
+   if (dev->dev.fd < 0) {
+      drmFreeDevice(&drm_device);
+      result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
+                         "failed to open device %s", path);
+      goto fail_init;
+   }
+
+   bool succ = agx_open_device(NULL, &dev->dev);
+   drmFreeDevice(&drm_device);
+   if (!succ) {
+      result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to get DRM device: %m");
+      goto fail_fd;
+   }
+
+   vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
+   dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
+
+   result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
+                                     1024, 1024 * 1024);
+   if (result != VK_SUCCESS)
+      goto fail_dev;
+
+   result = hk_init_sampler_heap(dev, &dev->samplers);
+   if (result != VK_SUCCESS)
+      goto fail_images;
+
+   result = hk_descriptor_table_init(
+      dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
+      AGX_MAX_OCCLUSION_QUERIES);
+   if (result != VK_SUCCESS)
+      goto fail_samplers;
+
+   result = hk_upload_rodata(dev);
+   if (result != VK_SUCCESS)
+      goto fail_queries;
+
+   /* Depends on rodata */
+   hk_upload_null_descriptors(dev);
+
+   /* XXX: error handling, and should this even go on the device? */
+   agx_bg_eot_init(&dev->bg_eot, &dev->dev);
+   if (!dev->bg_eot.ht) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail_rodata;
+   }
+
+   result = hk_init_internal_shaders(&dev->prolog_epilog);
+   if (result != VK_SUCCESS)
+      goto fail_bg_eot;
+
+   result = hk_init_internal_shaders(&dev->kernels);
+   if (result != VK_SUCCESS)
+      goto fail_internal_shaders;
+
+   result =
+      hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
+   if (result != VK_SUCCESS)
+      goto fail_internal_shaders_2;
+
+   struct vk_pipeline_cache_create_info cache_info = {
+      .weak_ref = true,
+   };
+   dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
+   if (dev->mem_cache == NULL) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail_queue;
+   }
+
+   result = hk_device_init_meta(dev);
+   if (result != VK_SUCCESS)
+      goto fail_mem_cache;
+
+   *pDevice = hk_device_to_handle(dev);
+
+   agx_scratch_init(&dev->dev, &dev->scratch.vs);
+   agx_scratch_init(&dev->dev, &dev->scratch.fs);
+   agx_scratch_init(&dev->dev, &dev->scratch.cs);
+
+   return VK_SUCCESS;
+
+fail_mem_cache:
+   vk_pipeline_cache_destroy(dev->mem_cache, NULL);
+fail_queue:
+   hk_queue_finish(dev, &dev->queue);
+fail_rodata:
+   agx_bo_unreference(dev->rodata.bo);
+fail_bg_eot:
+   agx_bg_eot_cleanup(&dev->bg_eot);
+fail_internal_shaders_2:
+   hk_destroy_internal_shaders(dev, &dev->kernels, false);
+fail_internal_shaders:
+   hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
+fail_queries:
+   hk_descriptor_table_finish(dev, &dev->occlusion_queries);
+fail_samplers:
+   hk_destroy_sampler_heap(dev, &dev->samplers);
+fail_images:
+   hk_descriptor_table_finish(dev, &dev->images);
+fail_dev:
+   agx_close_device(&dev->dev);
+fail_fd:
+   close(dev->dev.fd);
+fail_init:
+   vk_device_finish(&dev->vk);
+fail_alloc:
+   vk_free(&dev->vk.alloc, dev);
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+
+   if (!dev)
+      return;
+
+   hk_device_finish_meta(dev);
+   hk_destroy_internal_shaders(dev, &dev->kernels, false);
+   hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
+
+   vk_pipeline_cache_destroy(dev->mem_cache, NULL);
+   hk_queue_finish(dev, &dev->queue);
+   vk_device_finish(&dev->vk);
+
+   agx_scratch_fini(&dev->scratch.vs);
+   agx_scratch_fini(&dev->scratch.fs);
+   agx_scratch_fini(&dev->scratch.cs);
+
+   hk_destroy_sampler_heap(dev, &dev->samplers);
+   hk_descriptor_table_finish(dev, &dev->images);
+   hk_descriptor_table_finish(dev, &dev->occlusion_queries);
+   agx_bo_unreference(dev->rodata.bo);
+   agx_bo_unreference(dev->heap);
+   agx_bg_eot_cleanup(&dev->bg_eot);
+   agx_close_device(&dev->dev);
+   vk_free(&dev->vk.alloc, dev);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetCalibratedTimestampsKHR(
+   VkDevice _device, uint32_t timestampCount,
+   const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
+   uint64_t *pMaxDeviation)
+{
+   // VK_FROM_HANDLE(hk_device, dev, _device);
+   // struct hk_physical_device *pdev = hk_device_physical(dev);
+   uint64_t max_clock_period = 0;
+   uint64_t begin, end;
+   int d;
+
+#ifdef CLOCK_MONOTONIC_RAW
+   begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
+#else
+   begin = vk_clock_gettime(CLOCK_MONOTONIC);
+#endif
+
+   for (d = 0; d < timestampCount; d++) {
+      switch (pTimestampInfos[d].timeDomain) {
+      case VK_TIME_DOMAIN_DEVICE_KHR:
+         unreachable("todo");
+         // pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev);
+         max_clock_period = MAX2(
+            max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */
+         break;
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
+         pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
+         max_clock_period = MAX2(max_clock_period, 1);
+         break;
+
+#ifdef CLOCK_MONOTONIC_RAW
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
+         pTimestamps[d] = begin;
+         break;
+#endif
+      default:
+         pTimestamps[d] = 0;
+         break;
+      }
+   }
+
+#ifdef CLOCK_MONOTONIC_RAW
+   end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
+#else
+   end = vk_clock_gettime(CLOCK_MONOTONIC);
+#endif
+
+   *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
+
+   return VK_SUCCESS;
+}
diff --git a/src/asahi/vulkan/hk_device.h b/src/asahi/vulkan/hk_device.h
new file mode 100644
index 00000000000..b6c57315390
--- /dev/null
+++ b/src/asahi/vulkan/hk_device.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/lib/agx_device.h"
+#include "agx_bg_eot.h"
+#include "agx_pack.h"
+#include "agx_scratch.h"
+#include "decode.h"
+#include "vk_cmd_queue.h"
+#include "vk_dispatch_table.h"
+
+#include "hk_private.h"
+
+#include "hk_descriptor_table.h"
+#include "hk_queue.h"
+#include "vk_device.h"
+#include "vk_meta.h"
+#include "vk_queue.h"
+
+struct hk_physical_device;
+struct vk_pipeline_cache;
+
+/* Fixed offsets for reserved null image descriptors */
+#define HK_NULL_TEX_OFFSET (0)
+#define HK_NULL_PBE_OFFSET (24)
+
+typedef void (*hk_internal_builder_t)(struct nir_builder *b, const void *key);
+
+struct hk_internal_key {
+   hk_internal_builder_t builder;
+   size_t key_size;
+   uint8_t key[];
+};
+
+struct hk_internal_shaders {
+   simple_mtx_t lock;
+   struct hash_table *ht;
+};
+
+struct hk_rc_sampler {
+   struct agx_sampler_packed key;
+
+   /* Reference count for this hardware sampler, protected by the heap mutex */
+   uint16_t refcount;
+
+   /* Index of this hardware sampler in the hardware sampler heap */
+   uint16_t index;
+};
+
+struct hk_sampler_heap {
+   simple_mtx_t lock;
+
+   struct hk_descriptor_table table;
+
+   /* Map of agx_sampler_packed to hk_rc_sampler */
+   struct hash_table *ht;
+};
+
+struct hk_device {
+   struct vk_device vk;
+   struct agx_device dev;
+   struct agxdecode_ctx *decode_ctx;
+
+   struct hk_descriptor_table images;
+   struct hk_descriptor_table occlusion_queries;
+   struct hk_sampler_heap samplers;
+
+   struct hk_queue queue;
+
+   struct vk_pipeline_cache *mem_cache;
+
+   struct vk_meta_device meta;
+   struct agx_bg_eot_cache bg_eot;
+
+   struct {
+      struct agx_bo *bo;
+      struct agx_usc_sampler_packed txf_sampler;
+      struct agx_usc_uniform_packed image_heap;
+      uint64_t null_sink, zero_sink;
+      uint64_t geometry_state;
+   } rodata;
+
+   struct hk_internal_shaders prolog_epilog;
+   struct hk_internal_shaders kernels;
+   struct hk_api_shader *write_shader;
+
+   /* Indirected for common secondary emulation */
+   struct vk_device_dispatch_table cmd_dispatch;
+
+   /* Heap used for GPU-side memory allocation for geometry/tessellation.
+    *
+    * Control streams accessing the heap must be serialized. This is not
+    * expected to be a legitimate problem. If it is, we can rework later.
+    */
+   struct agx_bo *heap;
+
+   struct {
+      struct agx_scratch vs, fs, cs;
+   } scratch;
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+
+static inline struct hk_physical_device *
+hk_device_physical(struct hk_device *dev)
+{
+   return (struct hk_physical_device *)dev->vk.physical;
+}
+
+VkResult hk_device_init_meta(struct hk_device *dev);
+void hk_device_finish_meta(struct hk_device *dev);
+
+VkResult hk_sampler_heap_add(struct hk_device *dev,
+                             struct agx_sampler_packed desc,
+                             struct hk_rc_sampler **out);
+
+void hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc);
diff --git a/src/asahi/vulkan/hk_device_memory.c b/src/asahi/vulkan/hk_device_memory.c
new file mode 100644
index 00000000000..0d10a55f5df
--- /dev/null
+++ b/src/asahi/vulkan/hk_device_memory.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_device_memory.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "asahi/lib/agx_bo.h"
+#include "util/u_atomic.h"
+
+#include <inttypes.h>
+#include <sys/mman.h>
+
+/* Supports opaque fd only */
+const VkExternalMemoryProperties hk_opaque_fd_mem_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+   .compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+};
+
+/* Supports opaque fd and dma_buf. */
+const VkExternalMemoryProperties hk_dma_buf_mem_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+   .compatibleHandleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+                            VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+};
+
+static enum agx_bo_flags
+hk_memory_type_flags(const VkMemoryType *type,
+                     VkExternalMemoryHandleTypeFlagBits handle_types)
+{
+   unsigned flags = 0;
+
+   if (handle_types)
+      flags |= AGX_BO_SHARED | AGX_BO_SHAREABLE;
+
+   return flags;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetMemoryFdPropertiesKHR(VkDevice device,
+                            VkExternalMemoryHandleTypeFlagBits handleType,
+                            int fd,
+                            VkMemoryFdPropertiesKHR *pMemoryFdProperties)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct agx_bo *bo;
+
+   switch (handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      bo = agx_bo_import(&dev->dev, fd);
+      if (bo == NULL)
+         return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      break;
+   default:
+      return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   }
+
+   uint32_t type_bits = 0;
+   for (unsigned t = 0; t < ARRAY_SIZE(pdev->mem_types); t++) {
+      const unsigned flags =
+         hk_memory_type_flags(&pdev->mem_types[t], handleType);
+      if (!(flags & ~bo->flags))
+         type_bits |= (1 << t);
+   }
+
+   pMemoryFdProperties->memoryTypeBits = type_bits;
+
+   agx_bo_unreference(bo);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_AllocateMemory(VkDevice device, const VkMemoryAllocateInfo *pAllocateInfo,
+                  const VkAllocationCallbacks *pAllocator, VkDeviceMemory *pMem)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_device_memory *mem;
+   VkResult result = VK_SUCCESS;
+
+   const VkImportMemoryFdInfoKHR *fd_info =
+      vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
+   const VkExportMemoryAllocateInfo *export_info =
+      vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO);
+   const VkMemoryType *type = &pdev->mem_types[pAllocateInfo->memoryTypeIndex];
+
+   VkExternalMemoryHandleTypeFlagBits handle_types = 0;
+   if (export_info != NULL)
+      handle_types |= export_info->handleTypes;
+   if (fd_info != NULL)
+      handle_types |= fd_info->handleType;
+
+   const unsigned flags = hk_memory_type_flags(type, handle_types);
+
+   uint32_t alignment = 16384; /* Apple page size */
+
+   struct hk_memory_heap *heap = &pdev->mem_heaps[type->heapIndex];
+   if (p_atomic_read(&heap->used) > heap->size)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   const uint64_t aligned_size =
+      align64(pAllocateInfo->allocationSize, alignment);
+
+   mem = vk_device_memory_create(&dev->vk, pAllocateInfo, pAllocator,
+                                 sizeof(*mem));
+   if (!mem)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   mem->map = NULL;
+   if (fd_info && fd_info->handleType) {
+      assert(
+         fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+         fd_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+
+      mem->bo = agx_bo_import(&dev->dev, fd_info->fd);
+      if (mem->bo == NULL) {
+         result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         goto fail_alloc;
+      }
+      assert(!(flags & ~mem->bo->flags));
+   } else {
+      enum agx_bo_flags flags = 0;
+      if (handle_types)
+         flags |= AGX_BO_SHAREABLE;
+
+      mem->bo = agx_bo_create(&dev->dev, aligned_size, flags, "App memory");
+      if (!mem->bo) {
+         result = vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail_alloc;
+      }
+   }
+
+   if (fd_info && fd_info->handleType) {
+      /* From the Vulkan spec:
+       *
+       *    "Importing memory from a file descriptor transfers ownership of
+       *    the file descriptor from the application to the Vulkan
+       *    implementation. The application must not perform any operations on
+       *    the file descriptor after a successful import."
+       *
+       * If the import fails, we leave the file descriptor open.
+       */
+      close(fd_info->fd);
+   }
+
+   uint64_t heap_used = p_atomic_add_return(&heap->used, mem->bo->size);
+   if (heap_used > heap->size) {
+      hk_FreeMemory(device, hk_device_memory_to_handle(mem), pAllocator);
+      return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "Out of heap memory");
+   }
+
+   *pMem = hk_device_memory_to_handle(mem);
+
+   return VK_SUCCESS;
+
+fail_alloc:
+   vk_device_memory_destroy(&dev->vk, pAllocator, &mem->vk);
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_FreeMemory(VkDevice device, VkDeviceMemory _mem,
+              const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_device_memory, mem, _mem);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+
+   if (!mem)
+      return;
+
+   const VkMemoryType *type = &pdev->mem_types[mem->vk.memory_type_index];
+   struct hk_memory_heap *heap = &pdev->mem_heaps[type->heapIndex];
+   p_atomic_add(&heap->used, -((int64_t)mem->bo->size));
+
+   agx_bo_unreference(mem->bo);
+
+   vk_device_memory_destroy(&dev->vk, pAllocator, &mem->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_MapMemory2KHR(VkDevice device, const VkMemoryMapInfoKHR *pMemoryMapInfo,
+                 void **ppData)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_device_memory, mem, pMemoryMapInfo->memory);
+
+   if (mem == NULL) {
+      *ppData = NULL;
+      return VK_SUCCESS;
+   }
+
+   const VkDeviceSize offset = pMemoryMapInfo->offset;
+   const VkDeviceSize size = vk_device_memory_range(
+      &mem->vk, pMemoryMapInfo->offset, pMemoryMapInfo->size);
+
+   UNUSED void *fixed_addr = NULL;
+   if (pMemoryMapInfo->flags & VK_MEMORY_MAP_PLACED_BIT_EXT) {
+      const VkMemoryMapPlacedInfoEXT *placed_info = vk_find_struct_const(
+         pMemoryMapInfo->pNext, MEMORY_MAP_PLACED_INFO_EXT);
+      fixed_addr = placed_info->pPlacedAddress;
+   }
+
+   /* From the Vulkan spec version 1.0.32 docs for MapMemory:
+    *
+    *  * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0
+    *    assert(size != 0);
+    *  * If size is not equal to VK_WHOLE_SIZE, size must be less than or
+    *    equal to the size of the memory minus offset
+    */
+   assert(size > 0);
+   assert(offset + size <= mem->bo->size);
+
+   if (size != (size_t)size) {
+      return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED,
+                       "requested size 0x%" PRIx64 " does not fit in %u bits",
+                       size, (unsigned)(sizeof(size_t) * 8));
+   }
+
+   /* From the Vulkan 1.2.194 spec:
+    *
+    *    "memory must not be currently host mapped"
+    */
+   if (mem->map != NULL) {
+      return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED,
+                       "Memory object already mapped.");
+   }
+
+   mem->map = mem->bo->ptr.cpu;
+   *ppData = mem->map + offset;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_UnmapMemory2KHR(VkDevice device,
+                   const VkMemoryUnmapInfoKHR *pMemoryUnmapInfo)
+{
+   VK_FROM_HANDLE(hk_device_memory, mem, pMemoryUnmapInfo->memory);
+
+   if (mem == NULL)
+      return VK_SUCCESS;
+
+   if (pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT) {
+      unreachable("todo");
+#if 0
+      VK_FROM_HANDLE(hk_device, dev, device);
+
+      int err = agx_bo_overmap(mem->bo, mem->map);
+      if (err) {
+         return vk_errorf(dev, VK_ERROR_MEMORY_MAP_FAILED,
+                          "Failed to map over original mapping");
+      }
+#endif
+   } else {
+      /* TODO */
+      //// agx_bo_unmap(mem->bo, mem->map);
+   }
+
+   mem->map = NULL;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_FlushMappedMemoryRanges(VkDevice device, uint32_t memoryRangeCount,
+                           const VkMappedMemoryRange *pMemoryRanges)
+{
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_InvalidateMappedMemoryRanges(VkDevice device, uint32_t memoryRangeCount,
+                                const VkMappedMemoryRange *pMemoryRanges)
+{
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceMemoryCommitment(VkDevice device, VkDeviceMemory _mem,
+                             VkDeviceSize *pCommittedMemoryInBytes)
+{
+   VK_FROM_HANDLE(hk_device_memory, mem, _mem);
+
+   *pCommittedMemoryInBytes = mem->bo->size;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetMemoryFdKHR(VkDevice device, const VkMemoryGetFdInfoKHR *pGetFdInfo,
+                  int *pFD)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_device_memory, memory, pGetFdInfo->memory);
+
+   switch (pGetFdInfo->handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      *pFD = agx_bo_export(memory->bo);
+      return VK_SUCCESS;
+   default:
+      assert(!"unsupported handle type");
+      return vk_error(dev, VK_ERROR_FEATURE_NOT_PRESENT);
+   }
+}
+
+VKAPI_ATTR uint64_t VKAPI_CALL
+hk_GetDeviceMemoryOpaqueCaptureAddress(
+   UNUSED VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfo *pInfo)
+{
+   VK_FROM_HANDLE(hk_device_memory, mem, pInfo->memory);
+
+   return mem->bo->ptr.gpu;
+}
diff --git a/src/asahi/vulkan/hk_device_memory.h b/src/asahi/vulkan/hk_device_memory.h
new file mode 100644
index 00000000000..29d3651972a
--- /dev/null
+++ b/src/asahi/vulkan/hk_device_memory.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_device_memory.h"
+
+#include "util/list.h"
+
+struct hk_device;
+struct hk_image_plane;
+
+struct hk_device_memory {
+   struct vk_device_memory vk;
+
+   struct agx_bo *bo;
+
+   void *map;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_device_memory, vk.base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+
+extern const VkExternalMemoryProperties hk_opaque_fd_mem_props;
+extern const VkExternalMemoryProperties hk_dma_buf_mem_props;
diff --git a/src/asahi/vulkan/hk_event.c b/src/asahi/vulkan/hk_event.c
new file mode 100644
index 00000000000..aadbb272e76
--- /dev/null
+++ b/src/asahi/vulkan/hk_event.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_event.h"
+#include "vulkan/vulkan_core.h"
+
+#include "agx_bo.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+
+#define HK_EVENT_MEM_SIZE sizeof(VkResult)
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateEvent(VkDevice device, const VkEventCreateInfo *pCreateInfo,
+               const VkAllocationCallbacks *pAllocator, VkEvent *pEvent)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_event *event;
+
+   event = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*event),
+                            VK_OBJECT_TYPE_EVENT);
+   if (!event)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* TODO: this is really wasteful, bring back the NVK heap!
+    *
+    * XXX
+    */
+   event->bo =
+      agx_bo_create(&dev->dev, HK_EVENT_MEM_SIZE, AGX_BO_WRITEBACK, "Event");
+   event->status = event->bo->ptr.cpu;
+   event->addr = event->bo->ptr.gpu;
+
+   *event->status = VK_EVENT_RESET;
+
+   *pEvent = hk_event_to_handle(event);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyEvent(VkDevice device, VkEvent _event,
+                const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   if (!event)
+      return;
+
+   agx_bo_unreference(event->bo);
+   vk_object_free(&dev->vk, pAllocator, event);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetEventStatus(VkDevice device, VkEvent _event)
+{
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   return *event->status;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_SetEvent(VkDevice device, VkEvent _event)
+{
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   *event->status = VK_EVENT_SET;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_ResetEvent(VkDevice device, VkEvent _event)
+{
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   *event->status = VK_EVENT_RESET;
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
+                const VkDependencyInfo *pDependencyInfo)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   hk_queue_write(cmd, event->bo->ptr.gpu, VK_EVENT_SET, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event,
+                  VkPipelineStageFlags2 stageMask)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_event, event, _event);
+
+   hk_queue_write(cmd, event->bo->ptr.gpu, VK_EVENT_RESET, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount,
+                  const VkEvent *pEvents,
+                  const VkDependencyInfo *pDependencyInfos)
+{
+   /* Currently we barrier everything, so this is a no-op. */
+}
diff --git a/src/asahi/vulkan/hk_event.h b/src/asahi/vulkan/hk_event.h
new file mode 100644
index 00000000000..c675ceada8a
--- /dev/null
+++ b/src/asahi/vulkan/hk_event.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+
+#include "vk_object.h"
+
+struct hk_event {
+   struct vk_object_base base;
+   struct agx_bo *bo;
+
+   uint64_t addr;
+   VkResult *status;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
diff --git a/src/asahi/vulkan/hk_format.c b/src/asahi/vulkan/hk_format.c
new file mode 100644
index 00000000000..b0fa8ae5c99
--- /dev/null
+++ b/src/asahi/vulkan/hk_format.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "drm-uapi/drm_fourcc.h"
+
+#include "hk_buffer_view.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "vk_enum_defines.h"
+#include "vk_format.h"
+
+uint64_t agx_best_modifiers[] = {
+   // DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED,
+   DRM_FORMAT_MOD_APPLE_TWIDDLED,
+   DRM_FORMAT_MOD_LINEAR,
+};
+
+static VkFormatFeatureFlags2
+hk_modifier_features(uint64_t mod, VkFormat vk_format,
+                     const VkFormatProperties *props)
+{
+   if (mod == DRM_FORMAT_MOD_LINEAR)
+      return props->linearTilingFeatures;
+
+   if (mod == DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED /* TODO */)
+      return 0;
+
+   return props->optimalTilingFeatures;
+}
+
+static void
+get_drm_format_modifier_properties_list(
+   const struct hk_physical_device *physical_device, VkFormat vk_format,
+   VkDrmFormatModifierPropertiesListEXT *list, const VkFormatProperties *props)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(agx_best_modifiers); ++i) {
+      uint64_t mod = agx_best_modifiers[i];
+      VkFormatFeatureFlags2 flags = hk_modifier_features(mod, vk_format, props);
+
+      if (!flags)
+         continue;
+
+      vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out,
+                               out_props)
+      {
+         *out_props = (VkDrmFormatModifierPropertiesEXT){
+            .drmFormatModifier = mod,
+            .drmFormatModifierPlaneCount = 1 /* no planar mods */,
+            .drmFormatModifierTilingFeatures = flags,
+         };
+      };
+   }
+}
+
+static void
+get_drm_format_modifier_properties_list_2(
+   const struct hk_physical_device *physical_device, VkFormat vk_format,
+   VkDrmFormatModifierPropertiesList2EXT *list, const VkFormatProperties *props)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(agx_best_modifiers); ++i) {
+      uint64_t mod = agx_best_modifiers[i];
+      VkFormatFeatureFlags2 flags = hk_modifier_features(mod, vk_format, props);
+
+      if (!flags)
+         continue;
+
+      vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out,
+                               out_props)
+      {
+         *out_props = (VkDrmFormatModifierProperties2EXT){
+            .drmFormatModifier = mod,
+            .drmFormatModifierPlaneCount = 1, /* no planar mods */
+            .drmFormatModifierTilingFeatures = flags,
+         };
+      };
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice,
+                                      VkFormat format,
+                                      VkFormatProperties2 *pFormatProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdevice, physicalDevice);
+
+   VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+   linear2 =
+      hk_get_image_format_features(pdevice, format, VK_IMAGE_TILING_LINEAR);
+   optimal2 =
+      hk_get_image_format_features(pdevice, format, VK_IMAGE_TILING_OPTIMAL);
+   buffer2 = hk_get_buffer_format_features(pdevice, format);
+
+   pFormatProperties->formatProperties = (VkFormatProperties){
+      .linearTilingFeatures = vk_format_features2_to_features(linear2),
+      .optimalTilingFeatures = vk_format_features2_to_features(optimal2),
+      .bufferFeatures = vk_format_features2_to_features(buffer2),
+   };
+
+   vk_foreach_struct(ext, pFormatProperties->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+         VkFormatProperties3 *p = (void *)ext;
+         p->linearTilingFeatures = linear2;
+         p->optimalTilingFeatures = optimal2;
+         p->bufferFeatures = buffer2;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT:
+         get_drm_format_modifier_properties_list(
+            pdevice, format, (void *)ext, &pFormatProperties->formatProperties);
+         break;
+
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT:
+         get_drm_format_modifier_properties_list_2(
+            pdevice, format, (void *)ext, &pFormatProperties->formatProperties);
+         break;
+
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
diff --git a/src/asahi/vulkan/hk_image.c b/src/asahi/vulkan/hk_image.c
new file mode 100644
index 00000000000..6187eff40a8
--- /dev/null
+++ b/src/asahi/vulkan/hk_image.c
@@ -0,0 +1,1536 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_image.h"
+#include "asahi/layout/layout.h"
+#include "asahi/lib/agx_formats.h"
+#include "drm-uapi/drm_fourcc.h"
+#include "util/bitscan.h"
+#include "util/format/u_format.h"
+#include "util/format/u_formats.h"
+#include "util/macros.h"
+#include "util/u_math.h"
+#include "vulkan/vulkan_core.h"
+
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vk_format.h"
+
+/* Minimum alignment encodable for our descriptors. The hardware texture/PBE
+ * descriptors require 16-byte alignment. Our software PBE atomic descriptor
+ * requires 128-byte alignment, but we could relax that one if we wanted.
+ */
+#define HK_PLANE_ALIGN_B 128
+
+static VkFormatFeatureFlags2
+hk_get_image_plane_format_features(struct hk_physical_device *pdev,
+                                   VkFormat vk_format, VkImageTiling tiling)
+{
+   VkFormatFeatureFlags2 features = 0;
+
+   /* Conformance fails with these optional formats. Just drop them for now.
+    * TODO: Investigate later if we have a use case.
+    */
+   switch (vk_format) {
+   case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR:
+   case VK_FORMAT_A8_UNORM_KHR:
+      return 0;
+   default:
+      break;
+   }
+
+   enum pipe_format p_format = vk_format_to_pipe_format(vk_format);
+   if (p_format == PIPE_FORMAT_NONE)
+      return 0;
+
+   /* NPOT formats only supported for texel buffers */
+   if (!util_is_power_of_two_nonzero(util_format_get_blocksize(p_format)))
+      return 0;
+
+   if (util_format_is_compressed(p_format)) {
+      /* Linear block-compressed images are all sorts of problematic, not sure
+       * if AGX even supports them. Don't try.
+       */
+      if (tiling != VK_IMAGE_TILING_OPTIMAL)
+         return 0;
+
+      /* XXX: Conformance fails, e.g.:
+       * dEQP-VK.pipeline.monolithic.sampler.view_type.2d.format.etc2_r8g8b8a1_unorm_block.mipmap.linear.lod.select_bias_3_7
+       *
+       * I suspect ail bug with mipmapping of compressed :-/
+       */
+      switch (util_format_description(p_format)->layout) {
+      case UTIL_FORMAT_LAYOUT_ETC:
+      case UTIL_FORMAT_LAYOUT_ASTC:
+         return 0;
+      default:
+         break;
+      }
+   }
+
+   if (agx_pixel_format[p_format].texturable) {
+      features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT;
+      features |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
+
+      /* We can sample integer formats but it doesn't make sense to linearly
+       * filter them.
+       */
+      if (!util_format_is_pure_integer(p_format)) {
+         features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+      }
+
+      if (vk_format_has_depth(vk_format)) {
+         features |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
+      }
+   }
+
+   if (agx_pixel_format[p_format].renderable) {
+      /* For now, disable snorm rendering due to nir_lower_blend bugs.
+       *
+       * TODO: revisit.
+       */
+      if (!util_format_is_snorm(p_format)) {
+         features |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+         features |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
+      }
+
+      features |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
+      features |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT |
+                  VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+   }
+
+   if (vk_format_is_depth_or_stencil(vk_format)) {
+      if (!(p_format == PIPE_FORMAT_Z32_FLOAT ||
+            p_format == PIPE_FORMAT_S8_UINT ||
+            p_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ||
+            p_format == PIPE_FORMAT_Z16_UNORM) ||
+          tiling == VK_IMAGE_TILING_LINEAR)
+         return 0;
+
+      features |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT;
+   }
+
+   /* Our image atomic lowering doesn't bother to handle linear */
+   if ((p_format == PIPE_FORMAT_R32_UINT || p_format == PIPE_FORMAT_R32_SINT) &&
+       tiling == VK_IMAGE_TILING_OPTIMAL) {
+
+      features |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
+   }
+
+   if (features != 0) {
+      features |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT;
+      features |= VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+      features |= VK_FORMAT_FEATURE_2_HOST_IMAGE_TRANSFER_BIT_EXT;
+   }
+
+   return features;
+}
+
+VkFormatFeatureFlags2
+hk_get_image_format_features(struct hk_physical_device *pdev,
+                             VkFormat vk_format, VkImageTiling tiling)
+{
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(vk_format);
+   if (ycbcr_info == NULL)
+      return hk_get_image_plane_format_features(pdev, vk_format, tiling);
+
+   /* For multi-plane, we get the feature flags of each plane separately,
+    * then take their intersection as the overall format feature flags
+    */
+   VkFormatFeatureFlags2 features = ~0ull;
+   bool cosited_chroma = false;
+   for (uint8_t plane = 0; plane < ycbcr_info->n_planes; plane++) {
+      const struct vk_format_ycbcr_plane *plane_info =
+         &ycbcr_info->planes[plane];
+      features &=
+         hk_get_image_plane_format_features(pdev, plane_info->format, tiling);
+      if (plane_info->denominator_scales[0] > 1 ||
+          plane_info->denominator_scales[1] > 1)
+         cosited_chroma = true;
+   }
+   if (features == 0)
+      return 0;
+
+   /* Uh... We really should be able to sample from YCbCr */
+   assert(features & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT);
+   assert(features & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT);
+
+   /* These aren't allowed for YCbCr formats */
+   features &=
+      ~(VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+        VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+        VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+        VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT);
+
+   /* This is supported on all YCbCr formats */
+   features |=
+      VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT;
+
+   if (ycbcr_info->n_planes > 1) {
+      /* DISJOINT_BIT implies that each plane has its own separate binding,
+       * while SEPARATE_RECONSTRUCTION_FILTER_BIT implies that luma and chroma
+       * each have their own, separate filters, so these two bits make sense
+       * for multi-planar formats only.
+       *
+       * For MIDPOINT_CHROMA_SAMPLES_BIT, NVIDIA HW on single-plane interleaved
+       * YCbCr defaults to COSITED_EVEN, which is inaccurate and fails tests.
+       * This can be fixed with a NIR tweak but for now, we only enable this bit
+       * for multi-plane formats. See Issue #9525 on the mesa/main tracker.
+       */
+      features |=
+         VK_FORMAT_FEATURE_DISJOINT_BIT |
+         VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT |
+         VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+   }
+
+   if (cosited_chroma)
+      features |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT;
+
+   return features;
+}
+
+static VkFormatFeatureFlags2
+vk_image_usage_to_format_features(VkImageUsageFlagBits usage_flag)
+{
+   assert(util_bitcount(usage_flag) == 1);
+   switch (usage_flag) {
+   case VK_IMAGE_USAGE_TRANSFER_SRC_BIT:
+      return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+             VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+   case VK_IMAGE_USAGE_TRANSFER_DST_BIT:
+      return VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+             VK_FORMAT_FEATURE_BLIT_DST_BIT;
+   case VK_IMAGE_USAGE_SAMPLED_BIT:
+      return VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT;
+   case VK_IMAGE_USAGE_STORAGE_BIT:
+      return VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+   case VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT:
+      return VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+   case VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT:
+      return VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT;
+   default:
+      return 0;
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetPhysicalDeviceImageFormatProperties2(
+   VkPhysicalDevice physicalDevice,
+   const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo,
+   VkImageFormatProperties2 *pImageFormatProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+
+   const VkPhysicalDeviceExternalImageFormatInfo *external_info =
+      vk_find_struct_const(pImageFormatInfo->pNext,
+                           PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO);
+
+   /* Initialize to zero in case we return VK_ERROR_FORMAT_NOT_SUPPORTED */
+   memset(&pImageFormatProperties->imageFormatProperties, 0,
+          sizeof(pImageFormatProperties->imageFormatProperties));
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(pImageFormatInfo->format);
+
+   /* For the purposes of these checks, we don't care about all the extra
+    * YCbCr features and we just want the accumulation of features available
+    * to all planes of the given format.
+    */
+   VkFormatFeatureFlags2 features;
+   if (ycbcr_info == NULL) {
+      features = hk_get_image_plane_format_features(
+         pdev, pImageFormatInfo->format, pImageFormatInfo->tiling);
+   } else {
+      features = ~0ull;
+      assert(ycbcr_info->n_planes > 0);
+      for (uint8_t plane = 0; plane < ycbcr_info->n_planes; plane++) {
+         const VkFormat plane_format = ycbcr_info->planes[plane].format;
+         features &= hk_get_image_plane_format_features(
+            pdev, plane_format, pImageFormatInfo->tiling);
+      }
+   }
+   if (features == 0)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   if (pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR &&
+       pImageFormatInfo->type != VK_IMAGE_TYPE_2D)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   if (ycbcr_info && pImageFormatInfo->type != VK_IMAGE_TYPE_2D)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* From the Vulkan 1.3.279 spec:
+    *
+    *    VUID-VkImageCreateInfo-tiling-04121
+    *
+    *    "If tiling is VK_IMAGE_TILING_LINEAR, flags must not contain
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    *
+    *    VUID-VkImageCreateInfo-imageType-00970
+    *
+    *    "If imageType is VK_IMAGE_TYPE_1D, flags must not contain
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    */
+   if (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT &&
+       (pImageFormatInfo->type == VK_IMAGE_TYPE_1D ||
+        pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* From the Vulkan 1.3.279 spec:
+    *
+    *    VUID-VkImageCreateInfo-flags-09403
+    *
+    *    "If flags contains VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT, flags
+    *    must not include VK_IMAGE_CREATE_SPARSE_ALIASED_BIT,
+    *    VK_IMAGE_CREATE_SPARSE_BINDING_BIT, or
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    */
+   if ((pImageFormatInfo->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) &&
+       (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT |
+                                   VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                                   VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* We don't yet support sparse, but it shouldn't be too hard */
+   if (pImageFormatInfo->flags & (VK_IMAGE_CREATE_SPARSE_ALIASED_BIT |
+                                  VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                                  VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   const uint32_t max_dim = 16384;
+   VkExtent3D maxExtent;
+   uint32_t maxArraySize;
+   switch (pImageFormatInfo->type) {
+   case VK_IMAGE_TYPE_1D:
+      maxExtent = (VkExtent3D){max_dim, 1, 1};
+      maxArraySize = 2048;
+      break;
+   case VK_IMAGE_TYPE_2D:
+      maxExtent = (VkExtent3D){max_dim, max_dim, 1};
+      maxArraySize = 2048;
+      break;
+   case VK_IMAGE_TYPE_3D:
+      maxExtent = (VkExtent3D){max_dim, max_dim, max_dim};
+      maxArraySize = 1;
+      break;
+   default:
+      unreachable("Invalid image type");
+   }
+   if (pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR)
+      maxArraySize = 1;
+
+   assert(util_is_power_of_two_nonzero(max_dim));
+   uint32_t maxMipLevels = util_logbase2(max_dim) + 1;
+   if (ycbcr_info != NULL || pImageFormatInfo->tiling == VK_IMAGE_TILING_LINEAR)
+      maxMipLevels = 1;
+
+   VkSampleCountFlags sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+   if (pImageFormatInfo->tiling == VK_IMAGE_TILING_OPTIMAL &&
+       pImageFormatInfo->type == VK_IMAGE_TYPE_2D && ycbcr_info == NULL &&
+       (features & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+                    VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       !(pImageFormatInfo->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)) {
+
+      sampleCounts =
+         VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
+   }
+
+   /* From the Vulkan 1.2.199 spec:
+    *
+    *    "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be
+    *    created with usage flags that are not supported for the format the
+    *    image is created with but are supported for at least one format a
+    *    VkImageView created from the image can have."
+    *
+    * If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set, views can be created with
+    * different usage than the image so we can't always filter on usage.
+    * There is one exception to this below for storage.
+    */
+   const VkImageUsageFlags image_usage = pImageFormatInfo->usage;
+   VkImageUsageFlags view_usage = image_usage;
+   if (pImageFormatInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)
+      view_usage = 0;
+
+   if (view_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
+      if (!(features & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+                        VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT))) {
+         return VK_ERROR_FORMAT_NOT_SUPPORTED;
+      }
+   }
+
+   u_foreach_bit(b, view_usage) {
+      VkFormatFeatureFlags2 usage_features =
+         vk_image_usage_to_format_features(1 << b);
+      if (usage_features && !(features & usage_features))
+         return VK_ERROR_FORMAT_NOT_SUPPORTED;
+   }
+
+   const VkExternalMemoryProperties *ext_mem_props = NULL;
+   if (external_info != NULL && external_info->handleType != 0) {
+      bool tiling_has_explicit_layout;
+      switch (pImageFormatInfo->tiling) {
+      case VK_IMAGE_TILING_LINEAR:
+      case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT:
+         tiling_has_explicit_layout = true;
+         break;
+      case VK_IMAGE_TILING_OPTIMAL:
+         tiling_has_explicit_layout = false;
+         break;
+      default:
+         unreachable("Unsupported VkImageTiling");
+      }
+
+      switch (external_info->handleType) {
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+         /* No special restrictions */
+         if (tiling_has_explicit_layout) {
+            /* With an explicit memory layout, we don't care which type of
+             * fd the image belongs too. Both OPAQUE_FD and DMA_BUF are
+             * interchangeable here.
+             */
+            ext_mem_props = &hk_dma_buf_mem_props;
+         } else {
+            ext_mem_props = &hk_opaque_fd_mem_props;
+         }
+         break;
+
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+         if (!tiling_has_explicit_layout) {
+            return vk_errorf(pdev, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                             "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
+                             "requires VK_IMAGE_TILING_LINEAR or "
+                             "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+         }
+         ext_mem_props = &hk_dma_buf_mem_props;
+         break;
+
+      default:
+         /* From the Vulkan 1.3.256 spec:
+          *
+          *    "If handleType is not compatible with the [parameters] in
+          *    VkPhysicalDeviceImageFormatInfo2, then
+          *    vkGetPhysicalDeviceImageFormatProperties2 returns
+          *    VK_ERROR_FORMAT_NOT_SUPPORTED."
+          */
+         return vk_errorf(pdev, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                          "unsupported VkExternalMemoryTypeFlagBits 0x%x",
+                          external_info->handleType);
+      }
+   }
+
+   const unsigned plane_count =
+      vk_format_get_plane_count(pImageFormatInfo->format);
+
+   /* From the Vulkan 1.3.259 spec, VkImageCreateInfo:
+    *
+    *    VUID-VkImageCreateInfo-imageCreateFormatFeatures-02260
+    *
+    *    "If format is a multi-planar format, and if imageCreateFormatFeatures
+    *    (as defined in Image Creation Limits) does not contain
+    *    VK_FORMAT_FEATURE_DISJOINT_BIT, then flags must not contain
+    *    VK_IMAGE_CREATE_DISJOINT_BIT"
+    *
+    * This is satisfied trivially because we support DISJOINT on all
+    * multi-plane formats.  Also,
+    *
+    *    VUID-VkImageCreateInfo-format-01577
+    *
+    *    "If format is not a multi-planar format, and flags does not include
+    *    VK_IMAGE_CREATE_ALIAS_BIT, flags must not contain
+    *    VK_IMAGE_CREATE_DISJOINT_BIT"
+    */
+   if (plane_count == 1 &&
+       !(pImageFormatInfo->flags & VK_IMAGE_CREATE_ALIAS_BIT) &&
+       (pImageFormatInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   if (ycbcr_info &&
+       ((pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) ||
+        (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT)))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   pImageFormatProperties->imageFormatProperties = (VkImageFormatProperties){
+      .maxExtent = maxExtent,
+      .maxMipLevels = maxMipLevels,
+      .maxArrayLayers = maxArraySize,
+      .sampleCounts = sampleCounts,
+      .maxResourceSize = UINT32_MAX, /* TODO */
+   };
+
+   vk_foreach_struct(s, pImageFormatProperties->pNext) {
+      switch (s->sType) {
+      case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: {
+         VkExternalImageFormatProperties *p = (void *)s;
+         /* From the Vulkan 1.3.256 spec:
+          *
+          *    "If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2
+          *    will behave as if VkPhysicalDeviceExternalImageFormatInfo was
+          *    not present, and VkExternalImageFormatProperties will be
+          *    ignored."
+          *
+          * This is true if and only if ext_mem_props == NULL
+          */
+         if (ext_mem_props != NULL)
+            p->externalMemoryProperties = *ext_mem_props;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: {
+         VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = (void *)s;
+         ycbcr_props->combinedImageSamplerDescriptorCount = plane_count;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_HOST_IMAGE_COPY_DEVICE_PERFORMANCE_QUERY_EXT: {
+         VkHostImageCopyDevicePerformanceQueryEXT *hic_props = (void *)s;
+
+         /* TODO: Check compressability */
+         hic_props->optimalDeviceAccess = hic_props->identicalMemoryLayout =
+            true;
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(s->sType);
+         break;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkSparseImageFormatProperties
+hk_fill_sparse_image_fmt_props(VkImageAspectFlags aspects)
+{
+   /* TODO */
+   return (VkSparseImageFormatProperties){
+      .aspectMask = aspects,
+      .flags = VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT,
+      .imageGranularity =
+         {
+            .width = 1,
+            .height = 1,
+            .depth = 1,
+         },
+   };
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceSparseImageFormatProperties2(
+   VkPhysicalDevice physicalDevice,
+   const VkPhysicalDeviceSparseImageFormatInfo2 *pFormatInfo,
+   uint32_t *pPropertyCount, VkSparseImageFormatProperties2 *pProperties)
+{
+   VkResult result;
+
+   /* Check if the given format info is valid first before returning sparse
+    * props.  The easiest way to do this is to just call
+    * hk_GetPhysicalDeviceImageFormatProperties2()
+    */
+   const VkPhysicalDeviceImageFormatInfo2 img_fmt_info = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+      .format = pFormatInfo->format,
+      .type = pFormatInfo->type,
+      .tiling = pFormatInfo->tiling,
+      .usage = pFormatInfo->usage,
+      .flags = VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+               VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT,
+   };
+
+   VkImageFormatProperties2 img_fmt_props2 = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
+      .pNext = NULL,
+   };
+
+   result = hk_GetPhysicalDeviceImageFormatProperties2(
+      physicalDevice, &img_fmt_info, &img_fmt_props2);
+   if (result != VK_SUCCESS) {
+      *pPropertyCount = 0;
+      return;
+   }
+
+   const VkImageFormatProperties *props = &img_fmt_props2.imageFormatProperties;
+   if (!(pFormatInfo->samples & props->sampleCounts)) {
+      *pPropertyCount = 0;
+      return;
+   }
+
+   VK_OUTARRAY_MAKE_TYPED(VkSparseImageFormatProperties2, out, pProperties,
+                          pPropertyCount);
+
+   VkImageAspectFlags aspects = vk_format_aspects(pFormatInfo->format);
+
+   vk_outarray_append_typed(VkSparseImageFormatProperties2, &out, props)
+   {
+      props->properties = hk_fill_sparse_image_fmt_props(aspects);
+   }
+}
+
+static enum ail_tiling
+hk_map_tiling(const VkImageCreateInfo *info, unsigned plane)
+{
+   switch (info->tiling) {
+   case VK_IMAGE_TILING_LINEAR:
+      return AIL_TILING_LINEAR;
+
+   case VK_IMAGE_TILING_OPTIMAL: {
+      const struct vk_format_ycbcr_info *ycbcr_info =
+         vk_format_get_ycbcr_info(info->format);
+      VkFormat format =
+         ycbcr_info ? ycbcr_info->planes[plane].format : info->format;
+
+      if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         format = (plane == 0) ? VK_FORMAT_D32_SFLOAT : VK_FORMAT_S8_UINT;
+      }
+
+      const uint8_t width_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[0] : 1;
+      const uint8_t height_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[1] : 1;
+
+      if ((info->extent.width / width_scale) < 16 ||
+          (info->extent.height / height_scale) < 16)
+         return AIL_TILING_TWIDDLED;
+
+      // TODO: lots of bugs to fix first
+      // return AIL_TILING_TWIDDLED_COMPRESSED;
+      return AIL_TILING_TWIDDLED;
+   }
+
+   case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT:
+      /* TODO */
+      return AIL_TILING_TWIDDLED;
+   default:
+      unreachable("invalid tiling");
+   }
+}
+
+static uint32_t
+modifier_get_score(uint64_t mod)
+{
+   switch (mod) {
+   case DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED:
+      return 10;
+
+   case DRM_FORMAT_MOD_APPLE_TWIDDLED:
+      return 5;
+
+   case DRM_FORMAT_MOD_LINEAR:
+      return 1;
+
+   default:
+      return 0;
+   }
+}
+
+static uint64_t
+choose_drm_format_mod(uint32_t modifier_count, const uint64_t *modifiers)
+{
+   uint64_t best_mod = UINT64_MAX;
+   uint32_t best_score = 0;
+
+   for (uint32_t i = 0; i < modifier_count; ++i) {
+      uint32_t score = modifier_get_score(modifiers[i]);
+      if (score > best_score) {
+         best_mod = modifiers[i];
+         best_score = score;
+      }
+   }
+
+   if (best_score > 0)
+      return best_mod;
+   else
+      return DRM_FORMAT_MOD_INVALID;
+}
+
+static VkResult
+hk_image_init(struct hk_device *dev, struct hk_image *image,
+              const VkImageCreateInfo *pCreateInfo)
+{
+   vk_image_init(&dev->vk, &image->vk, pCreateInfo);
+
+   if ((image->vk.usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                           VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       image->vk.samples > 1) {
+      image->vk.usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+      image->vk.stencil_usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+   }
+
+   if (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT)
+      image->vk.usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+   if (image->vk.usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT)
+      image->vk.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+   image->plane_count = vk_format_get_plane_count(pCreateInfo->format);
+   image->disjoint = image->plane_count > 1 &&
+                     (pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT);
+
+   /* We do not support interleaved depth/stencil. Instead, we decompose to
+    * a depth plane and a stencil plane.
+    */
+   if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+      image->plane_count = 2;
+   }
+
+   if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) {
+      /* Sparse multiplane is not supported. Sparse depth/stencil not supported
+       * on G13 so we're fine there too.
+       */
+      assert(image->plane_count == 1);
+   }
+
+   const struct VkImageDrmFormatModifierExplicitCreateInfoEXT
+      *mod_explicit_info = NULL;
+
+   if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      assert(!image->vk.wsi_legacy_scanout);
+      mod_explicit_info = vk_find_struct_const(
+         pCreateInfo->pNext,
+         IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+
+      uint64_t modifier = DRM_FORMAT_MOD_INVALID;
+
+      if (mod_explicit_info) {
+         modifier = mod_explicit_info->drmFormatModifier;
+      } else {
+         const struct VkImageDrmFormatModifierListCreateInfoEXT *mod_list_info =
+            vk_find_struct_const(
+               pCreateInfo->pNext,
+               IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
+
+         modifier = choose_drm_format_mod(mod_list_info->drmFormatModifierCount,
+                                          mod_list_info->pDrmFormatModifiers);
+      }
+
+      assert(modifier != DRM_FORMAT_MOD_INVALID);
+      assert(image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID);
+      image->vk.drm_format_mod = modifier;
+   }
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(pCreateInfo->format);
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      VkFormat format =
+         ycbcr_info ? ycbcr_info->planes[plane].format : pCreateInfo->format;
+
+      if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         format = (plane == 0) ? VK_FORMAT_D32_SFLOAT : VK_FORMAT_S8_UINT;
+      }
+
+      const uint8_t width_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[0] : 1;
+      const uint8_t height_scale =
+         ycbcr_info ? ycbcr_info->planes[plane].denominator_scales[1] : 1;
+
+      enum ail_tiling tiling = hk_map_tiling(pCreateInfo, plane);
+
+      image->planes[plane].layout = (struct ail_layout){
+         .tiling = tiling,
+         .mipmapped_z = pCreateInfo->imageType == VK_IMAGE_TYPE_3D,
+         .format = vk_format_to_pipe_format(format),
+
+         .width_px = pCreateInfo->extent.width / width_scale,
+         .height_px = pCreateInfo->extent.height / height_scale,
+         .depth_px = MAX2(pCreateInfo->extent.depth, pCreateInfo->arrayLayers),
+
+         .levels = pCreateInfo->mipLevels,
+         .sample_count_sa = pCreateInfo->samples,
+         .writeable_image = tiling != AIL_TILING_TWIDDLED_COMPRESSED,
+
+         /* TODO: Maybe optimize this, our GL driver doesn't bother though */
+         .renderable = true,
+      };
+
+      ail_make_miptree(&image->planes[plane].layout);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_image_plane_alloc_vma(struct hk_device *dev, struct hk_image_plane *plane,
+                         VkImageCreateFlags create_flags)
+{
+   const bool sparse_bound = create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT;
+   const bool sparse_resident =
+      create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
+   assert(sparse_bound || !sparse_resident);
+
+   if (sparse_bound) {
+      plane->vma_size_B = plane->layout.size_B;
+#if 0
+      plane->addr = nouveau_ws_alloc_vma(dev->ws_dev, 0, plane->vma_size_B,
+                                         plane->layout.align_B,
+                                         false, sparse_resident);
+#endif
+      if (plane->addr == 0) {
+         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "Sparse VMA allocation failed");
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_image_plane_finish(struct hk_device *dev, struct hk_image_plane *plane,
+                      VkImageCreateFlags create_flags,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   if (plane->vma_size_B) {
+#if 0
+      const bool sparse_resident =
+         create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
+
+      agx_bo_unbind_vma(dev->ws_dev, plane->addr, plane->vma_size_B);
+      nouveau_ws_free_vma(dev->ws_dev, plane->addr, plane->vma_size_B,
+                          false, sparse_resident);
+#endif
+   }
+}
+
+static void
+hk_image_finish(struct hk_device *dev, struct hk_image *image,
+                const VkAllocationCallbacks *pAllocator)
+{
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      hk_image_plane_finish(dev, &image->planes[plane], image->vk.create_flags,
+                            pAllocator);
+   }
+
+   vk_image_finish(&image->vk);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateImage(VkDevice _device, const VkImageCreateInfo *pCreateInfo,
+               const VkAllocationCallbacks *pAllocator, VkImage *pImage)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_image *image;
+   VkResult result;
+
+#ifdef HK_USE_WSI_PLATFORM
+   /* Ignore swapchain creation info on Android. Since we don't have an
+    * implementation in Mesa, we're guaranteed to access an Android object
+    * incorrectly.
+    */
+   const VkImageSwapchainCreateInfoKHR *swapchain_info =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+      return wsi_common_create_swapchain_image(
+         &pdev->wsi_device, pCreateInfo, swapchain_info->swapchain, pImage);
+   }
+#endif
+
+   image = vk_zalloc2(&dev->vk.alloc, pAllocator, sizeof(*image), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!image)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = hk_image_init(dev, image, pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_free2(&dev->vk.alloc, pAllocator, image);
+      return result;
+   }
+
+   for (uint8_t plane = 0; plane < image->plane_count; plane++) {
+      result = hk_image_plane_alloc_vma(dev, &image->planes[plane],
+                                        image->vk.create_flags);
+      if (result != VK_SUCCESS) {
+         hk_image_finish(dev, image, pAllocator);
+         vk_free2(&dev->vk.alloc, pAllocator, image);
+         return result;
+      }
+   }
+
+   *pImage = hk_image_to_handle(image);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyImage(VkDevice device, VkImage _image,
+                const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   if (!image)
+      return;
+
+   hk_image_finish(dev, image, pAllocator);
+   vk_free2(&dev->vk.alloc, pAllocator, image);
+}
+
+static void
+hk_image_plane_add_req(struct hk_image_plane *plane, uint64_t *size_B,
+                       uint32_t *align_B)
+{
+   assert(util_is_power_of_two_or_zero64(*align_B));
+   assert(util_is_power_of_two_or_zero64(HK_PLANE_ALIGN_B));
+
+   *align_B = MAX2(*align_B, HK_PLANE_ALIGN_B);
+   *size_B = align64(*size_B, HK_PLANE_ALIGN_B);
+   *size_B += plane->layout.size_B;
+}
+
+static void
+hk_get_image_memory_requirements(struct hk_device *dev, struct hk_image *image,
+                                 VkImageAspectFlags aspects,
+                                 VkMemoryRequirements2 *pMemoryRequirements)
+{
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   uint32_t memory_types = (1 << pdev->mem_type_count) - 1;
+
+   // TODO hope for the best?
+
+   uint64_t size_B = 0;
+   uint32_t align_B = 0;
+   if (image->disjoint) {
+      uint8_t plane = hk_image_aspects_to_plane(image, aspects);
+      hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B);
+   } else {
+      for (unsigned plane = 0; plane < image->plane_count; plane++)
+         hk_image_plane_add_req(&image->planes[plane], &size_B, &align_B);
+   }
+
+   pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
+   pMemoryRequirements->memoryRequirements.alignment = align_B;
+   pMemoryRequirements->memoryRequirements.size = size_B;
+
+   vk_foreach_struct_const(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *dedicated = (void *)ext;
+         dedicated->prefersDedicatedAllocation = false;
+         dedicated->requiresDedicatedAllocation = false;
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetImageMemoryRequirements2(VkDevice device,
+                               const VkImageMemoryRequirementsInfo2 *pInfo,
+                               VkMemoryRequirements2 *pMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, pInfo->image);
+
+   const VkImagePlaneMemoryRequirementsInfo *plane_info =
+      vk_find_struct_const(pInfo->pNext, IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO);
+   const VkImageAspectFlags aspects =
+      image->disjoint ? plane_info->planeAspect : image->vk.aspects;
+
+   hk_get_image_memory_requirements(dev, image, aspects, pMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceImageMemoryRequirements(VkDevice device,
+                                    const VkDeviceImageMemoryRequirements *pInfo,
+                                    VkMemoryRequirements2 *pMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   ASSERTED VkResult result;
+   struct hk_image image = {0};
+
+   result = hk_image_init(dev, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   const VkImageAspectFlags aspects =
+      image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+   hk_get_image_memory_requirements(dev, &image, aspects, pMemoryRequirements);
+
+   hk_image_finish(dev, &image, NULL);
+}
+
+static VkSparseImageMemoryRequirements
+hk_fill_sparse_image_memory_reqs(const struct ail_layout *layout,
+                                 VkImageAspectFlags aspects)
+{
+   VkSparseImageFormatProperties sparse_format_props =
+      hk_fill_sparse_image_fmt_props(aspects);
+
+   // assert(layout->mip_tail_first_lod <= layout->num_levels);
+   VkSparseImageMemoryRequirements sparse_memory_reqs = {
+      .formatProperties = sparse_format_props,
+      .imageMipTailFirstLod = 0, // layout->mip_tail_first_lod,
+      .imageMipTailStride = 0,
+   };
+
+   sparse_memory_reqs.imageMipTailSize = layout->size_B;
+   sparse_memory_reqs.imageMipTailOffset = 0;
+   return sparse_memory_reqs;
+}
+
+static void
+hk_get_image_sparse_memory_requirements(
+   struct hk_device *dev, struct hk_image *image, VkImageAspectFlags aspects,
+   uint32_t *pSparseMemoryRequirementCount,
+   VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkSparseImageMemoryRequirements2, out,
+                          pSparseMemoryRequirements,
+                          pSparseMemoryRequirementCount);
+
+   /* From the Vulkan 1.3.279 spec:
+    *
+    *    "The sparse image must have been created using the
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT flag to retrieve valid sparse
+    *    image memory requirements."
+    */
+   if (!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+      return;
+
+   /* We don't support multiplane sparse for now */
+   if (image->plane_count > 1)
+      return;
+
+   vk_outarray_append_typed(VkSparseImageMemoryRequirements2, &out, reqs)
+   {
+      reqs->memoryRequirements =
+         hk_fill_sparse_image_memory_reqs(&image->planes[0].layout, aspects);
+   };
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetImageSparseMemoryRequirements2(
+   VkDevice device, const VkImageSparseMemoryRequirementsInfo2 *pInfo,
+   uint32_t *pSparseMemoryRequirementCount,
+   VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, pInfo->image);
+
+   const VkImageAspectFlags aspects = image->vk.aspects;
+
+   hk_get_image_sparse_memory_requirements(dev, image, aspects,
+                                           pSparseMemoryRequirementCount,
+                                           pSparseMemoryRequirements);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceImageSparseMemoryRequirements(
+   VkDevice device, const VkDeviceImageMemoryRequirements *pInfo,
+   uint32_t *pSparseMemoryRequirementCount,
+   VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   ASSERTED VkResult result;
+   struct hk_image image = {0};
+
+   result = hk_image_init(dev, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   const VkImageAspectFlags aspects =
+      image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+   hk_get_image_sparse_memory_requirements(dev, &image, aspects,
+                                           pSparseMemoryRequirementCount,
+                                           pSparseMemoryRequirements);
+
+   hk_image_finish(dev, &image, NULL);
+}
+
+static void
+hk_get_image_subresource_layout(UNUSED struct hk_device *dev,
+                                struct hk_image *image,
+                                const VkImageSubresource2KHR *pSubresource,
+                                VkSubresourceLayout2KHR *pLayout)
+{
+   const VkImageSubresource *isr = &pSubresource->imageSubresource;
+
+   const uint8_t p = hk_image_aspects_to_plane(image, isr->aspectMask);
+   const struct hk_image_plane *plane = &image->planes[p];
+
+   uint64_t offset_B = 0;
+   if (!image->disjoint) {
+      uint32_t align_B = 0;
+      for (unsigned plane = 0; plane < p; plane++)
+         hk_image_plane_add_req(&image->planes[plane], &offset_B, &align_B);
+   }
+   offset_B +=
+      ail_get_layer_level_B(&plane->layout, isr->arrayLayer, isr->mipLevel);
+
+   bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
+
+   pLayout->subresourceLayout = (VkSubresourceLayout){
+      .offset = offset_B,
+      .size = ail_get_level_size_B(&plane->layout, isr->mipLevel),
+
+      /* From the spec:
+       *
+       *     It is legal to call vkGetImageSubresourceLayout2KHR with a image
+       *     created with tiling equal to VK_IMAGE_TILING_OPTIMAL, but the
+       * members of VkSubresourceLayout2KHR::subresourceLayout will have
+       * undefined values in this case.
+       *
+       * So don't collapse with mips.
+       */
+      .rowPitch = isr->mipLevel
+                     ? 0
+                     : ail_get_wsi_stride_B(&plane->layout, isr->mipLevel),
+      .arrayPitch = is_3d ? 0 : plane->layout.layer_stride_B,
+      .depthPitch = is_3d ? plane->layout.layer_stride_B : 0,
+   };
+
+   VkSubresourceHostMemcpySizeEXT *memcpy_size =
+      vk_find_struct(pLayout, SUBRESOURCE_HOST_MEMCPY_SIZE_EXT);
+   if (memcpy_size) {
+      memcpy_size->size = pLayout->subresourceLayout.size;
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetImageSubresourceLayout2KHR(VkDevice device, VkImage _image,
+                                 const VkImageSubresource2KHR *pSubresource,
+                                 VkSubresourceLayout2KHR *pLayout)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_image, image, _image);
+
+   hk_get_image_subresource_layout(dev, image, pSubresource, pLayout);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetDeviceImageSubresourceLayoutKHR(
+   VkDevice device, const VkDeviceImageSubresourceInfoKHR *pInfo,
+   VkSubresourceLayout2KHR *pLayout)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   ASSERTED VkResult result;
+   struct hk_image image = {0};
+
+   result = hk_image_init(dev, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   hk_get_image_subresource_layout(dev, &image, pInfo->pSubresource, pLayout);
+
+   hk_image_finish(dev, &image, NULL);
+}
+
+static void
+hk_image_plane_bind(struct hk_device *dev, struct hk_image_plane *plane,
+                    struct hk_device_memory *mem, uint64_t *offset_B)
+{
+   *offset_B = align64(*offset_B, HK_PLANE_ALIGN_B);
+
+   if (plane->vma_size_B) {
+#if 0
+      agx_bo_bind_vma(dev->ws_dev,
+                             mem->bo,
+                             plane->addr,
+                             plane->vma_size_B,
+                             *offset_B,
+                             plane->nil.pte_kind);
+#endif
+      unreachable("todo");
+   } else {
+      plane->addr = mem->bo->ptr.gpu + *offset_B;
+      plane->map = mem->bo->ptr.cpu + *offset_B;
+      plane->rem = mem->bo->size - (*offset_B);
+   }
+
+   *offset_B += plane->layout.size_B;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_BindImageMemory2(VkDevice device, uint32_t bindInfoCount,
+                    const VkBindImageMemoryInfo *pBindInfos)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   for (uint32_t i = 0; i < bindInfoCount; ++i) {
+      VK_FROM_HANDLE(hk_device_memory, mem, pBindInfos[i].memory);
+      VK_FROM_HANDLE(hk_image, image, pBindInfos[i].image);
+
+      /* Ignore this struct on Android, we cannot access swapchain structures
+       * there. */
+#ifdef HK_USE_WSI_PLATFORM
+      const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
+         vk_find_struct_const(pBindInfos[i].pNext,
+                              BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR);
+
+      if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+         VkImage _wsi_image = wsi_common_get_image(swapchain_info->swapchain,
+                                                   swapchain_info->imageIndex);
+         VK_FROM_HANDLE(hk_image, wsi_img, _wsi_image);
+
+         assert(image->plane_count == 1);
+         assert(wsi_img->plane_count == 1);
+
+         struct hk_image_plane *plane = &image->planes[0];
+         struct hk_image_plane *swapchain_plane = &wsi_img->planes[0];
+
+         /* Copy memory binding information from swapchain image to the current
+          * image's plane. */
+         plane->addr = swapchain_plane->addr;
+         continue;
+      }
+#endif
+
+      uint64_t offset_B = pBindInfos[i].memoryOffset;
+      if (image->disjoint) {
+         const VkBindImagePlaneMemoryInfo *plane_info = vk_find_struct_const(
+            pBindInfos[i].pNext, BIND_IMAGE_PLANE_MEMORY_INFO);
+         uint8_t plane =
+            hk_image_aspects_to_plane(image, plane_info->planeAspect);
+         hk_image_plane_bind(dev, &image->planes[plane], mem, &offset_B);
+      } else {
+         for (unsigned plane = 0; plane < image->plane_count; plane++) {
+            hk_image_plane_bind(dev, &image->planes[plane], mem, &offset_B);
+         }
+      }
+
+      const VkBindMemoryStatusKHR *status =
+         vk_find_struct_const(pBindInfos[i].pNext, BIND_MEMORY_STATUS_KHR);
+      if (status != NULL && status->pResult != NULL)
+         *status->pResult = VK_SUCCESS;
+   }
+
+   return VK_SUCCESS;
+}
+
+static uint32_t
+hk_plane_index(VkFormat format, VkImageAspectFlags aspect_mask)
+{
+   switch (aspect_mask) {
+   default:
+      assert(aspect_mask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+      return 0;
+   case VK_IMAGE_ASPECT_PLANE_1_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+      return 1;
+   case VK_IMAGE_ASPECT_PLANE_2_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+      return 2;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      return format == VK_FORMAT_D32_SFLOAT_S8_UINT;
+   }
+}
+
+static void
+hk_copy_memory_to_image(struct hk_device *device, struct hk_image *dst_image,
+                        const VkMemoryToImageCopyEXT *info, bool copy_memcpy)
+{
+   unsigned plane =
+      hk_plane_index(dst_image->vk.format, info->imageSubresource.aspectMask);
+   const struct ail_layout *layout = &dst_image->planes[plane].layout;
+
+   VkOffset3D offset = info->imageOffset;
+   VkExtent3D extent = info->imageExtent;
+   uint32_t src_width = info->memoryRowLength ?: extent.width;
+   uint32_t src_height = info->memoryImageHeight ?: extent.height;
+
+   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+   uint32_t src_pitch = src_width * blocksize_B;
+
+   unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                             ? offset.z
+                             : info->imageSubresource.baseArrayLayer;
+   uint32_t layers =
+      MAX2(extent.depth, vk_image_subresource_layer_count(
+                            &dst_image->vk, &info->imageSubresource));
+
+   unsigned level = info->imageSubresource.mipLevel;
+   uint32_t image_offset = ail_get_layer_level_B(layout, start_layer, level);
+   uint32_t dst_layer_stride = layout->layer_stride_B;
+   uint32_t src_layer_stride = copy_memcpy
+                                  ? ail_get_level_size_B(layout, level)
+                                  : (src_width * src_height * blocksize_B);
+   bool tiled = ail_is_level_twiddled_uncompressed(
+      layout, info->imageSubresource.mipLevel);
+
+   const char *src =
+      (const char *)info->pHostPointer + start_layer * dst_layer_stride;
+   char *dst = (char *)dst_image->planes[plane].map + image_offset;
+   for (unsigned layer = 0; layer < layers;
+        layer++, src += src_layer_stride, dst += dst_layer_stride) {
+      if (copy_memcpy) {
+         memcpy(dst, src, ail_get_level_size_B(layout, level));
+      } else if (!tiled) {
+         uint32_t dst_pitch = ail_get_linear_stride_B(layout, level);
+         /*TODO:comp*/
+         for (unsigned y = 0; y < extent.height; y++) {
+            memcpy(dst + dst_pitch * (y + offset.y) + offset.x * blocksize_B,
+                   src + src_pitch * y, extent.width * blocksize_B);
+         }
+      } else {
+         ail_tile(dst, (void *)src, layout, level, src_pitch, offset.x,
+                  offset.y, extent.width, extent.height);
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CopyMemoryToImageEXT(VkDevice _device,
+                        const VkCopyMemoryToImageInfoEXT *info)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_image, dst_image, info->dstImage);
+
+   for (unsigned i = 0; i < info->regionCount; i++) {
+      hk_copy_memory_to_image(device, dst_image, &info->pRegions[i],
+                              info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_copy_image_to_memory(struct hk_device *device, struct hk_image *src_image,
+                        const VkImageToMemoryCopyEXT *info, bool copy_memcpy)
+{
+   unsigned plane =
+      hk_plane_index(src_image->vk.format, info->imageSubresource.aspectMask);
+   const struct ail_layout *layout = &src_image->planes[plane].layout;
+
+   VkOffset3D offset = info->imageOffset;
+   VkExtent3D extent = info->imageExtent;
+   uint32_t dst_width = info->memoryRowLength ?: extent.width;
+   uint32_t dst_height = info->memoryImageHeight ?: extent.height;
+
+#if 0
+   copy_compressed(src_image->vk.format, &offset, &extent, &dst_width,
+                   &dst_height);
+#endif
+
+   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+   uint32_t dst_pitch = dst_width * blocksize_B;
+
+   unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                             ? offset.z
+                             : info->imageSubresource.baseArrayLayer;
+   uint32_t layers =
+      MAX2(extent.depth, vk_image_subresource_layer_count(
+                            &src_image->vk, &info->imageSubresource));
+   unsigned level = info->imageSubresource.mipLevel;
+
+   uint32_t image_offset = ail_get_layer_level_B(layout, start_layer, level);
+   uint32_t src_layer_stride = layout->layer_stride_B;
+   uint32_t dst_layer_stride = copy_memcpy
+                                  ? ail_get_level_size_B(layout, level)
+                                  : (dst_width * dst_height * blocksize_B);
+
+   bool tiled = ail_is_level_twiddled_uncompressed(
+      layout, info->imageSubresource.mipLevel);
+
+   const char *src = (const char *)src_image->planes[plane].map + image_offset;
+   char *dst = (char *)info->pHostPointer + start_layer * dst_layer_stride;
+   for (unsigned layer = 0; layer < layers;
+        layer++, src += src_layer_stride, dst += dst_layer_stride) {
+
+      if (copy_memcpy) {
+         memcpy(dst, src, dst_layer_stride);
+      } else if (!tiled) {
+         /* TODO: comp */
+         uint32_t src_pitch = ail_get_linear_stride_B(layout, level);
+         for (unsigned y = 0; y < extent.height; y++) {
+            memcpy(dst + dst_pitch * y,
+                   src + src_pitch * (y + offset.y) + offset.x * blocksize_B,
+                   extent.width * blocksize_B);
+         }
+      } else {
+         ail_detile((void *)src, dst, layout, info->imageSubresource.mipLevel,
+                    dst_pitch, offset.x, offset.y, extent.width, extent.height);
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CopyImageToMemoryEXT(VkDevice _device,
+                        const VkCopyImageToMemoryInfoEXT *info)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_image, image, info->srcImage);
+
+   for (unsigned i = 0; i < info->regionCount; i++) {
+      hk_copy_image_to_memory(device, image, &info->pRegions[i],
+                              info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
+                           struct hk_image *dst_image, const VkImageCopy2 *info,
+                           bool copy_memcpy)
+{
+   unsigned src_plane =
+      hk_plane_index(src_image->vk.format, info->srcSubresource.aspectMask);
+   unsigned dst_plane =
+      hk_plane_index(dst_image->vk.format, info->dstSubresource.aspectMask);
+
+   const struct ail_layout *src_layout = &src_image->planes[src_plane].layout;
+   const struct ail_layout *dst_layout = &dst_image->planes[dst_plane].layout;
+
+   VkOffset3D src_offset = info->srcOffset;
+   VkOffset3D dst_offset = info->dstOffset;
+   VkExtent3D extent = info->extent;
+   uint32_t layers_to_copy = MAX2(
+      info->extent.depth,
+      vk_image_subresource_layer_count(&src_image->vk, &info->srcSubresource));
+
+   /* See comment above. */
+#if 0
+   copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
+   copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
+#endif
+
+   unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                                 ? src_offset.z
+                                 : info->srcSubresource.baseArrayLayer;
+   unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
+                                 ? dst_offset.z
+                                 : info->dstSubresource.baseArrayLayer;
+
+   uint32_t src_layer_stride = src_layout->layer_stride_B;
+   uint32_t dst_layer_stride = dst_layout->layer_stride_B;
+
+   uint32_t dst_block_B = util_format_get_blocksize(dst_layout->format);
+   uint32_t src_block_B = util_format_get_blocksize(src_layout->format);
+
+   uint32_t src_image_offset = ail_get_layer_level_B(
+      src_layout, src_start_layer, info->srcSubresource.mipLevel);
+   uint32_t dst_image_offset = ail_get_layer_level_B(
+      dst_layout, dst_start_layer, info->dstSubresource.mipLevel);
+
+   bool src_tiled = ail_is_level_twiddled_uncompressed(
+      src_layout, info->srcSubresource.mipLevel);
+   bool dst_tiled = ail_is_level_twiddled_uncompressed(
+      dst_layout, info->dstSubresource.mipLevel);
+
+   const char *src =
+      (const char *)src_image->planes[src_plane].map + src_image_offset;
+   char *dst = (char *)dst_image->planes[dst_plane].map + dst_image_offset;
+   for (unsigned layer = 0; layer < layers_to_copy;
+        layer++, src += src_layer_stride, dst += dst_layer_stride) {
+
+      if (copy_memcpy) {
+         uint32_t src_size =
+            ail_get_level_size_B(src_layout, info->srcSubresource.mipLevel);
+         uint32_t dst_size =
+            ail_get_level_size_B(dst_layout, info->dstSubresource.mipLevel);
+
+         assert(src_size == dst_size);
+         memcpy(dst, src, src_size);
+      } else if (!src_tiled && !dst_tiled) {
+         /* TODO comp */
+         uint32_t src_pitch =
+            ail_get_linear_stride_B(src_layout, info->srcSubresource.mipLevel);
+
+         uint32_t dst_pitch =
+            ail_get_linear_stride_B(dst_layout, info->dstSubresource.mipLevel);
+
+         for (unsigned y = 0; y < extent.height; y++) {
+            memcpy(dst + dst_pitch * (y + dst_offset.y) +
+                      dst_offset.x * dst_block_B,
+                   src + src_pitch * (y + src_offset.y) +
+                      src_offset.x * src_block_B,
+                   extent.width * src_block_B);
+         }
+      } else if (!src_tiled) {
+         unreachable("todo");
+#if 0
+         fdl6_memcpy_linear_to_tiled(
+            dst_offset.x, dst_offset.y, extent.width, extent.height, dst,
+            src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp,
+            dst_layout, info->dstSubresource.mipLevel, src_pitch,
+            &device->physical_device->ubwc_config);
+#endif
+      } else if (!dst_tiled) {
+         unreachable("todo");
+#if 0
+         fdl6_memcpy_tiled_to_linear(
+            src_offset.x, src_offset.y, extent.width, extent.height,
+            dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp,
+            src, src_layout, info->dstSubresource.mipLevel, dst_pitch,
+            &device->physical_device->ubwc_config);
+#endif
+      } else {
+         /* Work tile-by-tile, holding the unswizzled tile in a temporary
+          * buffer.
+          */
+         char temp_tile[16384];
+
+         unsigned src_level = info->srcSubresource.mipLevel;
+         unsigned dst_level = info->dstSubresource.mipLevel;
+         uint32_t block_width = src_layout->tilesize_el[src_level].width_el;
+         uint32_t block_height = src_layout->tilesize_el[src_level].height_el;
+         uint32_t temp_pitch = block_width * src_block_B;
+         ;
+
+         for (unsigned by = src_offset.y / block_height;
+              by * block_height < src_offset.y + extent.height; by++) {
+            uint32_t src_y_start = MAX2(src_offset.y, by * block_height);
+            uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y;
+            uint32_t height =
+               MIN2((by + 1) * block_height, src_offset.y + extent.height) -
+               src_y_start;
+            for (unsigned bx = src_offset.x / block_width;
+                 bx * block_width < src_offset.x + extent.width; bx++) {
+               uint32_t src_x_start = MAX2(src_offset.x, bx * block_width);
+               uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x;
+               uint32_t width =
+                  MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
+                  src_x_start;
+
+               ail_detile((void *)src, temp_tile, src_layout, src_level,
+                          temp_pitch, src_x_start, src_y_start, width, height);
+               ail_tile(dst, temp_tile, dst_layout, dst_level, temp_pitch,
+                        dst_x_start, dst_y_start, width, height);
+            }
+         }
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CopyImageToImageEXT(VkDevice _device,
+                       const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo)
+{
+   VK_FROM_HANDLE(hk_device, device, _device);
+   VK_FROM_HANDLE(hk_image, src_image, pCopyImageToImageInfo->srcImage);
+   VK_FROM_HANDLE(hk_image, dst_image, pCopyImageToImageInfo->dstImage);
+   bool copy_memcpy =
+      pCopyImageToImageInfo->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT;
+
+   for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) {
+      if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
+         VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i];
+         u_foreach_bit(b, info.dstSubresource.aspectMask) {
+            info.srcSubresource.aspectMask = BITFIELD_BIT(b);
+            info.dstSubresource.aspectMask = BITFIELD_BIT(b);
+            hk_copy_image_to_image_cpu(device, src_image, dst_image, &info,
+                                       copy_memcpy);
+         }
+         continue;
+      }
+
+      hk_copy_image_to_image_cpu(device, src_image, dst_image,
+                                 pCopyImageToImageInfo->pRegions + i,
+                                 copy_memcpy);
+   }
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_TransitionImageLayoutEXT(
+   VkDevice device, uint32_t transitionCount,
+   const VkHostImageLayoutTransitionInfoEXT *transitions)
+{
+   /* We don't do anything with layouts so this should be a no-op */
+   return VK_SUCCESS;
+}
diff --git a/src/asahi/vulkan/hk_image.h b/src/asahi/vulkan/hk_image.h
new file mode 100644
index 00000000000..a15129032aa
--- /dev/null
+++ b/src/asahi/vulkan/hk_image.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/layout/layout.h"
+#include "vulkan/vulkan_core.h"
+
+#include "hk_private.h"
+
+#include "vk_image.h"
+
+/* Because small images can end up with an array_stride_B that is less than
+ * the sparse block size (in bytes), we have to set SINGLE_MIPTAIL_BIT when
+ * advertising sparse properties to the client.  This means that we get one
+ * single memory range for the miptail of the image.  For large images with
+ * mipTailStartLod > 0, we have to deal with the array stride ourselves.
+ *
+ * We do this by returning HK_MIP_TAIL_START_OFFSET as the image's
+ * imageMipTailOffset.  We can then detect anything with that address as
+ * being part of the miptail and re-map it accordingly.  The Vulkan spec
+ * explicitly allows for this.
+ *
+ * From the Vulkan 1.3.279 spec:
+ *
+ *    "When VK_SPARSE_MEMORY_BIND_METADATA_BIT is present, the resourceOffset
+ *    must have been derived explicitly from the imageMipTailOffset in the
+ *    sparse resource properties returned for the metadata aspect. By
+ *    manipulating the value returned for imageMipTailOffset, the
+ *    resourceOffset does not have to correlate directly to a device virtual
+ *    address offset, and may instead be whatever value makes it easiest for
+ *    the implementation to derive the correct device virtual address."
+ */
+#define HK_MIP_TAIL_START_OFFSET 0x6d74000000000000UL
+
+struct hk_device_memory;
+struct hk_physical_device;
+
+static VkFormatFeatureFlags2
+hk_get_image_plane_format_features(struct hk_physical_device *pdev,
+                                   VkFormat vk_format, VkImageTiling tiling);
+
+VkFormatFeatureFlags2
+hk_get_image_format_features(struct hk_physical_device *pdevice,
+                             VkFormat format, VkImageTiling tiling);
+
+struct hk_image_plane {
+   struct ail_layout layout;
+   uint64_t addr;
+
+   /** Size of the reserved VMA range for sparse images, zero otherwise. */
+   uint64_t vma_size_B;
+
+   /* For host image copy */
+   void *map;
+   uint32_t rem;
+};
+
+struct hk_image {
+   struct vk_image vk;
+
+   /** True if the planes are bound separately
+    *
+    * This is set based on VK_IMAGE_CREATE_DISJOINT_BIT
+    */
+   bool disjoint;
+
+   uint8_t plane_count;
+   struct hk_image_plane planes[3];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
+
+static inline uint64_t
+hk_image_plane_base_address(const struct hk_image_plane *plane)
+{
+   return plane->addr;
+}
+
+static inline uint64_t
+hk_image_base_address(const struct hk_image *image, uint8_t plane)
+{
+   return hk_image_plane_base_address(&image->planes[plane]);
+}
+
+static inline uint8_t
+hk_image_aspects_to_plane(const struct hk_image *image,
+                          VkImageAspectFlags aspectMask)
+{
+   /* Must only be one aspect unless it's depth/stencil */
+   assert(aspectMask ==
+             (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) ||
+          util_bitcount(aspectMask) == 1);
+
+   switch (aspectMask) {
+   default:
+      assert(aspectMask != VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+      return 0;
+
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      return image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
+
+   case VK_IMAGE_ASPECT_PLANE_1_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+      return 1;
+
+   case VK_IMAGE_ASPECT_PLANE_2_BIT:
+   case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+      return 2;
+   }
+}
diff --git a/src/asahi/vulkan/hk_image_view.c b/src/asahi/vulkan/hk_image_view.c
new file mode 100644
index 00000000000..5a78224a4fd
--- /dev/null
+++ b/src/asahi/vulkan/hk_image_view.c
@@ -0,0 +1,653 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_image_view.h"
+#include "util/format/u_format.h"
+#include "vulkan/vulkan_core.h"
+
+#include "agx_helpers.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_physical_device.h"
+
+#include "layout.h"
+#include "vk_format.h"
+
+enum hk_desc_usage {
+   HK_DESC_USAGE_SAMPLED,
+   HK_DESC_USAGE_STORAGE,
+   HK_DESC_USAGE_INPUT,
+   HK_DESC_USAGE_BG_EOT,
+   HK_DESC_USAGE_LAYERED_BG_EOT,
+   HK_DESC_USAGE_EMRT,
+};
+
+static bool
+hk_image_view_type_is_array(VkImageViewType view_type)
+{
+   switch (view_type) {
+   case VK_IMAGE_VIEW_TYPE_1D:
+   case VK_IMAGE_VIEW_TYPE_2D:
+   case VK_IMAGE_VIEW_TYPE_3D:
+   case VK_IMAGE_VIEW_TYPE_CUBE:
+      return false;
+
+   case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
+   case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
+   case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
+      return true;
+
+   default:
+      unreachable("Invalid image view type");
+   }
+}
+
+static enum agx_texture_dimension
+translate_image_view_type(VkImageViewType view_type, bool msaa, bool layered,
+                          enum hk_desc_usage usage)
+{
+   if (usage == HK_DESC_USAGE_EMRT || usage == HK_DESC_USAGE_INPUT ||
+       (usage == HK_DESC_USAGE_LAYERED_BG_EOT && layered)) {
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D_ARRAY;
+   }
+
+   /* For background/EOT, we ignore the application-provided view type */
+   if (usage == HK_DESC_USAGE_BG_EOT || usage == HK_DESC_USAGE_LAYERED_BG_EOT) {
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D;
+   }
+
+   bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED;
+
+   switch (view_type) {
+   case VK_IMAGE_VIEW_TYPE_1D:
+   case VK_IMAGE_VIEW_TYPE_2D:
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D;
+
+   case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
+   case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
+      return msaa ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
+                  : AGX_TEXTURE_DIMENSION_2D_ARRAY;
+
+   case VK_IMAGE_VIEW_TYPE_3D:
+      assert(!msaa);
+      return AGX_TEXTURE_DIMENSION_3D;
+
+   case VK_IMAGE_VIEW_TYPE_CUBE:
+      assert(!msaa);
+      return cubes_to_2d ? AGX_TEXTURE_DIMENSION_2D_ARRAY
+                         : AGX_TEXTURE_DIMENSION_CUBE;
+
+   case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
+      assert(!msaa);
+      return cubes_to_2d ? AGX_TEXTURE_DIMENSION_2D_ARRAY
+                         : AGX_TEXTURE_DIMENSION_CUBE_ARRAY;
+
+   default:
+      unreachable("Invalid image view type");
+   }
+}
+
+static enum pipe_swizzle
+vk_swizzle_to_pipe(VkComponentSwizzle swizzle)
+{
+   switch (swizzle) {
+   case VK_COMPONENT_SWIZZLE_R:
+      return PIPE_SWIZZLE_X;
+   case VK_COMPONENT_SWIZZLE_G:
+      return PIPE_SWIZZLE_Y;
+   case VK_COMPONENT_SWIZZLE_B:
+      return PIPE_SWIZZLE_Z;
+   case VK_COMPONENT_SWIZZLE_A:
+      return PIPE_SWIZZLE_W;
+   case VK_COMPONENT_SWIZZLE_ONE:
+      return PIPE_SWIZZLE_1;
+   case VK_COMPONENT_SWIZZLE_ZERO:
+      return PIPE_SWIZZLE_0;
+   default:
+      unreachable("Invalid component swizzle");
+   }
+}
+
+static enum pipe_format
+get_stencil_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_S8_UINT:
+      return PIPE_FORMAT_S8_UINT;
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      return PIPE_FORMAT_X24S8_UINT;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      return PIPE_FORMAT_S8X24_UINT;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return PIPE_FORMAT_X32_S8X24_UINT;
+   default:
+      unreachable("Unsupported depth/stencil format");
+   }
+}
+
+struct hk_3d {
+   unsigned x, y, z;
+};
+
+static struct hk_3d
+view_denominator(struct hk_image_view *view)
+{
+   enum pipe_format view_format = vk_format_to_pipe_format(view->vk.format);
+   enum pipe_format img_format =
+      vk_format_to_pipe_format(view->vk.image->format);
+
+   if (util_format_is_compressed(view_format)) {
+      /*
+       * We can do an uncompressed view of a compressed image but not the other
+       * way around.
+       */
+      assert(util_format_is_compressed(img_format));
+      assert(util_format_get_blockwidth(img_format) ==
+             util_format_get_blockwidth(view_format));
+      assert(util_format_get_blockheight(img_format) ==
+             util_format_get_blockheight(view_format));
+      assert(util_format_get_blockdepth(img_format) ==
+             util_format_get_blockdepth(view_format));
+
+      return (struct hk_3d){1, 1, 1};
+   }
+
+   if (!util_format_is_compressed(img_format)) {
+      /* Both formats uncompressed */
+      return (struct hk_3d){1, 1, 1};
+   }
+
+   /* Else, img is compressed but view is not */
+   return (struct hk_3d){
+      util_format_get_blockwidth(img_format),
+      util_format_get_blockheight(img_format),
+      util_format_get_blockdepth(img_format),
+   };
+}
+
+static enum pipe_format
+format_for_plane(struct hk_image_view *view, unsigned view_plane)
+{
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(view->vk.format);
+
+   assert(ycbcr_info || view_plane == 0);
+   VkFormat plane_format =
+      ycbcr_info ? ycbcr_info->planes[view_plane].format : view->vk.format;
+
+   enum pipe_format p_format = vk_format_to_pipe_format(plane_format);
+   if (view->vk.aspects == VK_IMAGE_ASPECT_STENCIL_BIT)
+      p_format = get_stencil_format(p_format);
+
+   return p_format;
+}
+
+static void
+pack_texture(struct hk_image_view *view, unsigned view_plane,
+             enum hk_desc_usage usage, struct agx_texture_packed *out)
+{
+   struct hk_image *image = container_of(view->vk.image, struct hk_image, vk);
+   const uint8_t image_plane = view->planes[view_plane].image_plane;
+   struct ail_layout *layout = &image->planes[image_plane].layout;
+   uint64_t base_addr = hk_image_base_address(image, image_plane);
+
+   bool cubes_to_2d = usage != HK_DESC_USAGE_SAMPLED;
+
+   unsigned level = view->vk.base_mip_level;
+   unsigned layer = view->vk.base_array_layer;
+
+   enum pipe_format p_format = format_for_plane(view, view_plane);
+   const struct util_format_description *desc =
+      util_format_description(p_format);
+
+   struct hk_3d denom = view_denominator(view);
+
+   uint8_t format_swizzle[4] = {
+      desc->swizzle[0],
+      desc->swizzle[1],
+      desc->swizzle[2],
+      desc->swizzle[3],
+   };
+
+   /* Different APIs have different depth/stencil swizzle rules. Vulkan expects
+    * R001 behaviour, override here because Mesa's format table is not that.
+    */
+   if (util_format_is_depth_or_stencil(p_format)) {
+      format_swizzle[0] = PIPE_SWIZZLE_X;
+      format_swizzle[1] = PIPE_SWIZZLE_0;
+      format_swizzle[2] = PIPE_SWIZZLE_0;
+      format_swizzle[3] = PIPE_SWIZZLE_1;
+   }
+
+   /* We only have a single swizzle for the user swizzle and the format
+    * fixup, so compose them now.
+    */
+   uint8_t out_swizzle[4];
+   uint8_t view_swizzle[4] = {
+      vk_swizzle_to_pipe(view->vk.swizzle.r),
+      vk_swizzle_to_pipe(view->vk.swizzle.g),
+      vk_swizzle_to_pipe(view->vk.swizzle.b),
+      vk_swizzle_to_pipe(view->vk.swizzle.a),
+   };
+
+   unsigned layers = view->vk.layer_count;
+   if (view->vk.view_type == VK_IMAGE_VIEW_TYPE_3D) {
+      layers = DIV_ROUND_UP(layout->depth_px, denom.z);
+   } else if (!cubes_to_2d &&
+              (view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
+               view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)) {
+
+      layers /= 6;
+   }
+
+   util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle);
+
+   agx_pack(out, TEXTURE, cfg) {
+      cfg.dimension = translate_image_view_type(
+         view->vk.view_type, view->vk.image->samples > 1, layers > 1, usage);
+      cfg.layout = agx_translate_layout(layout->tiling);
+      cfg.channels = agx_pixel_format[p_format].channels;
+      cfg.type = agx_pixel_format[p_format].type;
+      cfg.srgb = util_format_is_srgb(p_format);
+
+      cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]);
+      cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]);
+      cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]);
+      cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]);
+
+      if (denom.x > 1) {
+         assert(view->vk.level_count == 1);
+         assert(view->vk.layer_count == 1);
+
+         cfg.address = base_addr + ail_get_layer_level_B(layout, layer, level);
+         cfg.width = DIV_ROUND_UP(u_minify(layout->width_px, level), denom.x);
+         cfg.height = DIV_ROUND_UP(u_minify(layout->height_px, level), denom.y);
+         cfg.first_level = 0;
+         cfg.last_level = 1;
+      } else {
+         cfg.address = base_addr + ail_get_layer_offset_B(layout, layer);
+         cfg.width = layout->width_px;
+         cfg.height = layout->height_px;
+         cfg.first_level = level;
+         cfg.last_level = level + view->vk.level_count - 1;
+      }
+
+      cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+      cfg.unk_mipmapped = layout->levels > 1;
+      cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
+
+      if (ail_is_compressed(layout)) {
+         cfg.compressed_1 = true;
+         cfg.extended = true;
+      }
+
+      if (ail_is_compressed(layout)) {
+         cfg.acceleration_buffer = base_addr + layout->metadata_offset_B +
+                                   (layer * layout->compression_layer_stride_B);
+      }
+
+      if (layout->tiling == AIL_TILING_LINEAR &&
+          (hk_image_view_type_is_array(view->vk.view_type))) {
+
+         cfg.depth_linear = layers;
+         cfg.layer_stride_linear = layout->layer_stride_B - 0x80;
+         cfg.extended = true;
+      } else {
+         assert((layout->tiling != AIL_TILING_LINEAR) || (layers == 1));
+         cfg.depth = layers;
+      }
+
+      if (view->vk.image->samples > 1) {
+         cfg.samples = agx_translate_sample_count(view->vk.image->samples);
+      }
+
+      if (layout->tiling == AIL_TILING_LINEAR) {
+         cfg.stride = ail_get_linear_stride_B(layout, 0) - 16;
+      } else {
+         assert(layout->tiling == AIL_TILING_TWIDDLED ||
+                layout->tiling == AIL_TILING_TWIDDLED_COMPRESSED);
+
+         cfg.page_aligned_layers = layout->page_aligned_layers;
+      }
+   }
+}
+
+static void
+pack_pbe(struct hk_device *dev, struct hk_image_view *view, unsigned view_plane,
+         enum hk_desc_usage usage, struct agx_pbe_packed *out)
+{
+   struct hk_image *image = container_of(view->vk.image, struct hk_image, vk);
+   const uint8_t image_plane = view->planes[view_plane].image_plane;
+   struct ail_layout *layout = &image->planes[image_plane].layout;
+   uint64_t base_addr = hk_image_base_address(image, image_plane);
+
+   unsigned level = view->vk.base_mip_level;
+   unsigned layer = view->vk.base_array_layer;
+
+   enum pipe_format p_format = format_for_plane(view, view_plane);
+   const struct util_format_description *desc =
+      util_format_description(p_format);
+
+   bool eot =
+      usage == HK_DESC_USAGE_BG_EOT || usage == HK_DESC_USAGE_LAYERED_BG_EOT;
+
+   /* The tilebuffer is already in sRGB space if needed. Do not convert for
+    * end-of-tile descriptors.
+    */
+   if (eot)
+      p_format = util_format_linear(p_format);
+
+   bool msaa = view->vk.image->samples > 1;
+   struct hk_3d denom = view_denominator(view);
+
+   unsigned layers = view->vk.view_type == VK_IMAGE_VIEW_TYPE_3D
+                        ? image->vk.extent.depth
+                        : view->vk.layer_count;
+
+   agx_pack(out, PBE, cfg) {
+      cfg.dimension =
+         translate_image_view_type(view->vk.view_type, msaa, layers > 1, usage);
+      cfg.layout = agx_translate_layout(layout->tiling);
+      cfg.channels = agx_pixel_format[p_format].channels;
+      cfg.type = agx_pixel_format[p_format].type;
+      cfg.srgb = util_format_is_srgb(p_format);
+
+      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
+
+      for (unsigned i = 0; i < desc->nr_channels; ++i) {
+         if (desc->swizzle[i] == 0)
+            cfg.swizzle_r = i;
+         else if (desc->swizzle[i] == 1)
+            cfg.swizzle_g = i;
+         else if (desc->swizzle[i] == 2)
+            cfg.swizzle_b = i;
+         else if (desc->swizzle[i] == 3)
+            cfg.swizzle_a = i;
+      }
+
+      cfg.buffer = base_addr + ail_get_layer_offset_B(layout, layer);
+      cfg.unk_mipmapped = layout->levels > 1;
+
+      if (msaa & !eot) {
+         /* Multisampled images are bound like buffer textures, with
+          * addressing arithmetic to determine the texel to write.
+          *
+          * Note that the end-of-tile program uses real multisample images
+          * with image_write_block instructions.
+          */
+         unsigned blocksize_B = util_format_get_blocksize(p_format);
+         unsigned size_px =
+            (layout->size_B - layout->layer_stride_B * layer) / blocksize_B;
+
+         cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
+         cfg.layout = AGX_LAYOUT_LINEAR;
+         cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
+         cfg.height = DIV_ROUND_UP(size_px, cfg.width);
+         cfg.stride = (cfg.width * blocksize_B) - 4;
+         cfg.layers = 1;
+         cfg.levels = 1;
+
+         cfg.buffer += layout->level_offsets_B[level];
+         cfg.level = 0;
+      } else {
+         if (denom.x > 1) {
+            assert(denom.z == 1 && "todo how to handle?");
+            assert(view->vk.level_count == 1);
+            assert(view->vk.layer_count == 1);
+
+            cfg.buffer =
+               base_addr + ail_get_layer_level_B(layout, layer, level);
+            cfg.width =
+               DIV_ROUND_UP(u_minify(layout->width_px, level), denom.x);
+            cfg.height =
+               DIV_ROUND_UP(u_minify(layout->height_px, level), denom.y);
+            cfg.level = 0;
+         } else {
+            cfg.buffer = base_addr + ail_get_layer_offset_B(layout, layer);
+            cfg.width = layout->width_px;
+            cfg.height = layout->height_px;
+            cfg.level = level;
+         }
+
+         if (layout->tiling == AIL_TILING_LINEAR &&
+             (hk_image_view_type_is_array(view->vk.view_type))) {
+
+            cfg.depth_linear = layers;
+            cfg.layer_stride_linear = (layout->layer_stride_B - 0x80);
+            cfg.extended = true;
+         } else {
+            assert((layout->tiling != AIL_TILING_LINEAR) || (layers == 1));
+            cfg.layers = layers;
+         }
+
+         cfg.levels = image->vk.mip_levels;
+
+         if (layout->tiling == AIL_TILING_LINEAR) {
+            cfg.stride = ail_get_linear_stride_B(layout, level) - 4;
+            assert(cfg.levels == 1);
+         } else {
+            cfg.page_aligned_layers = layout->page_aligned_layers;
+         }
+
+         if (image->vk.samples > 1)
+            cfg.samples = agx_translate_sample_count(image->vk.samples);
+      }
+
+      if (ail_is_compressed(layout)) {
+         cfg.compressed_1 = true;
+         cfg.extended = true;
+
+         cfg.acceleration_buffer = base_addr + layout->metadata_offset_B +
+                                   (layer * layout->compression_layer_stride_B);
+      }
+
+      /* When the descriptor isn't extended architecturally, we use
+       * the last 8 bytes as a sideband to accelerate image atomics.
+       */
+      if (!cfg.extended && layout->writeable_image) {
+         if (msaa) {
+            assert(denom.x == 1 && "no MSAA of block-compressed");
+
+            cfg.aligned_width_msaa_sw =
+               align(u_minify(layout->width_px, level),
+                     layout->tilesize_el[level].width_el);
+         } else {
+            cfg.level_offset_sw = ail_get_level_offset_B(layout, cfg.level);
+         }
+
+         cfg.sample_count_log2_sw = util_logbase2(image->vk.samples);
+
+         if (layout->tiling == AIL_TILING_TWIDDLED) {
+            struct ail_tile tile_size = layout->tilesize_el[level];
+            cfg.tile_width_sw = tile_size.width_el;
+            cfg.tile_height_sw = tile_size.height_el;
+
+            cfg.layer_stride_sw = layout->layer_stride_B;
+         }
+      }
+   };
+}
+
+static VkResult
+add_descriptor(struct hk_device *dev, struct hk_image_view *view,
+               struct agx_texture_packed *desc,
+               struct agx_texture_packed *cached, uint32_t *index)
+{
+   /* First, look for a descriptor we already uploaded */
+   for (unsigned i = 0; i < view->descriptor_count; ++i) {
+      if (memcmp(&cached[i], desc, sizeof *desc) == 0) {
+         *index = view->descriptor_index[i];
+         return VK_SUCCESS;
+      }
+   }
+
+   /* Else, add a new descriptor */
+   VkResult result =
+      hk_descriptor_table_add(dev, &dev->images, desc, sizeof *desc, index);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint32_t local_index = view->descriptor_count++;
+   assert(local_index < HK_MAX_IMAGE_DESCS);
+
+   cached[local_index] = *desc;
+   view->descriptor_index[local_index] = *index;
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_image_view_init(struct hk_device *dev, struct hk_image_view *view,
+                   bool driver_internal,
+                   const VkImageViewCreateInfo *pCreateInfo)
+{
+   VK_FROM_HANDLE(hk_image, image, pCreateInfo->image);
+   VkResult result;
+
+   memset(view, 0, sizeof(*view));
+
+   vk_image_view_init(&dev->vk, &view->vk, driver_internal, pCreateInfo);
+
+   /* First, figure out which image planes we need. For depth/stencil, we only
+    * have one aspect viewed at a time.
+    */
+   if (image->vk.aspects &
+       (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+
+      view->plane_count = 1;
+      view->planes[0].image_plane =
+         hk_image_aspects_to_plane(image, view->vk.aspects);
+   } else {
+      /* For other formats, retrieve the plane count from the aspect mask
+       * and then walk through the aspect mask to map each image plane
+       * to its corresponding view plane
+       */
+      assert(util_bitcount(view->vk.aspects) ==
+             vk_format_get_plane_count(view->vk.format));
+      view->plane_count = 0;
+      u_foreach_bit(aspect_bit, view->vk.aspects) {
+         uint8_t image_plane =
+            hk_image_aspects_to_plane(image, 1u << aspect_bit);
+         view->planes[view->plane_count++].image_plane = image_plane;
+      }
+   }
+
+   struct agx_texture_packed cached[HK_MAX_IMAGE_DESCS];
+
+   /* Finally, fill in each view plane separately */
+   for (unsigned view_plane = 0; view_plane < view->plane_count; view_plane++) {
+      const struct {
+         VkImageUsageFlagBits flag;
+         enum hk_desc_usage usage;
+         uint32_t *tex;
+         uint32_t *pbe;
+      } descriptors[] = {
+         {VK_IMAGE_USAGE_SAMPLED_BIT, HK_DESC_USAGE_SAMPLED,
+          &view->planes[view_plane].sampled_desc_index},
+
+         {VK_IMAGE_USAGE_STORAGE_BIT, HK_DESC_USAGE_STORAGE,
+          &view->planes[view_plane].ro_storage_desc_index,
+          &view->planes[view_plane].storage_desc_index},
+
+         {VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT, HK_DESC_USAGE_INPUT,
+          &view->planes[view_plane].ia_desc_index},
+
+         {VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, HK_DESC_USAGE_BG_EOT,
+          &view->planes[view_plane].background_desc_index,
+          &view->planes[view_plane].eot_pbe_desc_index},
+
+         {VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, HK_DESC_USAGE_LAYERED_BG_EOT,
+          &view->planes[view_plane].layered_background_desc_index,
+          &view->planes[view_plane].layered_eot_pbe_desc_index},
+      };
+
+      for (unsigned i = 0; i < ARRAY_SIZE(descriptors); ++i) {
+         if (!(view->vk.usage & descriptors[i].flag))
+            continue;
+
+         for (unsigned is_pbe = 0; is_pbe < 2; ++is_pbe) {
+            struct agx_texture_packed desc;
+            uint32_t *out = is_pbe ? descriptors[i].pbe : descriptors[i].tex;
+
+            if (!out)
+               continue;
+
+            if (is_pbe) {
+               static_assert(sizeof(struct agx_pbe_packed) ==
+                             sizeof(struct agx_texture_packed));
+
+               pack_pbe(dev, view, view_plane, descriptors[i].usage,
+                        (struct agx_pbe_packed *)&desc);
+            } else {
+               pack_texture(view, view_plane, descriptors[i].usage, &desc);
+            }
+
+            result = add_descriptor(dev, view, &desc, cached, out);
+            if (result != VK_SUCCESS)
+               return result;
+         }
+      }
+
+      if (view->vk.usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+         pack_texture(view, view_plane, HK_DESC_USAGE_EMRT,
+                      &view->planes[view_plane].emrt_texture);
+
+         pack_pbe(dev, view, view_plane, HK_DESC_USAGE_EMRT,
+                  &view->planes[view_plane].emrt_pbe);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyImageView(VkDevice _device, VkImageView imageView,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   VK_FROM_HANDLE(hk_image_view, view, imageView);
+
+   if (!view)
+      return;
+
+   for (uint8_t d = 0; d < view->descriptor_count; ++d) {
+      hk_descriptor_table_remove(dev, &dev->images, view->descriptor_index[d]);
+   }
+
+   vk_image_view_finish(&view->vk);
+   vk_free2(&dev->vk.alloc, pAllocator, view);
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo,
+                   const VkAllocationCallbacks *pAllocator, VkImageView *pView)
+{
+   VK_FROM_HANDLE(hk_device, dev, _device);
+   struct hk_image_view *view;
+   VkResult result;
+
+   view = vk_alloc2(&dev->vk.alloc, pAllocator, sizeof(*view), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!view)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = hk_image_view_init(
+      dev, view, pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_INTERNAL_MESA,
+      pCreateInfo);
+   if (result != VK_SUCCESS) {
+      hk_DestroyImageView(_device, hk_image_view_to_handle(view), pAllocator);
+      return result;
+   }
+
+   *pView = hk_image_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
diff --git a/src/asahi/vulkan/hk_image_view.h b/src/asahi/vulkan/hk_image_view.h
new file mode 100644
index 00000000000..4a5c7c79fb7
--- /dev/null
+++ b/src/asahi/vulkan/hk_image_view.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "agx_pack.h"
+#include "hk_private.h"
+#include "vk_image.h"
+
+struct hk_device;
+
+#define HK_MAX_PLANES      3
+#define HK_MAX_IMAGE_DESCS (10 * HK_MAX_PLANES)
+
+struct hk_image_view {
+   struct vk_image_view vk;
+
+   uint32_t descriptor_index[HK_MAX_IMAGE_DESCS];
+   uint8_t descriptor_count;
+
+   uint8_t plane_count;
+   struct {
+      uint8_t image_plane;
+
+      /** Descriptors used for eMRT. We delay upload since we want them
+       * contiguous in memory, although this could be reworked if we wanted.
+       */
+      struct agx_texture_packed emrt_texture;
+      struct agx_pbe_packed emrt_pbe;
+
+      /** Index in the image descriptor table for the sampled image descriptor */
+      uint32_t sampled_desc_index;
+
+      /** Index in the image descriptor table for the storage image descriptor */
+      uint32_t storage_desc_index;
+
+      /** Index in the image descriptor table for the readonly storage image
+       * descriptor.
+       */
+      uint32_t ro_storage_desc_index;
+
+      /** Index in the image descriptor table for the texture descriptor used
+       * for background programs.
+       */
+      uint32_t background_desc_index;
+      uint32_t layered_background_desc_index;
+
+      /** Index in the image descriptor table for the texture descriptor used
+       * for input attachments.
+       */
+      uint32_t ia_desc_index;
+
+      /** Index in the image descriptor table for the PBE descriptor used for
+       * end-of-tile programs.
+       */
+      uint32_t eot_pbe_desc_index;
+      uint32_t layered_eot_pbe_desc_index;
+   } planes[3];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_image_view, vk.base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW)
diff --git a/src/asahi/vulkan/hk_instance.c b/src/asahi/vulkan/hk_instance.c
new file mode 100644
index 00000000000..fdf113f0edf
--- /dev/null
+++ b/src/asahi/vulkan/hk_instance.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_instance.h"
+
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vulkan/wsi/wsi_common.h"
+
+#include "util/build_id.h"
+#include "util/driconf.h"
+#include "util/mesa-sha1.h"
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_EnumerateInstanceVersion(uint32_t *pApiVersion)
+{
+   uint32_t version_override = vk_get_version_override();
+   *pApiVersion = version_override ? version_override
+                                   : VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION);
+
+   return VK_SUCCESS;
+}
+
+static const struct vk_instance_extension_table instance_extensions = {
+#ifdef HK_USE_WSI_PLATFORM
+   .KHR_get_surface_capabilities2 = true,
+   .KHR_surface = true,
+   .KHR_surface_protected_capabilities = true,
+   .EXT_surface_maintenance1 = true,
+   .EXT_swapchain_colorspace = true,
+#endif
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+   .KHR_wayland_surface = true,
+#endif
+#ifdef VK_USE_PLATFORM_XCB_KHR
+   .KHR_xcb_surface = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+   .KHR_xlib_surface = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+   .EXT_acquire_xlib_display = true,
+#endif
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+   .KHR_display = true,
+   .KHR_get_display_properties2 = true,
+   .EXT_direct_mode_display = true,
+   .EXT_display_surface_counter = true,
+   .EXT_acquire_drm_display = true,
+#endif
+#ifndef VK_USE_PLATFORM_WIN32_KHR
+   .EXT_headless_surface = true,
+#endif
+   .KHR_device_group_creation = true,
+   .KHR_external_fence_capabilities = true,
+   .KHR_external_memory_capabilities = true,
+   .KHR_external_semaphore_capabilities = true,
+   .KHR_get_physical_device_properties2 = true,
+   .EXT_debug_report = true,
+   .EXT_debug_utils = true,
+};
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_EnumerateInstanceExtensionProperties(const char *pLayerName,
+                                        uint32_t *pPropertyCount,
+                                        VkExtensionProperties *pProperties)
+{
+   if (pLayerName)
+      return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
+
+   return vk_enumerate_instance_extension_properties(
+      &instance_extensions, pPropertyCount, pProperties);
+}
+
+static const driOptionDescription hk_dri_options[] = {
+   DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC(true)
+      DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
+         DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
+            DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false)
+               DRI_CONF_VK_KHR_PRESENT_WAIT(false)
+                  DRI_CONF_VK_XWAYLAND_WAIT_READY(false) DRI_CONF_SECTION_END
+
+                     DRI_CONF_SECTION_DEBUG DRI_CONF_FORCE_VK_VENDOR()
+                        DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false)
+                           DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false)
+                              DRI_CONF_SECTION_END};
+
+static void
+hk_init_dri_options(struct hk_instance *instance)
+{
+   driParseOptionInfo(&instance->available_dri_options, hk_dri_options,
+                      ARRAY_SIZE(hk_dri_options));
+   driParseConfigFiles(
+      &instance->dri_options, &instance->available_dri_options, 0, "hk", NULL,
+      NULL, instance->vk.app_info.app_name, instance->vk.app_info.app_version,
+      instance->vk.app_info.engine_name, instance->vk.app_info.engine_version);
+
+   instance->force_vk_vendor =
+      driQueryOptioni(&instance->dri_options, "force_vk_vendor");
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
+                  const VkAllocationCallbacks *pAllocator,
+                  VkInstance *pInstance)
+{
+   struct hk_instance *instance;
+   VkResult result;
+
+   if (pAllocator == NULL)
+      pAllocator = vk_default_allocator();
+
+   instance = vk_alloc(pAllocator, sizeof(*instance), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!instance)
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_instance_dispatch_table dispatch_table;
+   vk_instance_dispatch_table_from_entrypoints(&dispatch_table,
+                                               &hk_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
+
+   result = vk_instance_init(&instance->vk, &instance_extensions,
+                             &dispatch_table, pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   hk_init_dri_options(instance);
+
+   instance->vk.physical_devices.try_create_for_drm =
+      hk_create_drm_physical_device;
+   instance->vk.physical_devices.destroy = hk_physical_device_destroy;
+
+   const struct build_id_note *note =
+      build_id_find_nhdr_for_addr(hk_CreateInstance);
+   if (!note) {
+      result = vk_errorf(NULL, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to find build-id");
+      goto fail_init;
+   }
+
+   unsigned build_id_len = build_id_length(note);
+   if (build_id_len < SHA1_DIGEST_LENGTH) {
+      result = vk_errorf(NULL, VK_ERROR_INITIALIZATION_FAILED,
+                         "build-id too short.  It needs to be a SHA");
+      goto fail_init;
+   }
+
+   static_assert(sizeof(instance->driver_build_sha) == SHA1_DIGEST_LENGTH);
+   memcpy(instance->driver_build_sha, build_id_data(note), SHA1_DIGEST_LENGTH);
+
+   *pInstance = hk_instance_to_handle(instance);
+   return VK_SUCCESS;
+
+fail_init:
+   vk_instance_finish(&instance->vk);
+fail_alloc:
+   vk_free(pAllocator, instance);
+
+   return result;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyInstance(VkInstance _instance,
+                   const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_instance, instance, _instance);
+
+   if (!instance)
+      return;
+
+   driDestroyOptionCache(&instance->dri_options);
+   driDestroyOptionInfo(&instance->available_dri_options);
+
+   vk_instance_finish(&instance->vk);
+   vk_free(&instance->vk.alloc, instance);
+}
+
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+hk_GetInstanceProcAddr(VkInstance _instance, const char *pName)
+{
+   VK_FROM_HANDLE(hk_instance, instance, _instance);
+   return vk_instance_get_proc_addr(&instance->vk, &hk_instance_entrypoints,
+                                    pName);
+}
+
+PUBLIC VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+vk_icdGetInstanceProcAddr(VkInstance instance, const char *pName)
+{
+   return hk_GetInstanceProcAddr(instance, pName);
+}
diff --git a/src/asahi/vulkan/hk_instance.h b/src/asahi/vulkan/hk_instance.h
new file mode 100644
index 00000000000..d0c0397b02a
--- /dev/null
+++ b/src/asahi/vulkan/hk_instance.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "util/xmlconfig.h"
+#include "hk_private.h"
+#include "vk_instance.h"
+
+struct hk_instance {
+   struct vk_instance vk;
+
+   struct driOptionCache dri_options;
+   struct driOptionCache available_dri_options;
+
+   uint8_t driver_build_sha[20];
+   uint32_t force_vk_vendor;
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_instance, vk.base, VkInstance,
+                       VK_OBJECT_TYPE_INSTANCE)
diff --git a/src/asahi/vulkan/hk_nir_lower_descriptors.c b/src/asahi/vulkan/hk_nir_lower_descriptors.c
new file mode 100644
index 00000000000..802e184ae5e
--- /dev/null
+++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c
@@ -0,0 +1,867 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "pipe/p_defines.h"
+#include "vulkan/vulkan_core.h"
+#include "agx_nir_passes.h"
+#include "agx_pack.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_set.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_shader.h"
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_builder_opcodes.h"
+#include "nir_deref.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "shader_enums.h"
+#include "vk_pipeline.h"
+
+struct lower_descriptors_ctx {
+   const struct hk_descriptor_set_layout *set_layouts[HK_MAX_SETS];
+
+   bool clamp_desc_array_bounds;
+   nir_address_format ubo_addr_format;
+   nir_address_format ssbo_addr_format;
+};
+
+static const struct hk_descriptor_set_binding_layout *
+get_binding_layout(uint32_t set, uint32_t binding,
+                   const struct lower_descriptors_ctx *ctx)
+{
+   assert(set < HK_MAX_SETS);
+   assert(ctx->set_layouts[set] != NULL);
+
+   const struct hk_descriptor_set_layout *set_layout = ctx->set_layouts[set];
+
+   assert(binding < set_layout->binding_count);
+   return &set_layout->binding[binding];
+}
+
+static nir_def *
+load_speculatable(nir_builder *b, unsigned num_components, unsigned bit_size,
+                  nir_def *addr, unsigned align)
+{
+   return nir_build_load_global_constant(b, num_components, bit_size, addr,
+                                         .align_mul = align,
+                                         .access = ACCESS_CAN_SPECULATE);
+}
+
+static nir_def *
+load_root(nir_builder *b, unsigned num_components, unsigned bit_size,
+          nir_def *offset, unsigned align)
+{
+   nir_def *root = nir_load_preamble(b, 1, 64, .base = HK_ROOT_UNIFORM);
+
+   /* We've bound the address of the root descriptor, index in. */
+   nir_def *addr = nir_iadd(b, root, nir_u2u64(b, offset));
+
+   return load_speculatable(b, num_components, bit_size, addr, align);
+}
+
+static bool
+lower_load_constant(nir_builder *b, nir_intrinsic_instr *load,
+                    const struct lower_descriptors_ctx *ctx)
+{
+   assert(load->intrinsic == nir_intrinsic_load_constant);
+   unreachable("todo: stick an address in the root descriptor or something");
+
+   uint32_t base = nir_intrinsic_base(load);
+   uint32_t range = nir_intrinsic_range(load);
+
+   b->cursor = nir_before_instr(&load->instr);
+
+   nir_def *offset = nir_iadd_imm(b, load->src[0].ssa, base);
+   nir_def *data = nir_load_ubo(
+      b, load->def.num_components, load->def.bit_size, nir_imm_int(b, 0),
+      offset, .align_mul = nir_intrinsic_align_mul(load),
+      .align_offset = nir_intrinsic_align_offset(load), .range_base = base,
+      .range = range);
+
+   nir_def_rewrite_uses(&load->def, data);
+
+   return true;
+}
+
+static nir_def *
+load_descriptor_set_addr(nir_builder *b, uint32_t set,
+                         UNUSED const struct lower_descriptors_ctx *ctx)
+{
+   uint32_t set_addr_offset =
+      hk_root_descriptor_offset(sets) + set * sizeof(uint64_t);
+
+   return load_root(b, 1, 64, nir_imm_int(b, set_addr_offset), 8);
+}
+
+static nir_def *
+load_dynamic_buffer_start(nir_builder *b, uint32_t set,
+                          const struct lower_descriptors_ctx *ctx)
+{
+   int dynamic_buffer_start_imm = 0;
+   for (uint32_t s = 0; s < set; s++) {
+      if (ctx->set_layouts[s] == NULL) {
+         dynamic_buffer_start_imm = -1;
+         break;
+      }
+
+      dynamic_buffer_start_imm += ctx->set_layouts[s]->dynamic_buffer_count;
+   }
+
+   if (dynamic_buffer_start_imm >= 0) {
+      return nir_imm_int(b, dynamic_buffer_start_imm);
+   } else {
+      uint32_t root_offset =
+         hk_root_descriptor_offset(set_dynamic_buffer_start) + set;
+
+      return nir_u2u32(b, load_root(b, 1, 8, nir_imm_int(b, root_offset), 1));
+   }
+}
+
+static nir_def *
+load_descriptor(nir_builder *b, unsigned num_components, unsigned bit_size,
+                uint32_t set, uint32_t binding, nir_def *index,
+                unsigned offset_B, const struct lower_descriptors_ctx *ctx)
+{
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      get_binding_layout(set, binding, ctx);
+
+   if (ctx->clamp_desc_array_bounds)
+      index =
+         nir_umin(b, index, nir_imm_int(b, binding_layout->array_size - 1));
+
+   switch (binding_layout->type) {
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+      /* Get the index in the root descriptor table dynamic_buffers array. */
+      nir_def *dynamic_buffer_start = load_dynamic_buffer_start(b, set, ctx);
+
+      index = nir_iadd(b, index,
+                       nir_iadd_imm(b, dynamic_buffer_start,
+                                    binding_layout->dynamic_buffer_index));
+
+      nir_def *root_desc_offset = nir_iadd_imm(
+         b, nir_imul_imm(b, index, sizeof(struct hk_buffer_address)),
+         hk_root_descriptor_offset(dynamic_buffers));
+
+      assert(num_components == 4 && bit_size == 32);
+      nir_def *desc = load_root(b, 4, 32, root_desc_offset, 16);
+
+      /* We know a priori that the the .w compnent (offset) is zero */
+      return nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3);
+   }
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+      nir_def *base_addr = nir_iadd_imm(
+         b, load_descriptor_set_addr(b, set, ctx), binding_layout->offset);
+
+      assert(binding_layout->stride == 1);
+      const uint32_t binding_size = binding_layout->array_size;
+
+      /* Convert it to nir_address_format_64bit_bounded_global */
+      assert(num_components == 4 && bit_size == 32);
+      return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
+                      nir_unpack_64_2x32_split_y(b, base_addr),
+                      nir_imm_int(b, binding_size), nir_imm_int(b, 0));
+   }
+
+   default: {
+      assert(binding_layout->stride > 0);
+      nir_def *desc_ubo_offset =
+         nir_iadd_imm(b, nir_imul_imm(b, index, binding_layout->stride),
+                      binding_layout->offset + offset_B);
+
+      unsigned desc_align_mul = (1 << (ffs(binding_layout->stride) - 1));
+      desc_align_mul = MIN2(desc_align_mul, 16);
+      unsigned desc_align_offset = binding_layout->offset + offset_B;
+      desc_align_offset %= desc_align_mul;
+
+      nir_def *desc;
+      nir_def *set_addr = load_descriptor_set_addr(b, set, ctx);
+      desc = nir_load_global_constant_offset(
+         b, num_components, bit_size, set_addr, desc_ubo_offset,
+         .align_mul = desc_align_mul, .align_offset = desc_align_offset,
+         .access = ACCESS_CAN_SPECULATE);
+
+      if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+          binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER) {
+         /* We know a priori that the the .w compnent (offset) is zero */
+         assert(num_components == 4 && bit_size == 32);
+         desc = nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3);
+      }
+      return desc;
+   }
+   }
+}
+
+static bool
+is_idx_intrin(nir_intrinsic_instr *intrin)
+{
+   while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) {
+      intrin = nir_src_as_intrinsic(intrin->src[0]);
+      if (intrin == NULL)
+         return false;
+   }
+
+   return intrin->intrinsic == nir_intrinsic_vulkan_resource_index;
+}
+
+static nir_def *
+load_descriptor_for_idx_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
+                               const struct lower_descriptors_ctx *ctx)
+{
+   nir_def *index = nir_imm_int(b, 0);
+
+   while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) {
+      index = nir_iadd(b, index, intrin->src[1].ssa);
+      intrin = nir_src_as_intrinsic(intrin->src[0]);
+   }
+
+   assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+   uint32_t set = nir_intrinsic_desc_set(intrin);
+   uint32_t binding = nir_intrinsic_binding(intrin);
+   index = nir_iadd(b, index, intrin->src[0].ssa);
+
+   return load_descriptor(b, 4, 32, set, binding, index, 0, ctx);
+}
+
+static bool
+try_lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
+                                 const struct lower_descriptors_ctx *ctx)
+{
+   ASSERTED const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(intrin->src[0]);
+   if (idx_intrin == NULL || !is_idx_intrin(idx_intrin)) {
+      assert(desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+             desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
+      return false;
+   }
+
+   nir_def *desc = load_descriptor_for_idx_intrin(b, idx_intrin, ctx);
+
+   nir_def_rewrite_uses(&intrin->def, desc);
+
+   return true;
+}
+
+static bool
+_lower_sysval_to_root_table(nir_builder *b, nir_intrinsic_instr *intrin,
+                            uint32_t root_table_offset)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+   assert((root_table_offset & 3) == 0 && "aligned");
+
+   nir_def *val = load_root(b, intrin->def.num_components, intrin->def.bit_size,
+                            nir_imm_int(b, root_table_offset), 4);
+
+   nir_def_rewrite_uses(&intrin->def, val);
+
+   return true;
+}
+
+#define lower_sysval_to_root_table(b, intrin, member)                          \
+   _lower_sysval_to_root_table(b, intrin, hk_root_descriptor_offset(member))
+
+static bool
+lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *load,
+                         const struct lower_descriptors_ctx *ctx)
+{
+   const uint32_t push_region_offset = hk_root_descriptor_offset(push);
+   const uint32_t base = nir_intrinsic_base(load);
+
+   b->cursor = nir_before_instr(&load->instr);
+
+   nir_def *offset =
+      nir_iadd_imm(b, load->src[0].ssa, push_region_offset + base);
+
+   nir_def *val = load_root(b, load->def.num_components, load->def.bit_size,
+                            offset, load->def.bit_size / 8);
+
+   nir_def_rewrite_uses(&load->def, val);
+
+   return true;
+}
+
+static void
+get_resource_deref_binding(nir_builder *b, nir_deref_instr *deref,
+                           uint32_t *set, uint32_t *binding, nir_def **index)
+{
+   if (deref->deref_type == nir_deref_type_array) {
+      *index = deref->arr.index.ssa;
+      deref = nir_deref_instr_parent(deref);
+   } else {
+      *index = nir_imm_int(b, 0);
+   }
+
+   assert(deref->deref_type == nir_deref_type_var);
+   nir_variable *var = deref->var;
+
+   *set = var->data.descriptor_set;
+   *binding = var->data.binding;
+}
+
+static nir_def *
+load_resource_deref_desc(nir_builder *b, unsigned num_components,
+                         unsigned bit_size, nir_deref_instr *deref,
+                         unsigned offset_B,
+                         const struct lower_descriptors_ctx *ctx)
+{
+   uint32_t set, binding;
+   nir_def *index;
+   get_resource_deref_binding(b, deref, &set, &binding, &index);
+   return load_descriptor(b, num_components, bit_size, set, binding, index,
+                          offset_B, ctx);
+}
+
+/*
+ * Returns an AGX bindless handle to access an indexed image within the global
+ * image heap.
+ */
+static nir_def *
+image_heap_handle(nir_builder *b, nir_def *offset)
+{
+   return nir_vec2(b, nir_imm_int(b, HK_IMAGE_HEAP_UNIFORM), offset);
+}
+
+static bool
+lower_image_intrin(nir_builder *b, nir_intrinsic_instr *intr,
+                   const struct lower_descriptors_ctx *ctx)
+{
+   b->cursor = nir_before_instr(&intr->instr);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+
+   /* Reads and queries use the texture descriptor; writes and atomics PBE. */
+   unsigned offs;
+   if (intr->intrinsic != nir_intrinsic_image_deref_load &&
+       intr->intrinsic != nir_intrinsic_image_deref_size &&
+       intr->intrinsic != nir_intrinsic_image_deref_samples) {
+
+      offs = offsetof(struct hk_storage_image_descriptor, pbe_offset);
+   } else {
+      offs = offsetof(struct hk_storage_image_descriptor, tex_offset);
+   }
+
+   nir_def *offset = load_resource_deref_desc(b, 1, 32, deref, offs, ctx);
+   nir_rewrite_image_intrinsic(intr, image_heap_handle(b, offset), true);
+
+   return true;
+}
+
+static VkQueryPipelineStatisticFlagBits
+translate_pipeline_stat_bit(enum pipe_statistics_query_index pipe)
+{
+   switch (pipe) {
+   case PIPE_STAT_QUERY_IA_VERTICES:
+      return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT;
+   case PIPE_STAT_QUERY_IA_PRIMITIVES:
+      return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT;
+   case PIPE_STAT_QUERY_VS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_GS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_GS_PRIMITIVES:
+      return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT;
+   case PIPE_STAT_QUERY_C_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_C_PRIMITIVES:
+      return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT;
+   case PIPE_STAT_QUERY_PS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_HS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT;
+   case PIPE_STAT_QUERY_DS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_CS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
+   case PIPE_STAT_QUERY_TS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT;
+   case PIPE_STAT_QUERY_MS_INVOCATIONS:
+      return VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT;
+   }
+
+   unreachable("invalid statistic");
+}
+
+static bool
+lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
+{
+   unsigned *vs_uniform_base = data;
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_uvs_index_agx: {
+      gl_varying_slot slot = nir_intrinsic_io_semantics(intrin).location;
+      unsigned offset = hk_root_descriptor_offset(draw.uvs_index[slot]);
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      nir_def *val = load_root(b, 1, 8, nir_imm_int(b, offset), 1);
+      nir_def_rewrite_uses(&intrin->def, nir_u2u16(b, val));
+      return true;
+   }
+
+   case nir_intrinsic_load_shader_part_tests_zs_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.no_epilog_discard);
+
+   case nir_intrinsic_load_api_sample_mask_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.api_sample_mask);
+
+   case nir_intrinsic_load_sample_positions_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.ppp_multisamplectl);
+
+   case nir_intrinsic_load_depth_never_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.force_never_in_shader);
+
+   case nir_intrinsic_load_geometry_param_buffer_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.geometry_params);
+
+   case nir_intrinsic_load_vs_output_buffer_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.vertex_output_buffer);
+
+   case nir_intrinsic_load_vs_outputs_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.vertex_outputs);
+
+   case nir_intrinsic_load_tess_param_buffer_agx:
+      return lower_sysval_to_root_table(b, intrin, draw.tess_params);
+
+   case nir_intrinsic_load_is_first_fan_agx: {
+      unsigned offset = hk_root_descriptor_offset(draw.provoking);
+      b->cursor = nir_instr_remove(&intrin->instr);
+      nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2);
+      nir_def_rewrite_uses(&intrin->def, nir_ieq_imm(b, val, 1));
+      return true;
+   }
+
+   case nir_intrinsic_load_provoking_last: {
+      unsigned offset = hk_root_descriptor_offset(draw.provoking);
+      b->cursor = nir_instr_remove(&intrin->instr);
+      nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2);
+      nir_def_rewrite_uses(&intrin->def, nir_b2b32(b, nir_ieq_imm(b, val, 2)));
+      return true;
+   }
+
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_first_vertex:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_input_assembly_buffer_agx: {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      unsigned base = *vs_uniform_base;
+      unsigned size = 32;
+
+      if (intrin->intrinsic == nir_intrinsic_load_base_instance) {
+         base += 2;
+      } else if (intrin->intrinsic == nir_intrinsic_load_draw_id) {
+         base += 4;
+         size = 16;
+      } else if (intrin->intrinsic ==
+                 nir_intrinsic_load_input_assembly_buffer_agx) {
+         base += 8;
+         size = 64;
+      }
+
+      nir_def *val = nir_load_preamble(b, 1, size, .base = base);
+      nir_def_rewrite_uses(&intrin->def,
+                           nir_u2uN(b, val, intrin->def.bit_size));
+      return true;
+   }
+
+   case nir_intrinsic_load_stat_query_address_agx: {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      unsigned off1 = hk_root_descriptor_offset(draw.pipeline_stats);
+      unsigned off2 = hk_root_descriptor_offset(draw.pipeline_stats_flags);
+
+      nir_def *base = load_root(b, 1, 64, nir_imm_int(b, off1), 8);
+      nir_def *flags = load_root(b, 1, 16, nir_imm_int(b, off2), 2);
+
+      unsigned query = nir_intrinsic_base(intrin);
+      VkQueryPipelineStatisticFlagBits bit = translate_pipeline_stat_bit(query);
+
+      /* Prefix sum to find the compacted offset */
+      nir_def *idx = nir_bit_count(b, nir_iand_imm(b, flags, bit - 1));
+      nir_def *addr = nir_iadd(
+         b, base, nir_imul_imm(b, nir_u2u64(b, idx), sizeof(uint64_t)));
+
+      /* The above returns garbage if the query isn't actually enabled, handle
+       * that case.
+       *
+       * TODO: Optimize case where we *know* the query is present?
+       */
+      nir_def *present = nir_ine_imm(b, nir_iand_imm(b, flags, bit), 0);
+      addr = nir_bcsel(b, present, addr, nir_imm_int64(b, 0));
+
+      nir_def_rewrite_uses(&intrin->def, addr);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+bool
+hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base)
+{
+   return nir_shader_intrinsics_pass(
+      s, lower_uvs_index, nir_metadata_control_flow, &vs_uniform_base);
+}
+
+static bool
+try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
+                 const struct lower_descriptors_ctx *ctx)
+{
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_constant:
+      return lower_load_constant(b, intrin, ctx);
+
+   case nir_intrinsic_load_vulkan_descriptor:
+      return try_lower_load_vulkan_descriptor(b, intrin, ctx);
+
+   case nir_intrinsic_load_workgroup_size:
+      unreachable("Should have been lowered by nir_lower_cs_intrinsics()");
+
+   case nir_intrinsic_load_base_workgroup_id:
+      return lower_sysval_to_root_table(b, intrin, cs.base_group);
+
+   case nir_intrinsic_load_push_constant:
+      return lower_load_push_constant(b, intrin, ctx);
+
+   case nir_intrinsic_load_view_index:
+      return lower_sysval_to_root_table(b, intrin, draw.view_index);
+
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_deref_sparse_load:
+   case nir_intrinsic_image_deref_store:
+   case nir_intrinsic_image_deref_atomic:
+   case nir_intrinsic_image_deref_atomic_swap:
+   case nir_intrinsic_image_deref_size:
+   case nir_intrinsic_image_deref_samples:
+      return lower_image_intrin(b, intrin, ctx);
+
+   case nir_intrinsic_load_num_workgroups: {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      unsigned offset = hk_root_descriptor_offset(cs.group_count_addr);
+      nir_def *ptr = load_root(b, 1, 64, nir_imm_int(b, offset), 4);
+      nir_def *val = load_speculatable(b, 3, 32, ptr, 4);
+
+      nir_def_rewrite_uses(&intrin->def, val);
+      return true;
+   }
+
+   default:
+      return false;
+   }
+}
+
+static bool
+lower_tex(nir_builder *b, nir_tex_instr *tex,
+          const struct lower_descriptors_ctx *ctx)
+{
+   b->cursor = nir_before_instr(&tex->instr);
+
+   nir_def *texture = nir_steal_tex_src(tex, nir_tex_src_texture_deref);
+   nir_def *sampler = nir_steal_tex_src(tex, nir_tex_src_sampler_deref);
+   if (!texture) {
+      assert(!sampler);
+      return false;
+   }
+
+   nir_def *plane_ssa = nir_steal_tex_src(tex, nir_tex_src_plane);
+   const uint32_t plane =
+      plane_ssa ? nir_src_as_uint(nir_src_for_ssa(plane_ssa)) : 0;
+   const uint64_t plane_offset_B =
+      plane * sizeof(struct hk_sampled_image_descriptor);
+
+   /* LOD bias is passed in the descriptor set, rather than embedded into
+    * the sampler descriptor. There's no spot in the hardware descriptor,
+    * plus this saves on precious sampler heap spots.
+    */
+   if (tex->op == nir_texop_lod_bias_agx) {
+      unsigned offs =
+         offsetof(struct hk_sampled_image_descriptor, lod_bias_fp16);
+
+      nir_def *bias = load_resource_deref_desc(
+         b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_def_replace(&tex->def, bias);
+      return true;
+   }
+
+   if (tex->op == nir_texop_has_custom_border_color_agx) {
+      unsigned offs = offsetof(struct hk_sampled_image_descriptor, has_border);
+
+      nir_def *res = load_resource_deref_desc(
+         b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_def_replace(&tex->def, nir_ine_imm(b, res, 0));
+      return true;
+   }
+
+   if (tex->op == nir_texop_custom_border_color_agx) {
+      unsigned offs = offsetof(struct hk_sampled_image_descriptor, border);
+
+      nir_def *border = load_resource_deref_desc(
+         b, 4, 32, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_alu_type T = nir_alu_type_get_base_type(tex->dest_type);
+      border = nir_convert_to_bit_size(b, border, T, tex->def.bit_size);
+
+      nir_def_replace(&tex->def, border);
+      return true;
+   }
+
+   {
+      unsigned offs =
+         offsetof(struct hk_sampled_image_descriptor, image_offset);
+
+      nir_def *offset = load_resource_deref_desc(
+         b, 1, 32, nir_src_as_deref(nir_src_for_ssa(texture)),
+         plane_offset_B + offs, ctx);
+
+      nir_def *handle = image_heap_handle(b, offset);
+      nir_tex_instr_add_src(tex, nir_tex_src_texture_handle, handle);
+   }
+
+   if (sampler != NULL) {
+      unsigned offs =
+         offsetof(struct hk_sampled_image_descriptor, sampler_index);
+
+      if (tex->backend_flags & AGX_TEXTURE_FLAG_CLAMP_TO_0) {
+         offs =
+            offsetof(struct hk_sampled_image_descriptor, clamp_0_sampler_index);
+      }
+
+      nir_def *index = load_resource_deref_desc(
+         b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
+         plane_offset_B + offs, ctx);
+
+      nir_tex_instr_add_src(tex, nir_tex_src_sampler_handle, index);
+   }
+
+   return true;
+}
+
+static bool
+try_lower_descriptors_instr(nir_builder *b, nir_instr *instr, void *_data)
+{
+   const struct lower_descriptors_ctx *ctx = _data;
+
+   switch (instr->type) {
+   case nir_instr_type_tex:
+      return lower_tex(b, nir_instr_as_tex(instr), ctx);
+   case nir_instr_type_intrinsic:
+      return try_lower_intrin(b, nir_instr_as_intrinsic(instr), ctx);
+   default:
+      return false;
+   }
+}
+
+static bool
+lower_ssbo_resource_index(nir_builder *b, nir_intrinsic_instr *intrin,
+                          const struct lower_descriptors_ctx *ctx)
+{
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
+       desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   uint32_t set = nir_intrinsic_desc_set(intrin);
+   uint32_t binding = nir_intrinsic_binding(intrin);
+   nir_def *index = intrin->src[0].ssa;
+
+   const struct hk_descriptor_set_binding_layout *binding_layout =
+      get_binding_layout(set, binding, ctx);
+
+   nir_def *binding_addr;
+   uint8_t binding_stride;
+   switch (binding_layout->type) {
+   case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+      nir_def *set_addr = load_descriptor_set_addr(b, set, ctx);
+      binding_addr = nir_iadd_imm(b, set_addr, binding_layout->offset);
+      binding_stride = binding_layout->stride;
+      break;
+   }
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+      const uint32_t root_desc_addr_offset =
+         hk_root_descriptor_offset(root_desc_addr);
+
+      nir_def *root_desc_addr =
+         load_root(b, 1, 64, nir_imm_int(b, root_desc_addr_offset), 8);
+
+      nir_def *dynamic_buffer_start =
+         nir_iadd_imm(b, load_dynamic_buffer_start(b, set, ctx),
+                      binding_layout->dynamic_buffer_index);
+
+      nir_def *dynamic_binding_offset =
+         nir_iadd_imm(b,
+                      nir_imul_imm(b, dynamic_buffer_start,
+                                   sizeof(struct hk_buffer_address)),
+                      hk_root_descriptor_offset(dynamic_buffers));
+
+      binding_addr =
+         nir_iadd(b, root_desc_addr, nir_u2u64(b, dynamic_binding_offset));
+      binding_stride = sizeof(struct hk_buffer_address);
+      break;
+   }
+
+   default:
+      unreachable("Not an SSBO descriptor");
+   }
+
+   /* Tuck the stride in the top 8 bits of the binding address */
+   binding_addr = nir_ior_imm(b, binding_addr, (uint64_t)binding_stride << 56);
+
+   const uint32_t binding_size = binding_layout->array_size * binding_stride;
+   nir_def *offset_in_binding = nir_imul_imm(b, index, binding_stride);
+
+   nir_def *addr = nir_vec4(b, nir_unpack_64_2x32_split_x(b, binding_addr),
+                            nir_unpack_64_2x32_split_y(b, binding_addr),
+                            nir_imm_int(b, binding_size), offset_in_binding);
+
+   nir_def_rewrite_uses(&intrin->def, addr);
+
+   return true;
+}
+
+static bool
+lower_ssbo_resource_reindex(nir_builder *b, nir_intrinsic_instr *intrin,
+                            const struct lower_descriptors_ctx *ctx)
+{
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
+       desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *addr = intrin->src[0].ssa;
+   nir_def *index = intrin->src[1].ssa;
+
+   nir_def *addr_high32 = nir_channel(b, addr, 1);
+   nir_def *stride = nir_ushr_imm(b, addr_high32, 24);
+   nir_def *offset = nir_imul(b, index, stride);
+
+   addr = nir_build_addr_iadd(b, addr, ctx->ssbo_addr_format, nir_var_mem_ssbo,
+                              offset);
+   nir_def_rewrite_uses(&intrin->def, addr);
+
+   return true;
+}
+
+static bool
+lower_load_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
+                           const struct lower_descriptors_ctx *ctx)
+{
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
+       desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *addr = intrin->src[0].ssa;
+
+   nir_def *desc;
+   switch (ctx->ssbo_addr_format) {
+   case nir_address_format_64bit_global_32bit_offset: {
+      nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
+      nir_def *offset = nir_channel(b, addr, 3);
+      /* Mask off the binding stride */
+      base = nir_iand_imm(b, base, BITFIELD64_MASK(56));
+      desc = nir_load_global_constant_offset(b, 4, 32, base, offset,
+                                             .align_mul = 16, .align_offset = 0,
+                                             .access = ACCESS_CAN_SPECULATE);
+      break;
+   }
+
+   case nir_address_format_64bit_bounded_global: {
+      nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
+      nir_def *size = nir_channel(b, addr, 2);
+      nir_def *offset = nir_channel(b, addr, 3);
+      /* Mask off the binding stride */
+      base = nir_iand_imm(b, base, BITFIELD64_MASK(56));
+      desc = nir_load_global_constant_bounded(
+         b, 4, 32, base, offset, size, .align_mul = 16, .align_offset = 0,
+         .access = ACCESS_CAN_SPECULATE);
+      break;
+   }
+
+   default:
+      unreachable("Unknown address mode");
+   }
+
+   nir_def_rewrite_uses(&intrin->def, desc);
+
+   return true;
+}
+
+static bool
+lower_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intr, void *_data)
+{
+   const struct lower_descriptors_ctx *ctx = _data;
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_vulkan_resource_index:
+      return lower_ssbo_resource_index(b, intr, ctx);
+   case nir_intrinsic_vulkan_resource_reindex:
+      return lower_ssbo_resource_reindex(b, intr, ctx);
+   case nir_intrinsic_load_vulkan_descriptor:
+      return lower_load_ssbo_descriptor(b, intr, ctx);
+   default:
+      return false;
+   }
+}
+
+bool
+hk_nir_lower_descriptors(nir_shader *nir,
+                         const struct vk_pipeline_robustness_state *rs,
+                         uint32_t set_layout_count,
+                         struct vk_descriptor_set_layout *const *set_layouts)
+{
+   struct lower_descriptors_ctx ctx = {
+      .clamp_desc_array_bounds =
+         rs->storage_buffers !=
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
+
+         rs->uniform_buffers !=
+            VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
+
+         rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
+
+      .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
+      .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
+   };
+
+   assert(set_layout_count <= HK_MAX_SETS);
+   for (uint32_t s = 0; s < set_layout_count; s++) {
+      if (set_layouts[s] != NULL)
+         ctx.set_layouts[s] = vk_to_hk_descriptor_set_layout(set_layouts[s]);
+   }
+
+   /* First lower everything but complex SSBOs, then lower complex SSBOs.
+    *
+    * TODO: See if we can unify this, not sure if the fast path matters on
+    * Apple. This is inherited from NVK.
+    */
+   bool pass_lower_descriptors = nir_shader_instructions_pass(
+      nir, try_lower_descriptors_instr, nir_metadata_control_flow, &ctx);
+
+   bool pass_lower_ssbo = nir_shader_intrinsics_pass(
+      nir, lower_ssbo_descriptor, nir_metadata_control_flow, &ctx);
+
+   return pass_lower_descriptors || pass_lower_ssbo;
+}
diff --git a/src/asahi/vulkan/hk_nir_passthrough_gs.c b/src/asahi/vulkan/hk_nir_passthrough_gs.c
new file mode 100644
index 00000000000..536b10c6b96
--- /dev/null
+++ b/src/asahi/vulkan/hk_nir_passthrough_gs.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022 Collabora Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "util/bitscan.h"
+#include "hk_shader.h"
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_xfb_info.h"
+#include "shader_enums.h"
+
+void
+hk_nir_passthrough_gs(nir_builder *b, const void *key_)
+{
+   nir_shader *s = b->shader;
+   const struct hk_passthrough_gs_key *key = key_;
+   assert(key->prim == u_decomposed_prim(key->prim));
+   assert(key->prim != MESA_PRIM_PATCHES && "tessellation consumes patches");
+
+   enum mesa_prim out;
+   if (key->prim == MESA_PRIM_POINTS)
+      out = MESA_PRIM_POINTS;
+   else if (u_reduced_prim(key->prim) == MESA_PRIM_LINES)
+      out = MESA_PRIM_LINE_STRIP;
+   else
+      out = MESA_PRIM_TRIANGLE_STRIP;
+
+#if 0
+   assert((key->outputs &
+           (VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1)) == 0 &&
+          "cull distance lowering not run yet");
+#endif
+   /* XXX: need rework of preprocess_nir */
+   uint64_t outputs =
+      key->outputs & ~(VARYING_BIT_BOUNDING_BOX0 | VARYING_BIT_BOUNDING_BOX1);
+
+   s->info.outputs_written = s->info.inputs_read = outputs;
+   s->info.clip_distance_array_size = key->clip_distance_array_size;
+   s->info.cull_distance_array_size = key->cull_distance_array_size;
+   s->info.stage = MESA_SHADER_GEOMETRY;
+   s->info.gs.input_primitive = key->prim;
+   s->info.gs.output_primitive = out;
+   s->info.gs.vertices_in = mesa_vertices_per_prim(key->prim);
+   s->info.gs.vertices_out = mesa_vertices_per_prim(out);
+   s->info.gs.invocations = 1;
+   s->info.gs.active_stream_mask = 1;
+
+   if (key->xfb_info.output_count) {
+      size_t size = nir_xfb_info_size(key->xfb_info.output_count);
+      s->xfb_info = ralloc_memdup(s, &key->xfb_info, size);
+      s->info.has_transform_feedback_varyings = true;
+      memcpy(s->info.xfb_stride, key->xfb_stride, sizeof(key->xfb_stride));
+   }
+
+   unsigned int start_vert = key->prim == MESA_PRIM_LINES_ADJACENCY ? 1 : 0;
+   unsigned int step = key->prim == MESA_PRIM_TRIANGLES_ADJACENCY ? 2 : 1;
+
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_def *one = nir_imm_int(b, 1);
+
+   for (unsigned i = 0; i < s->info.gs.vertices_out; ++i) {
+      nir_def *vertex = nir_imm_int(b, start_vert + (i * step));
+
+      /* Copy inputs to outputs. */
+      u_foreach_bit64(loc, outputs) {
+         unsigned adjusted_loc = loc;
+         nir_def *offset = zero;
+         unsigned num_slots = 1;
+
+         bool scalar = loc == VARYING_SLOT_LAYER ||
+                       loc == VARYING_SLOT_VIEW_INDEX ||
+                       loc == VARYING_SLOT_VIEWPORT || loc == VARYING_SLOT_PSIZ;
+         unsigned comps = scalar ? 1 : 4;
+
+         /* We use combined, compact clip/cull */
+         if (loc == VARYING_SLOT_CLIP_DIST1 || loc == VARYING_SLOT_CULL_DIST1) {
+            adjusted_loc--;
+            offset = one;
+         }
+
+         if (adjusted_loc == VARYING_SLOT_CLIP_DIST0 ||
+             adjusted_loc == VARYING_SLOT_CULL_DIST0) {
+            num_slots =
+               key->cull_distance_array_size + key->clip_distance_array_size;
+
+            if (loc > adjusted_loc)
+               comps = num_slots - 4;
+            else
+               comps = MIN2(num_slots, 4);
+         }
+
+         nir_io_semantics sem = {
+            .location = adjusted_loc,
+            .num_slots = num_slots,
+         };
+
+         nir_def *val = nir_load_per_vertex_input(b, comps, 32, vertex, offset,
+                                                  .io_semantics = sem);
+
+         for (unsigned c = 0; c < comps; ++c) {
+            nir_store_output(b, nir_channel(b, val, c), offset,
+                             .io_semantics = sem, .src_type = nir_type_uint32,
+                             .component = c);
+         }
+      }
+
+      nir_emit_vertex(b, 0);
+   }
+}
diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c
new file mode 100644
index 00000000000..304cc7c938d
--- /dev/null
+++ b/src/asahi/vulkan/hk_physical_device.c
@@ -0,0 +1,1417 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_physical_device.h"
+
+#include "asahi/lib/agx_device.h"
+#include "asahi/lib/agx_nir_lower_vbo.h"
+#include "asahi/lib/agx_nir_passes.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+#include "git_sha1.h"
+#include "hk_buffer.h"
+#include "hk_entrypoints.h"
+#include "hk_image.h"
+#include "hk_instance.h"
+#include "hk_private.h"
+#include "hk_shader.h"
+#include "hk_wsi.h"
+
+#include "util/u_debug.h"
+#include "vulkan/vulkan_core.h"
+#include "vulkan/wsi/wsi_common.h"
+#include "vk_device.h"
+#include "vk_drm_syncobj.h"
+#include "vk_shader_module.h"
+
+#include <fcntl.h>
+#include <string.h>
+#include <xf86drm.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+static uint32_t
+hk_get_vk_version()
+{
+   /* Version override takes priority */
+   const uint32_t version_override = vk_get_version_override();
+   if (version_override)
+      return version_override;
+
+   return VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION);
+}
+
+static void
+hk_get_device_extensions(const struct hk_instance *instance,
+                         struct vk_device_extension_table *ext)
+{
+   *ext = (struct vk_device_extension_table){
+      .KHR_8bit_storage = true,
+      .KHR_16bit_storage = true,
+      .KHR_bind_memory2 = true,
+      .KHR_buffer_device_address = true,
+      .KHR_calibrated_timestamps = false,
+      .KHR_copy_commands2 = true,
+      .KHR_create_renderpass2 = true,
+      .KHR_dedicated_allocation = true,
+      .KHR_depth_stencil_resolve = true,
+      .KHR_descriptor_update_template = true,
+      .KHR_device_group = true,
+      .KHR_draw_indirect_count = false,
+      .KHR_driver_properties = true,
+      .KHR_dynamic_rendering = true,
+      // TODO
+      .KHR_dynamic_rendering_local_read = false,
+      .KHR_external_fence = true,
+      .KHR_external_fence_fd = true,
+      .KHR_external_memory = true,
+      .KHR_external_memory_fd = true,
+      /* XXX: External timeline semaphores maybe broken in kernel, see
+       * dEQP-VK.synchronization.signal_order.shared_timeline_semaphore.write_copy_buffer_to_image_read_image_compute.image_128_r32_uint_opaque_fd
+       */
+      .KHR_external_semaphore = false,
+      .KHR_external_semaphore_fd = false,
+      .KHR_format_feature_flags2 = true,
+      .KHR_fragment_shader_barycentric = false,
+      .KHR_get_memory_requirements2 = true,
+      .KHR_global_priority = true,
+      .KHR_image_format_list = true,
+      .KHR_imageless_framebuffer = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .KHR_incremental_present = true,
+#endif
+      .KHR_index_type_uint8 = true,
+      .KHR_line_rasterization = true,
+      .KHR_load_store_op_none = true,
+      .KHR_maintenance1 = true,
+      .KHR_maintenance2 = true,
+      .KHR_maintenance3 = true,
+      .KHR_maintenance4 = true,
+      .KHR_maintenance5 = true,
+      .KHR_maintenance6 = true,
+      .KHR_map_memory2 = true,
+      .KHR_multiview = true,
+      .KHR_pipeline_executable_properties = true,
+      .KHR_pipeline_library = true,
+      .KHR_push_descriptor = true,
+      .KHR_relaxed_block_layout = true,
+      .KHR_sampler_mirror_clamp_to_edge = true,
+      .KHR_sampler_ycbcr_conversion = false,
+      .KHR_separate_depth_stencil_layouts = true,
+      .KHR_shader_atomic_int64 = false,
+      .KHR_shader_clock = false,
+      .KHR_shader_draw_parameters = true,
+      .KHR_shader_expect_assume = true,
+      .KHR_shader_float_controls = true,
+      // TODO: wait for nvk
+      .KHR_shader_float_controls2 = true,
+      .KHR_shader_float16_int8 = true,
+      .KHR_shader_integer_dot_product = true,
+      .KHR_shader_maximal_reconvergence = true,
+      .KHR_shader_non_semantic_info = true,
+      .KHR_shader_subgroup_extended_types = true,
+      .KHR_shader_subgroup_rotate = true,
+      .KHR_shader_subgroup_uniform_control_flow = true,
+      .KHR_shader_terminate_invocation = true,
+      .KHR_spirv_1_4 = true,
+      .KHR_storage_buffer_storage_class = true,
+      .KHR_timeline_semaphore = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .KHR_swapchain = true,
+      .KHR_swapchain_mutable_format = true,
+#endif
+      .KHR_synchronization2 = true,
+      .KHR_uniform_buffer_standard_layout = true,
+      .KHR_variable_pointers = true,
+      .KHR_vertex_attribute_divisor = true,
+      .KHR_vulkan_memory_model = true,
+      .KHR_workgroup_memory_explicit_layout = true,
+      .KHR_zero_initialize_workgroup_memory = true,
+      .EXT_4444_formats = true,
+      .EXT_attachment_feedback_loop_layout = true,
+      .EXT_border_color_swizzle = true,
+      .EXT_buffer_device_address = true,
+      .EXT_calibrated_timestamps = false,
+      .EXT_conditional_rendering = false,
+      .EXT_color_write_enable = true,
+      .EXT_custom_border_color = true,
+      .EXT_depth_bias_control = false,
+      .EXT_depth_clip_control = false,
+      .EXT_depth_clip_enable = true,
+      .EXT_descriptor_indexing = true,
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+      .EXT_display_control = false,
+#endif
+      .EXT_dynamic_rendering_unused_attachments = true,
+      .EXT_extended_dynamic_state = true,
+      .EXT_extended_dynamic_state2 = true,
+      .EXT_extended_dynamic_state3 = true,
+      .EXT_external_memory_dma_buf = true,
+      // TODO
+      .EXT_global_priority = false,
+      // TODO
+      .EXT_global_priority_query = false,
+      .EXT_graphics_pipeline_library = true,
+      .EXT_host_query_reset = true,
+      .EXT_host_image_copy = true,
+      .EXT_image_2d_view_of_3d = true,
+      .EXT_image_robustness = true,
+      .EXT_image_sliced_view_of_3d = false,
+      .EXT_image_view_min_lod = false,
+      .EXT_index_type_uint8 = true,
+      .EXT_inline_uniform_block = true,
+      .EXT_line_rasterization = true,
+      .EXT_load_store_op_none = true,
+      .EXT_map_memory_placed = false,
+      .EXT_memory_budget = false,
+      .EXT_multi_draw = true,
+      .EXT_mutable_descriptor_type = true,
+      .EXT_non_seamless_cube_map = true,
+      .EXT_pipeline_creation_cache_control = true,
+      .EXT_pipeline_creation_feedback = true,
+      .EXT_pipeline_protected_access = true,
+      .EXT_pipeline_robustness = true,
+      .EXT_physical_device_drm = true,
+      .EXT_primitive_topology_list_restart = true,
+      .EXT_private_data = true,
+      .EXT_primitives_generated_query = false,
+      .EXT_provoking_vertex = true,
+      .EXT_robustness2 = true,
+      .EXT_sample_locations = true,
+      .EXT_sampler_filter_minmax = false,
+      .EXT_scalar_block_layout = true,
+      .EXT_separate_stencil_usage = true,
+      .EXT_shader_image_atomic_int64 = false,
+      .EXT_shader_demote_to_helper_invocation = true,
+      .EXT_shader_module_identifier = true,
+      .EXT_shader_object = true,
+      .EXT_shader_replicated_composites = true,
+      .EXT_shader_stencil_export = true,
+      .EXT_shader_subgroup_ballot = true,
+      .EXT_shader_subgroup_vote = true,
+      .EXT_shader_viewport_index_layer = true,
+      .EXT_subgroup_size_control = true,
+#ifdef HK_USE_WSI_PLATFORM
+      .EXT_swapchain_maintenance1 = true,
+#endif
+      .EXT_texel_buffer_alignment = true,
+      .EXT_tooling_info = true,
+      .EXT_transform_feedback = true,
+      .EXT_vertex_attribute_divisor = true,
+      .EXT_vertex_input_dynamic_state = true,
+      .EXT_ycbcr_2plane_444_formats = false,
+      .EXT_ycbcr_image_arrays = false,
+      .GOOGLE_decorate_string = true,
+      .GOOGLE_hlsl_functionality1 = true,
+      .GOOGLE_user_type = true,
+      .VALVE_mutable_descriptor_type = true,
+   };
+}
+
+static void
+hk_get_device_features(
+   const struct vk_device_extension_table *supported_extensions,
+   struct vk_features *features)
+{
+   *features = (struct vk_features){
+      /* Vulkan 1.0 */
+      .robustBufferAccess = true,
+      .fullDrawIndexUint32 = true,
+      .imageCubeArray = true,
+      .independentBlend = true,
+      .geometryShader = true,
+      .tessellationShader = true,
+      .sampleRateShading = true,
+      .dualSrcBlend = true,
+      .logicOp = true,
+      .multiDrawIndirect = true,
+      .drawIndirectFirstInstance = true,
+      .depthClamp = true,
+      .depthBiasClamp = true,
+      .fillModeNonSolid = true,
+      .depthBounds = false,
+      .wideLines = true,
+      .largePoints = true,
+      .alphaToOne = true,
+      .multiViewport = true,
+      .samplerAnisotropy = true,
+      .textureCompressionETC2 = false,
+      .textureCompressionBC = true,
+      .textureCompressionASTC_LDR = false,
+      .occlusionQueryPrecise = true,
+      .pipelineStatisticsQuery = true,
+      .vertexPipelineStoresAndAtomics = true,
+      .fragmentStoresAndAtomics = true,
+      .shaderTessellationAndGeometryPointSize = true,
+      .shaderImageGatherExtended = true,
+      .shaderStorageImageExtendedFormats = true,
+      /* TODO: hitting the vertex shader timeout in CTS, but should work */
+      .shaderStorageImageMultisample = false,
+      .shaderStorageImageReadWithoutFormat = true,
+      .shaderStorageImageWriteWithoutFormat = true,
+      .shaderUniformBufferArrayDynamicIndexing = true,
+      .shaderSampledImageArrayDynamicIndexing = true,
+      .shaderStorageBufferArrayDynamicIndexing = true,
+      .shaderStorageImageArrayDynamicIndexing = true,
+      .shaderClipDistance = true,
+      .shaderCullDistance = true,
+      .shaderFloat64 = false,
+      .shaderInt64 = true,
+      .shaderInt16 = true,
+      .shaderResourceResidency = false,
+      .shaderResourceMinLod = false,
+      .sparseBinding = false,
+      .sparseResidency2Samples = false,
+      .sparseResidency4Samples = false,
+      .sparseResidency8Samples = false,
+      .sparseResidencyAliased = false,
+      .sparseResidencyBuffer = false,
+      .sparseResidencyImage2D = false,
+      .sparseResidencyImage3D = false,
+      .variableMultisampleRate = false,
+      .inheritedQueries = true,
+
+      /* Vulkan 1.1 */
+      .storageBuffer16BitAccess = true,
+      .uniformAndStorageBuffer16BitAccess = true,
+      .storagePushConstant16 = true,
+      .storageInputOutput16 = false,
+      .multiview = true,
+      .multiviewGeometryShader = false,
+      .multiviewTessellationShader = false,
+      .variablePointersStorageBuffer = true,
+      .variablePointers = true,
+      .shaderDrawParameters = true,
+      .samplerYcbcrConversion = true,
+
+      /* Vulkan 1.2 */
+      .samplerMirrorClampToEdge = true,
+      .drawIndirectCount = false,
+      .storageBuffer8BitAccess = true,
+      .uniformAndStorageBuffer8BitAccess = true,
+      .storagePushConstant8 = true,
+      .shaderBufferInt64Atomics = false,
+      .shaderSharedInt64Atomics = false,
+      .shaderFloat16 = true,
+      .shaderInt8 = true,
+      .descriptorIndexing = true,
+      .shaderInputAttachmentArrayDynamicIndexing = true,
+      .shaderUniformTexelBufferArrayDynamicIndexing = true,
+      .shaderStorageTexelBufferArrayDynamicIndexing = true,
+      .shaderUniformBufferArrayNonUniformIndexing = true,
+      .shaderSampledImageArrayNonUniformIndexing = true,
+      .shaderStorageBufferArrayNonUniformIndexing = true,
+      .shaderStorageImageArrayNonUniformIndexing = true,
+      .shaderInputAttachmentArrayNonUniformIndexing = true,
+      .shaderUniformTexelBufferArrayNonUniformIndexing = true,
+      .shaderStorageTexelBufferArrayNonUniformIndexing = true,
+      .descriptorBindingUniformBufferUpdateAfterBind = true,
+      .descriptorBindingSampledImageUpdateAfterBind = true,
+      .descriptorBindingStorageImageUpdateAfterBind = true,
+      .descriptorBindingStorageBufferUpdateAfterBind = true,
+      .descriptorBindingUniformTexelBufferUpdateAfterBind = true,
+      .descriptorBindingStorageTexelBufferUpdateAfterBind = true,
+      .descriptorBindingUpdateUnusedWhilePending = true,
+      .descriptorBindingPartiallyBound = true,
+      .descriptorBindingVariableDescriptorCount = true,
+      .runtimeDescriptorArray = true,
+      .samplerFilterMinmax = false,
+      .scalarBlockLayout = true,
+      .imagelessFramebuffer = true,
+      .uniformBufferStandardLayout = true,
+      .shaderSubgroupExtendedTypes = true,
+      .separateDepthStencilLayouts = true,
+      .hostQueryReset = true,
+      .timelineSemaphore = true,
+      .bufferDeviceAddress = true,
+      .bufferDeviceAddressCaptureReplay = false,
+      .bufferDeviceAddressMultiDevice = false,
+      .vulkanMemoryModel = true,
+      .vulkanMemoryModelDeviceScope = true,
+      .vulkanMemoryModelAvailabilityVisibilityChains = false,
+      .shaderOutputViewportIndex = true,
+      .shaderOutputLayer = true,
+      .subgroupBroadcastDynamicId = true,
+
+      /* Vulkan 1.3 */
+      .robustImageAccess = true,
+      .inlineUniformBlock = true,
+      .descriptorBindingInlineUniformBlockUpdateAfterBind = true,
+      .pipelineCreationCacheControl = true,
+      .privateData = true,
+      .shaderDemoteToHelperInvocation = true,
+      .shaderTerminateInvocation = true,
+      .subgroupSizeControl = true,
+      .computeFullSubgroups = true,
+      .synchronization2 = true,
+      .shaderZeroInitializeWorkgroupMemory = true,
+      .dynamicRendering = true,
+      .shaderIntegerDotProduct = true,
+      .maintenance4 = true,
+
+      /* VK_KHR_dynamic_rendering_local_read */
+      .dynamicRenderingLocalRead = true,
+
+      /* VK_KHR_fragment_shader_barycentric */
+      .fragmentShaderBarycentric = false,
+
+      /* VK_KHR_global_priority */
+      .globalPriorityQuery = true,
+
+      /* VK_KHR_index_type_uint8 */
+      .indexTypeUint8 = true,
+
+      /* VK_KHR_line_rasterization */
+      .rectangularLines = false,
+      .bresenhamLines = true,
+      .smoothLines = false,
+      .stippledRectangularLines = false,
+      .stippledBresenhamLines = false,
+      .stippledSmoothLines = false,
+
+      /* VK_KHR_maintenance5 */
+      .maintenance5 = true,
+
+      /* VK_KHR_maintenance6 */
+      .maintenance6 = true,
+
+      /* VK_KHR_pipeline_executable_properties */
+      .pipelineExecutableInfo = true,
+
+      /* VK_KHR_present_id */
+      .presentId = false,
+
+      /* VK_KHR_present_wait */
+      .presentWait = false,
+
+      /* VK_KHR_shader_clock */
+      .shaderSubgroupClock = false,
+      .shaderDeviceClock = false,
+
+      /* VK_KHR_shader_expect_assume */
+      .shaderExpectAssume = true,
+
+      /* VK_KHR_shader_float_controls2 */
+      .shaderFloatControls2 = true,
+
+      /* VK_KHR_shader_maximal_reconvergence */
+      .shaderMaximalReconvergence = true,
+
+      /* VK_KHR_shader_subgroup_rotate */
+      .shaderSubgroupRotate = true,
+      .shaderSubgroupRotateClustered = true,
+
+      /* VK_KHR_vertex_attribute_divisor */
+      .vertexAttributeInstanceRateDivisor = true,
+      .vertexAttributeInstanceRateZeroDivisor = true,
+
+      /* VK_KHR_workgroup_memory_explicit_layout */
+      .workgroupMemoryExplicitLayout = true,
+      .workgroupMemoryExplicitLayoutScalarBlockLayout = true,
+      .workgroupMemoryExplicitLayout8BitAccess = true,
+      .workgroupMemoryExplicitLayout16BitAccess = true,
+
+      /* VK_EXT_4444_formats */
+      .formatA4R4G4B4 = true,
+      .formatA4B4G4R4 = true,
+
+      /* VK_EXT_attachment_feedback_loop_layout */
+      .attachmentFeedbackLoopLayout = true,
+
+      /* VK_EXT_border_color_swizzle */
+      .borderColorSwizzle = true,
+      .borderColorSwizzleFromImage = false,
+
+      /* VK_EXT_buffer_device_address */
+      .bufferDeviceAddressCaptureReplayEXT = false,
+
+      /* VK_EXT_color_write_enable */
+      .colorWriteEnable = true,
+
+      /* VK_EXT_conditional_rendering */
+      .conditionalRendering = false,
+      .inheritedConditionalRendering = false,
+
+      /* VK_EXT_custom_border_color */
+      .customBorderColors = true,
+      .customBorderColorWithoutFormat = true,
+
+      /* VK_EXT_depth_bias_control */
+      .depthBiasControl = false,
+      .leastRepresentableValueForceUnormRepresentation = false,
+      .floatRepresentation = false,
+      .depthBiasExact = false,
+
+      /* VK_EXT_depth_clip_control */
+      .depthClipControl = false,
+
+      /* VK_EXT_depth_clip_enable */
+      .depthClipEnable = true,
+
+      /* VK_EXT_dynamic_rendering_unused_attachments */
+      .dynamicRenderingUnusedAttachments = true,
+
+      /* VK_EXT_extended_dynamic_state */
+      .extendedDynamicState = true,
+
+      /* VK_EXT_extended_dynamic_state2 */
+      .extendedDynamicState2 = true,
+      .extendedDynamicState2LogicOp = true,
+      .extendedDynamicState2PatchControlPoints = false,
+
+      /* VK_EXT_extended_dynamic_state3 */
+      .extendedDynamicState3TessellationDomainOrigin = false,
+      .extendedDynamicState3DepthClampEnable = true,
+      .extendedDynamicState3PolygonMode = true,
+      .extendedDynamicState3RasterizationSamples = true,
+      .extendedDynamicState3SampleMask = true,
+      .extendedDynamicState3AlphaToCoverageEnable = true,
+      .extendedDynamicState3AlphaToOneEnable = true,
+      .extendedDynamicState3LogicOpEnable = true,
+      .extendedDynamicState3ColorBlendEnable = true,
+      .extendedDynamicState3ColorBlendEquation = true,
+      .extendedDynamicState3ColorWriteMask = true,
+      .extendedDynamicState3RasterizationStream = false,
+      .extendedDynamicState3ConservativeRasterizationMode = false,
+      .extendedDynamicState3ExtraPrimitiveOverestimationSize = false,
+      .extendedDynamicState3DepthClipEnable = true,
+      .extendedDynamicState3SampleLocationsEnable = false,
+      .extendedDynamicState3ColorBlendAdvanced = false,
+      .extendedDynamicState3ProvokingVertexMode = true,
+      .extendedDynamicState3LineRasterizationMode = true,
+      .extendedDynamicState3LineStippleEnable = false,
+      .extendedDynamicState3DepthClipNegativeOneToOne = false,
+      .extendedDynamicState3ViewportWScalingEnable = false,
+      .extendedDynamicState3ViewportSwizzle = false,
+      .extendedDynamicState3CoverageToColorEnable = false,
+      .extendedDynamicState3CoverageToColorLocation = false,
+      .extendedDynamicState3CoverageModulationMode = false,
+      .extendedDynamicState3CoverageModulationTableEnable = false,
+      .extendedDynamicState3CoverageModulationTable = false,
+      .extendedDynamicState3CoverageReductionMode = false,
+      .extendedDynamicState3RepresentativeFragmentTestEnable = false,
+      .extendedDynamicState3ShadingRateImageEnable = false,
+
+      /* VK_EXT_graphics_pipeline_library */
+      .graphicsPipelineLibrary = true,
+
+      /* VK_EXT_host_image_copy */
+      .hostImageCopy = true,
+
+      /* VK_EXT_image_2d_view_of_3d */
+      .image2DViewOf3D = true,
+      .sampler2DViewOf3D = true,
+
+      /* VK_EXT_image_sliced_view_of_3d */
+      .imageSlicedViewOf3D = false,
+
+#ifdef HK_USE_WSI_PLATFORM
+      /* VK_EXT_swapchain_maintenance1 */
+      .swapchainMaintenance1 = false,
+#endif
+
+      /* VK_EXT_image_view_min_lod */
+      .minLod = false,
+
+      /* VK_EXT_map_memory_placed */
+      .memoryMapPlaced = false,
+      .memoryMapRangePlaced = false,
+      .memoryUnmapReserve = false,
+
+      /* VK_EXT_multi_draw */
+      .multiDraw = true,
+
+      /* VK_EXT_mutable_descriptor_type */
+      .mutableDescriptorType = true,
+
+      /* VK_EXT_non_seamless_cube_map */
+      .nonSeamlessCubeMap = true,
+
+      /* VK_EXT_pipeline_protected_access */
+      .pipelineProtectedAccess = true,
+
+      /* VK_EXT_pipeline_robustness */
+      .pipelineRobustness = true,
+
+      /* VK_EXT_primitive_topology_list_restart */
+      .primitiveTopologyListRestart = true,
+      .primitiveTopologyPatchListRestart = false,
+
+      /* VK_EXT_primitives_generated_query */
+      .primitivesGeneratedQuery = false,
+      .primitivesGeneratedQueryWithNonZeroStreams = false,
+      .primitivesGeneratedQueryWithRasterizerDiscard = false,
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexLast = true,
+      .transformFeedbackPreservesProvokingVertex = true,
+
+      /* VK_EXT_robustness2 */
+      .robustBufferAccess2 = true,
+      .robustImageAccess2 = true,
+      .nullDescriptor = true,
+
+      /* VK_EXT_shader_image_atomic_int64 */
+      .shaderImageInt64Atomics = false,
+      .sparseImageInt64Atomics = false,
+
+      /* VK_EXT_shader_module_identifier */
+      .shaderModuleIdentifier = true,
+
+      /* VK_EXT_shader_object */
+      .shaderObject = true,
+
+      /* VK_EXT_shader_replicated_composites */
+      .shaderReplicatedComposites = true,
+
+      /* VK_KHR_shader_subgroup_uniform_control_flow */
+      .shaderSubgroupUniformControlFlow = true,
+
+      /* VK_EXT_texel_buffer_alignment */
+      .texelBufferAlignment = true,
+
+      /* VK_EXT_transform_feedback */
+      .transformFeedback = true,
+      .geometryStreams = true,
+
+      /* VK_EXT_vertex_input_dynamic_state */
+      .vertexInputDynamicState = true,
+
+      /* VK_EXT_ycbcr_2plane_444_formats */
+      .ycbcr2plane444Formats = false,
+
+      /* VK_EXT_ycbcr_image_arrays */
+      .ycbcrImageArrays = false,
+   };
+}
+
+static void
+hk_get_device_properties(const struct agx_device *dev,
+                         const struct hk_instance *instance,
+                         struct vk_properties *properties)
+{
+   const VkSampleCountFlagBits sample_counts =
+      VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
+
+   uint64_t os_page_size = 16384;
+   os_get_page_size(&os_page_size);
+
+   *properties = (struct vk_properties){
+      .apiVersion = hk_get_vk_version(),
+      .driverVersion = vk_get_driver_version(),
+      .vendorID = instance->force_vk_vendor ?: VK_VENDOR_ID_MESA,
+      .deviceID = 0,
+      .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
+
+      /* Vulkan 1.0 limits */
+      .maxImageDimension1D = 16384,
+      .maxImageDimension2D = 16384,
+      .maxImageDimension3D = 16384,
+      .maxImageDimensionCube = 16384,
+      .maxImageArrayLayers = 2048,
+      .maxTexelBufferElements = AGX_TEXTURE_BUFFER_MAX_SIZE,
+      .maxUniformBufferRange = 65536,
+      .maxStorageBufferRange = UINT32_MAX,
+      .maxPushConstantsSize = HK_MAX_PUSH_SIZE,
+      .maxMemoryAllocationCount = 4096,
+      .maxSamplerAllocationCount = 4000,
+      .bufferImageGranularity = 0x400,
+      .sparseAddressSpaceSize = HK_SPARSE_ADDR_SPACE_SIZE,
+      .maxBoundDescriptorSets = HK_MAX_SETS,
+      .maxPerStageDescriptorSamplers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorSampledImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorStorageImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorInputAttachments = HK_MAX_DESCRIPTORS,
+      .maxPerStageResources = UINT32_MAX,
+      .maxDescriptorSetSamplers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUniformBuffersDynamic = HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetStorageBuffersDynamic = HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetSampledImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetStorageImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetInputAttachments = HK_MAX_DESCRIPTORS,
+      .maxVertexInputAttributes = AGX_MAX_VBUFS,
+      .maxVertexInputBindings = AGX_MAX_ATTRIBS,
+      .maxVertexInputAttributeOffset = 65535,
+      .maxVertexInputBindingStride = 2048,
+      .maxVertexOutputComponents = 64,
+      .maxGeometryShaderInvocations = 32,
+      .maxGeometryInputComponents = 128,
+      .maxGeometryOutputComponents = 128,
+      .maxGeometryOutputVertices = 1024,
+      .maxGeometryTotalOutputComponents = 1024,
+      .maxTessellationGenerationLevel = 64,
+      .maxTessellationPatchSize = 32,
+      .maxTessellationControlPerVertexInputComponents = 128,
+      .maxTessellationControlPerVertexOutputComponents = 128,
+      .maxTessellationControlPerPatchOutputComponents = 120,
+      .maxTessellationControlTotalOutputComponents = 4216,
+      .maxTessellationEvaluationInputComponents = 128,
+      .maxTessellationEvaluationOutputComponents = 128,
+      .maxFragmentInputComponents = 64,
+      .maxFragmentOutputAttachments = HK_MAX_RTS,
+      .maxFragmentDualSrcAttachments = 1,
+      .maxFragmentCombinedOutputResources = 16,
+      .maxComputeSharedMemorySize = HK_MAX_SHARED_SIZE,
+      .maxComputeWorkGroupCount = {0x7fffffff, 65535, 65535},
+      .maxComputeWorkGroupInvocations = 1024,
+      .maxComputeWorkGroupSize = {1024, 1024, 64},
+      .subPixelPrecisionBits = 8,
+      .subTexelPrecisionBits = 8,
+      .mipmapPrecisionBits = 8,
+      .maxDrawIndexedIndexValue = UINT32_MAX,
+      .maxDrawIndirectCount = UINT32_MAX,
+      .maxSamplerLodBias = 15,
+      .maxSamplerAnisotropy = 16,
+      .maxViewports = HK_MAX_VIEWPORTS,
+      .maxViewportDimensions = {32768, 32768},
+      .viewportBoundsRange = {-65536, 65536},
+      .viewportSubPixelBits = 8,
+      .minMemoryMapAlignment = os_page_size,
+      .minTexelBufferOffsetAlignment = HK_MIN_TEXEL_BUFFER_ALIGNMENT,
+      .minUniformBufferOffsetAlignment = HK_MIN_UBO_ALIGNMENT,
+      .minStorageBufferOffsetAlignment = HK_MIN_SSBO_ALIGNMENT,
+      .minTexelOffset = -8,
+      .maxTexelOffset = 7,
+      .minTexelGatherOffset = -8,
+      .maxTexelGatherOffset = 7,
+      .minInterpolationOffset = -0.5,
+      .maxInterpolationOffset = 0.4375,
+      .subPixelInterpolationOffsetBits = 4,
+      .maxFramebufferHeight = 16384,
+      .maxFramebufferWidth = 16384,
+      .maxFramebufferLayers = 2048,
+      .framebufferColorSampleCounts = sample_counts,
+      .framebufferDepthSampleCounts = sample_counts,
+      .framebufferNoAttachmentsSampleCounts = sample_counts,
+      .framebufferStencilSampleCounts = sample_counts,
+      .maxColorAttachments = HK_MAX_RTS,
+      .sampledImageColorSampleCounts = sample_counts,
+      .sampledImageIntegerSampleCounts = sample_counts,
+      .sampledImageDepthSampleCounts = sample_counts,
+      .sampledImageStencilSampleCounts = sample_counts,
+      .storageImageSampleCounts = sample_counts,
+      .maxSampleMaskWords = 1,
+      .timestampComputeAndGraphics = false,
+      .timestampPeriod = 1,
+      .maxClipDistances = 8,
+      .maxCullDistances = 8,
+      .maxCombinedClipAndCullDistances = 8,
+      .discreteQueuePriorities = 2,
+      .pointSizeRange = {1.0, 512.f - 0.0625f},
+      .lineWidthRange = {1.0, 16.0f},
+      .pointSizeGranularity = 0.0625,
+      .lineWidthGranularity = 1.0f / 16.0f,
+      .strictLines = false,
+      .standardSampleLocations = true,
+      .optimalBufferCopyOffsetAlignment = 1,
+      .optimalBufferCopyRowPitchAlignment = 1,
+      .nonCoherentAtomSize = 64,
+
+      /* Vulkan 1.0 sparse properties */
+      .sparseResidencyNonResidentStrict = false,
+      .sparseResidencyAlignedMipSize = false,
+      .sparseResidencyStandard2DBlockShape = false,
+      .sparseResidencyStandard2DMultisampleBlockShape = false,
+      .sparseResidencyStandard3DBlockShape = false,
+
+      /* Vulkan 1.1 properties */
+      .subgroupSize = 32,
+      .subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT |
+                                 VK_SHADER_STAGE_FRAGMENT_BIT |
+                                 VK_SHADER_STAGE_VERTEX_BIT,
+      .subgroupSupportedOperations =
+         VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT |
+         VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
+         VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+         VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+         VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
+         VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+         VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
+         VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR,
+      .subgroupQuadOperationsInAllStages = true,
+      .pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY,
+      .maxMultiviewViewCount = HK_MAX_MULTIVIEW_VIEW_COUNT,
+      .maxMultiviewInstanceIndex = UINT32_MAX,
+      .maxPerSetDescriptors = UINT32_MAX,
+      .maxMemoryAllocationSize = (1u << 31),
+
+      /* Vulkan 1.2 properties */
+      .supportedDepthResolveModes =
+         VK_RESOLVE_MODE_SAMPLE_ZERO_BIT | VK_RESOLVE_MODE_AVERAGE_BIT |
+         VK_RESOLVE_MODE_MIN_BIT | VK_RESOLVE_MODE_MAX_BIT,
+      .supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+                                      VK_RESOLVE_MODE_MIN_BIT |
+                                      VK_RESOLVE_MODE_MAX_BIT,
+      .independentResolveNone = true,
+      .independentResolve = true,
+      .driverID = VK_DRIVER_ID_MESA_HONEYKRISP,
+      .conformanceVersion = (VkConformanceVersion){1, 3, 8, 3},
+      .denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+      .roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL,
+      .shaderSignedZeroInfNanPreserveFloat16 = true,
+      .shaderSignedZeroInfNanPreserveFloat32 = true,
+      .shaderSignedZeroInfNanPreserveFloat64 = false,
+      .shaderDenormPreserveFloat16 = true,
+      .shaderDenormPreserveFloat32 = false,
+      .shaderDenormPreserveFloat64 = false,
+      .shaderDenormFlushToZeroFloat16 = false,
+      .shaderDenormFlushToZeroFloat32 = true,
+      .shaderDenormFlushToZeroFloat64 = false,
+      .shaderRoundingModeRTEFloat16 = true,
+      .shaderRoundingModeRTEFloat32 = true,
+      .shaderRoundingModeRTEFloat64 = false,
+      .shaderRoundingModeRTZFloat16 = false,
+      .shaderRoundingModeRTZFloat32 = false,
+      .shaderRoundingModeRTZFloat64 = false,
+      .maxUpdateAfterBindDescriptorsInAllPools = UINT32_MAX,
+      .shaderUniformBufferArrayNonUniformIndexingNative = true,
+      .shaderSampledImageArrayNonUniformIndexingNative = true,
+      .shaderStorageBufferArrayNonUniformIndexingNative = true,
+      .shaderStorageImageArrayNonUniformIndexingNative = true,
+      .shaderInputAttachmentArrayNonUniformIndexingNative = true,
+      .robustBufferAccessUpdateAfterBind = true,
+      .quadDivergentImplicitLod = false,
+      .maxPerStageDescriptorUpdateAfterBindSamplers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindSampledImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindStorageImages = HK_MAX_DESCRIPTORS,
+      .maxPerStageDescriptorUpdateAfterBindInputAttachments =
+         HK_MAX_DESCRIPTORS,
+      .maxPerStageUpdateAfterBindResources = UINT32_MAX,
+      .maxDescriptorSetUpdateAfterBindSamplers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindUniformBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindUniformBuffersDynamic =
+         HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetUpdateAfterBindStorageBuffers = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindStorageBuffersDynamic =
+         HK_MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetUpdateAfterBindSampledImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindStorageImages = HK_MAX_DESCRIPTORS,
+      .maxDescriptorSetUpdateAfterBindInputAttachments = HK_MAX_DESCRIPTORS,
+      .filterMinmaxSingleComponentFormats = false,
+      .filterMinmaxImageComponentMapping = false,
+      .maxTimelineSemaphoreValueDifference = UINT64_MAX,
+      .framebufferIntegerColorSampleCounts = sample_counts,
+
+      /* Vulkan 1.3 properties */
+      .minSubgroupSize = 32,
+      .maxSubgroupSize = 32,
+      .maxComputeWorkgroupSubgroups = 1024 / 32,
+      .requiredSubgroupSizeStages = 0,
+      .maxInlineUniformBlockSize = 1 << 16,
+      .maxPerStageDescriptorInlineUniformBlocks = 32,
+      .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 32,
+      .maxDescriptorSetInlineUniformBlocks = 6 * 32,
+      .maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 6 * 32,
+      .maxInlineUniformTotalSize = 1 << 16,
+      .integerDotProduct4x8BitPackedUnsignedAccelerated = false,
+      .integerDotProduct4x8BitPackedSignedAccelerated = false,
+      .integerDotProduct4x8BitPackedMixedSignednessAccelerated = false,
+      .storageTexelBufferOffsetAlignmentBytes = HK_MIN_TEXEL_BUFFER_ALIGNMENT,
+      .storageTexelBufferOffsetSingleTexelAlignment = true,
+      .uniformTexelBufferOffsetAlignmentBytes = HK_MIN_TEXEL_BUFFER_ALIGNMENT,
+      .uniformTexelBufferOffsetSingleTexelAlignment = true,
+      .maxBufferSize = HK_MAX_BUFFER_SIZE,
+
+      /* VK_KHR_push_descriptor */
+      .maxPushDescriptors = HK_MAX_PUSH_DESCRIPTORS,
+
+      /* VK_EXT_custom_border_color */
+      .maxCustomBorderColorSamplers = 4000,
+
+      /* VK_EXT_extended_dynamic_state3 */
+      .dynamicPrimitiveTopologyUnrestricted = true,
+
+      /* VK_EXT_graphics_pipeline_library */
+      .graphicsPipelineLibraryFastLinking = true,
+      .graphicsPipelineLibraryIndependentInterpolationDecoration = true,
+
+      /* VK_EXT_host_image_copy */
+
+      /* VK_KHR_line_rasterization */
+      .lineSubPixelPrecisionBits = 8,
+
+      /* VK_KHR_maintenance5 */
+      .earlyFragmentMultisampleCoverageAfterSampleCounting = false,
+      .earlyFragmentSampleMaskTestBeforeSampleCounting = true,
+      .depthStencilSwizzleOneSupport = true,
+      .polygonModePointSize = false,
+      .nonStrictSinglePixelWideLinesUseParallelogram = false,
+      .nonStrictWideLinesUseParallelogram = false,
+
+      /* VK_KHR_maintenance6 */
+      .blockTexelViewCompatibleMultipleLayers = false,
+      .maxCombinedImageSamplerDescriptorCount = 3,
+      .fragmentShadingRateClampCombinerInputs = false, /* TODO */
+
+      /* VK_EXT_map_memory_placed */
+      .minPlacedMemoryMapAlignment = os_page_size,
+
+      /* VK_EXT_multi_draw */
+      .maxMultiDrawCount = UINT16_MAX,
+
+      /* VK_EXT_pipeline_robustness */
+      .defaultRobustnessStorageBuffers =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .defaultRobustnessUniformBuffers =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .defaultRobustnessVertexInputs =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
+      .defaultRobustnessImages =
+         VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
+
+      /* VK_EXT_physical_device_drm gets populated later */
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexModePerPipeline = true,
+      .transformFeedbackPreservesTriangleFanProvokingVertex = true,
+
+      /* VK_EXT_robustness2 */
+      .robustStorageBufferAccessSizeAlignment = HK_SSBO_BOUNDS_CHECK_ALIGNMENT,
+      .robustUniformBufferAccessSizeAlignment = HK_MIN_UBO_ALIGNMENT,
+
+      /* VK_EXT_sample_locations */
+      .sampleLocationSampleCounts = sample_counts,
+      .maxSampleLocationGridSize = (VkExtent2D){1, 1},
+      .sampleLocationCoordinateRange[0] = 0.0f,
+      .sampleLocationCoordinateRange[1] = 0.9375f,
+      .sampleLocationSubPixelBits = 4,
+      .variableSampleLocations = false,
+
+      /* VK_EXT_shader_object */
+      .shaderBinaryVersion = 0,
+
+      /* VK_EXT_transform_feedback */
+      .maxTransformFeedbackStreams = 4,
+      .maxTransformFeedbackBuffers = 4,
+      .maxTransformFeedbackBufferSize = UINT32_MAX,
+      .maxTransformFeedbackStreamDataSize = 2048,
+      .maxTransformFeedbackBufferDataSize = 512,
+      .maxTransformFeedbackBufferDataStride = 2048,
+      .transformFeedbackQueries = true,
+      .transformFeedbackStreamsLinesTriangles = false,
+      .transformFeedbackRasterizationStreamSelect = false,
+      .transformFeedbackDraw = false,
+
+      /* VK_KHR_vertex_attribute_divisor */
+      .maxVertexAttribDivisor = UINT32_MAX,
+      .supportsNonZeroFirstInstance = true,
+
+      /* VK_KHR_fragment_shader_barycentric */
+      .triStripVertexOrderIndependentOfProvokingVertex = false,
+   };
+
+   strncpy(properties->deviceName, dev->name, sizeof(properties->deviceName));
+
+   /* VK_EXT_shader_module_identifier */
+   static_assert(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+                 sizeof(properties->shaderModuleIdentifierAlgorithmUUID));
+   memcpy(properties->shaderModuleIdentifierAlgorithmUUID,
+          vk_shaderModuleIdentifierAlgorithmUUID,
+          sizeof(properties->shaderModuleIdentifierAlgorithmUUID));
+
+   const struct {
+      uint16_t vendor_id;
+      uint16_t device_id;
+      uint8_t pad[12];
+   } dev_uuid = {
+      .vendor_id = 0,
+      .device_id = 0,
+   };
+   static_assert(sizeof(dev_uuid) == VK_UUID_SIZE);
+   memcpy(properties->deviceUUID, &dev_uuid, VK_UUID_SIZE);
+   static_assert(sizeof(instance->driver_build_sha) >= VK_UUID_SIZE);
+   memcpy(properties->driverUUID, instance->driver_build_sha, VK_UUID_SIZE);
+
+   strncpy(properties->driverName, "Honeykrisp", VK_MAX_DRIVER_NAME_SIZE);
+   snprintf(properties->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+   /* We don't use the layouts ATM so just report all layouts from
+    * extensions that we support as compatible.
+    */
+   static const VkImageLayout supported_layouts[] = {
+      VK_IMAGE_LAYOUT_GENERAL, /* required by spec */
+      VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+      VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+      VK_IMAGE_LAYOUT_PREINITIALIZED,
+      VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL,
+      VK_IMAGE_LAYOUT_STENCIL_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_READ_ONLY_OPTIMAL,
+      VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL,
+      // VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT,
+      VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT,
+   };
+
+   properties->pCopySrcLayouts = (VkImageLayout *)supported_layouts;
+   properties->copySrcLayoutCount = ARRAY_SIZE(supported_layouts);
+   properties->pCopyDstLayouts = (VkImageLayout *)supported_layouts;
+   properties->copyDstLayoutCount = ARRAY_SIZE(supported_layouts);
+
+   /* We're a UMR so we can always map every kind of memory */
+   properties->identicalMemoryTypeRequirements = true;
+
+   {
+      struct mesa_sha1 sha1_ctx;
+      uint8_t sha1[20];
+
+      _mesa_sha1_init(&sha1_ctx);
+      /* Make sure we don't match with other vendors */
+      const char *driver = "honeykrisp-v1";
+      _mesa_sha1_update(&sha1_ctx, driver, strlen(driver));
+      _mesa_sha1_final(&sha1_ctx, sha1);
+
+      memcpy(properties->optimalTilingLayoutUUID, sha1, VK_UUID_SIZE);
+   }
+}
+
+static void
+hk_physical_device_init_pipeline_cache(struct hk_physical_device *pdev)
+{
+   struct hk_instance *instance = hk_physical_device_instance(pdev);
+
+   struct mesa_sha1 sha_ctx;
+   _mesa_sha1_init(&sha_ctx);
+
+   _mesa_sha1_update(&sha_ctx, instance->driver_build_sha,
+                     sizeof(instance->driver_build_sha));
+
+   const uint64_t compiler_flags = hk_physical_device_compiler_flags(pdev);
+   _mesa_sha1_update(&sha_ctx, &compiler_flags, sizeof(compiler_flags));
+
+   unsigned char sha[SHA1_DIGEST_LENGTH];
+   _mesa_sha1_final(&sha_ctx, sha);
+
+   static_assert(SHA1_DIGEST_LENGTH >= VK_UUID_SIZE);
+   memcpy(pdev->vk.properties.pipelineCacheUUID, sha, VK_UUID_SIZE);
+   memcpy(pdev->vk.properties.shaderBinaryUUID, sha, VK_UUID_SIZE);
+
+#ifdef ENABLE_SHADER_CACHE
+   char renderer[10];
+   ASSERTED int len = snprintf(renderer, sizeof(renderer), "hk_g13g_");
+   assert(len == sizeof(renderer) - 2);
+
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, instance->driver_build_sha);
+
+   const uint64_t driver_flags = hk_physical_device_compiler_flags(pdev);
+   pdev->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+}
+
+static void
+hk_physical_device_free_disk_cache(struct hk_physical_device *pdev)
+{
+#ifdef ENABLE_SHADER_CACHE
+   if (pdev->vk.disk_cache) {
+      disk_cache_destroy(pdev->vk.disk_cache);
+      pdev->vk.disk_cache = NULL;
+   }
+#else
+   assert(pdev->vk.disk_cache == NULL);
+#endif
+}
+
+static uint64_t
+hk_get_sysmem_heap_size(void)
+{
+   uint64_t sysmem_size_B = 0;
+   if (!os_get_total_physical_memory(&sysmem_size_B))
+      return 0;
+
+   /* Use 3/4 of total size to avoid swapping */
+   return ROUND_DOWN_TO(sysmem_size_B * 3 / 4, 1 << 20);
+}
+
+static uint64_t
+hk_get_sysmem_heap_available(struct hk_physical_device *pdev)
+{
+   uint64_t sysmem_size_B = 0;
+   if (!os_get_available_system_memory(&sysmem_size_B)) {
+      vk_loge(VK_LOG_OBJS(pdev), "Failed to query available system memory");
+      return 0;
+   }
+
+   /* Use 3/4 of available to avoid swapping */
+   return ROUND_DOWN_TO(sysmem_size_B * 3 / 4, 1 << 20);
+}
+
+VkResult
+hk_create_drm_physical_device(struct vk_instance *_instance,
+                              drmDevicePtr drm_device,
+                              struct vk_physical_device **pdev_out)
+{
+   struct hk_instance *instance = (struct hk_instance *)_instance;
+   VkResult result;
+
+   /* Blanket refusal to probe due to unstable UAPI. */
+   return VK_ERROR_INCOMPATIBLE_DRIVER;
+
+   if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
+       drm_device->bustype != DRM_BUS_PLATFORM)
+      return VK_ERROR_INCOMPATIBLE_DRIVER;
+
+   const char *path = drm_device->nodes[DRM_NODE_RENDER];
+   int fd = open(path, O_RDWR | O_CLOEXEC);
+   if (fd < 0) {
+      return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                       "failed to open device %s", path);
+   }
+
+   drmVersionPtr version = drmGetVersion(fd);
+   if (!version) {
+      result =
+         vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                   "failed to query kernel driver version for device %s", path);
+      goto fail_fd;
+   }
+
+   bool is_asahi = (strcmp(version->name, "asahi") == 0);
+   is_asahi |= strcmp(version->name, "virtio_gpu") == 0;
+   drmFreeVersion(version);
+
+   if (!is_asahi) {
+      result =
+         vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                   "device %s does not use the asahi kernel driver", path);
+      goto fail_fd;
+   }
+
+   struct stat st;
+   if (stat(drm_device->nodes[DRM_NODE_RENDER], &st)) {
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "fstat() failed on %s: %m",
+                         drm_device->nodes[DRM_NODE_RENDER]);
+      goto fail_fd;
+   }
+   const dev_t render_dev = st.st_rdev;
+
+   struct hk_physical_device *pdev =
+      vk_zalloc(&instance->vk.alloc, sizeof(*pdev), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+
+   if (pdev == NULL) {
+      result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_fd;
+   }
+
+   /* TODO: we're render-only, should we be reporting displays anyway in
+    * KHR_display?
+    */
+   pdev->master_fd = -1;
+
+#if 0
+   if (instance->vk.enabled_extensions.KHR_display) {
+      int master_fd =
+         open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC);
+
+      if (master_fd >= 0) {
+         struct stat st;
+         if (!stat(drm_device->nodes[DRM_NODE_PRIMARY], &st)) {
+            pdev->master_fd = master_fd;
+            properties.drmHasPrimary = true;
+            properties.drmPrimaryMajor = major(st.st_rdev);
+            properties.drmPrimaryMinor = minor(st.st_rdev);
+         }
+      }
+   }
+#endif
+
+   pdev->render_dev = render_dev;
+   pdev->dev.fd = fd;
+
+   if (!agx_open_device(NULL, &pdev->dev)) {
+      result = vk_error(instance, VK_ERROR_UNKNOWN);
+      goto fail_pdev_alloc;
+   }
+
+   struct vk_physical_device_dispatch_table dispatch_table;
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &hk_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
+
+   struct vk_device_extension_table supported_extensions;
+   hk_get_device_extensions(instance, &supported_extensions);
+
+   struct vk_features supported_features;
+   hk_get_device_features(&supported_extensions, &supported_features);
+
+   struct vk_properties properties;
+   hk_get_device_properties(&pdev->dev, instance, &properties);
+
+   properties.drmHasRender = true;
+   properties.drmRenderMajor = major(render_dev);
+   properties.drmRenderMinor = minor(render_dev);
+
+   result = vk_physical_device_init(&pdev->vk, &instance->vk,
+                                    &supported_extensions, &supported_features,
+                                    &properties, &dispatch_table);
+   if (result != VK_SUCCESS)
+      goto fail_agx_device;
+
+   hk_physical_device_init_pipeline_cache(pdev);
+
+   uint64_t sysmem_size_B = hk_get_sysmem_heap_size();
+   if (sysmem_size_B == 0) {
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                         "Failed to query total system memory");
+      goto fail_disk_cache;
+   }
+
+   uint32_t sysmem_heap_idx = pdev->mem_heap_count++;
+   pdev->mem_heaps[sysmem_heap_idx] = (struct hk_memory_heap){
+      .size = sysmem_size_B,
+      .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+      .available = hk_get_sysmem_heap_available,
+   };
+
+   pdev->mem_types[pdev->mem_type_count++] = (VkMemoryType){
+      .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                       VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                       VK_MEMORY_PROPERTY_HOST_CACHED_BIT |
+                       VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+      .heapIndex = sysmem_heap_idx,
+   };
+
+   assert(pdev->mem_heap_count <= ARRAY_SIZE(pdev->mem_heaps));
+   assert(pdev->mem_type_count <= ARRAY_SIZE(pdev->mem_types));
+
+   /* TODO: VK_QUEUE_SPARSE_BINDING_BIT*/
+   pdev->queue_families[pdev->queue_family_count++] = (struct hk_queue_family){
+      .queue_flags =
+         VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT,
+
+      .queue_count = 1,
+   };
+   assert(pdev->queue_family_count <= ARRAY_SIZE(pdev->queue_families));
+
+   unsigned st_idx = 0;
+   pdev->syncobj_sync_type = vk_drm_syncobj_get_type(fd);
+   pdev->sync_types[st_idx++] = &pdev->syncobj_sync_type;
+   pdev->sync_types[st_idx++] = NULL;
+   assert(st_idx <= ARRAY_SIZE(pdev->sync_types));
+   pdev->vk.supported_sync_types = pdev->sync_types;
+
+   result = hk_init_wsi(pdev);
+   if (result != VK_SUCCESS)
+      goto fail_disk_cache;
+
+   *pdev_out = &pdev->vk;
+
+   return VK_SUCCESS;
+
+fail_disk_cache:
+   hk_physical_device_free_disk_cache(pdev);
+   vk_physical_device_finish(&pdev->vk);
+fail_agx_device:
+   agx_close_device(&pdev->dev);
+fail_pdev_alloc:
+   if (pdev->master_fd)
+      close(pdev->master_fd);
+
+   vk_free(&pdev->vk.instance->alloc, pdev);
+fail_fd:
+   close(fd);
+   return result;
+}
+
+void
+hk_physical_device_destroy(struct vk_physical_device *vk_pdev)
+{
+   struct hk_physical_device *pdev =
+      container_of(vk_pdev, struct hk_physical_device, vk);
+
+   hk_finish_wsi(pdev);
+
+   if (pdev->master_fd >= 0)
+      close(pdev->master_fd);
+
+   hk_physical_device_free_disk_cache(pdev);
+   agx_close_device(&pdev->dev);
+   vk_physical_device_finish(&pdev->vk);
+   vk_free(&pdev->vk.instance->alloc, pdev);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceMemoryProperties2(
+   VkPhysicalDevice physicalDevice,
+   VkPhysicalDeviceMemoryProperties2 *pMemoryProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+
+   pMemoryProperties->memoryProperties.memoryHeapCount = pdev->mem_heap_count;
+   for (int i = 0; i < pdev->mem_heap_count; i++) {
+      pMemoryProperties->memoryProperties.memoryHeaps[i] = (VkMemoryHeap){
+         .size = pdev->mem_heaps[i].size,
+         .flags = pdev->mem_heaps[i].flags,
+      };
+   }
+
+   pMemoryProperties->memoryProperties.memoryTypeCount = pdev->mem_type_count;
+   for (int i = 0; i < pdev->mem_type_count; i++) {
+      pMemoryProperties->memoryProperties.memoryTypes[i] = pdev->mem_types[i];
+   }
+
+   vk_foreach_struct(ext, pMemoryProperties->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: {
+         VkPhysicalDeviceMemoryBudgetPropertiesEXT *p = (void *)ext;
+
+         for (unsigned i = 0; i < pdev->mem_heap_count; i++) {
+            const struct hk_memory_heap *heap = &pdev->mem_heaps[i];
+            uint64_t used = p_atomic_read(&heap->used);
+
+            /* From the Vulkan 1.3.278 spec:
+             *
+             *    "heapUsage is an array of VK_MAX_MEMORY_HEAPS VkDeviceSize
+             *    values in which memory usages are returned, with one element
+             *    for each memory heap. A heap’s usage is an estimate of how
+             *    much memory the process is currently using in that heap."
+             *
+             * TODO: Include internal allocations?
+             */
+            p->heapUsage[i] = used;
+
+            uint64_t available = heap->size;
+            if (heap->available)
+               available = heap->available(pdev);
+
+            /* From the Vulkan 1.3.278 spec:
+             *
+             *    "heapBudget is an array of VK_MAX_MEMORY_HEAPS VkDeviceSize
+             *    values in which memory budgets are returned, with one
+             *    element for each memory heap. A heap’s budget is a rough
+             *    estimate of how much memory the process can allocate from
+             *    that heap before allocations may fail or cause performance
+             *    degradation. The budget includes any currently allocated
+             *    device memory."
+             *
+             * and
+             *
+             *    "The heapBudget value must be less than or equal to
+             *    VkMemoryHeap::size for each heap."
+             *
+             * available (queried above) is the total amount free memory
+             * system-wide and does not include our allocations so we need
+             * to add that in.
+             */
+            uint64_t budget = MIN2(available + used, heap->size);
+
+            /* Set the budget at 90% of available to avoid thrashing */
+            p->heapBudget[i] = ROUND_DOWN_TO(budget * 9 / 10, 1 << 20);
+         }
+
+         /* From the Vulkan 1.3.278 spec:
+          *
+          *    "The heapBudget and heapUsage values must be zero for array
+          *    elements greater than or equal to
+          *    VkPhysicalDeviceMemoryProperties::memoryHeapCount. The
+          *    heapBudget value must be non-zero for array elements less than
+          *    VkPhysicalDeviceMemoryProperties::memoryHeapCount."
+          */
+         for (unsigned i = pdev->mem_heap_count; i < VK_MAX_MEMORY_HEAPS; i++) {
+            p->heapBudget[i] = 0u;
+            p->heapUsage[i] = 0u;
+         }
+         break;
+      }
+      default:
+         vk_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceQueueFamilyProperties2(
+   VkPhysicalDevice physicalDevice, uint32_t *pQueueFamilyPropertyCount,
+   VkQueueFamilyProperties2 *pQueueFamilyProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+   VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out, pQueueFamilyProperties,
+                          pQueueFamilyPropertyCount);
+
+   for (uint8_t i = 0; i < pdev->queue_family_count; i++) {
+      const struct hk_queue_family *queue_family = &pdev->queue_families[i];
+
+      vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p)
+      {
+         p->queueFamilyProperties.queueFlags = queue_family->queue_flags;
+         p->queueFamilyProperties.queueCount = queue_family->queue_count;
+         p->queueFamilyProperties.timestampValidBits = 0; // TODO 64;
+         p->queueFamilyProperties.minImageTransferGranularity =
+            (VkExtent3D){1, 1, 1};
+
+         vk_foreach_struct(ext, p->pNext) {
+            switch (ext->sType) {
+            case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
+               VkQueueFamilyGlobalPriorityPropertiesKHR *props = (void *)ext;
+
+               /* TODO: support multiple priorities */
+               props->priorityCount = 1;
+               props->priorities[0] = VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT;
+               break;
+            }
+            default:
+               break;
+            }
+         }
+      }
+   }
+}
+
+static const VkTimeDomainKHR hk_time_domains[] = {
+   VK_TIME_DOMAIN_DEVICE_KHR,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
+#ifdef CLOCK_MONOTONIC_RAW
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR,
+#endif
+};
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetPhysicalDeviceCalibrateableTimeDomainsKHR(VkPhysicalDevice physicalDevice,
+                                                uint32_t *pTimeDomainCount,
+                                                VkTimeDomainKHR *pTimeDomains)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkTimeDomainKHR, out, pTimeDomains, pTimeDomainCount);
+
+   for (int d = 0; d < ARRAY_SIZE(hk_time_domains); d++) {
+      vk_outarray_append_typed(VkTimeDomainKHR, &out, i)
+      {
+         *i = hk_time_domains[d];
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_GetPhysicalDeviceMultisamplePropertiesEXT(
+   VkPhysicalDevice physicalDevice, VkSampleCountFlagBits samples,
+   VkMultisamplePropertiesEXT *pMultisampleProperties)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+
+   if (samples & pdev->vk.properties.sampleLocationSampleCounts) {
+      pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){1, 1};
+   } else {
+      pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){0, 0};
+   }
+}
diff --git a/src/asahi/vulkan/hk_physical_device.h b/src/asahi/vulkan/hk_physical_device.h
new file mode 100644
index 00000000000..8b8b318d8be
--- /dev/null
+++ b/src/asahi/vulkan/hk_physical_device.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/lib/agx_device.h"
+#include <sys/types.h>
+#include "hk_private.h"
+#include "vk_physical_device.h"
+#include "vk_sync.h"
+#include "wsi_common.h"
+
+struct hk_instance;
+struct hk_physical_device;
+
+struct hk_queue_family {
+   VkQueueFlags queue_flags;
+   uint32_t queue_count;
+};
+
+struct hk_memory_heap {
+   uint64_t size;
+   uint64_t used;
+   VkMemoryHeapFlags flags;
+   uint64_t (*available)(struct hk_physical_device *pdev);
+};
+
+struct hk_physical_device {
+   struct vk_physical_device vk;
+   dev_t render_dev;
+   int master_fd;
+
+   /* Only used for VK_EXT_memory_budget */
+   struct agx_device dev;
+
+   struct wsi_device wsi_device;
+
+   uint8_t device_uuid[VK_UUID_SIZE];
+
+   // TODO: add mapable VRAM heap if possible
+   struct hk_memory_heap mem_heaps[3];
+   VkMemoryType mem_types[3];
+   uint8_t mem_heap_count;
+   uint8_t mem_type_count;
+
+   struct hk_queue_family queue_families[3];
+   uint8_t queue_family_count;
+
+   struct vk_sync_type syncobj_sync_type;
+   const struct vk_sync_type *sync_types[2];
+};
+
+VK_DEFINE_HANDLE_CASTS(hk_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+
+static inline struct hk_instance *
+hk_physical_device_instance(struct hk_physical_device *pdev)
+{
+   return (struct hk_instance *)pdev->vk.instance;
+}
+
+VkResult hk_create_drm_physical_device(struct vk_instance *vk_instance,
+                                       struct _drmDevice *drm_device,
+                                       struct vk_physical_device **pdev_out);
+
+void hk_physical_device_destroy(struct vk_physical_device *vk_device);
+
+#if defined(VK_USE_PLATFORM_WAYLAND_KHR) ||                                    \
+   defined(VK_USE_PLATFORM_XCB_KHR) || defined(VK_USE_PLATFORM_XLIB_KHR) ||    \
+   defined(VK_USE_PLATFORM_DISPLAY_KHR)
+#define HK_USE_WSI_PLATFORM
+#endif
diff --git a/src/asahi/vulkan/hk_private.h b/src/asahi/vulkan/hk_private.h
new file mode 100644
index 00000000000..bd2b8d68f97
--- /dev/null
+++ b/src/asahi/vulkan/hk_private.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <assert.h>
+
+#include "vk_log.h"
+#include "vk_util.h"
+
+#define HK_MAX_SETS                   8
+#define HK_MAX_PUSH_SIZE              128
+#define HK_MAX_DYNAMIC_BUFFERS        64
+#define HK_MAX_RTS                    8
+#define HK_MIN_SSBO_ALIGNMENT         16
+#define HK_MIN_TEXEL_BUFFER_ALIGNMENT 16
+#define HK_MIN_UBO_ALIGNMENT          64
+#define HK_MAX_VIEWPORTS              16
+#define HK_MAX_DESCRIPTOR_SIZE        32
+#define HK_MAX_PUSH_DESCRIPTORS       32
+#define HK_MAX_DESCRIPTOR_SET_SIZE    (1u << 30)
+#define HK_MAX_DESCRIPTORS            (1 << 20)
+#define HK_PUSH_DESCRIPTOR_SET_SIZE                                            \
+   (HK_MAX_PUSH_DESCRIPTORS * HK_MAX_DESCRIPTOR_SIZE)
+#define HK_SSBO_BOUNDS_CHECK_ALIGNMENT 4
+#define HK_MAX_MULTIVIEW_VIEW_COUNT    32
+
+#define HK_SPARSE_ADDR_SPACE_SIZE (1ull << 39)
+#define HK_MAX_BUFFER_SIZE        (1ull << 31)
+#define HK_MAX_SHARED_SIZE        (32 * 1024)
+
+struct hk_addr_range {
+   uint64_t addr;
+   uint64_t range;
+};
+
+#define perf_debug(dev, fmt, ...)                                              \
+   do {                                                                        \
+      if (dev->dev.debug & AGX_DBG_PERF)                                       \
+         mesa_log(MESA_LOG_WARN, (MESA_LOG_TAG), (fmt), ##__VA_ARGS__);        \
+   } while (0)
+
+/* Fake values, pending UAPI upstreaming */
+#ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED
+#define DRM_FORMAT_MOD_APPLE_TWIDDLED (2)
+#endif
+#ifndef DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED
+#define DRM_FORMAT_MOD_APPLE_TWIDDLED_COMPRESSED (3)
+#endif
diff --git a/src/asahi/vulkan/hk_query_pool.c b/src/asahi/vulkan/hk_query_pool.c
new file mode 100644
index 00000000000..5762c69419c
--- /dev/null
+++ b/src/asahi/vulkan/hk_query_pool.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_query_pool.h"
+
+#include "agx_compile.h"
+#include "agx_pack.h"
+#include "hk_buffer.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_event.h"
+#include "hk_physical_device.h"
+#include "hk_shader.h"
+
+#include "shader_enums.h"
+#include "vk_common_entrypoints.h"
+#include "vk_meta.h"
+#include "vk_pipeline.h"
+
+#include "asahi/lib/agx_bo.h"
+#include "asahi/lib/libagx_shaders.h"
+#include "asahi/lib/shaders/query.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+
+#include "util/os_time.h"
+#include "vulkan/vulkan_core.h"
+
+struct hk_query_report {
+   /* TODO: do we want this to be legit u64? */
+   uint32_t value;
+   uint32_t padding;
+};
+
+static uint16_t *
+hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
+{
+   return (uint16_t *)(pool->bo->ptr.cpu + pool->query_start);
+}
+
+static uint32_t
+hk_reports_per_query(struct hk_query_pool *pool)
+{
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      return 1;
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+      return util_bitcount(pool->vk.pipeline_statistics);
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      // Primitives succeeded and primitives needed
+      return 2;
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
+                   const VkAllocationCallbacks *pAllocator,
+                   VkQueryPool *pQueryPool)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_query_pool *pool;
+
+   bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
+   unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;
+
+   pool =
+      vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
+   if (!pool)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* We place the availability first and then data */
+   pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
+                             sizeof(struct hk_query_report));
+
+   uint32_t reports_per_query = hk_reports_per_query(pool);
+   pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
+
+   if (pool->vk.query_count > 0) {
+      uint32_t bo_size = pool->query_start;
+
+      /* For occlusion queries, we stick the query index remapping here */
+      if (occlusion_queries)
+         bo_size += sizeof(uint16_t) * pool->vk.query_count;
+      else
+         bo_size += pool->query_stride * pool->vk.query_count;
+
+      pool->bo =
+         agx_bo_create(&dev->dev, bo_size, AGX_BO_WRITEBACK, "Query pool");
+      if (!pool->bo) {
+         hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
+         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+   }
+
+   uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+
+   for (unsigned i = 0; i < occlusion_queries; ++i) {
+      uint64_t zero = 0;
+      unsigned index;
+
+      VkResult result = hk_descriptor_table_add(
+         dev, &dev->occlusion_queries, &zero, sizeof(uint64_t), &index);
+
+      if (result != VK_SUCCESS) {
+         hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
+         return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+
+      /* We increment as we go so we can clean up properly if we run out */
+      assert(pool->oq_queries < occlusion_queries);
+      oq_index[pool->oq_queries++] = index;
+   }
+
+   *pQueryPool = hk_query_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   if (!pool)
+      return;
+
+   uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+
+   for (unsigned i = 0; i < pool->oq_queries; ++i) {
+      hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
+   }
+
+   agx_bo_unreference(pool->bo);
+   vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
+}
+
+static uint64_t
+hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return pool->bo->ptr.gpu + query * sizeof(uint32_t);
+}
+
+static uint32_t *
+hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return (uint32_t *)pool->bo->ptr.cpu + query;
+}
+
+static uint64_t
+hk_query_offset(struct hk_query_pool *pool, uint32_t query)
+{
+   assert(query < pool->vk.query_count);
+   return pool->query_start + query * pool->query_stride;
+}
+
+static uint64_t
+hk_query_report_addr(struct hk_device *dev, struct hk_query_pool *pool,
+                     uint32_t query)
+{
+   if (pool->oq_queries) {
+      uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+      return dev->occlusion_queries.bo->ptr.gpu +
+             (oq_index[query] * sizeof(uint64_t));
+   } else {
+      return pool->bo->ptr.gpu + hk_query_offset(pool, query);
+   }
+}
+
+static struct hk_query_report *
+hk_query_report_map(struct hk_device *dev, struct hk_query_pool *pool,
+                    uint32_t query)
+{
+   if (pool->oq_queries) {
+      uint64_t *queries = (uint64_t *)dev->occlusion_queries.bo->ptr.cpu;
+      uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+
+      return (struct hk_query_report *)&queries[oq_index[query]];
+   } else {
+      return (void *)((char *)pool->bo->ptr.cpu + hk_query_offset(pool, query));
+   }
+}
+
+struct hk_write_params {
+   uint64_t address;
+   uint32_t value;
+};
+
+static void
+hk_nir_write_u32(nir_builder *b, UNUSED const void *key)
+{
+   nir_def *addr = nir_load_preamble(
+      b, 1, 64, .base = offsetof(struct hk_write_params, address) / 2);
+
+   nir_def *value = nir_load_preamble(
+      b, 1, 32, .base = offsetof(struct hk_write_params, value) / 2);
+
+   nir_store_global(b, addr, 4, value, nir_component_mask(1));
+}
+
+void
+hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
+               bool after_gfx)
+{
+   struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
+      cmd, after_gfx ? &cmd->current_cs.post_gfx : &cmd->current_cs.cs, true);
+   if (!cs)
+      return;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   /* As soon as we mark a query available, it needs to be available system
+    * wide, otherwise a CPU-side get result can query. As such, we cache flush
+    * before and then let coherency works its magic. Without this barrier, we
+    * get flakes in
+    *
+    * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
+    */
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   hk_cdm_cache_flush(dev, cs);
+
+   struct hk_shader *s = hk_meta_kernel(dev, hk_nir_write_u32, NULL, 0);
+   struct hk_write_params params = {.address = address, .value = value};
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
+
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1));
+}
+
+/**
+ * Goes through a series of consecutive query indices in the given pool,
+ * setting all element values to 0 and emitting them as available.
+ */
+static void
+emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
+                  uint32_t first_index, uint32_t num_queries,
+                  bool set_available)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   for (uint32_t i = 0; i < num_queries; i++) {
+      uint64_t available = hk_query_available_addr(pool, first_index + i);
+      uint64_t report = hk_query_report_addr(dev, pool, first_index + i);
+      hk_queue_write(cmd, available, set_available, false);
+
+      /* XXX: is this supposed to happen on the begin? */
+      for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
+         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0,
+                        false);
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
+                  uint32_t queryCount)
+{
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+   VK_FROM_HANDLE(hk_device, dev, device);
+
+   uint32_t *available = hk_query_available_map(pool, firstQuery);
+   struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery);
+
+   memset(available, 0, queryCount * sizeof(*available));
+   memset(reports, 0, queryCount * pool->query_stride);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                     uint32_t firstQuery, uint32_t queryCount)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
+                      VkPipelineStageFlags2 stage, VkQueryPool queryPool,
+                      uint32_t query)
+{
+   unreachable("todo");
+#if 0
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   struct nv_push *p = hk_cmd_buffer_push(cmd, 10);
+
+   uint64_t report_addr = hk_query_report_addr(pool, query);
+   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+   P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
+   P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
+   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
+   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+      .operation = OPERATION_REPORT_ONLY,
+      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
+      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
+   });
+
+   uint64_t available_addr = hk_query_available_addr(pool, query);
+   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
+   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
+   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
+   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
+   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
+      .operation = OPERATION_RELEASE,
+      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
+      .pipeline_location = PIPELINE_LOCATION_ALL,
+      .structure_size = STRUCTURE_SIZE_ONE_WORD,
+   });
+
+   /* From the Vulkan spec:
+    *
+    *   "If vkCmdWriteTimestamp2 is called while executing a render pass
+    *    instance that has multiview enabled, the timestamp uses N consecutive
+    *    query indices in the query pool (starting at query) where N is the
+    *    number of bits set in the view mask of the subpass the command is
+    *    executed in. The resulting query values are determined by an
+    *    implementation-dependent choice of one of the following behaviors:"
+    *
+    * In our case, only the first query is used, so we emit zeros for the
+    * remaining queries, as described in the first behavior listed in the
+    * Vulkan spec:
+    *
+    *   "The first query is a timestamp value and (if more than one bit is set
+    *   in the view mask) zero is written to the remaining queries."
+    */
+   if (cmd->state.gfx.render.view_mask != 0) {
+      const uint32_t num_queries =
+         util_bitcount(cmd->state.gfx.render.view_mask);
+      if (num_queries > 1)
+         emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
+   }
+#endif
+}
+
+static void
+hk_cmd_begin_end_query(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
+                       uint32_t query, uint32_t index,
+                       VkQueryControlFlags flags, bool end)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   bool graphics = false;
+
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION: {
+      assert(query < pool->oq_queries);
+
+      if (end) {
+         cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
+      } else {
+         cmd->state.gfx.occlusion.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
+                                            ? AGX_VISIBILITY_MODE_COUNTING
+                                            : AGX_VISIBILITY_MODE_BOOLEAN;
+      }
+
+      uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
+      cmd->state.gfx.occlusion.index = oq_index[query];
+      cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
+      break;
+   }
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+      uint64_t addr = hk_query_report_addr(dev, pool, query);
+      cmd->state.gfx.xfb_query[index] = end ? 0 : addr;
+      break;
+   }
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+      struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
+      cmd->state.gfx.descriptors.root_dirty = true;
+
+      root->draw.pipeline_stats = hk_query_report_addr(dev, pool, query);
+      root->draw.pipeline_stats_flags = pool->vk.pipeline_statistics;
+
+      /* XXX: I don't think is correct... when does the query become available
+       * exactly?
+       */
+      graphics = pool->vk.pipeline_statistics &
+                 ~VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
+      break;
+   }
+
+   default:
+      unreachable("Unsupported query type");
+   }
+
+   /* We need to set available=1 after the graphics work finishes. */
+   if (end) {
+      hk_queue_write(cmd, hk_query_available_addr(pool, query), 1, graphics);
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                           uint32_t query, VkQueryControlFlags flags,
+                           uint32_t index)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   hk_cmd_begin_end_query(cmd, pool, query, index, flags, false);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                         uint32_t query, uint32_t index)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   hk_cmd_begin_end_query(cmd, pool, query, index, 0, true);
+
+   /* From the Vulkan spec:
+    *
+    *   "If queries are used while executing a render pass instance that has
+    *    multiview enabled, the query uses N consecutive query indices in
+    *    the query pool (starting at query) where N is the number of bits set
+    *    in the view mask in the subpass the query is used in. How the
+    *    numerical results of the query are distributed among the queries is
+    *    implementation-dependent."
+    *
+    * In our case, only the first query is used, so we emit zeros for the
+    * remaining queries.
+    */
+   if (cmd->state.gfx.render.view_mask != 0) {
+      const uint32_t num_queries =
+         util_bitcount(cmd->state.gfx.render.view_mask);
+      if (num_queries > 1)
+         emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
+   }
+}
+
+static bool
+hk_query_is_available(struct hk_query_pool *pool, uint32_t query)
+{
+   uint32_t *available = hk_query_available_map(pool, query);
+   return p_atomic_read(available) != 0;
+}
+
+#define HK_QUERY_TIMEOUT 2000000000ull
+
+static VkResult
+hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
+                            uint32_t query)
+{
+   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);
+
+   while (os_time_get_nano() < abs_timeout_ns) {
+      if (hk_query_is_available(pool, query))
+         return VK_SUCCESS;
+
+      VkResult status = vk_device_check_status(&dev->vk);
+      if (status != VK_SUCCESS)
+         return status;
+   }
+
+   return vk_device_set_lost(&dev->vk, "query timeout");
+}
+
+static void
+cpu_write_query_result(void *dst, uint32_t idx, VkQueryResultFlags flags,
+                       uint64_t result)
+{
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+      uint64_t *dst64 = dst;
+      dst64[idx] = result;
+   } else {
+      uint32_t *dst32 = dst;
+      dst32[idx] = result;
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
+                       uint32_t firstQuery, uint32_t queryCount,
+                       size_t dataSize, void *pData, VkDeviceSize stride,
+                       VkQueryResultFlags flags)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+
+   if (vk_device_is_lost(&dev->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   VkResult status = VK_SUCCESS;
+   for (uint32_t i = 0; i < queryCount; i++) {
+      const uint32_t query = firstQuery + i;
+
+      bool available = hk_query_is_available(pool, query);
+
+      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
+         status = hk_query_wait_for_available(dev, pool, query);
+         if (status != VK_SUCCESS)
+            return status;
+
+         available = true;
+      }
+
+      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
+
+      const struct hk_query_report *src = hk_query_report_map(dev, pool, query);
+      assert(i * stride < dataSize);
+      void *dst = (char *)pData + i * stride;
+
+      uint32_t reports = hk_reports_per_query(pool);
+      if (write_results) {
+         for (uint32_t j = 0; j < reports; j++) {
+            cpu_write_query_result(dst, j, flags, src[j].value);
+         }
+      }
+
+      if (!write_results)
+         status = VK_NOT_READY;
+
+      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+         cpu_write_query_result(dst, reports, flags, available);
+   }
+
+   return status;
+}
+
+static void
+hk_nir_copy_query(nir_builder *b, UNUSED const void *key)
+{
+   nir_def *id = nir_channel(b, nir_load_workgroup_id(b), 0);
+   libagx_copy_query(b, nir_load_preamble(b, 1, 64), id);
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
+                           uint32_t firstQuery, uint32_t queryCount,
+                           VkBuffer dstBuffer, VkDeviceSize dstOffset,
+                           VkDeviceSize stride, VkQueryResultFlags flags)
+{
+   VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
+   VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+   VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);
+
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true);
+   if (!cs)
+      return;
+
+   hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
+
+   const struct libagx_copy_query_push info = {
+      .availability = pool->bo->ptr.gpu,
+      .results = pool->oq_queries ? dev->occlusion_queries.bo->ptr.gpu
+                                  : pool->bo->ptr.gpu + pool->query_start,
+      .oq_index = pool->oq_queries ? pool->bo->ptr.gpu + pool->query_start : 0,
+
+      .first_query = firstQuery,
+      .dst_addr = hk_buffer_address(dst_buffer, dstOffset),
+      .dst_stride = stride,
+      .reports_per_query = hk_reports_per_query(pool),
+
+      .partial = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
+      ._64 = flags & VK_QUERY_RESULT_64_BIT,
+      .with_availability = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
+   };
+
+   uint64_t push = hk_pool_upload(cmd, &info, sizeof(info), 8);
+
+   struct hk_shader *s = hk_meta_kernel(dev, hk_nir_copy_query, NULL, 0);
+   uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
+   hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(queryCount, 1, 1),
+                        hk_grid(1, 1, 1));
+}
diff --git a/src/asahi/vulkan/hk_query_pool.h b/src/asahi/vulkan/hk_query_pool.h
new file mode 100644
index 00000000000..9e235dfed08
--- /dev/null
+++ b/src/asahi/vulkan/hk_query_pool.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+#include "vk_query_pool.h"
+
+struct agx_bo;
+
+struct hk_query_pool {
+   struct vk_query_pool vk;
+
+   uint32_t query_start;
+   uint32_t query_stride;
+
+   struct agx_bo *bo;
+   void *bo_map;
+
+   unsigned oq_queries;
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_query_pool, vk.base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c
new file mode 100644
index 00000000000..7cc1c8be139
--- /dev/null
+++ b/src/asahi/vulkan/hk_queue.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_queue.h"
+
+#include "agx_bo.h"
+#include "agx_device.h"
+#include "agx_pack.h"
+#include "decode.h"
+#include "hk_cmd_buffer.h"
+#include "hk_device.h"
+#include "hk_physical_device.h"
+
+#include <xf86drm.h>
+#include "asahi/lib/unstable_asahi_drm.h"
+#include "util/list.h"
+#include "vulkan/vulkan_core.h"
+
+#include "vk_drm_syncobj.h"
+#include "vk_sync.h"
+
+/*
+ * We need to specially handle submits with no control streams. The kernel
+ * can't accept empty submits, but we can end up here in Vulkan for
+ * synchronization purposes only. Rather than submit a no-op job (slow),
+ * we simply tie the fences together.
+ */
+static VkResult
+queue_submit_empty(struct hk_device *dev, struct hk_queue *queue,
+                   struct vk_queue_submit *submit)
+{
+   int fd = dev->dev.fd;
+
+   /* Transfer the waits into the queue timeline. */
+   for (unsigned i = 0; i < submit->wait_count; ++i) {
+      struct vk_sync_wait *wait = &submit->waits[i];
+
+      assert(vk_sync_type_is_drm_syncobj(wait->sync->type));
+      const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync);
+
+      drmSyncobjTransfer(fd, queue->drm.syncobj, ++queue->drm.timeline_value,
+                         syncobj->syncobj, wait->wait_value, 0);
+   }
+
+   /* Transfer the queue timeline into each out fence. They will all be
+    * signalled when we reach this point.
+    */
+   for (unsigned i = 0; i < submit->signal_count; ++i) {
+      struct vk_sync_signal *signal = &submit->signals[i];
+
+      assert(vk_sync_type_is_drm_syncobj(signal->sync->type));
+      const struct vk_drm_syncobj *syncobj =
+         vk_sync_as_drm_syncobj(signal->sync);
+
+      drmSyncobjTransfer(fd, syncobj->syncobj, signal->signal_value,
+                         queue->drm.syncobj, queue->drm.timeline_value, 0);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
+                       struct drm_asahi_cmd_compute *cmd)
+{
+   size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start);
+
+   *cmd = (struct drm_asahi_cmd_compute){
+      .encoder_ptr = cs->addr,
+      .encoder_end = cs->addr + len,
+
+      .sampler_array = dev->samplers.table.bo->ptr.gpu,
+      .sampler_count = dev->samplers.table.alloc,
+      .sampler_max = dev->samplers.table.alloc + 1,
+
+      .encoder_id = agx_get_global_id(&dev->dev),
+      .cmd_id = agx_get_global_id(&dev->dev),
+      .unk_mask = 0xffffffff,
+   };
+
+   if (cs->scratch.cs.main || cs->scratch.cs.preamble) {
+      cmd->helper_arg = dev->scratch.cs.buf->ptr.gpu;
+      cmd->helper_cfg = cs->scratch.cs.preamble << 16;
+      cmd->helper_program = dev->dev.helper->ptr.gpu | 1;
+   }
+}
+
+static void
+asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
+                       struct drm_asahi_cmd_render *c)
+{
+#if 0
+   bool clear_pipeline_textures =
+      agx_tilebuffer_spills(&batch->tilebuffer_layout);
+
+   for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
+      struct pipe_surface *surf = batch->key.cbufs[i];
+
+      clear_pipeline_textures |=
+         surf && surf->texture && !(batch->clear & (PIPE_CLEAR_COLOR0 << i));
+   }
+
+#endif
+   unsigned cmd_ta_id = agx_get_global_id(&dev->dev);
+   unsigned cmd_3d_id = agx_get_global_id(&dev->dev);
+   unsigned encoder_id = agx_get_global_id(&dev->dev);
+
+   memset(c, 0, sizeof(*c));
+
+   c->encoder_ptr = cs->addr;
+   c->encoder_id = encoder_id;
+   c->cmd_3d_id = cmd_3d_id;
+   c->cmd_ta_id = cmd_ta_id;
+   c->ppp_ctrl = 0x202;
+
+   c->fb_width = cs->cr.width;
+   c->fb_height = cs->cr.height;
+
+   c->isp_bgobjdepth = cs->cr.isp_bgobjdepth;
+   c->isp_bgobjvals = cs->cr.isp_bgobjvals;
+
+   static_assert(sizeof(c->zls_ctrl) == sizeof(cs->cr.zls_control));
+   memcpy(&c->zls_ctrl, &cs->cr.zls_control, sizeof(cs->cr.zls_control));
+
+   c->depth_dimensions = (cs->cr.width - 1) | ((cs->cr.height - 1) << 15);
+
+   c->depth_buffer_load = cs->cr.depth.buffer;
+   c->depth_buffer_store = cs->cr.depth.buffer;
+   c->depth_buffer_partial = cs->cr.depth.buffer;
+
+   c->depth_buffer_load_stride = cs->cr.depth.stride;
+   c->depth_buffer_store_stride = cs->cr.depth.stride;
+   c->depth_buffer_partial_stride = cs->cr.depth.stride;
+
+   c->depth_meta_buffer_load = cs->cr.depth.meta;
+   c->depth_meta_buffer_store = cs->cr.depth.meta;
+   c->depth_meta_buffer_partial = cs->cr.depth.meta;
+
+   c->depth_meta_buffer_load_stride = cs->cr.depth.stride;
+   c->depth_meta_buffer_store_stride = cs->cr.depth.meta_stride;
+   c->depth_meta_buffer_partial_stride = cs->cr.depth.meta_stride;
+
+   c->stencil_buffer_load = cs->cr.stencil.buffer;
+   c->stencil_buffer_store = cs->cr.stencil.buffer;
+   c->stencil_buffer_partial = cs->cr.stencil.buffer;
+
+   c->stencil_buffer_load_stride = cs->cr.stencil.stride;
+   c->stencil_buffer_store_stride = cs->cr.stencil.stride;
+   c->stencil_buffer_partial_stride = cs->cr.stencil.stride;
+
+   c->stencil_meta_buffer_load = cs->cr.stencil.meta;
+   c->stencil_meta_buffer_store = cs->cr.stencil.meta;
+   c->stencil_meta_buffer_partial = cs->cr.stencil.meta;
+
+   c->stencil_meta_buffer_load_stride = cs->cr.stencil.stride;
+   c->stencil_meta_buffer_store_stride = cs->cr.stencil.meta_stride;
+   c->stencil_meta_buffer_partial_stride = cs->cr.stencil.meta_stride;
+
+   c->iogpu_unk_214 = cs->cr.iogpu_unk_214;
+
+#if 0
+   if (clear_pipeline_textures)
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+   else
+      c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES;
+
+   if (zres && !(batch->clear & PIPE_CLEAR_DEPTH))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+
+   if (sres && !(batch->clear & PIPE_CLEAR_STENCIL))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+#endif
+
+   if (dev->dev.debug & AGX_DBG_NOCLUSTER)
+      c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING;
+
+#if 0
+   /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */
+   if (tib->nr_samples > 1 && framebuffer->zsbuf)
+      c->flags |= ASAHI_RENDER_MSAA_ZS;
+#endif
+
+   c->utile_width = cs->tib.tile_size.width;
+   c->utile_height = cs->tib.tile_size.height;
+
+   /* Can be 0 for attachmentless rendering with no draws */
+   c->samples = MAX2(cs->tib.nr_samples, 1);
+   c->layers = cs->cr.layers;
+
+   c->ppp_multisamplectl = cs->ppp_multisamplectl;
+   c->sample_size = cs->tib.sample_size_B;
+
+   /* XXX OR 0x80 with eMRT? */
+   c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(&cs->tib), 2048) / 2048;
+
+   float tan_60 = 1.732051f;
+   c->merge_upper_x = fui(tan_60 / cs->cr.width);
+   c->merge_upper_y = fui(tan_60 / cs->cr.height);
+
+   c->load_pipeline = cs->cr.bg.main.usc | 4;
+   c->store_pipeline = cs->cr.eot.main.usc | 4;
+   c->partial_reload_pipeline = cs->cr.bg.partial.usc | 4;
+   c->partial_store_pipeline = cs->cr.eot.partial.usc | 4;
+
+   memcpy(&c->load_pipeline_bind, &cs->cr.bg.main.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->store_pipeline_bind, &cs->cr.eot.main.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_reload_pipeline_bind, &cs->cr.bg.partial.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_store_pipeline_bind, &cs->cr.eot.partial.counts,
+          sizeof(struct agx_counts_packed));
+
+   c->scissor_array = cs->uploaded_scissor;
+   c->depth_bias_array = cs->uploaded_zbias;
+
+   c->vertex_sampler_array = dev->samplers.table.bo->ptr.gpu;
+   c->vertex_sampler_count = dev->samplers.table.alloc;
+   c->vertex_sampler_max = dev->samplers.table.alloc + 1;
+
+   c->fragment_sampler_array = c->vertex_sampler_array;
+   c->fragment_sampler_count = c->vertex_sampler_count;
+   c->fragment_sampler_max = c->vertex_sampler_max;
+
+   c->visibility_result_buffer = dev->occlusion_queries.bo->ptr.gpu;
+
+   /* If a tile is empty, we do not want to process it, as the redundant
+    * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
+    * memory bandwidth. Any draw marks a tile as non-empty, so we only need to
+    * process empty tiles if the background+EOT programs have a side effect.
+    * This is the case exactly when there is an attachment we are clearing (some
+    * attachment A in clear and in resolve <==> non-empty intersection).
+    *
+    * This case matters a LOT for performance in workloads that split batches.
+    */
+   if (true /* TODO */)
+      c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES;
+
+   if (cs->scratch.vs.main || cs->scratch.vs.preamble) {
+      c->flags |= ASAHI_RENDER_VERTEX_SPILLS;
+      c->vertex_helper_arg = dev->scratch.vs.buf->ptr.gpu;
+      c->vertex_helper_cfg = cs->scratch.vs.preamble << 16;
+      c->vertex_helper_program = dev->dev.helper->ptr.gpu | 1;
+   }
+
+   if (cs->scratch.fs.main || cs->scratch.fs.preamble) {
+      c->fragment_helper_arg = dev->scratch.fs.buf->ptr.gpu;
+      c->fragment_helper_cfg = cs->scratch.fs.preamble << 16;
+      c->fragment_helper_program = dev->dev.helper->ptr.gpu | 1;
+   }
+}
+
+static void
+asahi_fill_sync(struct drm_asahi_sync *sync, struct vk_sync *vk_sync,
+                uint64_t value)
+{
+   if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) {
+      unreachable("Unsupported sync type");
+      return;
+   }
+
+   const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
+   *sync = (struct drm_asahi_sync){.handle = syncobj->syncobj};
+
+   if (vk_sync->flags & VK_SYNC_IS_TIMELINE) {
+      sync->sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ;
+      sync->timeline_value = value;
+   } else {
+      sync->sync_type = DRM_ASAHI_SYNC_SYNCOBJ;
+   }
+}
+
+union drm_asahi_cmd {
+   struct drm_asahi_cmd_compute compute;
+   struct drm_asahi_cmd_render render;
+};
+
+/* TODO: I think it's 64. Can we query from the kernel? */
+#define MAX_COMMANDS_PER_SUBMIT (16)
+
+static VkResult
+queue_submit_single(struct agx_device *dev, struct drm_asahi_submit *submit)
+{
+   int ret = dev->ops.submit(dev, submit, 0);
+
+   /* XXX: don't trap */
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT failed: %m\n");
+      assert(0);
+   }
+
+   return VK_SUCCESS;
+}
+
+/*
+ * The kernel/firmware jointly impose a limit on commands per submit ioctl, but
+ * we can build up arbitrarily large command buffers. We handle this here by
+ * looping the ioctl, submitting slices of the command buffers that are within
+ * bounds.
+ */
+static VkResult
+queue_submit_looped(struct agx_device *dev, struct drm_asahi_submit *submit)
+{
+   struct drm_asahi_command *cmds = (void *)submit->commands;
+   unsigned commands_remaining = submit->command_count;
+   unsigned submitted_vdm = 0, submitted_cdm = 0;
+
+   while (commands_remaining) {
+      bool first = commands_remaining == submit->command_count;
+      bool last = commands_remaining <= MAX_COMMANDS_PER_SUBMIT;
+
+      unsigned count = MIN2(commands_remaining, MAX_COMMANDS_PER_SUBMIT);
+      commands_remaining -= count;
+
+      assert(!last || commands_remaining == 0);
+      assert(count > 0);
+
+      /* We need to fix up the barriers since barriers are ioctl-relative */
+      for (unsigned i = 0; i < count; ++i) {
+         assert(cmds[i].barriers[0] >= submitted_vdm);
+         assert(cmds[i].barriers[1] >= submitted_cdm);
+
+         cmds[i].barriers[0] -= submitted_vdm;
+         cmds[i].barriers[1] -= submitted_cdm;
+      }
+
+      /* We can't signal the out-syncobjs until all prior work finishes. Since
+       * only the last ioctl will signal, make sure it waits on prior ioctls.
+       *
+       * TODO: there might be a more performant way to do this.
+       */
+      if (last && !first) {
+         if (cmds[0].barriers[0] == DRM_ASAHI_BARRIER_NONE)
+            cmds[0].barriers[0] = 0;
+
+         if (cmds[0].barriers[1] == DRM_ASAHI_BARRIER_NONE)
+            cmds[0].barriers[1] = 0;
+      }
+
+      struct drm_asahi_submit submit_ioctl = {
+         .flags = submit->flags,
+         .queue_id = submit->queue_id,
+         .result_handle = submit->result_handle,
+         .commands = (uint64_t)(uintptr_t)(cmds),
+         .command_count = count,
+         .in_syncs = first ? submit->in_syncs : 0,
+         .in_sync_count = first ? submit->in_sync_count : 0,
+         .out_syncs = last ? submit->out_syncs : 0,
+         .out_sync_count = last ? submit->out_sync_count : 0,
+      };
+
+      VkResult result = queue_submit_single(dev, &submit_ioctl);
+      if (result != VK_SUCCESS)
+         return result;
+
+      for (unsigned i = 0; i < count; ++i) {
+         if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE)
+            submitted_cdm++;
+         else if (cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER)
+            submitted_vdm++;
+         else
+            unreachable("unknown subqueue");
+      }
+
+      cmds += count;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+queue_submit(struct hk_device *dev, struct hk_queue *queue,
+             struct vk_queue_submit *submit)
+{
+   unsigned command_count = 0;
+
+   /* Gather the number of individual commands to submit up front */
+   for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
+      struct hk_cmd_buffer *cmdbuf =
+         (struct hk_cmd_buffer *)submit->command_buffers[i];
+
+      command_count += list_length(&cmdbuf->control_streams);
+   }
+
+   if (command_count == 0)
+      return queue_submit_empty(dev, queue, submit);
+
+   unsigned wait_count = 0;
+   struct drm_asahi_sync *waits =
+      alloca(submit->wait_count * sizeof(struct drm_asahi_sync));
+
+   struct drm_asahi_sync *signals =
+      alloca((submit->signal_count + 1) * sizeof(struct drm_asahi_sync));
+
+   for (unsigned i = 0; i < submit->wait_count; ++i) {
+      /* The kernel rejects the submission if we try to wait on the same
+       * timeline semaphore at multiple points.
+       *
+       * TODO: Can we relax the UAPI?
+       *
+       * XXX: This is quadratic time.
+       */
+      bool skip = false;
+      if (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) {
+         uint32_t v1 = submit->waits[i].wait_value;
+         for (unsigned j = 0; j < submit->wait_count; ++j) {
+            uint32_t v2 = submit->waits[j].wait_value;
+            if (i != j && submit->waits[i].sync == submit->waits[j].sync &&
+                (v1 < v2 || (v1 == v2 && i < j))) {
+               skip = true;
+               break;
+            }
+         }
+
+         if (skip)
+            continue;
+      }
+
+      asahi_fill_sync(&waits[wait_count++], submit->waits[i].sync,
+                      submit->waits[i].wait_value);
+   }
+
+   for (unsigned i = 0; i < submit->signal_count; ++i) {
+      asahi_fill_sync(&signals[i], submit->signals[i].sync,
+                      submit->signals[i].signal_value);
+   }
+
+   /* Signal progress on the queue itself */
+   signals[submit->signal_count] = (struct drm_asahi_sync){
+      .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
+      .handle = queue->drm.syncobj,
+      .timeline_value = ++queue->drm.timeline_value,
+   };
+
+   /* Now setup the command structs */
+   struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count);
+   union drm_asahi_cmd *cmds_inner =
+      alloca(sizeof(*cmds_inner) * command_count);
+
+   unsigned cmd_it = 0;
+   unsigned nr_vdm = 0, nr_cdm = 0;
+
+   for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
+      struct hk_cmd_buffer *cmdbuf =
+         (struct hk_cmd_buffer *)submit->command_buffers[i];
+
+      list_for_each_entry(struct hk_cs, cs, &cmdbuf->control_streams, node) {
+         assert(cmd_it < command_count);
+
+         struct drm_asahi_command cmd = {
+            .cmd_buffer = (uint64_t)(uintptr_t)&cmds_inner[cmd_it],
+            .result_offset = 0 /* TODO */,
+            .result_size = 0 /* TODO */,
+            /* Barrier on previous command */
+            .barriers = {nr_vdm, nr_cdm},
+         };
+
+         if (cs->type == HK_CS_CDM) {
+            cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE;
+            cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute);
+            nr_cdm++;
+
+            asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute);
+         } else {
+            assert(cs->type == HK_CS_VDM);
+            cmd.cmd_type = DRM_ASAHI_CMD_RENDER;
+            cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render);
+            nr_vdm++;
+
+            asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render);
+         }
+
+         cmds[cmd_it++] = cmd;
+      }
+   }
+
+   assert(cmd_it == command_count);
+
+   if (dev->dev.debug & AGX_DBG_TRACE) {
+      for (unsigned i = 0; i < command_count; ++i) {
+         if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE) {
+            agxdecode_drm_cmd_compute(dev->dev.agxdecode, &dev->dev.params,
+                                      &cmds_inner[i].compute, true);
+         } else {
+            assert(cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER);
+            agxdecode_drm_cmd_render(dev->dev.agxdecode, &dev->dev.params,
+                                     &cmds_inner[i].render, true);
+         }
+      }
+
+      agxdecode_image_heap(dev->dev.agxdecode, dev->images.bo->ptr.gpu,
+                           dev->images.alloc);
+
+      agxdecode_next_frame();
+   }
+
+   struct drm_asahi_submit submit_ioctl = {
+      .flags = 0,
+      .queue_id = queue->drm.id,
+      .result_handle = 0 /* TODO */,
+      .in_sync_count = wait_count,
+      .out_sync_count = submit->signal_count + 1,
+      .command_count = command_count,
+      .in_syncs = (uint64_t)(uintptr_t)(waits),
+      .out_syncs = (uint64_t)(uintptr_t)(signals),
+      .commands = (uint64_t)(uintptr_t)(cmds),
+   };
+
+   if (command_count <= MAX_COMMANDS_PER_SUBMIT)
+      return queue_submit_single(&dev->dev, &submit_ioctl);
+   else
+      return queue_submit_looped(&dev->dev, &submit_ioctl);
+}
+
+static VkResult
+hk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
+{
+   struct hk_queue *queue = container_of(vk_queue, struct hk_queue, vk);
+   struct hk_device *dev = hk_queue_device(queue);
+
+   if (vk_queue_is_lost(&queue->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   VkResult result = queue_submit(dev, queue, submit);
+   if (result != VK_SUCCESS)
+      return vk_queue_set_lost(&queue->vk, "Submit failed");
+
+   return VK_SUCCESS;
+}
+
+VkResult
+hk_queue_init(struct hk_device *dev, struct hk_queue *queue,
+              const VkDeviceQueueCreateInfo *pCreateInfo,
+              uint32_t index_in_family)
+{
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   VkResult result;
+
+   assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
+
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+   const enum VkQueueGlobalPriorityKHR global_priority =
+      priority_info ? priority_info->globalPriority
+                    : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   if (global_priority != VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
+   if (result != VK_SUCCESS)
+      return result;
+
+   queue->vk.driver_submit = hk_queue_submit;
+
+   queue->drm.id = agx_create_command_queue(&dev->dev,
+                                            DRM_ASAHI_QUEUE_CAP_RENDER |
+                                               DRM_ASAHI_QUEUE_CAP_BLIT |
+                                               DRM_ASAHI_QUEUE_CAP_COMPUTE,
+                                            2);
+
+   if (drmSyncobjCreate(dev->dev.fd, 0, &queue->drm.syncobj)) {
+      mesa_loge("drmSyncobjCreate() failed %d\n", errno);
+      agx_destroy_command_queue(&dev->dev, queue->drm.id);
+      vk_queue_finish(&queue->vk);
+
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "DRM_IOCTL_SYNCOBJ_CREATE failed: %m");
+   }
+
+   uint64_t initial_value = 1;
+   if (drmSyncobjTimelineSignal(dev->dev.fd, &queue->drm.syncobj,
+                                &initial_value, 1)) {
+      hk_queue_finish(dev, queue);
+      return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "DRM_IOCTL_TIMELINE_SYNCOBJ_SIGNAL failed: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+hk_queue_finish(struct hk_device *dev, struct hk_queue *queue)
+{
+   drmSyncobjDestroy(dev->dev.fd, queue->drm.syncobj);
+   agx_destroy_command_queue(&dev->dev, queue->drm.id);
+   vk_queue_finish(&queue->vk);
+}
diff --git a/src/asahi/vulkan/hk_queue.h b/src/asahi/vulkan/hk_queue.h
new file mode 100644
index 00000000000..42e446ba430
--- /dev/null
+++ b/src/asahi/vulkan/hk_queue.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_private.h"
+#include "vk_queue.h"
+
+struct hk_device;
+
+struct hk_queue {
+   struct vk_queue vk;
+
+   struct {
+      /* Asahi kernel queue ID */
+      uint32_t id;
+
+      /* Timeline syncobj backing the queue */
+      uint32_t syncobj;
+
+      /* Current maximum timeline value for the queue's syncobj. If the
+       * syncobj's value equals timeline_value, then all work is complete.
+       */
+      uint32_t timeline_value;
+   } drm;
+};
+
+static inline struct hk_device *
+hk_queue_device(struct hk_queue *queue)
+{
+   return (struct hk_device *)queue->vk.base.device;
+}
+
+VkResult hk_queue_init(struct hk_device *dev, struct hk_queue *queue,
+                       const VkDeviceQueueCreateInfo *pCreateInfo,
+                       uint32_t index_in_family);
+
+void hk_queue_finish(struct hk_device *dev, struct hk_queue *queue);
diff --git a/src/asahi/vulkan/hk_sampler.c b/src/asahi/vulkan/hk_sampler.c
new file mode 100644
index 00000000000..7e936b0cb04
--- /dev/null
+++ b/src/asahi/vulkan/hk_sampler.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_sampler.h"
+
+#include "hk_device.h"
+#include "hk_entrypoints.h"
+#include "hk_physical_device.h"
+
+#include "vk_enum_to_str.h"
+#include "vk_format.h"
+#include "vk_sampler.h"
+
+#include "asahi/genxml/agx_pack.h"
+
+static inline uint32_t
+translate_address_mode(VkSamplerAddressMode addr_mode)
+{
+#define MODE(VK, AGX_) [VK_SAMPLER_ADDRESS_MODE_##VK] = AGX_WRAP_##AGX_
+   static const uint8_t translate[] = {
+      MODE(REPEAT, REPEAT),
+      MODE(MIRRORED_REPEAT, MIRRORED_REPEAT),
+      MODE(CLAMP_TO_EDGE, CLAMP_TO_EDGE),
+      MODE(CLAMP_TO_BORDER, CLAMP_TO_BORDER),
+      MODE(MIRROR_CLAMP_TO_EDGE, MIRRORED_CLAMP_TO_EDGE),
+   };
+#undef MODE
+
+   assert(addr_mode < ARRAY_SIZE(translate));
+   return translate[addr_mode];
+}
+
+static uint32_t
+translate_texsamp_compare_op(VkCompareOp op)
+{
+#define OP(VK, AGX_) [VK_COMPARE_OP_##VK] = AGX_COMPARE_FUNC_##AGX_
+   static const uint8_t translate[] = {
+      OP(NEVER, NEVER),
+      OP(LESS, LESS),
+      OP(EQUAL, EQUAL),
+      OP(LESS_OR_EQUAL, LEQUAL),
+      OP(GREATER, GREATER),
+      OP(NOT_EQUAL, NOT_EQUAL),
+      OP(GREATER_OR_EQUAL, GEQUAL),
+      OP(ALWAYS, ALWAYS),
+   };
+#undef OP
+
+   assert(op < ARRAY_SIZE(translate));
+   return translate[op];
+}
+
+static enum agx_filter
+translate_filter(VkFilter filter)
+{
+   static_assert((enum agx_filter)VK_FILTER_NEAREST == AGX_FILTER_NEAREST);
+   static_assert((enum agx_filter)VK_FILTER_LINEAR == AGX_FILTER_LINEAR);
+
+   return (enum agx_filter)filter;
+}
+
+static enum agx_mip_filter
+translate_mipfilter(VkSamplerMipmapMode mode)
+{
+   switch (mode) {
+   case VK_SAMPLER_MIPMAP_MODE_NEAREST:
+      return AGX_MIP_FILTER_NEAREST;
+
+   case VK_SAMPLER_MIPMAP_MODE_LINEAR:
+      return AGX_MIP_FILTER_LINEAR;
+
+   default:
+      unreachable("Invalid filter");
+   }
+}
+
+static bool
+uses_border(const VkSamplerCreateInfo *info)
+{
+   return info->addressModeU == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
+          info->addressModeV == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER ||
+          info->addressModeW == VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
+}
+
+static enum agx_border_colour
+is_border_color_custom(VkBorderColor color)
+{
+   /* TODO: for now, opaque black is treated as custom due to rgba4 swizzling
+    * issues, could be optimized though.
+    */
+   switch (color) {
+   case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
+   case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
+   case VK_BORDER_COLOR_INT_CUSTOM_EXT:
+   case VK_BORDER_COLOR_FLOAT_CUSTOM_EXT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/* Translate an American VkBorderColor into a Canadian agx_border_colour */
+static enum agx_border_colour
+translate_border_color(VkBorderColor color, bool custom_to_1)
+{
+   switch (color) {
+   case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
+   case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
+      return AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
+
+   case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
+   case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
+      return AGX_BORDER_COLOUR_OPAQUE_WHITE;
+
+   default:
+      assert(is_border_color_custom(color));
+      return custom_to_1 ? AGX_BORDER_COLOUR_OPAQUE_WHITE
+                         : AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
+   }
+}
+
+static void
+pack_sampler(const struct hk_physical_device *pdev,
+             const struct VkSamplerCreateInfo *info, bool custom_to_1,
+             struct agx_sampler_packed *out)
+{
+   agx_pack(out, SAMPLER, cfg) {
+      cfg.minimum_lod = info->minLod;
+      cfg.maximum_lod = info->maxLod;
+      cfg.magnify = translate_filter(info->magFilter);
+      cfg.minify = translate_filter(info->minFilter);
+      cfg.mip_filter = translate_mipfilter(info->mipmapMode);
+      cfg.wrap_s = translate_address_mode(info->addressModeU);
+      cfg.wrap_t = translate_address_mode(info->addressModeV);
+      cfg.wrap_r = translate_address_mode(info->addressModeW);
+      cfg.pixel_coordinates = info->unnormalizedCoordinates;
+
+      cfg.seamful_cube_maps =
+         info->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT;
+
+      if (info->compareEnable) {
+         cfg.compare_func = translate_texsamp_compare_op(info->compareOp);
+         cfg.compare_enable = true;
+      }
+
+      if (info->anisotropyEnable) {
+         cfg.maximum_anisotropy =
+            util_next_power_of_two(MAX2(info->maxAnisotropy, 1));
+      } else {
+         cfg.maximum_anisotropy = 1;
+      }
+
+      if (uses_border(info)) {
+         cfg.border_colour =
+            translate_border_color(info->borderColor, custom_to_1);
+      }
+   }
+}
+
+VKAPI_ATTR VkResult VKAPI_CALL
+hk_CreateSampler(VkDevice device,
+                 const VkSamplerCreateInfo *info /* pCreateInfo */,
+                 const VkAllocationCallbacks *pAllocator, VkSampler *pSampler)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   struct hk_physical_device *pdev = hk_device_physical(dev);
+   struct hk_sampler *sampler;
+   VkResult result;
+
+   sampler = vk_sampler_create(&dev->vk, info, pAllocator, sizeof(*sampler));
+   if (!sampler)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct agx_sampler_packed samp;
+   pack_sampler(pdev, info, true, &samp);
+
+   /* LOD bias passed in the descriptor set */
+   sampler->lod_bias_fp16 = _mesa_float_to_half(info->mipLodBias);
+
+   result =
+      hk_sampler_heap_add(dev, samp, &sampler->planes[sampler->plane_count].hw);
+   if (result != VK_SUCCESS) {
+      hk_DestroySampler(device, hk_sampler_to_handle(sampler), pAllocator);
+      return result;
+   }
+
+   sampler->plane_count++;
+
+   /* In order to support CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT, we
+    * need multiple sampler planes: at minimum we will need one for luminance
+    * (the default), and one for chroma.  Each sampler plane needs its own
+    * sampler table entry.  However, sampler table entries are very rare on
+    * G13, and each plane would burn one of those. So we make sure to allocate
+    * only the minimum amount that we actually need (i.e., either 1 or 2), and
+    * then just copy the last sampler plane out as far as we need to fill the
+    * number of image planes.
+    */
+   if (sampler->vk.ycbcr_conversion) {
+      assert(!uses_border(info) &&
+             "consequence of VUID-VkSamplerCreateInfo-addressModeU-01646");
+
+      const VkFilter chroma_filter =
+         sampler->vk.ycbcr_conversion->state.chroma_filter;
+      if (info->magFilter != chroma_filter ||
+          info->minFilter != chroma_filter) {
+         VkSamplerCreateInfo plane2_info = *info;
+         plane2_info.magFilter = chroma_filter;
+         plane2_info.minFilter = chroma_filter;
+
+         pack_sampler(pdev, &plane2_info, false, &samp);
+         result = hk_sampler_heap_add(
+            dev, samp, &sampler->planes[sampler->plane_count].hw);
+
+         if (result != VK_SUCCESS) {
+            hk_DestroySampler(device, hk_sampler_to_handle(sampler),
+                              pAllocator);
+            return result;
+         }
+
+         sampler->plane_count++;
+      }
+   } else if (uses_border(info)) {
+      /* If the sampler uses custom border colours, we need both clamp-to-1
+       * and clamp-to-0 variants. We treat these as planes.
+       */
+      pack_sampler(pdev, info, false, &samp);
+      result = hk_sampler_heap_add(dev, samp,
+                                   &sampler->planes[sampler->plane_count].hw);
+
+      if (result != VK_SUCCESS) {
+         hk_DestroySampler(device, hk_sampler_to_handle(sampler), pAllocator);
+         return result;
+      }
+
+      sampler->plane_count++;
+
+      /* We also need to record the border.
+       *
+       * If there is a border colour component mapping, we need to swizzle with
+       * it. Otherwise, we can assume there's nothing to do.
+       */
+      VkClearColorValue bc = sampler->vk.border_color_value;
+
+      const VkSamplerBorderColorComponentMappingCreateInfoEXT *swiz_info =
+         vk_find_struct_const(
+            info->pNext,
+            SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT);
+
+      if (swiz_info) {
+         const bool is_int = vk_border_color_is_int(info->borderColor);
+         bc = vk_swizzle_color_value(bc, swiz_info->components, is_int);
+      }
+
+      sampler->custom_border = bc;
+      sampler->has_border = true;
+   }
+
+   *pSampler = hk_sampler_to_handle(sampler);
+
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL
+hk_DestroySampler(VkDevice device, VkSampler _sampler,
+                  const VkAllocationCallbacks *pAllocator)
+{
+   VK_FROM_HANDLE(hk_device, dev, device);
+   VK_FROM_HANDLE(hk_sampler, sampler, _sampler);
+
+   if (!sampler)
+      return;
+
+   for (uint8_t plane = 0; plane < sampler->plane_count; plane++) {
+      hk_sampler_heap_remove(dev, sampler->planes[plane].hw);
+   }
+
+   vk_sampler_destroy(&dev->vk, pAllocator, &sampler->vk);
+}
diff --git a/src/asahi/vulkan/hk_sampler.h b/src/asahi/vulkan/hk_sampler.h
new file mode 100644
index 00000000000..444aabc8d65
--- /dev/null
+++ b/src/asahi/vulkan/hk_sampler.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_device.h"
+#include "hk_physical_device.h"
+#include "hk_private.h"
+
+#include "vk_sampler.h"
+#include "vk_ycbcr_conversion.h"
+
+#include "vk_format.h"
+
+struct hk_sampler {
+   struct vk_sampler vk;
+   VkClearColorValue custom_border;
+   bool has_border;
+
+   uint8_t plane_count;
+   uint16_t lod_bias_fp16;
+
+   struct {
+      struct hk_rc_sampler *hw;
+   } planes[2];
+};
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(hk_sampler, vk.base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c
new file mode 100644
index 00000000000..60303963fd7
--- /dev/null
+++ b/src/asahi/vulkan/hk_shader.c
@@ -0,0 +1,1432 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_shader.h"
+
+#include "agx_helpers.h"
+#include "agx_nir_lower_gs.h"
+#include "glsl_types.h"
+#include "nir.h"
+#include "nir_builder.h"
+
+#include "agx_bo.h"
+#include "hk_cmd_buffer.h"
+#include "hk_descriptor_set_layout.h"
+#include "hk_device.h"
+#include "hk_physical_device.h"
+#include "hk_sampler.h"
+#include "hk_shader.h"
+
+#include "nir_builder_opcodes.h"
+#include "nir_builtin_builder.h"
+#include "nir_intrinsics.h"
+#include "nir_intrinsics_indices.h"
+#include "nir_xfb_info.h"
+#include "shader_enums.h"
+#include "vk_nir_convert_ycbcr.h"
+#include "vk_pipeline.h"
+#include "vk_pipeline_layout.h"
+#include "vk_shader_module.h"
+#include "vk_ycbcr_conversion.h"
+
+#include "asahi/compiler/agx_compile.h"
+#include "asahi/lib/agx_linker.h"
+#include "asahi/lib/agx_nir_passes.h"
+#include "asahi/lib/agx_tilebuffer.h"
+#include "asahi/lib/agx_uvs.h"
+#include "compiler/spirv/nir_spirv.h"
+
+#include "util/blob.h"
+#include "util/hash_table.h"
+#include "util/macros.h"
+#include "util/mesa-sha1.h"
+#include "util/simple_mtx.h"
+#include "util/u_debug.h"
+#include "vulkan/vulkan_core.h"
+
+struct hk_fs_key {
+   bool zs_self_dep;
+
+   /** True if sample shading is forced on via an API knob such as
+    * VkPipelineMultisampleStateCreateInfo::minSampleShading
+    */
+   bool force_sample_shading;
+
+   uint8_t pad[2];
+};
+static_assert(sizeof(struct hk_fs_key) == 4, "packed");
+
+static void
+shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+
+   uint32_t comp_size =
+      glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
+   unsigned length = glsl_get_vector_elements(type);
+   *size = comp_size * length, *align = comp_size;
+}
+
+uint64_t
+hk_physical_device_compiler_flags(const struct hk_physical_device *pdev)
+{
+   /* TODO compiler flags */
+   return 0;
+}
+
+const nir_shader_compiler_options *
+hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
+                   UNUSED const struct vk_pipeline_robustness_state *rs)
+{
+   return &agx_nir_options;
+}
+
+static struct spirv_to_nir_options
+hk_get_spirv_options(struct vk_physical_device *vk_pdev,
+                     UNUSED gl_shader_stage stage,
+                     const struct vk_pipeline_robustness_state *rs)
+{
+   return (struct spirv_to_nir_options){
+      .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
+      .phys_ssbo_addr_format = nir_address_format_64bit_global,
+      .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
+      .shared_addr_format = nir_address_format_32bit_offset,
+      .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT,
+      .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT,
+   };
+}
+
+static bool
+lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_jump)
+      return false;
+
+   nir_jump_instr *jump = nir_instr_as_jump(instr);
+   if (jump->type != nir_jump_halt)
+      return false;
+
+   assert(b->impl == nir_shader_get_entrypoint(b->shader));
+   jump->type = nir_jump_return;
+   return true;
+}
+
+void
+hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir)
+{
+   /* Must lower before io to temps */
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_lower_terminate_to_demote);
+      NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return,
+               nir_metadata_all, NULL);
+      NIR_PASS(_, nir, nir_lower_returns);
+   }
+
+   /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */
+   UNUSED bool progress = false;
+   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
+
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_dce);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_opt_loop);
+      NIR_PASS(progress, nir, nir_opt_loop_unroll);
+   } while (progress);
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      struct nir_lower_sysvals_to_varyings_options sysvals_opts = {
+         .point_coord = true,
+      };
+
+      nir_lower_sysvals_to_varyings(nir, &sysvals_opts);
+   }
+
+   NIR_PASS(_, nir, nir_lower_system_values);
+
+   /* Gather info before preprocess_nir but after some general lowering, so
+    * inputs_read and system_values_read are accurately set.
+    */
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
+              true, false);
+
+   NIR_PASS(_, nir, nir_lower_global_vars_to_local);
+
+   NIR_PASS(_, nir, nir_split_var_copies);
+   NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
+
+   /* Optimize but allow copies because we haven't lowered them yet */
+   agx_preprocess_nir(nir, NULL);
+
+   NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
+   NIR_PASS(_, nir, nir_lower_var_copies);
+}
+
+static void
+hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
+{
+   hk_preprocess_nir_internal(vk_pdev, nir);
+   nir_lower_compute_system_values_options csv_options = {
+      .has_base_workgroup_id = true,
+   };
+   NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
+}
+
+static void
+hk_populate_fs_key(struct hk_fs_key *key,
+                   const struct vk_graphics_pipeline_state *state)
+{
+   memset(key, 0, sizeof(*key));
+
+   if (state == NULL)
+      return;
+
+   if (state->pipeline_flags &
+       VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
+      key->zs_self_dep = true;
+
+   /* We force per-sample interpolation whenever sampleShadingEnable is set
+    * regardless of minSampleShading or rasterizationSamples.
+    *
+    * When sampleShadingEnable is set, few guarantees are made about the
+    * location of interpolation of the inputs.  The only real guarantees are
+    * that the inputs are interpolated within the pixel and that you get at
+    * least `rasterizationSamples * minSampleShading` unique positions.
+    * Importantly, it does not require that when `rasterizationSamples *
+    * minSampleShading <= 1.0` that those positions are at the fragment
+    * center.  Therefore, it's valid to just always do per-sample all the time.
+    *
+    * The one caveat here is that we have to be careful about gl_SampleMaskIn.
+    * When `hk_fs_key::force_sample_shading = true` we also turn any reads of
+    * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
+    * is actually per-fragment, not per-pass.  We handle this by smashing
+    * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
+    */
+   const struct vk_multisample_state *ms = state->ms;
+   if (ms != NULL && ms->sample_shading_enable)
+      key->force_sample_shading = true;
+}
+
+static void
+hk_hash_graphics_state(struct vk_physical_device *device,
+                       const struct vk_graphics_pipeline_state *state,
+                       VkShaderStageFlags stages, blake3_hash blake3_out)
+{
+   struct mesa_blake3 blake3_ctx;
+   _mesa_blake3_init(&blake3_ctx);
+   if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
+      struct hk_fs_key key;
+      hk_populate_fs_key(&key, state);
+      _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
+
+      const bool is_multiview = state->rp->view_mask != 0;
+      _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
+   }
+   _mesa_blake3_final(&blake3_ctx, blake3_out);
+}
+
+static bool
+lower_load_global_constant_offset_instr(nir_builder *b,
+                                        nir_intrinsic_instr *intrin,
+                                        UNUSED void *_data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset &&
+       intrin->intrinsic != nir_intrinsic_load_global_constant_bounded)
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def *base_addr = intrin->src[0].ssa;
+   nir_def *offset = intrin->src[1].ssa;
+
+   nir_def *zero = NULL;
+   if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
+      nir_def *bound = intrin->src[2].ssa;
+
+      unsigned bit_size = intrin->def.bit_size;
+      assert(bit_size >= 8 && bit_size % 8 == 0);
+      unsigned byte_size = bit_size / 8;
+
+      zero = nir_imm_zero(b, intrin->num_components, bit_size);
+
+      unsigned load_size = byte_size * intrin->num_components;
+
+      nir_def *sat_offset =
+         nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
+      nir_def *in_bounds =
+         nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
+
+      nir_push_if(b, in_bounds);
+   }
+
+   nir_def *val = nir_build_load_global_constant(
+      b, intrin->def.num_components, intrin->def.bit_size,
+      nir_iadd(b, base_addr, nir_u2u64(b, offset)),
+      .align_mul = nir_intrinsic_align_mul(intrin),
+      .align_offset = nir_intrinsic_align_offset(intrin),
+      .access = nir_intrinsic_access(intrin));
+
+   if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
+      nir_pop_if(b, NULL);
+      val = nir_if_phi(b, val, zero);
+   }
+
+   nir_def_rewrite_uses(&intrin->def, val);
+
+   return true;
+}
+
+struct lower_ycbcr_state {
+   uint32_t set_layout_count;
+   struct vk_descriptor_set_layout *const *set_layouts;
+};
+
+static const struct vk_ycbcr_conversion_state *
+lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding,
+                        uint32_t array_index)
+{
+   const struct lower_ycbcr_state *state = _state;
+   assert(set < state->set_layout_count);
+   assert(state->set_layouts[set] != NULL);
+   const struct hk_descriptor_set_layout *set_layout =
+      vk_to_hk_descriptor_set_layout(state->set_layouts[set]);
+   assert(binding < set_layout->binding_count);
+
+   const struct hk_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
+
+   if (bind_layout->immutable_samplers == NULL)
+      return NULL;
+
+   array_index = MIN2(array_index, bind_layout->array_size - 1);
+
+   const struct hk_sampler *sampler =
+      bind_layout->immutable_samplers[array_index];
+
+   return sampler && sampler->vk.ycbcr_conversion
+             ? &sampler->vk.ycbcr_conversion->state
+             : NULL;
+}
+
+static inline bool
+nir_has_image_var(nir_shader *nir)
+{
+   nir_foreach_image_variable(_, nir)
+      return true;
+
+   return false;
+}
+
+static int
+glsl_type_size(const struct glsl_type *type, bool bindless)
+{
+   return glsl_count_attribute_slots(type, false);
+}
+
+/*
+ * This is the world's worst multiview implementation. We simply duplicate each
+ * draw on the CPU side, changing a uniform in between, and then plumb the view
+ * index into the layer ID here. Whatever, it works.
+ *
+ * The "proper" implementation on AGX would use vertex amplification, but a
+ * MacBook is not a VR headset.
+ */
+static void
+hk_lower_multiview(nir_shader *nir)
+{
+   /* If there's an existing layer ID write, ignore it. This avoids validation
+    * splat with vk_meta.
+    */
+   nir_variable *existing = nir_find_variable_with_location(
+      nir, nir_var_shader_out, VARYING_SLOT_LAYER);
+
+   if (existing) {
+      existing->data.mode = nir_var_shader_temp;
+      existing->data.location = 0;
+      nir_fixup_deref_modes(nir);
+   }
+
+   /* Now write the view index as the layer */
+   nir_builder b =
+      nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
+
+   nir_variable *layer =
+      nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL);
+
+   layer->data.location = VARYING_SLOT_LAYER;
+
+   nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1));
+   b.shader->info.outputs_written |= VARYING_BIT_LAYER;
+}
+
+/*
+ * KHR_maintenance5 requires that points rasterize with a default point size of
+ * 1.0, while our hardware requires an explicit point size write for this.
+ * Since topology may be dynamic, we insert an unconditional write if necessary.
+ */
+static bool
+hk_nir_insert_psiz_write(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
+      nir_metadata_preserve(impl, nir_metadata_all);
+      return false;
+   }
+
+   nir_builder b = nir_builder_at(nir_after_impl(impl));
+
+   nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0),
+                    .write_mask = nir_component_mask(1),
+                    .io_semantics.location = VARYING_SLOT_PSIZ,
+                    .io_semantics.num_slots = 1, .src_type = nir_type_float32);
+
+   nir->info.outputs_written |= VARYING_BIT_PSIZ;
+   nir_metadata_preserve(b.impl, nir_metadata_control_flow);
+   return true;
+}
+
+static nir_def *
+query_custom_border(nir_builder *b, nir_tex_instr *tex)
+{
+   return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4,
+                                  tex->dest_type, false, false);
+}
+
+static nir_def *
+has_custom_border(nir_builder *b, nir_tex_instr *tex)
+{
+   return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx,
+                                  1, nir_type_bool1, false, false);
+}
+
+static bool
+lower(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+   if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex))
+      return false;
+
+   /* XXX: this is a really weird edge case, is this even well-defined? */
+   if (tex->is_shadow)
+      return false;
+
+   b->cursor = nir_after_instr(&tex->instr);
+   nir_def *has_custom = has_custom_border(b, tex);
+
+   nir_instr *orig = nir_instr_clone(b->shader, &tex->instr);
+   nir_builder_instr_insert(b, orig);
+   nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def;
+
+   nir_push_if(b, has_custom);
+   nir_def *replaced = NULL;
+   {
+      /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */
+      nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr);
+      nir_builder_instr_insert(b, clone_instr);
+
+      nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr);
+      nir_def *clamp_to_0 = &tex_0->def;
+
+      tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0;
+
+      /* Grab the border colour */
+      nir_def *border = query_custom_border(b, tex_0);
+
+      if (tex->op == nir_texop_tg4) {
+         border = nir_replicate(b, nir_channel(b, border, tex->component), 4);
+      }
+
+      /* Combine together with the border */
+      if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float &&
+          tex->op != nir_texop_tg4) {
+
+         /* For floats, lerp together:
+          *
+          * For border texels:  (1 * border) + (0 * border      ) = border
+          * For regular texels: (x * border) + (x * (1 - border)) = x.
+          *
+          * Linear filtering is linear (duh), so lerping is compatible.
+          */
+         replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border);
+      } else {
+         /* For integers, just select componentwise since there is no linear
+          * filtering. Gathers also use this path since they are unfiltered in
+          * each component.
+          */
+         replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0,
+                              border);
+      }
+   }
+   nir_pop_if(b, NULL);
+
+   /* Put it together with a phi */
+   nir_def *phi = nir_if_phi(b, replaced, clamp_to_1);
+   nir_def_replace(&tex->def, phi);
+   return true;
+}
+
+static bool
+agx_nir_lower_custom_border(nir_shader *nir)
+{
+   return nir_shader_instructions_pass(nir, lower, nir_metadata_none, NULL);
+}
+
+/*
+ * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not
+ * written by the vertex shader, but in our implementation, the varying would
+ * otherwise be undefined. This small pass predicates VIEWPORT reads based on
+ * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index).
+ */
+static bool
+lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_load_input)
+      return false;
+
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   if (sem.location != VARYING_SLOT_VIEWPORT)
+      return false;
+
+   b->cursor = nir_after_instr(&intr->instr);
+   nir_def *orig = &intr->def;
+
+   nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem);
+   nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0));
+
+   nir_def_rewrite_uses_after(orig, def, def->parent_instr);
+   return true;
+}
+
+static bool
+lower_subpass_dim(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS)
+      tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
+   else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
+      tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+   else
+      return false;
+
+   return true;
+}
+
+void
+hk_lower_nir(struct hk_device *dev, nir_shader *nir,
+             const struct vk_pipeline_robustness_state *rs, bool is_multiview,
+             uint32_t set_layout_count,
+             struct vk_descriptor_set_layout *const *set_layouts)
+{
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_lower_input_attachments,
+               &(nir_input_attachment_options){
+                  .use_fragcoord_sysval = true,
+                  .use_layer_id_sysval = true,
+                  .use_view_id_for_layer = is_multiview,
+               });
+
+      NIR_PASS(_, nir, nir_shader_instructions_pass, lower_subpass_dim,
+               nir_metadata_all, NULL);
+      NIR_PASS(_, nir, nir_lower_wpos_center);
+   }
+
+   /* XXX: should be last geometry stage, how do I get to that? */
+   if (nir->info.stage == MESA_SHADER_VERTEX) {
+      if (is_multiview)
+         hk_lower_multiview(nir);
+   }
+
+   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      NIR_PASS(_, nir, nir_lower_patch_vertices,
+               nir->info.tess.tcs_vertices_out, NULL);
+   }
+
+   const struct lower_ycbcr_state ycbcr_state = {
+      .set_layout_count = set_layout_count,
+      .set_layouts = set_layouts,
+   };
+   NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion,
+            &ycbcr_state);
+
+   /* Lower push constants before lower_descriptors */
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
+            nir_address_format_32bit_offset);
+
+   // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
+
+   /* Images accessed through the texture or PBE hardware are robust, so we
+    * don't set lower_image. (There are some sticky details around txf but
+    * they're handled by agx_nir_lower_texture). However, image atomics are
+    * software so require robustness lowering.
+    */
+   nir_lower_robust_access_options robustness = {
+      .lower_image_atomic = true,
+   };
+
+   NIR_PASS(_, nir, nir_lower_robust_access, &robustness);
+
+   /* We must do early lowering before hk_nir_lower_descriptors, since this will
+    * create lod_bias_agx instructions.
+    */
+   NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */);
+   NIR_PASS(_, nir, agx_nir_lower_custom_border);
+
+   NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count,
+            set_layouts);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
+            nir_address_format_64bit_global);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+            hk_buffer_addr_format(rs->storage_buffers));
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
+            hk_buffer_addr_format(rs->uniform_buffers));
+   NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+            lower_load_global_constant_offset_instr, nir_metadata_none, NULL);
+
+   if (!nir->info.shared_memory_explicit_layout) {
+      /* There may be garbage in shared_size, but it's the job of
+       * nir_lower_vars_to_explicit_types to allocate it. We have to reset to
+       * avoid overallocation.
+       */
+      nir->info.shared_size = 0;
+
+      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
+               shared_var_info);
+   }
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
+            nir_address_format_32bit_offset);
+
+   if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
+      /* Align everything up to 16B so we can write whole vec4s. */
+      nir->info.shared_size = align(nir->info.shared_size, 16);
+      NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
+               16);
+
+      /* We need to call lower_compute_system_values again because
+       * nir_zero_initialize_shared_memory generates load_invocation_id which
+       * has to be lowered to load_invocation_index.
+       */
+      NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
+   }
+
+   /* TODO: we can do indirect VS output */
+   nir_variable_mode lower_indirect_modes = 0;
+   if (nir->info.stage == MESA_SHADER_FRAGMENT)
+      lower_indirect_modes |= nir_var_shader_out;
+   else if (nir->info.stage == MESA_SHADER_VERTEX)
+      lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out;
+
+   NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes,
+            UINT32_MAX);
+
+   NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
+            glsl_type_size, nir_lower_io_lower_64bit_to_32);
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs,
+               nir_metadata_control_flow, NULL);
+   }
+
+   NIR_PASS(_, nir, agx_nir_lower_texture);
+   NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
+
+   agx_preprocess_nir(nir, dev->dev.libagx);
+   NIR_PASS(_, nir, nir_opt_conditional_discard);
+   NIR_PASS(_, nir, nir_opt_if,
+            nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis);
+}
+
+static void
+hk_upload_shader(struct hk_device *dev, struct hk_shader *shader)
+{
+   if (shader->b.info.has_preamble) {
+      unsigned offs = shader->b.info.preamble_offset;
+      assert(offs < shader->b.binary_size);
+
+      size_t size = shader->b.binary_size - offs;
+      assert(size > 0);
+
+      shader->bo = agx_bo_create(&dev->dev, size, AGX_BO_EXEC | AGX_BO_LOW_VA,
+                                 "Preamble");
+      memcpy(shader->bo->ptr.cpu, shader->b.binary + offs, size);
+      shader->preamble_addr = shader->bo->ptr.gpu;
+   }
+
+   if (!shader->linked.ht) {
+      /* If we only have a single variant, link now. */
+      shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0);
+   }
+
+   if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+      agx_pack(&shader->frag_face, FRAGMENT_FACE_2, cfg) {
+         cfg.conservative_depth =
+            agx_translate_depth_layout(shader->b.info.depth_layout);
+      }
+   }
+
+   agx_pack(&shader->counts, COUNTS, cfg) {
+      cfg.uniform_register_count = shader->b.info.push_count;
+      cfg.preshader_register_count = shader->b.info.nr_preamble_gprs;
+      cfg.sampler_state_register_count = agx_translate_sampler_state_count(
+         shader->b.info.uses_txf ? 1 : 0, false);
+   }
+}
+
+DERIVE_HASH_TABLE(hk_fast_link_key_vs);
+DERIVE_HASH_TABLE(hk_fast_link_key_fs);
+
+static VkResult
+hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage)
+{
+   simple_mtx_init(&shader->linked.lock, mtx_plain);
+
+   bool multiple_variants =
+      sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT;
+
+   if (!multiple_variants)
+      return VK_SUCCESS;
+
+   if (sw_stage == MESA_SHADER_VERTEX)
+      shader->linked.ht = hk_fast_link_key_vs_table_create(NULL);
+   else
+      shader->linked.ht = hk_fast_link_key_fs_table_create(NULL);
+
+   return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY
+                                      : VK_SUCCESS;
+}
+
+static VkResult
+hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
+               nir_shader *nir, VkShaderCreateFlagsEXT shader_flags,
+               const struct vk_pipeline_robustness_state *rs,
+               const struct hk_fs_key *fs_key, struct hk_shader *shader,
+               gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info)
+{
+   unsigned vs_uniform_base = 0;
+
+   /* For now, only shader objects are supported */
+   if (sw_stage == MESA_SHADER_VERTEX) {
+      vs_uniform_base =
+         6 * DIV_ROUND_UP(
+                BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4);
+   } else if (sw_stage == MESA_SHADER_FRAGMENT) {
+      shader->info.fs.interp = agx_gather_interp_info(nir);
+      shader->info.fs.writes_memory = nir->info.writes_memory;
+
+      /* Discards must be lowering before lowering MSAA to handle discards */
+      NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
+      NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog,
+               &shader->info.fs.epilog_key);
+      NIR_PASS(_, nir, agx_nir_lower_sample_mask);
+
+      if (nir->info.fs.uses_sample_shading) {
+         /* Ensure the sample ID is preserved in register */
+         nir_builder b =
+            nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
+         nir_export_agx(&b, nir_load_exported_agx(&b, 1, 16, .base = 1),
+                        .base = 1);
+
+         NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
+      }
+
+      NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
+      NIR_PASS(_, nir, agx_nir_lower_interpolation);
+   } else if (sw_stage == MESA_SHADER_TESS_EVAL) {
+      shader->info.ts.ccw = nir->info.tess.ccw;
+      shader->info.ts.point_mode = nir->info.tess.point_mode;
+      shader->info.ts.spacing = nir->info.tess.spacing;
+      shader->info.ts.mode = nir->info.tess._primitive_mode;
+
+      if (nir->info.tess.point_mode) {
+         shader->info.ts.out_prim = MESA_PRIM_POINTS;
+      } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
+         shader->info.ts.out_prim = MESA_PRIM_LINES;
+      } else {
+         shader->info.ts.out_prim = MESA_PRIM_TRIANGLES;
+      }
+
+      /* This destroys info so it needs to happen after the gather */
+      NIR_PASS(_, nir, agx_nir_lower_tes, dev->dev.libagx, hw);
+   } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
+      shader->info.tcs.output_patch_size = nir->info.tess.tcs_vertices_out;
+      shader->info.tcs.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
+      shader->info.tcs.nr_patch_outputs =
+         util_last_bit(nir->info.patch_outputs_written);
+      shader->info.tcs.output_stride = agx_tcs_output_stride(nir);
+   }
+
+   uint64_t outputs = nir->info.outputs_written;
+   if (!hw &&
+       (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) {
+      nir->info.stage = MESA_SHADER_COMPUTE;
+      memset(&nir->info.cs, 0, sizeof(nir->info.cs));
+      nir->xfb_info = NULL;
+   }
+
+   /* XXX: rename */
+   NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base);
+
+#if 0
+   /* TODO */
+   nir_variable_mode robust2_modes = 0;
+   if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
+      robust2_modes |= nir_var_mem_ubo;
+   if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
+      robust2_modes |= nir_var_mem_ssbo;
+#endif
+
+   struct agx_shader_key backend_key = {
+      .needs_g13x_coherency = (dev->dev.params.gpu_generation == 13 &&
+                               dev->dev.params.num_clusters_total > 1) ||
+                              dev->dev.params.num_dies > 1,
+      .reserved_preamble = 128 /* TODO */,
+      .libagx = dev->dev.libagx,
+      .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT,
+      .has_scratch = true,
+   };
+
+   /* For now, sample shading is always dynamic. Indicate that. */
+   if (nir->info.stage == MESA_SHADER_FRAGMENT &&
+       nir->info.fs.uses_sample_shading)
+      backend_key.fs.inside_sample_loop = true;
+
+   agx_compile_shader_nir(nir, &backend_key, NULL, &shader->b);
+
+   shader->code_ptr = shader->b.binary;
+   shader->code_size = shader->b.binary_size;
+
+   shader->info.stage = sw_stage;
+   shader->info.clip_distance_array_size = nir->info.clip_distance_array_size;
+   shader->info.cull_distance_array_size = nir->info.cull_distance_array_size;
+   shader->b.info.outputs = outputs;
+
+   if (sw_stage == MESA_SHADER_COMPUTE) {
+      for (unsigned i = 0; i < 3; ++i)
+         shader->info.cs.local_size[i] = nir->info.workgroup_size[i];
+   }
+
+   if (xfb_info) {
+      assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs));
+
+      memcpy(&shader->info.xfb_info, xfb_info,
+             nir_xfb_info_size(xfb_info->output_count));
+
+      typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4);
+   }
+
+   if (nir->constant_data_size > 0) {
+      uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT);
+
+      void *data = malloc(data_size);
+      if (data == NULL) {
+         ralloc_free(nir);
+         return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      memcpy(data, nir->constant_data, nir->constant_data_size);
+
+      assert(nir->constant_data_size <= data_size);
+      memset(data + nir->constant_data_size, 0,
+             data_size - nir->constant_data_size);
+
+      shader->data_ptr = data;
+      shader->data_size = data_size;
+   }
+
+   ralloc_free(nir);
+
+   VkResult result = hk_init_link_ht(shader, sw_stage);
+   if (result != VK_SUCCESS)
+      return vk_error(dev, result);
+
+   hk_upload_shader(dev, shader);
+   return VK_SUCCESS;
+}
+
+static const struct vk_shader_ops hk_shader_ops;
+
+static void
+hk_destroy_linked_shader(struct hk_linked_shader *linked)
+{
+   agx_bo_unreference(linked->b.bo);
+   ralloc_free(linked);
+}
+
+static void
+hk_destroy_linked_shader_ht(struct hash_entry *he)
+{
+   hk_destroy_linked_shader(he->data);
+}
+
+static void
+hk_shader_destroy(struct hk_shader *s)
+{
+   free((void *)s->code_ptr);
+   free((void *)s->data_ptr);
+   agx_bo_unreference(s->bo);
+
+   simple_mtx_destroy(&s->linked.lock);
+   _mesa_hash_table_destroy(s->linked.ht, hk_destroy_linked_shader_ht);
+
+   if (s->only_linked)
+      hk_destroy_linked_shader(s->only_linked);
+}
+
+void
+hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   hk_foreach_variant(obj, shader) {
+      hk_shader_destroy(shader);
+   }
+
+   vk_shader_free(&dev->vk, pAllocator, &obj->vk);
+}
+
+static void
+hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader)
+{
+   /* Point size must be clamped, excessively large points don't render
+    * properly on G13.
+    *
+    * Must be synced with pointSizeRange.
+    */
+   NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f);
+
+   /* TODO: Optimize out for monolithic? */
+   NIR_PASS(_, nir, hk_nir_insert_psiz_write);
+
+   NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
+   NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
+
+   NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs);
+
+   shader->info.vs.cull_distance_array_size =
+      nir->info.cull_distance_array_size;
+}
+
+VkResult
+hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
+                  const struct vk_graphics_pipeline_state *state,
+                  const VkAllocationCallbacks *pAllocator,
+                  struct hk_api_shader **shader_out)
+{
+   VkResult result;
+
+   /* We consume the NIR, regardless of success or failure */
+   nir_shader *nir = info->nir;
+
+   size_t size = sizeof(struct hk_api_shader) +
+                 sizeof(struct hk_shader) * hk_num_variants(info->stage);
+   struct hk_api_shader *obj =
+      vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size);
+
+   if (obj == NULL) {
+      ralloc_free(nir);
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   /* TODO: Multiview with ESO */
+   const bool is_multiview = state && state->rp->view_mask != 0;
+
+   hk_lower_nir(dev, nir, info->robustness, is_multiview,
+                info->set_layout_count, info->set_layouts);
+
+   gl_shader_stage sw_stage = nir->info.stage;
+
+   struct hk_fs_key fs_key_tmp, *fs_key = NULL;
+   if (sw_stage == MESA_SHADER_FRAGMENT) {
+      hk_populate_fs_key(&fs_key_tmp, state);
+      fs_key = &fs_key_tmp;
+
+      nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading;
+
+      /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way
+       * to do this.
+       */
+      if (fs_key->zs_self_dep) {
+         nir_builder b =
+            nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir)));
+         nir_discard_if(&b, nir_imm_false(&b));
+         nir->info.fs.uses_discard = true;
+      }
+
+      NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false);
+   } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
+      NIR_PASS_V(nir, agx_nir_lower_tcs, dev->dev.libagx);
+   }
+
+   /* Compile all variants up front */
+   if (sw_stage == MESA_SHADER_GEOMETRY) {
+      for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
+         struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
+         nir_shader *clone = nir_shader_clone(NULL, nir);
+
+         enum mesa_prim out_prim = MESA_PRIM_MAX;
+         nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
+
+         NIR_PASS(_, clone, agx_nir_lower_gs, dev->dev.libagx, rast_disc,
+                  &count, &rast, &pre_gs, &out_prim,
+                  &count_variant->info.gs.count_words);
+
+         if (!rast_disc) {
+            struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
+
+            hk_lower_hw_vs(rast, shader);
+            shader->info.gs.out_prim = out_prim;
+         }
+
+         struct {
+            nir_shader *in;
+            struct hk_shader *out;
+         } variants[] = {
+            {clone, hk_main_gs_variant(obj, rast_disc)},
+            {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
+            {count, count_variant},
+            {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
+         };
+
+         for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
+            if (variants[v].in) {
+               result = hk_compile_nir(dev, pAllocator, variants[v].in,
+                                       info->flags, info->robustness, NULL,
+                                       variants[v].out, sw_stage, true, NULL);
+               if (result != VK_SUCCESS) {
+                  hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+                  ralloc_free(nir);
+                  return result;
+               }
+            }
+         }
+      }
+   } else if (sw_stage == MESA_SHADER_VERTEX ||
+              sw_stage == MESA_SHADER_TESS_EVAL) {
+
+      if (sw_stage == MESA_SHADER_VERTEX) {
+         assert(
+            !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) &&
+            "Fixed-function attributes not used in Vulkan");
+
+         NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in);
+      }
+
+      /* the shader_out portion of this is load-bearing even for tess eval */
+      NIR_PASS(_, nir, nir_io_add_const_offset_to_base,
+               nir_var_shader_in | nir_var_shader_out);
+
+      for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) {
+         struct hk_shader *shader = &obj->variants[v];
+         bool hw = v == HK_VS_VARIANT_HW;
+
+         /* TODO: Optimize single variant when we know nextStage */
+         nir_shader *clone = nir_shader_clone(NULL, nir);
+
+         if (sw_stage == MESA_SHADER_VERTEX) {
+            NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog,
+                     shader->info.vs.attrib_components_read);
+
+            shader->info.vs.attribs_read =
+               nir->info.inputs_read >> VERT_ATTRIB_GENERIC0;
+         }
+
+         if (hw) {
+            hk_lower_hw_vs(clone, shader);
+         } else {
+            NIR_PASS(_, clone, agx_nir_lower_vs_before_gs, dev->dev.libagx);
+         }
+
+         result = hk_compile_nir(dev, pAllocator, clone, info->flags,
+                                 info->robustness, fs_key, shader, sw_stage, hw,
+                                 nir->xfb_info);
+         if (result != VK_SUCCESS) {
+            hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+            ralloc_free(nir);
+            return result;
+         }
+      }
+   } else {
+      struct hk_shader *shader = hk_only_variant(obj);
+
+      /* hk_compile_nir takes ownership of nir */
+      result =
+         hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness,
+                        fs_key, shader, sw_stage, true, NULL);
+      if (result != VK_SUCCESS) {
+         hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+         return result;
+      }
+   }
+
+   *shader_out = obj;
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
+                   struct vk_shader_compile_info *infos,
+                   const struct vk_graphics_pipeline_state *state,
+                   const VkAllocationCallbacks *pAllocator,
+                   struct vk_shader **shaders_out)
+{
+   struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
+
+   for (uint32_t i = 0; i < shader_count; i++) {
+      VkResult result =
+         hk_compile_shader(dev, &infos[i], state, pAllocator,
+                           (struct hk_api_shader **)&shaders_out[i]);
+      if (result != VK_SUCCESS) {
+         /* Clean up all the shaders before this point */
+         for (uint32_t j = 0; j < i; j++)
+            hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
+
+         /* Clean up all the NIR after this point */
+         for (uint32_t j = i + 1; j < shader_count; j++)
+            ralloc_free(infos[j].nir);
+
+         /* Memset the output array */
+         memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
+
+         return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob,
+                      struct hk_shader *shader)
+{
+   struct hk_shader_info info;
+   blob_copy_bytes(blob, &info, sizeof(info));
+
+   struct agx_shader_info b_info;
+   blob_copy_bytes(blob, &b_info, sizeof(b_info));
+
+   const uint32_t code_size = blob_read_uint32(blob);
+   const uint32_t data_size = blob_read_uint32(blob);
+   if (blob->overrun)
+      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
+
+   VkResult result = hk_init_link_ht(shader, info.stage);
+   if (result != VK_SUCCESS)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   simple_mtx_init(&shader->linked.lock, mtx_plain);
+
+   shader->b.info = b_info;
+   shader->info = info;
+   shader->code_size = code_size;
+   shader->data_size = data_size;
+   shader->b.binary_size = code_size;
+
+   shader->code_ptr = malloc(code_size);
+   if (shader->code_ptr == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   shader->data_ptr = malloc(data_size);
+   if (shader->data_ptr == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
+   blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
+   if (blob->overrun)
+      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
+
+   shader->b.binary = (void *)shader->code_ptr;
+   hk_upload_shader(dev, shader);
+   return VK_SUCCESS;
+}
+
+static VkResult
+hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob,
+                          uint32_t binary_version,
+                          const VkAllocationCallbacks *pAllocator,
+                          struct vk_shader **shader_out)
+{
+   struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
+
+   gl_shader_stage stage = blob_read_uint8(blob);
+   if (blob->overrun)
+      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
+
+   size_t size = sizeof(struct hk_api_shader) +
+                 sizeof(struct hk_shader) * hk_num_variants(stage);
+
+   struct hk_api_shader *obj =
+      vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size);
+
+   if (obj == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   hk_foreach_variant(obj, shader) {
+      VkResult result = hk_deserialize_shader(dev, blob, shader);
+
+      if (result != VK_SUCCESS) {
+         hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
+         return result;
+      }
+   }
+
+   *shader_out = &obj->vk;
+   return VK_SUCCESS;
+}
+
+static void
+hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader,
+                    struct blob *blob)
+{
+   blob_write_bytes(blob, &shader->info, sizeof(shader->info));
+   blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info));
+
+   blob_write_uint32(blob, shader->code_size);
+   blob_write_uint32(blob, shader->data_size);
+   blob_write_bytes(blob, shader->code_ptr, shader->code_size);
+   blob_write_bytes(blob, shader->data_ptr, shader->data_size);
+}
+
+static bool
+hk_api_shader_serialize(struct vk_device *vk_dev,
+                        const struct vk_shader *vk_shader, struct blob *blob)
+{
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   blob_write_uint8(blob, vk_shader->stage);
+
+   hk_foreach_variant(obj, shader) {
+      hk_shader_serialize(vk_dev, shader, blob);
+   }
+
+   return !blob->out_of_memory;
+}
+
+#define WRITE_STR(field, ...)                                                  \
+   ({                                                                          \
+      memset(field, 0, sizeof(field));                                         \
+      UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__);              \
+      assert(i > 0 && i < sizeof(field));                                      \
+   })
+
+static VkResult
+hk_shader_get_executable_properties(
+   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
+   uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
+{
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
+                          executable_count);
+
+   vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
+   {
+      props->stages = mesa_to_vk_shader_stage(obj->vk.stage);
+      props->subgroupSize = 32;
+      WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(obj->vk.stage));
+      WRITE_STR(props->description, "%s shader",
+                _mesa_shader_stage_to_string(obj->vk.stage));
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static VkResult
+hk_shader_get_executable_statistics(
+   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
+   uint32_t executable_index, uint32_t *statistic_count,
+   VkPipelineExecutableStatisticKHR *statistics)
+{
+   struct hk_api_shader *obj =
+      container_of(vk_shader, struct hk_api_shader, vk);
+
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
+                          statistic_count);
+
+   assert(executable_index == 0);
+
+   /* TODO: find a sane way to report multiple variants and have that play nice
+    * with zink.
+    */
+   struct hk_shader *shader = hk_any_variant(obj);
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
+   {
+      WRITE_STR(stat->name, "Code Size");
+      WRITE_STR(stat->description,
+                "Size of the compiled shader binary, in bytes");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = shader->code_size;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
+   {
+      WRITE_STR(stat->name, "Number of GPRs");
+      WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = shader->b.info.nr_gprs;
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static bool
+write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
+              const char *data)
+{
+   ir->isText = VK_TRUE;
+
+   size_t data_len = strlen(data) + 1;
+
+   if (ir->pData == NULL) {
+      ir->dataSize = data_len;
+      return true;
+   }
+
+   strncpy(ir->pData, data, ir->dataSize);
+   if (ir->dataSize < data_len)
+      return false;
+
+   ir->dataSize = data_len;
+   return true;
+}
+
+static VkResult
+hk_shader_get_executable_internal_representations(
+   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
+   uint32_t executable_index, uint32_t *internal_representation_count,
+   VkPipelineExecutableInternalRepresentationKHR *internal_representations)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+                          internal_representations,
+                          internal_representation_count);
+   bool incomplete_text = false;
+
+   assert(executable_index == 0);
+
+   /* TODO */
+#if 0
+   vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
+      WRITE_STR(ir->name, "AGX assembly");
+      WRITE_STR(ir->description, "AGX assembly");
+      if (!write_ir_text(ir, TODO))
+         incomplete_text = true;
+   }
+#endif
+
+   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
+}
+
+static const struct vk_shader_ops hk_shader_ops = {
+   .destroy = hk_api_shader_destroy,
+   .serialize = hk_api_shader_serialize,
+   .get_executable_properties = hk_shader_get_executable_properties,
+   .get_executable_statistics = hk_shader_get_executable_statistics,
+   .get_executable_internal_representations =
+      hk_shader_get_executable_internal_representations,
+};
+
+const struct vk_device_shader_ops hk_device_shader_ops = {
+   .get_nir_options = hk_get_nir_options,
+   .get_spirv_options = hk_get_spirv_options,
+   .preprocess_nir = hk_preprocess_nir,
+   .hash_graphics_state = hk_hash_graphics_state,
+   .compile = hk_compile_shaders,
+   .deserialize = hk_deserialize_api_shader,
+   .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
+   .cmd_bind_shaders = hk_cmd_bind_shaders,
+};
+
+struct hk_linked_shader *
+hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main,
+             struct agx_shader_part *prolog, struct agx_shader_part *epilog,
+             unsigned nr_samples_shaded)
+{
+   struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader);
+   agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog,
+                 nr_samples_shaded);
+
+   if (fragment) {
+      agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) {
+         cfg.cf_binding_count = s->b.cf.nr_bindings;
+         cfg.uniform_register_count = main->b.info.push_count;
+         cfg.preshader_register_count = main->b.info.nr_preamble_gprs;
+         cfg.sampler_state_register_count =
+            agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false);
+      }
+   }
+
+   /* Now that we've linked, bake the USC words to bind this program */
+   struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data));
+
+   if (main && main->b.info.immediate_size_16) {
+      unreachable("todo");
+#if 0
+      /* XXX: do ahead of time */
+      uint64_t ptr = agx_pool_upload_aligned(
+         &cmd->pool, s->b.info.immediates, s->b.info.immediate_size_16 * 2, 64);
+
+      for (unsigned range = 0; range < constant_push_ranges; ++range) {
+         unsigned offset = 64 * range;
+         assert(offset < s->b.info.immediate_size_16);
+
+         agx_usc_uniform(&b, s->b.info.immediate_base_uniform + offset,
+                         MIN2(64, s->b.info.immediate_size_16 - offset),
+                         ptr + (offset * 2));
+      }
+#endif
+   }
+
+   agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap);
+
+   if (s->b.uses_txf)
+      agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler);
+
+   if (main && (main->b.info.stage == MESA_SHADER_COMPUTE ||
+                main->b.info.stage == MESA_SHADER_TESS_CTRL)) {
+      unsigned size = main->b.info.local_size;
+
+      agx_usc_pack(&b, SHARED, cfg) {
+         cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
+         cfg.bytes_per_threadgroup = size > 0 ? size : 65536;
+         cfg.uses_shared_memory = size > 0;
+      }
+   } else if (!fragment) {
+      agx_usc_shared_none(&b);
+   }
+
+   agx_usc_push_packed(&b, SHADER, s->b.shader);
+   agx_usc_push_packed(&b, REGISTERS, s->b.regs);
+
+   if (fragment)
+      agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props);
+
+   if (main && main->b.info.has_preamble) {
+      agx_usc_pack(&b, PRESHADER, cfg) {
+         cfg.code = main->preamble_addr;
+      }
+   } else {
+      agx_usc_pack(&b, NO_PRESHADER, cfg)
+         ;
+   }
+
+   s->usc.size = b.head - s->usc.data;
+   return s;
+}
diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h
new file mode 100644
index 00000000000..458266f8365
--- /dev/null
+++ b/src/asahi/vulkan/hk_shader.h
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "asahi/compiler/agx_compile.h"
+#include "util/macros.h"
+#include "agx_linker.h"
+#include "agx_nir_lower_vbo.h"
+#include "agx_pack.h"
+#include "agx_usc.h"
+#include "agx_uvs.h"
+
+#include "hk_device.h"
+#include "hk_device_memory.h"
+#include "hk_private.h"
+
+#include "nir_xfb_info.h"
+#include "shader_enums.h"
+#include "vk_pipeline_cache.h"
+
+#include "nir.h"
+
+#include "vk_shader.h"
+
+struct hk_physical_device;
+struct hk_pipeline_compilation_ctx;
+struct vk_descriptor_set_layout;
+struct vk_graphics_pipeline_state;
+struct vk_pipeline_cache;
+struct vk_pipeline_layout;
+struct vk_pipeline_robustness_state;
+struct vk_shader_module;
+
+/* TODO: Make dynamic */
+#define HK_ROOT_UNIFORM       104
+#define HK_IMAGE_HEAP_UNIFORM 108
+
+struct hk_shader_info {
+   union {
+      struct {
+         uint32_t attribs_read;
+         BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4);
+         uint8_t cull_distance_array_size;
+         uint8_t _pad[7];
+      } vs;
+
+      struct {
+         /* Local workgroup size */
+         uint16_t local_size[3];
+
+         uint8_t _pad[26];
+      } cs;
+
+      struct {
+         struct agx_interp_info interp;
+         struct agx_fs_epilog_link_info epilog_key;
+
+         bool reads_sample_mask;
+         bool post_depth_coverage;
+         bool uses_sample_shading;
+         bool early_fragment_tests;
+         bool writes_memory;
+
+         uint8_t _pad[7];
+      } fs;
+
+      struct {
+         uint8_t spacing;
+         uint8_t mode;
+         enum mesa_prim out_prim;
+         bool point_mode;
+         bool ccw;
+         uint8_t _pad[27];
+      } ts;
+
+      struct {
+         uint64_t per_vertex_outputs;
+         uint32_t output_stride;
+         uint8_t output_patch_size;
+         uint8_t nr_patch_outputs;
+         uint8_t _pad[18];
+      } tcs;
+
+      struct {
+         unsigned count_words;
+         enum mesa_prim out_prim;
+         uint8_t _pad[27];
+      } gs;
+
+      /* Used to initialize the union for other stages */
+      uint8_t _pad[32];
+   };
+
+   struct agx_unlinked_uvs_layout uvs;
+
+   /* Transform feedback buffer strides */
+   uint8_t xfb_stride[MAX_XFB_BUFFERS];
+
+   gl_shader_stage stage : 8;
+   uint8_t clip_distance_array_size;
+   uint8_t cull_distance_array_size;
+   uint8_t _pad0[1];
+
+   /* XXX: is there a less goofy way to do this? I really don't want dynamic
+    * allocation here.
+    */
+   nir_xfb_info xfb_info;
+   nir_xfb_output_info xfb_outputs[64];
+};
+
+/*
+ * Hash table keys for fast-linked shader variants. These contain the entire
+ * prolog/epilog key so we only do 1 hash table lookup instead of 2 in the
+ * general case where the linked shader is already ready.
+ */
+struct hk_fast_link_key_vs {
+   struct agx_vs_prolog_key prolog;
+};
+
+struct hk_fast_link_key_fs {
+   unsigned nr_samples_shaded;
+   struct agx_fs_prolog_key prolog;
+   struct agx_fs_epilog_key epilog;
+};
+
+struct hk_shader {
+   struct agx_shader_part b;
+
+   struct hk_shader_info info;
+   struct agx_fragment_face_2_packed frag_face;
+   struct agx_counts_packed counts;
+
+   const void *code_ptr;
+   uint32_t code_size;
+
+   const void *data_ptr;
+   uint32_t data_size;
+
+   /* BO for any uploaded shader part */
+   struct agx_bo *bo;
+
+   /* Cache of fast linked variants */
+   struct {
+      simple_mtx_t lock;
+      struct hash_table *ht;
+   } linked;
+
+   /* If there's only a single possibly linked variant, direct pointer. TODO:
+    * Union with the cache to save some space?
+    */
+   struct hk_linked_shader *only_linked;
+
+   /* Address to the uploaded preamble section. Preambles are uploaded
+    * separately from fast-linked main shaders.
+    */
+   uint64_t preamble_addr;
+
+   /* Address of the start of the shader data section */
+   uint64_t data_addr;
+};
+
+enum hk_vs_variant {
+   /* Hardware vertex shader, when next stage is fragment */
+   HK_VS_VARIANT_HW,
+
+   /* Hardware compute shader, when next is geometry/tessellation */
+   HK_VS_VARIANT_SW,
+
+   HK_VS_VARIANTS,
+};
+
+enum hk_gs_variant {
+   /* Hardware vertex shader used for rasterization */
+   HK_GS_VARIANT_RAST,
+
+   /* Main compute shader */
+   HK_GS_VARIANT_MAIN,
+   HK_GS_VARIANT_MAIN_NO_RAST,
+
+   /* Count compute shader */
+   HK_GS_VARIANT_COUNT,
+   HK_GS_VARIANT_COUNT_NO_RAST,
+
+   /* Pre-GS compute shader */
+   HK_GS_VARIANT_PRE,
+   HK_GS_VARIANT_PRE_NO_RAST,
+
+   HK_GS_VARIANTS,
+};
+
+/* clang-format off */
+static const char *hk_gs_variant_name[] = {
+   [HK_GS_VARIANT_RAST] = "Rasterization",
+   [HK_GS_VARIANT_MAIN] = "Main",
+   [HK_GS_VARIANT_MAIN_NO_RAST] = "Main (rast. discard)",
+   [HK_GS_VARIANT_COUNT] = "Count",
+   [HK_GS_VARIANT_COUNT_NO_RAST] = "Count (rast. discard)",
+   [HK_GS_VARIANT_PRE] = "Pre-GS",
+   [HK_GS_VARIANT_PRE_NO_RAST] = "Pre-GS (rast. discard)",
+};
+/* clang-format on */
+
+static inline unsigned
+hk_num_variants(gl_shader_stage stage)
+{
+   switch (stage) {
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_TESS_EVAL:
+      return HK_VS_VARIANTS;
+
+   case MESA_SHADER_GEOMETRY:
+      return HK_GS_VARIANTS;
+
+   default:
+      return 1;
+   }
+}
+
+/*
+ * An hk_api shader maps 1:1 to a VkShader object. An hk_api_shader may contain
+ * multiple hardware hk_shader's, built at shader compile time. This complexity
+ * is required to efficiently implement the legacy geometry pipeline.
+ */
+struct hk_api_shader {
+   struct vk_shader vk;
+
+   /* Is this an internal passthrough geometry shader? */
+   bool is_passthrough;
+
+   struct hk_shader variants[];
+};
+
+#define hk_foreach_variant(api_shader, var)                                    \
+   for (struct hk_shader *var = api_shader->variants;                          \
+        var < api_shader->variants + hk_num_variants(api_shader->vk.stage);    \
+        ++var)
+
+static const char *
+hk_variant_name(struct hk_api_shader *obj, struct hk_shader *variant)
+{
+   unsigned i = variant - obj->variants;
+   assert(i < hk_num_variants(obj->vk.stage));
+
+   if (hk_num_variants(obj->vk.stage) == 1) {
+      return NULL;
+   } else if (obj->vk.stage == MESA_SHADER_GEOMETRY) {
+      assert(i < ARRAY_SIZE(hk_gs_variant_name));
+      return hk_gs_variant_name[i];
+   } else {
+      assert(i < 2);
+      return i == HK_VS_VARIANT_SW ? "Software" : "Hardware";
+   }
+}
+
+static struct hk_shader *
+hk_only_variant(struct hk_api_shader *obj)
+{
+   if (!obj)
+      return NULL;
+
+   assert(hk_num_variants(obj->vk.stage) == 1);
+   return &obj->variants[0];
+}
+
+static struct hk_shader *
+hk_any_variant(struct hk_api_shader *obj)
+{
+   if (!obj)
+      return NULL;
+
+   return &obj->variants[0];
+}
+
+static struct hk_shader *
+hk_main_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+{
+   return &obj->variants[HK_GS_VARIANT_MAIN + rast_disc];
+}
+
+static struct hk_shader *
+hk_count_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+{
+   return &obj->variants[HK_GS_VARIANT_COUNT + rast_disc];
+}
+
+static struct hk_shader *
+hk_pre_gs_variant(struct hk_api_shader *obj, bool rast_disc)
+{
+   return &obj->variants[HK_GS_VARIANT_PRE + rast_disc];
+}
+
+#define HK_MAX_LINKED_USC_SIZE                                                 \
+   (AGX_USC_PRESHADER_LENGTH + AGX_USC_FRAGMENT_PROPERTIES_LENGTH +            \
+    AGX_USC_REGISTERS_LENGTH + AGX_USC_SHADER_LENGTH + AGX_USC_SHARED_LENGTH + \
+    AGX_USC_SAMPLER_LENGTH + (AGX_USC_UNIFORM_LENGTH * 9))
+
+struct hk_linked_shader {
+   struct agx_linked_shader b;
+
+   /* Distinct from hk_shader::counts due to addition of cf_binding_count, which
+    * is delayed since it depends on cull distance.
+    */
+   struct agx_fragment_shader_word_0_packed fs_counts;
+
+   /* Baked USC words to bind this linked shader */
+   struct {
+      uint8_t data[HK_MAX_LINKED_USC_SIZE];
+      size_t size;
+   } usc;
+};
+
+struct hk_linked_shader *hk_fast_link(struct hk_device *dev, bool fragment,
+                                      struct hk_shader *main,
+                                      struct agx_shader_part *prolog,
+                                      struct agx_shader_part *epilog,
+                                      unsigned nr_samples_shaded);
+
+extern const struct vk_device_shader_ops hk_device_shader_ops;
+
+uint64_t
+hk_physical_device_compiler_flags(const struct hk_physical_device *pdev);
+
+static inline nir_address_format
+hk_buffer_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
+{
+   switch (robustness) {
+   case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
+      return nir_address_format_64bit_global_32bit_offset;
+   case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
+   case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
+      return nir_address_format_64bit_bounded_global;
+   default:
+      unreachable("Invalid robust buffer access behavior");
+   }
+}
+
+bool hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base);
+
+bool
+hk_nir_lower_descriptors(nir_shader *nir,
+                         const struct vk_pipeline_robustness_state *rs,
+                         uint32_t set_layout_count,
+                         struct vk_descriptor_set_layout *const *set_layouts);
+void hk_lower_nir(struct hk_device *dev, nir_shader *nir,
+                  const struct vk_pipeline_robustness_state *rs,
+                  bool is_multiview, uint32_t set_layout_count,
+                  struct vk_descriptor_set_layout *const *set_layouts);
+
+VkResult hk_compile_shader(struct hk_device *dev,
+                           struct vk_shader_compile_info *info,
+                           const struct vk_graphics_pipeline_state *state,
+                           const VkAllocationCallbacks *pAllocator,
+                           struct hk_api_shader **shader_out);
+
+void hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev,
+                                nir_shader *nir);
+
+void hk_api_shader_destroy(struct vk_device *vk_dev,
+                           struct vk_shader *vk_shader,
+                           const VkAllocationCallbacks *pAllocator);
+
+const nir_shader_compiler_options *
+hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
+                   UNUSED const struct vk_pipeline_robustness_state *rs);
+
+struct hk_api_shader *hk_meta_shader(struct hk_device *dev,
+                                     hk_internal_builder_t builder, void *data,
+                                     size_t data_size);
+
+static inline struct hk_shader *
+hk_meta_kernel(struct hk_device *dev, hk_internal_builder_t builder, void *data,
+               size_t data_size)
+{
+   return hk_only_variant(hk_meta_shader(dev, builder, data, data_size));
+}
+
+struct hk_passthrough_gs_key {
+   /* Bit mask of outputs written by the VS/TES, to be passed through */
+   uint64_t outputs;
+
+   /* Clip/cull sizes, implies clip/cull written in output */
+   uint8_t clip_distance_array_size;
+   uint8_t cull_distance_array_size;
+
+   /* Transform feedback buffer strides */
+   uint8_t xfb_stride[MAX_XFB_BUFFERS];
+
+   /* Decomposed primitive */
+   enum mesa_prim prim;
+
+   /* Transform feedback info. Must add nir_xfb_info_size to get the key size */
+   nir_xfb_info xfb_info;
+};
+
+void hk_nir_passthrough_gs(struct nir_builder *b, const void *key_);
diff --git a/src/asahi/vulkan/hk_wsi.c b/src/asahi/vulkan/hk_wsi.c
new file mode 100644
index 00000000000..b95d09a7d97
--- /dev/null
+++ b/src/asahi/vulkan/hk_wsi.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "hk_wsi.h"
+#include "hk_instance.h"
+#include "wsi_common.h"
+
+static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
+hk_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
+{
+   VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
+   return vk_instance_get_proc_addr_unchecked(pdev->vk.instance, pName);
+}
+
+VkResult
+hk_init_wsi(struct hk_physical_device *pdev)
+{
+   VkResult result;
+
+   struct wsi_device_options wsi_options = {.sw_device = false};
+   result = wsi_device_init(
+      &pdev->wsi_device, hk_physical_device_to_handle(pdev), hk_wsi_proc_addr,
+      &pdev->vk.instance->alloc, pdev->master_fd,
+      &hk_physical_device_instance(pdev)->dri_options, &wsi_options);
+   if (result != VK_SUCCESS)
+      return result;
+
+   pdev->wsi_device.supports_scanout = false;
+   pdev->wsi_device.supports_modifiers = true;
+
+   pdev->vk.wsi_device = &pdev->wsi_device;
+
+   return result;
+}
+
+void
+hk_finish_wsi(struct hk_physical_device *pdev)
+{
+   pdev->vk.wsi_device = NULL;
+   wsi_device_finish(&pdev->wsi_device, &pdev->vk.instance->alloc);
+}
diff --git a/src/asahi/vulkan/hk_wsi.h b/src/asahi/vulkan/hk_wsi.h
new file mode 100644
index 00000000000..458f0cd1616
--- /dev/null
+++ b/src/asahi/vulkan/hk_wsi.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright 2024 Valve Corporation
+ * Copyright 2024 Alyssa Rosenzweig
+ * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "hk_physical_device.h"
+
+VkResult hk_init_wsi(struct hk_physical_device *pdev);
+void hk_finish_wsi(struct hk_physical_device *pdev);
diff --git a/src/asahi/vulkan/meson.build b/src/asahi/vulkan/meson.build
new file mode 100644
index 00000000000..7b66cf2c1f0
--- /dev/null
+++ b/src/asahi/vulkan/meson.build
@@ -0,0 +1,142 @@
+# Copyright © 2022 Collabora Ltd. and Red Hat Inc.
+# SPDX-License-Identifier: MIT
+hk_files = files(
+  'hk_buffer.c',
+  'hk_buffer.h',
+  'hk_buffer_view.c',
+  'hk_buffer_view.h',
+  'hk_cmd_buffer.c',
+  'hk_cmd_buffer.h',
+  'hk_cmd_clear.c',
+  'hk_cmd_dispatch.c',
+  'hk_cmd_draw.c',
+  'hk_cmd_meta.c',
+  'hk_cmd_pool.c',
+  'hk_cmd_pool.h',
+  'hk_descriptor_set.h',
+  'hk_descriptor_set.c',
+  'hk_descriptor_set_layout.c',
+  'hk_descriptor_set_layout.h',
+  'hk_descriptor_table.c',
+  'hk_descriptor_table.h',
+  'hk_device.c',
+  'hk_device.h',
+  'hk_device_memory.c',
+  'hk_device_memory.h',
+  'hk_event.c',
+  'hk_event.h',
+  'hk_format.c',
+  'hk_image.c',
+  'hk_image.h',
+  'hk_image_view.c',
+  'hk_image_view.h',
+  'hk_instance.c',
+  'hk_instance.h',
+  'hk_nir_lower_descriptors.c',
+  'hk_nir_passthrough_gs.c',
+  'hk_physical_device.c',
+  'hk_physical_device.h',
+  'hk_private.h',
+  'hk_query_pool.c',
+  'hk_query_pool.h',
+  'hk_queue.c',
+  'hk_queue.h',
+  'hk_sampler.c',
+  'hk_sampler.h',
+  'hk_shader.c',
+  'hk_shader.h',
+  'hk_wsi.c',
+  'hk_wsi.h'
+)
+
+hk_entrypoints = custom_target(
+  'hk_entrypoints',
+  input : [vk_entrypoints_gen, vk_api_xml],
+  output : ['hk_entrypoints.h', 'hk_entrypoints.c'],
+  command : [
+    prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
+    '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'hk',
+    '--beta', with_vulkan_beta.to_string(),
+  ],
+  depend_files : vk_entrypoints_gen_depend_files,
+)
+
+hk_deps = [
+  dep_libdrm,
+  idep_nir,
+  idep_vulkan_runtime,
+  idep_vulkan_util,
+  idep_vulkan_wsi,
+  idep_vulkan_wsi_headers,
+  idep_agx_pack,
+]
+
+libhk = static_library(
+  'hk',
+  [
+    hk_entrypoints,
+    hk_files,
+    libagx_shaders,
+    sha1_h,
+  ],
+  include_directories : [
+    inc_gallium,
+    inc_gallium_aux,
+    inc_include,
+    inc_src,
+    inc_asahi,
+  ],
+  link_with : [libasahi_lib, libasahi_layout, libasahi_compiler],
+  c_args : ['-Wno-c2x-extensions'],
+  dependencies : [hk_deps],
+  gnu_symbol_visibility : 'hidden',
+)
+
+libvulkan_asahi = shared_library(
+  'vulkan_asahi',
+  link_whole : [libhk],
+  link_args: [ld_args_build_id],
+  gnu_symbol_visibility : 'hidden',
+  install : true,
+)
+
+icd_lib_path = join_paths(get_option('prefix'), get_option('libdir'))
+icd_file_name = 'libvulkan_asahi.so'
+if with_platform_windows
+  icd_lib_path = import('fs').relative_to(get_option('bindir'), with_vulkan_icd_dir)
+  icd_file_name = 'vulkan_asahi.dll'
+endif
+
+asahi_icd = custom_target(
+  'asahi_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : 'asahi_icd.@0@.json'.format(host_machine.cpu()),
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', join_paths(icd_lib_path, icd_file_name),
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+  install_dir : with_vulkan_icd_dir,
+  install_tag : 'runtime',
+  install : true,
+)
+
+_dev_icdname = 'asahi_devenv_icd.@0@.json'.format(host_machine.cpu())
+custom_target(
+  'asahi_devenv_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : _dev_icdname,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', meson.current_build_dir() / icd_file_name,
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', meson.current_build_dir() / _dev_icdname)
+# Deprecated: replaced by VK_DRIVER_FILES above
+devenv.append('VK_ICD_FILENAMES', meson.current_build_dir() / _dev_icdname)