asahi: add broken bits of unstable Linux UAPI

Rebasing around this patch has been a significant burden for development. Staging patches to asahi/mesa helps somewhat but 1. it's still really frustrating to have this much divergence with upstream, and 2. ideally we wouldn't have to do that. The kernel upstreaming is stalled for various reasons. This patch adds compile-only code to speak the unstable Linux UAPI for the SOLE purpose of reducing my rebase pain... NOT to actually work. It is NOT for users OR distro maintainers. asahi will refuse to probe on upstream Mesa to protect against regressions. The uapi is NOT STABLE and upstream Mesa CANNOT be used with it. Attempting to bypass this WILL give you a broken system. This patch employs several layers of deterrents against system-breaking enablement. With a lot of warning text at the relevant sites. Hopefully that is good enough to prevent people from breaking systems. And if people brazenly ignore all of the above ... they get to pick up the pieces. You have been warned. --- There is significant prior art for Mesa including downstream kernel uapi supports in-tree: * powervr (downstream android driver) * turnip (downstream kgsl android driver) * asahi ... ironically (prop macOS kernel driver) * maybe vc4? Linux is only special because of distros shipping tagged Mesa releases. The several layers of guards here guarantee that no tagged Mesa release would possibly probe even on an asahi downstream kernel. A distro would need a significant scary patch to make it probe. If/when it breaks, that's on them and they pick up the pieces. I make a stability guarantee ONLY for Fedora Asahi Remix -- where we push packages for both a downstream kernel and Mesa in tandem, while we patiently wait for upstreaming -- and that is *it*. It will be a nice future when this all works upstream, but unfortunately we're not there yet. Acked by Dave [1] and Sima [2] [1] https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29620#note_2444189 [2] https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29620#note_2445155 Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Co-developed-by: Asahi Lina <lina@asahilina.net> Signed-off-by: Asahi Lina <lina@asahilina.net> Co-developed-by: Sergio Lopez <slp@sinrega.org> Signed-off-by: Sergio Lopez <slp@sinrega.org> Co-developed-by: i509VCB <git@i509.me> Signed-off-by: i509VCB <git@i509.me> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29620>
2026-06-20 19:08:30 +02:00 · 2022-11-08 15:10:30 -05:00 · 2022-11-08 15:10:30 -05:00 · ece3896d5b
commit ece3896d5b
parent 08984e68fb
17 changed files with 2542 additions and 121 deletions
--- a/src/asahi/drm-shim/asahi_noop.c
+++ b/src/asahi/drm-shim/asahi_noop.c
@ -4,12 +4,118 @@
 * SPDX-License-Identifier: MIT
 */

+#include <string.h>
+
+#include "../lib/unstable_asahi_drm.h"
 #include "drm-shim/drm_shim.h"

 bool drm_shim_driver_prefers_first_render_node = true;

+static const struct drm_asahi_params_global params = {
+   .unstable_uabi_version = DRM_ASAHI_UNSTABLE_UABI_VERSION,
+   .gpu_generation = 13,
+   .gpu_variant = 'G',
+   .gpu_revision = 0,
+   .vm_user_start = 0x1000000,
+   .vm_user_end = 0x5000000,
+   .vm_shader_start = 0x8000000,
+   .vm_shader_end = 0x9000000,
+   .vm_page_size = 4096,
+};
+
+struct asahi_bo {
+   struct shim_bo base;
+   uint32_t offset;
+};
+
+static struct asahi_bo *
+asahi_bo(struct shim_bo *bo)
+{
+   return (struct asahi_bo *)bo;
+}
+
+struct asahi_device {
+   uint64_t next_offset;
+};
+
+static struct asahi_device asahi = {
+   .next_offset = 0x1000,
+};
+
+static int
+asahi_ioctl_noop(int fd, unsigned long request, void *arg)
+{
+   return 0;
+}
+
+static int
+asahi_ioctl_submit(int fd, unsigned long request, void *arg)
+{
+   return 0;
+}
+
+static int
+asahi_ioctl_gem_create(int fd, unsigned long request, void *arg)
+{
+   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+   struct drm_asahi_gem_create *create = arg;
+   struct asahi_bo *bo = calloc(1, sizeof(*bo));
+
+   drm_shim_bo_init(&bo->base, create->size);
+
+   assert(UINT64_MAX - asahi.next_offset > create->size);
+   bo->offset = asahi.next_offset;
+   asahi.next_offset += create->size;
+
+   create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
+
+   drm_shim_bo_put(&bo->base);
+
+   return 0;
+}
+
+static int
+asahi_ioctl_gem_mmap_offset(int fd, unsigned long request, void *arg)
+{
+   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+   struct drm_asahi_gem_mmap_offset *map = arg;
+   struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
+
+   map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+   drm_shim_bo_put(bo);
+
+   return 0;
+}
+
+static int
+asahi_ioctl_get_param(int fd, unsigned long request, void *arg)
+{
+   struct drm_asahi_get_params *gp = arg;
+
+   switch (gp->param_group) {
+   case 0:
+      assert(gp->size == sizeof(struct drm_asahi_params_global));
+      memcpy((void *)gp->pointer, &params, gp->size);
+      return 0;
+
+   default:
+      fprintf(stderr, "Unknown DRM_IOCTL_ASAHI_GET_PARAMS %d\n",
+              gp->param_group);
+      return -1;
+   }
+}
+
 static ioctl_fn_t driver_ioctls[] = {
-   /* The Asahi Linux UAPI is not yet upstream */
+   [DRM_ASAHI_GET_PARAMS] = asahi_ioctl_get_param,
+   [DRM_ASAHI_VM_CREATE] = asahi_ioctl_noop,
+   [DRM_ASAHI_VM_DESTROY] = asahi_ioctl_noop,
+   [DRM_ASAHI_GEM_CREATE] = asahi_ioctl_gem_create,
+   [DRM_ASAHI_GEM_MMAP_OFFSET] = asahi_ioctl_gem_mmap_offset,
+   [DRM_ASAHI_GEM_BIND] = asahi_ioctl_noop,
+   [DRM_ASAHI_QUEUE_CREATE] = asahi_ioctl_noop,
+   [DRM_ASAHI_QUEUE_DESTROY] = asahi_ioctl_noop,
+   [DRM_ASAHI_SUBMIT] = asahi_ioctl_submit,
 };

 void
--- a/src/asahi/drm-shim/meson.build
+++ b/src/asahi/drm-shim/meson.build
@ -4,8 +4,8 @@
 libasahi_noop_drm_shim = shared_library(
  'asahi_noop_drm_shim',
  'asahi_noop.c',
-  include_directories: inc_src,
-  dependencies: dep_drm_shim,
+  include_directories: [inc_include, inc_src],
+  dependencies: [dep_drm_shim, dep_valgrind],
  gnu_symbol_visibility : 'hidden',
  install : true,
 )
--- a/src/asahi/lib/agx_bo.c
+++ b/src/asahi/lib/agx_bo.c
@ -189,7 +189,7 @@ agx_bo_unreference(struct agx_bo *bo)
    * lock, let's make sure it's still not referenced before freeing it.
    */
   if (p_atomic_read(&bo->refcnt) == 0) {
-      assert(!p_atomic_read_relaxed(&bo->writer_syncobj));
+      assert(!p_atomic_read_relaxed(&bo->writer));

      if (dev->debug & AGX_DBG_TRACE)
         agxdecode_track_free(dev->agxdecode, bo);
@ -225,12 +225,12 @@ agx_bo_create_aligned(struct agx_device *dev, unsigned size, unsigned align,
    * flush the cache to make space for the new allocation.
    */
   if (!bo)
-      bo = agx_bo_alloc(dev, size, align, flags);
+      bo = dev->ops.bo_alloc(dev, size, align, flags);
   if (!bo)
      bo = agx_bo_cache_fetch(dev, size, align, flags, false);
   if (!bo) {
      agx_bo_cache_evict_all(dev);
-      bo = agx_bo_alloc(dev, size, align, flags);
+      bo = dev->ops.bo_alloc(dev, size, align, flags);
   }

   if (!bo) {
--- a/src/asahi/lib/agx_bo.h
+++ b/src/asahi/lib/agx_bo.h
@ -80,8 +80,8 @@ struct agx_bo {
   /* DMA-BUF fd clone for adding fences to imports/exports */
   int prime_fd;

-   /* Syncobj handle of the current writer, if any */
-   uint32_t writer_syncobj;
+   /* Current writer, if any (queue in upper 32 bits, syncobj in lower 32 bits) */
+   uint64_t writer;

   /* Owner */
   struct agx_device *dev;
@ -97,8 +97,30 @@ struct agx_bo {

   /* For debugging */
   const char *label;
+
+   /* virtio blob_id */
+   uint32_t blob_id;
+   uint32_t vbo_res_id;
 };

+static inline uint32_t
+agx_bo_writer_syncobj(uint64_t writer)
+{
+   return writer;
+}
+
+static inline uint32_t
+agx_bo_writer_queue(uint64_t writer)
+{
+   return writer >> 32;
+}
+
+static inline uint64_t
+agx_bo_writer(uint32_t queue, uint32_t syncobj)
+{
+   return (((uint64_t)queue) << 32) | syncobj;
+}
+
 struct agx_bo *agx_bo_create_aligned(struct agx_device *dev, unsigned size,
                                     unsigned align, enum agx_bo_flags flags,
                                     const char *label);
@ -115,8 +137,6 @@ struct agx_bo *agx_bo_import(struct agx_device *dev, int fd);
 int agx_bo_export(struct agx_bo *bo);

 void agx_bo_free(struct agx_device *dev, struct agx_bo *bo);
-struct agx_bo *agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
-                            enum agx_bo_flags flags);
 struct agx_bo *agx_bo_cache_fetch(struct agx_device *dev, size_t size,
                                  size_t align, uint32_t flags,
                                  const bool dontwait);
--- a/src/asahi/lib/agx_device.c
+++ b/src/asahi/lib/agx_device.c
@ -1,6 +1,7 @@
 /*
 * Copyright 2021 Alyssa Rosenzweig
 * Copyright 2019 Collabora, Ltd.
+ * Copyright 2020 Igalia S.L.
 * SPDX-License-Identifier: MIT
 */

@ -10,6 +11,7 @@
 #include "util/timespec.h"
 #include "agx_bo.h"
 #include "agx_compile.h"
+#include "agx_device_virtio.h"
 #include "agx_scratch.h"
 #include "decode.h"
 #include "glsl_types.h"
@ -20,16 +22,25 @@
 #include "drm-uapi/dma-buf.h"
 #include "util/blob.h"
 #include "util/log.h"
+#include "util/mesa-sha1.h"
 #include "util/os_file.h"
 #include "util/os_mman.h"
 #include "util/os_time.h"
 #include "util/simple_mtx.h"
 #include "git_sha1.h"
 #include "nir_serialize.h"
+#include "unstable_asahi_drm.h"
+#include "vdrm.h"

-/* TODO: Linux UAPI. Dummy defines to get some things to compile. */
-#define ASAHI_BIND_READ  0
-#define ASAHI_BIND_WRITE 0
+static inline int
+asahi_simple_ioctl(struct agx_device *dev, unsigned cmd, void *req)
+{
+   if (dev->is_virtio) {
+      return agx_virtio_simple_ioctl(dev, cmd, req);
+   } else {
+      return drmIoctl(dev->fd, cmd, req);
+   }
+}

 /* clang-format off */
 static const struct debug_named_value agx_debug_options[] = {
@ -101,10 +112,26 @@ static int
 agx_bo_bind(struct agx_device *dev, struct agx_bo *bo, uint64_t addr,
            uint32_t flags)
 {
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_gem_bind gem_bind = {
+      .op = ASAHI_BIND_OP_BIND,
+      .flags = flags,
+      .handle = bo->handle,
+      .vm_id = dev->vm_id,
+      .offset = 0,
+      .range = bo->size,
+      .addr = addr,
+   };
+
+   int ret = drmIoctl(dev->fd, DRM_IOCTL_ASAHI_GEM_BIND, &gem_bind);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GEM_BIND failed: %m (handle=%d)\n",
+              bo->handle);
+   }
+
+   return ret;
 }

-struct agx_bo *
+static struct agx_bo *
 agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
             enum agx_bo_flags flags)
 {
@ -117,7 +144,23 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
   /* executable implies low va */
   assert(!(flags & AGX_BO_EXEC) || (flags & AGX_BO_LOW_VA));

-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_gem_create gem_create = {.size = size};
+
+   if (flags & AGX_BO_WRITEBACK)
+      gem_create.flags |= ASAHI_GEM_WRITEBACK;
+
+   if (!(flags & (AGX_BO_SHARED | AGX_BO_SHAREABLE))) {
+      gem_create.flags |= ASAHI_GEM_VM_PRIVATE;
+      gem_create.vm_id = dev->vm_id;
+   }
+
+   int ret = drmIoctl(dev->fd, DRM_IOCTL_ASAHI_GEM_CREATE, &gem_create);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GEM_CREATE failed: %m\n");
+      return NULL;
+   }
+
+   handle = gem_create.handle;

   pthread_mutex_lock(&dev->bo_map_lock);
   bo = agx_lookup_bo(dev, handle);
@ -128,7 +171,7 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
   assert(!memcmp(bo, &((struct agx_bo){}), sizeof(*bo)));

   bo->type = AGX_ALLOC_REGULAR;
-   bo->size = size; /* TODO: gem_create.size */
+   bo->size = gem_create.size;
   bo->align = MAX2(dev->params.vm_page_size, align);
   bo->flags = flags;
   bo->dev = dev;
@ -157,13 +200,13 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
      bind |= ASAHI_BIND_WRITE;
   }

-   int ret = agx_bo_bind(dev, bo, bo->ptr.gpu, bind);
+   ret = dev->ops.bo_bind(dev, bo, bo->ptr.gpu, bind);
   if (ret) {
      agx_bo_free(dev, bo);
      return NULL;
   }

-   agx_bo_mmap(bo);
+   dev->ops.bo_mmap(bo);

   if (flags & AGX_BO_LOW_VA)
      bo->ptr.gpu -= dev->shader_base;
@ -173,10 +216,31 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
   return bo;
 }

-void
+static void
 agx_bo_mmap(struct agx_bo *bo)
 {
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_gem_mmap_offset gem_mmap_offset = {.handle = bo->handle};
+   int ret;
+
+   if (bo->ptr.cpu)
+      return;
+
+   ret =
+      drmIoctl(bo->dev->fd, DRM_IOCTL_ASAHI_GEM_MMAP_OFFSET, &gem_mmap_offset);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_MMAP_BO failed: %m\n");
+      assert(0);
+   }
+
+   bo->ptr.cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         bo->dev->fd, gem_mmap_offset.offset);
+   if (bo->ptr.cpu == MAP_FAILED) {
+      bo->ptr.cpu = NULL;
+      fprintf(stderr,
+              "mmap failed: result=%p size=0x%llx fd=%i offset=0x%llx %m\n",
+              bo->ptr.cpu, (long long)bo->size, bo->dev->fd,
+              (long long)gem_mmap_offset.offset);
+   }
 }

 struct agx_bo *
@ -239,8 +303,12 @@ agx_bo_import(struct agx_device *dev, int fd)
         abort();
      }

-      ret =
-         agx_bo_bind(dev, bo, bo->ptr.gpu, ASAHI_BIND_READ | ASAHI_BIND_WRITE);
+      if (dev->is_virtio) {
+         bo->vbo_res_id = vdrm_handle_to_res_id(dev->vdrm, bo->handle);
+      }
+
+      ret = dev->ops.bo_bind(dev, bo, bo->ptr.gpu,
+                             ASAHI_BIND_READ | ASAHI_BIND_WRITE);
      if (ret) {
         fprintf(stderr, "import failed: Could not bind BO at 0x%llx\n",
                 (long long)bo->ptr.gpu);
@ -293,11 +361,11 @@ agx_bo_export(struct agx_bo *bo)
      /* If there is a pending writer to this BO, import it into the buffer
       * for implicit sync.
       */
-      uint32_t writer_syncobj = p_atomic_read_relaxed(&bo->writer_syncobj);
-      if (writer_syncobj) {
+      uint64_t writer = p_atomic_read_relaxed(&bo->writer);
+      if (writer) {
         int out_sync_fd = -1;
-         int ret =
-            drmSyncobjExportSyncFile(bo->dev->fd, writer_syncobj, &out_sync_fd);
+         int ret = drmSyncobjExportSyncFile(
+            bo->dev->fd, agx_bo_writer_syncobj(writer), &out_sync_fd);
         assert(ret >= 0);
         assert(out_sync_fd >= 0);

@ -331,10 +399,38 @@ agx_get_global_id(struct agx_device *dev)
 static ssize_t
 agx_get_params(struct agx_device *dev, void *buf, size_t size)
 {
-   /* TODO: Linux UAPI */
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_get_params get_param = {
+      .param_group = 0,
+      .pointer = (uint64_t)(uintptr_t)buf,
+      .size = size,
+   };
+
+   memset(buf, 0, size);
+
+   int ret = drmIoctl(dev->fd, DRM_IOCTL_ASAHI_GET_PARAMS, &get_param);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GET_PARAMS failed: %m\n");
+      return -EINVAL;
+   }
+
+   return get_param.size;
 }

+static int
+agx_submit(struct agx_device *dev, struct drm_asahi_submit *submit,
+           uint32_t vbo_res_id)
+{
+   return drmIoctl(dev->fd, DRM_IOCTL_ASAHI_SUBMIT, submit);
+}
+
+const agx_device_ops_t agx_device_drm_ops = {
+   .bo_alloc = agx_bo_alloc,
+   .bo_bind = agx_bo_bind,
+   .bo_mmap = agx_bo_mmap,
+   .get_params = agx_get_params,
+   .submit = agx_submit,
+};
+
 bool
 agx_open_device(void *memctx, struct agx_device *dev)
 {
@ -342,21 +438,119 @@ agx_open_device(void *memctx, struct agx_device *dev)
      debug_get_flags_option("ASAHI_MESA_DEBUG", agx_debug_options, 0);

   dev->agxdecode = agxdecode_new_context();
+   dev->ops = agx_device_drm_ops;

   ssize_t params_size = -1;
+   drmVersionPtr version;

-   /* TODO: Linux UAPI */
-   return false;
+   version = drmGetVersion(dev->fd);
+   if (!version) {
+      fprintf(stderr, "cannot get version: %s", strerror(errno));
+      return NULL;
+   }

-   params_size = agx_get_params(dev, &dev->params, sizeof(dev->params));
+   if (!strcmp(version->name, "asahi")) {
+      dev->is_virtio = false;
+      dev->ops = agx_device_drm_ops;
+   } else if (!strcmp(version->name, "virtio_gpu")) {
+      dev->is_virtio = true;
+      if (!agx_virtio_open_device(dev)) {
+         fprintf(stderr,
+                 "Error opening virtio-gpu device for Asahi native context\n");
+         return false;
+      }
+   } else {
+      return false;
+   }
+
+   params_size = dev->ops.get_params(dev, &dev->params, sizeof(dev->params));
   if (params_size <= 0) {
      assert(0);
      return false;
   }
   assert(params_size >= sizeof(dev->params));

-   /* TODO: Linux UAPI: Params */
-   unreachable("Linux UAPI not yet upstream");
+   /* Refuse to probe. */
+   if (dev->params.unstable_uabi_version != DRM_ASAHI_UNSTABLE_UABI_VERSION) {
+      fprintf(
+         stderr,
+         "You are attempting to use upstream Mesa with a downstream kernel!\n"
+         "This WILL NOT work.\n"
+         "The Asahi UABI is unstable and NOT SUPPORTED in upstream Mesa.\n"
+         "UABI related code in upstream Mesa is not for use!\n"
+         "\n"
+         "Do NOT attempt to patch out checks, you WILL break your system.\n"
+         "Do NOT report bugs.\n"
+         "Do NOT ask Mesa developers for support.\n"
+         "Do NOT write guides about how to patch out these checks.\n"
+         "Do NOT package patches to Mesa to bypass this.\n"
+         "\n"
+         "~~~\n"
+         "This is not a place of honor.\n"
+         "No highly esteemed deed is commemorated here.\n"
+         "Nothing valued is here.\n"
+         "\n"
+         "What is here was dangerous and repulsive to us.\n"
+         "This message is a warning about danger.\n"
+         "\n"
+         "The danger is still present, in your time, as it was in ours.\n"
+         "The danger is unleashed only if you substantially disturb this place physically.\n"
+         "This place is best shunned and left uninhabited.\n"
+         "~~~\n"
+         "\n"
+         "THIS IS NOT A BUG. THIS IS YOU DOING SOMETHING BROKEN!\n");
+      abort();
+   }
+
+   uint64_t incompat =
+      dev->params.feat_incompat & (~AGX_SUPPORTED_INCOMPAT_FEATURES);
+   if (incompat) {
+      fprintf(stderr, "Missing GPU incompat features: 0x%" PRIx64 "\n",
+              incompat);
+      assert(0);
+      return false;
+   }
+
+   if (dev->params.gpu_generation >= 13 && dev->params.gpu_variant != 'P') {
+      const char *variant = " Unknown";
+      switch (dev->params.gpu_variant) {
+      case 'G':
+         variant = "";
+         break;
+      case 'S':
+         variant = " Pro";
+         break;
+      case 'C':
+         variant = " Max";
+         break;
+      case 'D':
+         variant = " Ultra";
+         break;
+      }
+      snprintf(dev->name, sizeof(dev->name), "Apple M%d%s (G%d%c %02X)",
+               dev->params.gpu_generation - 12, variant,
+               dev->params.gpu_generation, dev->params.gpu_variant,
+               dev->params.gpu_revision + 0xA0);
+   } else {
+      // Note: untested, theoretically this is the logic for at least a few
+      // generations back.
+      const char *variant = " Unknown";
+      switch (dev->params.gpu_variant) {
+      case 'P':
+         variant = "";
+         break;
+      case 'G':
+         variant = "X";
+         break;
+      }
+      snprintf(dev->name, sizeof(dev->name), "Apple A%d%s (G%d%c %02X)",
+               dev->params.gpu_generation + 1, variant,
+               dev->params.gpu_generation, dev->params.gpu_variant,
+               dev->params.gpu_revision + 0xA0);
+   }
+
+   dev->guard_size = dev->params.vm_page_size;
+   dev->shader_base = dev->params.vm_shader_start;

   util_sparse_array_init(&dev->bo_map, sizeof(struct agx_bo), 512);
   pthread_mutex_init(&dev->bo_map_lock, NULL);
@ -367,7 +561,14 @@ agx_open_device(void *memctx, struct agx_device *dev)
   for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i)
      list_inithead(&dev->bo_cache.buckets[i]);

-   /* TODO: Linux UAPI: Create VM */
+   struct drm_asahi_vm_create vm_create = {};
+
+   int ret = asahi_simple_ioctl(dev, DRM_IOCTL_ASAHI_VM_CREATE, &vm_create);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_VM_CREATE failed: %m\n");
+      assert(0);
+      return false;
+   }

   simple_mtx_init(&dev->vma_lock, mtx_plain);
   util_vma_heap_init(&dev->main_heap, dev->params.vm_user_start,
@ -376,6 +577,8 @@ agx_open_device(void *memctx, struct agx_device *dev)
      &dev->usc_heap, dev->params.vm_shader_start,
      dev->params.vm_shader_end - dev->params.vm_shader_start + 1);

+   dev->vm_id = vm_create.vm_id;
+
   agx_get_global_ids(dev);

   glsl_type_singleton_init_or_ref();
@ -406,9 +609,34 @@ agx_close_device(struct agx_device *dev)
 }

 uint32_t
-agx_create_command_queue(struct agx_device *dev, uint32_t caps)
+agx_create_command_queue(struct agx_device *dev, uint32_t caps,
+                         uint32_t priority)
 {
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_queue_create queue_create = {
+      .vm_id = dev->vm_id,
+      .queue_caps = caps,
+      .priority = priority,
+      .flags = 0,
+   };
+
+   int ret =
+      asahi_simple_ioctl(dev, DRM_IOCTL_ASAHI_QUEUE_CREATE, &queue_create);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_QUEUE_CREATE failed: %m\n");
+      assert(0);
+   }
+
+   return queue_create.queue_id;
+}
+
+int
+agx_destroy_command_queue(struct agx_device *dev, uint32_t queue_id)
+{
+   struct drm_asahi_queue_destroy queue_destroy = {
+      .queue_id = queue_id,
+   };
+
+   return drmIoctl(dev->fd, DRM_IOCTL_ASAHI_QUEUE_DESTROY, &queue_destroy);
 }

 int
@ -507,3 +735,56 @@ agx_get_gpu_timestamp(struct agx_device *dev)
 #error "invalid architecture for asahi"
 #endif
 }
+
+/* (Re)define UUID_SIZE to avoid including vulkan.h (or p_defines.h) here. */
+#define UUID_SIZE 16
+
+void
+agx_get_device_uuid(const struct agx_device *dev, void *uuid)
+{
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+   /* The device UUID uniquely identifies the given device within the machine.
+    * Since we never have more than one device, this doesn't need to be a real
+    * UUID, so we use SHA1("agx" + gpu_generation + gpu_variant + gpu_revision).
+    */
+   static const char *device_name = "agx";
+   _mesa_sha1_update(&sha1_ctx, device_name, strlen(device_name));
+
+   _mesa_sha1_update(&sha1_ctx, &dev->params.gpu_generation,
+                     sizeof(dev->params.gpu_generation));
+   _mesa_sha1_update(&sha1_ctx, &dev->params.gpu_variant,
+                     sizeof(dev->params.gpu_variant));
+   _mesa_sha1_update(&sha1_ctx, &dev->params.gpu_revision,
+                     sizeof(dev->params.gpu_revision));
+
+   uint8_t sha1[SHA1_DIGEST_LENGTH];
+   _mesa_sha1_final(&sha1_ctx, sha1);
+
+   assert(SHA1_DIGEST_LENGTH >= UUID_SIZE);
+   memcpy(uuid, sha1, UUID_SIZE);
+}
+
+void
+agx_get_driver_uuid(void *uuid)
+{
+   const char *driver_id = PACKAGE_VERSION MESA_GIT_SHA1;
+
+   /* The driver UUID is used for determining sharability of images and memory
+    * between two Vulkan instances in separate processes, but also to
+    * determining memory objects and sharability between Vulkan and OpenGL
+    * driver. People who want to share memory need to also check the device
+    * UUID.
+    */
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+   _mesa_sha1_update(&sha1_ctx, driver_id, strlen(driver_id));
+
+   uint8_t sha1[SHA1_DIGEST_LENGTH];
+   _mesa_sha1_final(&sha1_ctx, sha1);
+
+   assert(SHA1_DIGEST_LENGTH >= UUID_SIZE);
+   memcpy(uuid, sha1, UUID_SIZE);
+}
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@ -5,6 +5,7 @@

 #pragma once

+#include <xf86drm.h>
 #include "util/simple_mtx.h"
 #include "util/sparse_array.h"
 #include "util/timespec.h"
@ -12,6 +13,11 @@
 #include "agx_bo.h"
 #include "agx_formats.h"
 #include "decode.h"
+#include "unstable_asahi_drm.h"
+
+// TODO: this is a lie right now
+static const uint64_t AGX_SUPPORTED_INCOMPAT_FEATURES =
+   DRM_ASAHI_FEAT_MANDATORY_ZS_COMPRESSION;

 enum agx_dbg {
   AGX_DBG_TRACE = BITFIELD_BIT(0),
@ -37,29 +43,6 @@ enum agx_dbg {
   AGX_DBG_FEEDBACK = BITFIELD_BIT(20),
 };

-/* Dummy partial declarations, pending real UAPI */
-enum drm_asahi_cmd_type { DRM_ASAHI_CMD_TYPE_PLACEHOLDER_FOR_DOWNSTREAM_UAPI };
-enum drm_asahi_sync_type { DRM_ASAHI_SYNC_SYNCOBJ };
-struct drm_asahi_sync {
-   uint32_t sync_type;
-   uint32_t handle;
-};
-struct drm_asahi_params_global {
-   uint64_t vm_page_size;
-   uint64_t vm_user_start;
-   uint64_t vm_user_end;
-   uint64_t vm_shader_start;
-   uint64_t vm_shader_end;
-   uint32_t chip_id;
-   uint32_t num_clusters_total;
-   uint32_t gpu_generation;
-   uint32_t gpu_variant;
-   uint32_t num_dies;
-   uint32_t timer_frequency_hz;
-   uint32_t num_cores_per_cluster;
-   uint64_t core_masks[32];
-};
-
 /* How many power-of-two levels in the BO cache do we want? 2^14 minimum chosen
 * as it is the page size that all allocations are rounded to
 */
@ -72,6 +55,20 @@ struct drm_asahi_params_global {
 /* Forward decl only, do not pull in all of NIR */
 struct nir_shader;

+#define BARRIER_RENDER  (1 << DRM_ASAHI_SUBQUEUE_RENDER)
+#define BARRIER_COMPUTE (1 << DRM_ASAHI_SUBQUEUE_COMPUTE)
+
+typedef struct {
+   struct agx_bo *(*bo_alloc)(struct agx_device *dev, size_t size, size_t align,
+                              enum agx_bo_flags flags);
+   int (*bo_bind)(struct agx_device *dev, struct agx_bo *bo, uint64_t addr,
+                  uint32_t flags);
+   void (*bo_mmap)(struct agx_bo *bo);
+   ssize_t (*get_params)(struct agx_device *dev, void *buf, size_t size);
+   int (*submit)(struct agx_device *dev, struct drm_asahi_submit *submit,
+                 uint32_t vbo_res_id);
+} agx_device_ops_t;
+
 struct agx_device {
   uint32_t debug;

@ -81,6 +78,12 @@ struct agx_device {
   char name[64];
   struct drm_asahi_params_global params;
   uint64_t next_global_id, last_global_id;
+   bool is_virtio;
+   agx_device_ops_t ops;
+
+   /* vdrm device */
+   struct vdrm_device *vdrm;
+   uint32_t next_blob_id;

   /* Device handle */
   int fd;
@ -136,11 +139,11 @@ agx_lookup_bo(struct agx_device *dev, uint32_t handle)
   return util_sparse_array_get(&dev->bo_map, handle);
 }

-void agx_bo_mmap(struct agx_bo *bo);
-
 uint64_t agx_get_global_id(struct agx_device *dev);

-uint32_t agx_create_command_queue(struct agx_device *dev, uint32_t caps);
+uint32_t agx_create_command_queue(struct agx_device *dev, uint32_t caps,
+                                  uint32_t priority);
+int agx_destroy_command_queue(struct agx_device *dev, uint32_t queue_id);

 int agx_import_sync_file(struct agx_device *dev, struct agx_bo *bo, int fd);
 int agx_export_sync_file(struct agx_device *dev, struct agx_bo *bo);
@ -154,3 +157,6 @@ agx_gpu_time_to_ns(struct agx_device *dev, uint64_t gpu_time)
 {
   return (gpu_time * NSEC_PER_SEC) / dev->params.timer_frequency_hz;
 }
+
+void agx_get_device_uuid(const struct agx_device *dev, void *uuid);
+void agx_get_driver_uuid(void *uuid);
--- a/src/asahi/lib/agx_device_virtio.c
+++ b/src/asahi/lib/agx_device_virtio.c
@ -0,0 +1,326 @@
+/*
+ * Copyright 2024 Sergio Lopez
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "agx_device_virtio.h"
+
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "drm-uapi/virtgpu_drm.h"
+
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "vdrm.h"
+#include "virglrenderer_hw.h"
+
+#include "asahi_proto.h"
+
+/**
+ * Helper for simple pass-thru ioctls
+ */
+int
+agx_virtio_simple_ioctl(struct agx_device *dev, unsigned cmd, void *_req)
+{
+   struct vdrm_device *vdrm = dev->vdrm;
+   unsigned req_len = sizeof(struct asahi_ccmd_ioctl_simple_req);
+   unsigned rsp_len = sizeof(struct asahi_ccmd_ioctl_simple_rsp);
+
+   req_len += _IOC_SIZE(cmd);
+   if (cmd & IOC_OUT)
+      rsp_len += _IOC_SIZE(cmd);
+
+   uint8_t buf[req_len];
+   struct asahi_ccmd_ioctl_simple_req *req = (void *)buf;
+   struct asahi_ccmd_ioctl_simple_rsp *rsp;
+
+   req->hdr = ASAHI_CCMD(IOCTL_SIMPLE, req_len);
+   req->cmd = cmd;
+   memcpy(req->payload, _req, _IOC_SIZE(cmd));
+
+   rsp = vdrm_alloc_rsp(vdrm, &req->hdr, rsp_len);
+
+   int ret = vdrm_send_req(vdrm, &req->hdr, true);
+   if (ret) {
+      fprintf(stderr, "simple_ioctl: vdrm_send_req failed\n");
+      return ret;
+   }
+
+   if (cmd & IOC_OUT)
+      memcpy(_req, rsp->payload, _IOC_SIZE(cmd));
+
+   return rsp->ret;
+}
+
+static struct agx_bo *
+agx_virtio_bo_alloc(struct agx_device *dev, size_t size, size_t align,
+                    enum agx_bo_flags flags)
+{
+   struct agx_bo *bo;
+   unsigned handle = 0;
+   uint64_t ptr_gpu;
+
+   size = ALIGN_POT(size, dev->params.vm_page_size);
+
+   /* executable implies low va */
+   assert(!(flags & AGX_BO_EXEC) || (flags & AGX_BO_LOW_VA));
+
+   struct asahi_ccmd_gem_new_req req = {
+      .hdr = ASAHI_CCMD(GEM_NEW, sizeof(req)),
+      .size = size,
+   };
+
+   if (flags & AGX_BO_WRITEBACK)
+      req.flags |= ASAHI_GEM_WRITEBACK;
+
+   uint32_t blob_flags =
+      VIRTGPU_BLOB_FLAG_USE_MAPPABLE | VIRTGPU_BLOB_FLAG_USE_SHAREABLE;
+
+   req.bind_flags = ASAHI_BIND_READ;
+   if (!(flags & AGX_BO_READONLY)) {
+      req.bind_flags |= ASAHI_BIND_WRITE;
+   }
+
+   uint32_t blob_id = p_atomic_inc_return(&dev->next_blob_id);
+
+   ASSERTED bool lo = (flags & AGX_BO_LOW_VA);
+
+   struct util_vma_heap *heap;
+   if (lo)
+      heap = &dev->usc_heap;
+   else
+      heap = &dev->main_heap;
+
+   simple_mtx_lock(&dev->vma_lock);
+   ptr_gpu = util_vma_heap_alloc(heap, size + dev->guard_size,
+                                 dev->params.vm_page_size);
+   simple_mtx_unlock(&dev->vma_lock);
+   if (!ptr_gpu) {
+      fprintf(stderr, "Failed to allocate BO VMA\n");
+      return NULL;
+   }
+
+   req.addr = ptr_gpu;
+   req.blob_id = blob_id;
+   req.vm_id = dev->vm_id;
+
+   handle = vdrm_bo_create(dev->vdrm, size, blob_flags, blob_id, &req.hdr);
+   if (!handle) {
+      fprintf(stderr, "vdrm_bo_created failed\n");
+      return NULL;
+   }
+
+   pthread_mutex_lock(&dev->bo_map_lock);
+   bo = agx_lookup_bo(dev, handle);
+   dev->max_handle = MAX2(dev->max_handle, handle);
+   pthread_mutex_unlock(&dev->bo_map_lock);
+
+   /* Fresh handle */
+   assert(!memcmp(bo, &((struct agx_bo){}), sizeof(*bo)));
+
+   bo->type = AGX_ALLOC_REGULAR;
+   bo->size = size;
+   bo->align = MAX2(dev->params.vm_page_size, align);
+   bo->flags = flags;
+   bo->dev = dev;
+   bo->handle = handle;
+   bo->prime_fd = -1;
+   bo->blob_id = blob_id;
+   bo->ptr.gpu = ptr_gpu;
+   bo->vbo_res_id = vdrm_handle_to_res_id(dev->vdrm, handle);
+
+   dev->ops.bo_mmap(bo);
+
+   if (flags & AGX_BO_LOW_VA)
+      bo->ptr.gpu -= dev->shader_base;
+
+   assert(bo->ptr.gpu < (1ull << (lo ? 32 : 40)));
+
+   return bo;
+}
+
+static int
+agx_virtio_bo_bind(struct agx_device *dev, struct agx_bo *bo, uint64_t addr,
+                   uint32_t flags)
+{
+   struct asahi_ccmd_gem_bind_req req = {
+      .op = ASAHI_BIND_OP_BIND,
+      .flags = flags,
+      .vm_id = dev->vm_id,
+      .res_id = bo->vbo_res_id,
+      .size = bo->size,
+      .addr = addr,
+      .hdr.cmd = ASAHI_CCMD_GEM_BIND,
+      .hdr.len = sizeof(struct asahi_ccmd_gem_bind_req),
+   };
+
+   int ret = vdrm_send_req(dev->vdrm, &req.hdr, false);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GEM_BIND failed: %d (handle=%d)\n", ret,
+              bo->handle);
+   }
+
+   return ret;
+}
+
+static void
+agx_virtio_bo_mmap(struct agx_bo *bo)
+{
+   if (bo->ptr.cpu) {
+      return;
+   }
+
+   bo->ptr.cpu = vdrm_bo_map(bo->dev->vdrm, bo->handle, bo->size, NULL);
+   if (bo->ptr.cpu == MAP_FAILED) {
+      bo->ptr.cpu = NULL;
+      fprintf(stderr, "mmap failed: result=%p size=0x%llx fd=%i\n", bo->ptr.cpu,
+              (long long)bo->size, bo->dev->fd);
+   }
+}
+
+static ssize_t
+agx_virtio_get_params(struct agx_device *dev, void *buf, size_t size)
+{
+   struct vdrm_device *vdrm = dev->vdrm;
+   struct asahi_ccmd_get_params_req req = {
+      .params.size = size,
+      .hdr.cmd = ASAHI_CCMD_GET_PARAMS,
+      .hdr.len = sizeof(struct asahi_ccmd_get_params_req),
+   };
+   struct asahi_ccmd_get_params_rsp *rsp;
+
+   rsp =
+      vdrm_alloc_rsp(vdrm, &req.hdr, sizeof(struct asahi_ccmd_get_params_rsp));
+
+   int ret = vdrm_send_req(vdrm, &req.hdr, true);
+   if (ret)
+      goto out;
+
+   ret = rsp->ret;
+   if (!ret) {
+      memcpy(buf, &rsp->params, size);
+      return size;
+   }
+
+out:
+   return ret;
+}
+
+static int
+agx_virtio_submit(struct agx_device *dev, struct drm_asahi_submit *submit,
+                  uint32_t vbo_res_id)
+{
+   struct drm_asahi_command *commands =
+      (struct drm_asahi_command *)submit->commands;
+   struct drm_asahi_sync *in_syncs = (struct drm_asahi_sync *)submit->in_syncs;
+   struct drm_asahi_sync *out_syncs =
+      (struct drm_asahi_sync *)submit->out_syncs;
+   size_t req_len = sizeof(struct asahi_ccmd_submit_req);
+
+   for (int i = 0; i < submit->command_count; i++) {
+      switch (commands[i].cmd_type) {
+      case DRM_ASAHI_CMD_COMPUTE: {
+         req_len += sizeof(struct drm_asahi_command) +
+                    sizeof(struct drm_asahi_cmd_compute);
+         break;
+      }
+
+      case DRM_ASAHI_CMD_RENDER: {
+         struct drm_asahi_cmd_render *render =
+            (struct drm_asahi_cmd_render *)commands[i].cmd_buffer;
+         req_len += sizeof(struct drm_asahi_command) +
+                    sizeof(struct drm_asahi_cmd_render);
+         req_len += render->fragment_attachment_count *
+                    sizeof(struct drm_asahi_attachment);
+         break;
+      }
+
+      default:
+         return EINVAL;
+      }
+   }
+
+   struct asahi_ccmd_submit_req *req =
+      (struct asahi_ccmd_submit_req *)calloc(1, req_len);
+
+   req->queue_id = submit->queue_id;
+   req->result_res_id = vbo_res_id;
+   req->command_count = submit->command_count;
+
+   char *ptr = (char *)&req->payload;
+
+   for (int i = 0; i < submit->command_count; i++) {
+      memcpy(ptr, &commands[i], sizeof(struct drm_asahi_command));
+      ptr += sizeof(struct drm_asahi_command);
+
+      memcpy(ptr, (char *)commands[i].cmd_buffer, commands[i].cmd_buffer_size);
+      ptr += commands[i].cmd_buffer_size;
+
+      if (commands[i].cmd_type == DRM_ASAHI_CMD_RENDER) {
+         struct drm_asahi_cmd_render *render =
+            (struct drm_asahi_cmd_render *)commands[i].cmd_buffer;
+         size_t fragments_size = sizeof(struct drm_asahi_attachment) *
+                                 render->fragment_attachment_count;
+         memcpy(ptr, (char *)render->fragment_attachments, fragments_size);
+         ptr += fragments_size;
+      }
+   }
+
+   req->hdr.cmd = ASAHI_CCMD_SUBMIT;
+   req->hdr.len = req_len;
+
+   struct drm_virtgpu_execbuffer_syncobj *vdrm_in_syncs = calloc(
+      submit->in_sync_count, sizeof(struct drm_virtgpu_execbuffer_syncobj));
+   for (int i = 0; i < submit->in_sync_count; i++) {
+      vdrm_in_syncs[i].handle = in_syncs[i].handle;
+      vdrm_in_syncs[i].point = in_syncs[i].timeline_value;
+   }
+
+   struct drm_virtgpu_execbuffer_syncobj *vdrm_out_syncs = calloc(
+      submit->out_sync_count, sizeof(struct drm_virtgpu_execbuffer_syncobj));
+   for (int i = 0; i < submit->out_sync_count; i++) {
+      vdrm_out_syncs[i].handle = out_syncs[i].handle;
+      vdrm_out_syncs[i].point = out_syncs[i].timeline_value;
+   }
+
+   struct vdrm_execbuf_params p = {
+      /* Signal the host we want to wait for the command to complete */
+      .ring_idx = 1,
+      .req = &req->hdr,
+      .num_in_syncobjs = submit->in_sync_count,
+      .in_syncobjs = vdrm_in_syncs,
+      .num_out_syncobjs = submit->out_sync_count,
+      .out_syncobjs = vdrm_out_syncs,
+   };
+
+   int ret = vdrm_execbuf(dev->vdrm, &p);
+
+   free(vdrm_out_syncs);
+   free(vdrm_in_syncs);
+   free(req);
+   return ret;
+}
+
+const agx_device_ops_t agx_virtio_device_ops = {
+   .bo_alloc = agx_virtio_bo_alloc,
+   .bo_bind = agx_virtio_bo_bind,
+   .bo_mmap = agx_virtio_bo_mmap,
+   .get_params = agx_virtio_get_params,
+   .submit = agx_virtio_submit,
+};
+
+bool
+agx_virtio_open_device(struct agx_device *dev)
+{
+   struct vdrm_device *vdrm;
+
+   vdrm = vdrm_device_connect(dev->fd, 2);
+   if (!vdrm) {
+      fprintf(stderr, "could not connect vdrm\n");
+      return false;
+   }
+
+   dev->vdrm = vdrm;
+   dev->ops = agx_virtio_device_ops;
+   return true;
+}
--- a/src/asahi/lib/agx_device_virtio.h
+++ b/src/asahi/lib/agx_device_virtio.h
@ -0,0 +1,13 @@
+/*
+ * Copyright 2024 Sergio Lopez
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include "agx_device.h"
+
+int agx_virtio_simple_ioctl(struct agx_device *dev, unsigned cmd, void *_req);
+
+bool agx_virtio_open_device(struct agx_device *dev);
--- a/src/asahi/lib/asahi_proto.h
+++ b/src/asahi/lib/asahi_proto.h
@ -0,0 +1,133 @@
+/*
+ * Copyright 2024 Sergio Lopez
+ * Copyright 2022 Google LLC
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ASAHI_PROTO_H_
+#define ASAHI_PROTO_H_
+
+/**
+ * Defines the layout of shmem buffer used for host->guest communication.
+ */
+struct asahi_shmem {
+   struct vdrm_shmem base;
+
+   /**
+    * Counter that is incremented on asynchronous errors, like SUBMIT
+    * or GEM_NEW failures.  The guest should treat errors as context-
+    * lost.
+    */
+   uint32_t async_error;
+
+   /**
+    * Counter that is incremented on global fault (see MSM_PARAM_FAULTS)
+    */
+   uint32_t global_faults;
+};
+DEFINE_CAST(vdrm_shmem, asahi_shmem)
+
+/*
+ * Possible cmd types for "command stream", ie. payload of EXECBUF ioctl:
+ */
+enum asahi_ccmd {
+   ASAHI_CCMD_NOP = 1, /* No payload, can be used to sync with host */
+   ASAHI_CCMD_IOCTL_SIMPLE,
+   ASAHI_CCMD_GET_PARAMS,
+   ASAHI_CCMD_GEM_NEW,
+   ASAHI_CCMD_GEM_BIND,
+   ASAHI_CCMD_SUBMIT,
+};
+
+#define ASAHI_CCMD(_cmd, _len)                                                 \
+   (struct vdrm_ccmd_req)                                                      \
+   {                                                                           \
+      .cmd = ASAHI_CCMD_##_cmd, .len = (_len),                                 \
+   }
+
+/*
+ * ASAHI_CCMD_NOP
+ */
+struct asahi_ccmd_nop_req {
+   struct vdrm_ccmd_req hdr;
+};
+
+/*
+ * ASAHI_CCMD_IOCTL_SIMPLE
+ *
+ * Forward simple/flat IOC_RW or IOC_W ioctls.  Limited ioctls are supported.
+ */
+struct asahi_ccmd_ioctl_simple_req {
+   struct vdrm_ccmd_req hdr;
+
+   uint32_t cmd;
+   uint8_t payload[];
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_ioctl_simple_req)
+
+struct asahi_ccmd_ioctl_simple_rsp {
+   struct vdrm_ccmd_rsp hdr;
+
+   /* ioctl return value, interrupted syscalls are handled on the host without
+    * returning to the guest.
+    */
+   int32_t ret;
+
+   /* The output payload for IOC_RW ioctls, the payload is the same size as
+    * asahi_context_cmd_ioctl_simple_req.
+    *
+    * For IOC_W ioctls (userspace writes, kernel reads) this is zero length.
+    */
+   uint8_t payload[];
+};
+
+struct asahi_ccmd_get_params_req {
+   struct vdrm_ccmd_req hdr;
+   struct drm_asahi_get_params params;
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_get_params_req)
+
+struct asahi_ccmd_get_params_rsp {
+   struct vdrm_ccmd_rsp hdr;
+   int32_t ret;
+   struct drm_asahi_params_global params;
+};
+
+struct asahi_ccmd_gem_new_req {
+   struct vdrm_ccmd_req hdr;
+   uint32_t flags;
+   uint32_t bind_flags;
+   uint32_t vm_id;
+   uint32_t blob_id;
+   uint64_t size;
+   uint64_t addr;
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_gem_new_req)
+
+struct asahi_ccmd_gem_bind_req {
+   struct vdrm_ccmd_req hdr;
+   uint32_t op;
+   uint32_t flags;
+   uint32_t vm_id;
+   uint32_t res_id;
+   uint64_t size;
+   uint64_t addr;
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_gem_bind_req)
+
+struct asahi_ccmd_gem_bind_rsp {
+   struct vdrm_ccmd_rsp hdr;
+   int32_t ret;
+};
+
+struct asahi_ccmd_submit_req {
+   struct vdrm_ccmd_req hdr;
+   uint32_t queue_id;
+   uint32_t result_res_id;
+   uint32_t command_count;
+
+   uint8_t payload[];
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_submit_req)
+
+#endif // ASAHI_PROTO_H_
--- a/src/asahi/lib/decode.c
+++ b/src/asahi/lib/decode.c
@ -18,18 +18,11 @@

 #include "util/u_hexdump.h"
 #include "decode.h"
+#include "unstable_asahi_drm.h"
 #ifdef __APPLE__
 #include "agx_iokit.h"
 #endif

-/* Pending UAPI */
-struct drm_asahi_params_global {
-   int gpu_generation;
-   int gpu_variant;
-   int chip_id;
-   int num_clusters_total;
-};
-
 struct libagxdecode_config lib_config;

 UNUSED static const char *agx_alloc_types[AGX_NUM_ALLOC] = {"mem", "map",
@ -283,6 +276,11 @@ agxdecode_map_read_write(struct agxdecode_ctx *ctx)
      DUMP_UNPACKED(T, temp, str "\n");                                        \
   }

+#define DUMP_FIELD(struct, fmt, field)                                         \
+   {                                                                           \
+      fprintf(agxdecode_dump_stream, #field " = " fmt "\n", struct->field);    \
+   }
+
 #define agxdecode_log(str) fputs(str, agxdecode_dump_stream)
 #define agxdecode_msg(str) fprintf(agxdecode_dump_stream, "// %s", str)

@ -980,6 +978,116 @@ agxdecode_image_heap(struct agxdecode_ctx *ctx, uint64_t heap,
   agxdecode_map_read_write(ctx);
 }

+void
+agxdecode_drm_cmd_render(struct agxdecode_ctx *ctx,
+                         struct drm_asahi_params_global *params,
+                         struct drm_asahi_cmd_render *c, bool verbose)
+{
+   agxdecode_dump_file_open();
+
+   DUMP_FIELD(c, "%llx", flags);
+   DUMP_FIELD(c, "0x%llx", encoder_ptr);
+   agxdecode_stateful(ctx, c->encoder_ptr, "Encoder", agxdecode_vdm, verbose,
+                      params, NULL);
+   DUMP_FIELD(c, "0x%x", encoder_id);
+   DUMP_FIELD(c, "0x%x", cmd_ta_id);
+   DUMP_FIELD(c, "0x%x", cmd_3d_id);
+   DUMP_FIELD(c, "0x%x", ppp_ctrl);
+   DUMP_FIELD(c, "0x%llx", ppp_multisamplectl);
+   DUMP_CL(ZLS_CONTROL, &c->zls_ctrl, "ZLS Control");
+   DUMP_FIELD(c, "0x%llx", depth_buffer_load);
+   DUMP_FIELD(c, "0x%llx", depth_buffer_store);
+   DUMP_FIELD(c, "0x%llx", depth_buffer_partial);
+   DUMP_FIELD(c, "0x%llx", stencil_buffer_load);
+   DUMP_FIELD(c, "0x%llx", stencil_buffer_store);
+   DUMP_FIELD(c, "0x%llx", stencil_buffer_partial);
+   DUMP_FIELD(c, "0x%llx", scissor_array);
+   DUMP_FIELD(c, "0x%llx", depth_bias_array);
+   DUMP_FIELD(c, "%d", fb_width);
+   DUMP_FIELD(c, "%d", fb_height);
+   DUMP_FIELD(c, "%d", layers);
+   DUMP_FIELD(c, "%d", samples);
+   DUMP_FIELD(c, "%d", sample_size);
+   DUMP_FIELD(c, "%d", tib_blocks);
+   DUMP_FIELD(c, "%d", utile_width);
+   DUMP_FIELD(c, "%d", utile_height);
+   DUMP_FIELD(c, "0x%x", load_pipeline);
+   DUMP_FIELD(c, "0x%x", load_pipeline_bind);
+   agxdecode_stateful(ctx, c->load_pipeline & ~0x7, "Load pipeline",
+                      agxdecode_usc, verbose, params, NULL);
+   DUMP_FIELD(c, "0x%x", store_pipeline);
+   DUMP_FIELD(c, "0x%x", store_pipeline_bind);
+   agxdecode_stateful(ctx, c->store_pipeline & ~0x7, "Store pipeline",
+                      agxdecode_usc, verbose, params, NULL);
+   DUMP_FIELD(c, "0x%x", partial_reload_pipeline);
+   DUMP_FIELD(c, "0x%x", partial_reload_pipeline_bind);
+   agxdecode_stateful(ctx, c->partial_reload_pipeline & ~0x7,
+                      "Partial reload pipeline", agxdecode_usc, verbose, params,
+                      NULL);
+   DUMP_FIELD(c, "0x%x", partial_store_pipeline);
+   DUMP_FIELD(c, "0x%x", partial_store_pipeline_bind);
+   agxdecode_stateful(ctx, c->partial_store_pipeline & ~0x7,
+                      "Partial store pipeline", agxdecode_usc, verbose, params,
+                      NULL);
+
+   DUMP_FIELD(c, "0x%x", depth_dimensions);
+   DUMP_FIELD(c, "0x%x", isp_bgobjdepth);
+   DUMP_FIELD(c, "0x%x", isp_bgobjvals);
+
+   agxdecode_sampler_heap(ctx, c->vertex_sampler_array,
+                          c->vertex_sampler_count);
+
+   /* Linux driver doesn't use this, at least for now */
+   assert(c->fragment_sampler_array == c->vertex_sampler_array);
+   assert(c->fragment_sampler_count == c->vertex_sampler_count);
+
+   DUMP_FIELD(c, "%d", vertex_attachment_count);
+   struct drm_asahi_attachment *vertex_attachments =
+      (void *)c->vertex_attachments;
+   for (unsigned i = 0; i < c->vertex_attachment_count; i++) {
+      DUMP_FIELD((&vertex_attachments[i]), "0x%x", order);
+      DUMP_FIELD((&vertex_attachments[i]), "0x%llx", size);
+      DUMP_FIELD((&vertex_attachments[i]), "0x%llx", pointer);
+   }
+   DUMP_FIELD(c, "%d", fragment_attachment_count);
+   struct drm_asahi_attachment *fragment_attachments =
+      (void *)c->fragment_attachments;
+   for (unsigned i = 0; i < c->fragment_attachment_count; i++) {
+      DUMP_FIELD((&fragment_attachments[i]), "0x%x", order);
+      DUMP_FIELD((&fragment_attachments[i]), "0x%llx", size);
+      DUMP_FIELD((&fragment_attachments[i]), "0x%llx", pointer);
+   }
+
+   agxdecode_map_read_write(ctx);
+}
+
+void
+agxdecode_drm_cmd_compute(struct agxdecode_ctx *ctx,
+                          struct drm_asahi_params_global *params,
+                          struct drm_asahi_cmd_compute *c, bool verbose)
+{
+   agxdecode_dump_file_open();
+
+   DUMP_FIELD(c, "%llx", flags);
+   DUMP_FIELD(c, "0x%llx", encoder_ptr);
+   agxdecode_stateful(ctx, c->encoder_ptr, "Encoder", agxdecode_cdm, verbose,
+                      params, NULL);
+   DUMP_FIELD(c, "0x%x", encoder_id);
+   DUMP_FIELD(c, "0x%x", cmd_id);
+
+   agxdecode_sampler_heap(ctx, c->sampler_array, c->sampler_count);
+
+   agxdecode_map_read_write(ctx);
+
+   if (c->helper_program & 1) {
+      fprintf(agxdecode_dump_stream, "Helper program:\n");
+      uint8_t buf[1024];
+      agx_disassemble(
+         buf, agxdecode_fetch_gpu_array(ctx, c->helper_program & ~1, buf),
+         agxdecode_dump_stream);
+   }
+}
+
 static void
 chip_id_to_params(decoder_params *params, uint32_t chip_id)
 {
--- a/src/asahi/lib/decode.h
+++ b/src/asahi/lib/decode.h
@ -10,6 +10,8 @@
 #include <sys/types.h>
 #include "agx_bo.h"

+#include "unstable_asahi_drm.h"
+
 struct agxdecode_ctx;

 struct agxdecode_ctx *agxdecode_new_context(void);
@ -26,6 +28,16 @@ void agxdecode_cmdstream(struct agxdecode_ctx *ctx, unsigned cmdbuf_index,
 void agxdecode_image_heap(struct agxdecode_ctx *ctx, uint64_t heap,
                          unsigned nr_entries);

+void agxdecode_drm_cmd_render(struct agxdecode_ctx *ctx,
+                              struct drm_asahi_params_global *params,
+                              struct drm_asahi_cmd_render *cmdbuf,
+                              bool verbose);
+
+void agxdecode_drm_cmd_compute(struct agxdecode_ctx *ctx,
+                               struct drm_asahi_params_global *params,
+                               struct drm_asahi_cmd_compute *cmdbuf,
+                               bool verbose);
+
 void agxdecode_dump_file_open(void);

 void agxdecode_track_alloc(struct agxdecode_ctx *ctx, struct agx_bo *alloc);
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@ -9,6 +9,7 @@ libasahi_lib_files = files(
  'agx_bo.c',
  'agx_border.c',
  'agx_device.c',
+  'agx_device_virtio.c',
  'agx_formats.c',
  'agx_linker.c',
  'agx_bg_eot.c',
@ -86,10 +87,10 @@ libagx_shaders = custom_target(
 libasahi_lib = static_library(
  'asahi_lib',
  [libasahi_lib_files, libagx_shaders, agx_pack],
-  include_directories : inc_asahi,
+  include_directories : [inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
  c_args : [no_override_init_args],
  gnu_symbol_visibility : 'hidden',
-  link_with: [libasahi_decode],
+  link_with: [libasahi_decode, libvdrm],
  dependencies: [dep_libdrm, dep_valgrind, idep_nir],
  build_by_default : false,
 )
--- a/src/asahi/lib/unstable_asahi_drm.h
+++ b/src/asahi/lib/unstable_asahi_drm.h
@ -0,0 +1,666 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (C) The Asahi Linux Contributors
+ *
+ * Based on asahi_drm.h which is
+ *
+ * Copyright © 2014-2018 Broadcom
+ * Copyright © 2019 Collabora ltd.
+ */
+#ifndef _ASAHI_DRM_H_
+#define _ASAHI_DRM_H_
+
+#include "drm-uapi/drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * The UAPI defined in this file MUST NOT BE USED. End users, DO NOT attempt to
+ * use upstream Mesa with asahi kernels, it will blow up. Distro packagers, DO
+ * NOT patch upstream Mesa to do the same.
+ */
+#define DRM_ASAHI_UNSTABLE_UABI_VERSION (0xDEADBEEF)
+
+#define DRM_ASAHI_GET_PARAMS			0x00
+#define DRM_ASAHI_VM_CREATE			0x01
+#define DRM_ASAHI_VM_DESTROY			0x02
+#define DRM_ASAHI_GEM_CREATE			0x03
+#define DRM_ASAHI_GEM_MMAP_OFFSET		0x04
+#define DRM_ASAHI_GEM_BIND			0x05
+#define DRM_ASAHI_QUEUE_CREATE			0x06
+#define DRM_ASAHI_QUEUE_DESTROY			0x07
+#define DRM_ASAHI_SUBMIT			0x08
+#define DRM_ASAHI_GET_TIME			0x09
+
+#define DRM_ASAHI_MAX_CLUSTERS	32
+
+struct drm_asahi_params_global {
+	__u32 unstable_uabi_version;
+	__u32 pad0;
+
+	__u64 feat_compat;
+	__u64 feat_incompat;
+
+	__u32 gpu_generation;
+	__u32 gpu_variant;
+	__u32 gpu_revision;
+	__u32 chip_id;
+
+	__u32 num_dies;
+	__u32 num_clusters_total;
+	__u32 num_cores_per_cluster;
+	__u32 num_frags_per_cluster;
+	__u32 num_gps_per_cluster;
+	__u32 num_cores_total_active;
+	__u64 core_masks[DRM_ASAHI_MAX_CLUSTERS];
+
+	__u32 vm_page_size;
+	__u32 pad1;
+	__u64 vm_user_start;
+	__u64 vm_user_end;
+	__u64 vm_shader_start;
+	__u64 vm_shader_end;
+
+	__u32 max_syncs_per_submission;
+	__u32 max_commands_per_submission;
+	__u32 max_commands_in_flight;
+	__u32 max_attachments;
+
+	__u32 timer_frequency_hz;
+	__u32 min_frequency_khz;
+	__u32 max_frequency_khz;
+	__u32 max_power_mw;
+
+	__u32 result_render_size;
+	__u32 result_compute_size;
+
+	__u32 firmware_version[4];
+};
+
+/*
+enum drm_asahi_feat_compat {
+};
+*/
+
+enum drm_asahi_feat_incompat {
+	DRM_ASAHI_FEAT_MANDATORY_ZS_COMPRESSION = (1UL) << 0,
+};
+
+struct drm_asahi_get_params {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @param: Parameter group to fetch (MBZ) */
+	__u32 param_group;
+
+	/** @pad: MBZ */
+	__u32 pad;
+
+	/** @value: User pointer to write parameter struct */
+	__u64 pointer;
+
+	/** @value: Size of user buffer, max size supported on return */
+	__u64 size;
+};
+
+struct drm_asahi_vm_create {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @value: Returned VM ID */
+	__u32 vm_id;
+
+	/** @pad: MBZ */
+	__u32 pad;
+};
+
+struct drm_asahi_vm_destroy {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @value: VM ID to be destroyed */
+	__u32 vm_id;
+
+	/** @pad: MBZ */
+	__u32 pad;
+};
+
+#define ASAHI_GEM_WRITEBACK	(1L << 0)
+#define ASAHI_GEM_VM_PRIVATE	(1L << 1)
+
+struct drm_asahi_gem_create {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @size: Size of the BO */
+	__u64 size;
+
+	/** @flags: BO creation flags */
+	__u32 flags;
+
+	/** @handle: VM ID to assign to the BO, if ASAHI_GEM_VM_PRIVATE is set. */
+	__u32 vm_id;
+
+	/** @handle: Returned GEM handle for the BO */
+	__u32 handle;
+};
+
+struct drm_asahi_gem_mmap_offset {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @handle: Handle for the object being mapped. */
+	__u32 handle;
+
+	/** @flags: Must be zero */
+	__u32 flags;
+
+	/** @offset: The fake offset to use for subsequent mmap call */
+	__u64 offset;
+};
+
+enum drm_asahi_bind_op {
+	ASAHI_BIND_OP_BIND = 0,
+	ASAHI_BIND_OP_UNBIND = 1,
+	ASAHI_BIND_OP_UNBIND_ALL = 2,
+};
+
+#define ASAHI_BIND_READ		(1L << 0)
+#define ASAHI_BIND_WRITE	(1L << 1)
+
+struct drm_asahi_gem_bind {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @obj: Bind operation */
+	__u32 op;
+
+	/** @flags: One or more of ASAHI_BIND_* */
+	__u32 flags;
+
+	/** @obj: GEM object to bind */
+	__u32 handle;
+
+	/** @vm_id: The ID of the VM to bind to */
+	__u32 vm_id;
+
+	/** @offset: Offset into the object */
+	__u64 offset;
+
+	/** @range: Number of bytes from the object to bind to addr */
+	__u64 range;
+
+	/** @addr: Address to bind to */
+	__u64 addr;
+};
+
+enum drm_asahi_cmd_type {
+	DRM_ASAHI_CMD_RENDER = 0,
+	DRM_ASAHI_CMD_BLIT = 1,
+	DRM_ASAHI_CMD_COMPUTE = 2,
+};
+
+/* Note: this is an enum so that it can be resolved by Rust bindgen. */
+enum drm_asahi_queue_cap {
+	DRM_ASAHI_QUEUE_CAP_RENDER	= (1UL << DRM_ASAHI_CMD_RENDER),
+	DRM_ASAHI_QUEUE_CAP_BLIT	= (1UL << DRM_ASAHI_CMD_BLIT),
+	DRM_ASAHI_QUEUE_CAP_COMPUTE	= (1UL << DRM_ASAHI_CMD_COMPUTE),
+};
+
+struct drm_asahi_queue_create {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @flags: MBZ */
+	__u32 flags;
+
+	/** @vm_id: The ID of the VM this queue is bound to */
+	__u32 vm_id;
+
+	/** @type: Bitmask of DRM_ASAHI_QUEUE_CAP_* */
+	__u32 queue_caps;
+
+	/** @priority: Queue priority, 0-3 */
+	__u32 priority;
+
+	/** @queue_id: The returned queue ID */
+	__u32 queue_id;
+};
+
+struct drm_asahi_queue_destroy {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @queue_id: The queue ID to be destroyed */
+	__u32 queue_id;
+};
+
+enum drm_asahi_sync_type {
+	DRM_ASAHI_SYNC_SYNCOBJ = 0,
+	DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ = 1,
+};
+
+struct drm_asahi_sync {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @sync_type: One of drm_asahi_sync_type */
+	__u32 sync_type;
+
+	/** @handle: The sync object handle */
+	__u32 handle;
+
+	/** @timeline_value: Timeline value for timeline sync objects */
+	__u64 timeline_value;
+};
+
+enum drm_asahi_subqueue {
+	DRM_ASAHI_SUBQUEUE_RENDER = 0, /* Also blit */
+	DRM_ASAHI_SUBQUEUE_COMPUTE = 1,
+	DRM_ASAHI_SUBQUEUE_COUNT = 2,
+};
+
+#define DRM_ASAHI_BARRIER_NONE ~(0U)
+
+struct drm_asahi_command {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @type: One of drm_asahi_cmd_type */
+	__u32 cmd_type;
+
+	/** @flags: Flags for command submission */
+	__u32 flags;
+
+	/** @cmdbuf: Pointer to the appropriate command buffer structure */
+	__u64 cmd_buffer;
+
+	/** @cmdbuf: Size of the command buffer structure */
+	__u64 cmd_buffer_size;
+
+	/** @cmdbuf: Offset into the result BO to return information about this command */
+	__u64 result_offset;
+
+	/** @cmdbuf: Size of the result data structure */
+	__u64 result_size;
+
+	/** @barriers: Array of command indices per subqueue to wait on */
+	__u32 barriers[DRM_ASAHI_SUBQUEUE_COUNT];
+};
+
+struct drm_asahi_submit {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @in_syncs: An optional array of drm_asahi_sync to wait on before starting this job. */
+	__u64 in_syncs;
+
+	/** @in_syncs: An optional array of drm_asahi_sync objects to signal upon completion. */
+	__u64 out_syncs;
+
+	/** @commands: Pointer to the drm_asahi_command array of commands to submit. */
+	__u64 commands;
+
+	/** @flags: Flags for command submission (MBZ) */
+	__u32 flags;
+
+	/** @queue_id: The queue ID to be submitted to */
+	__u32 queue_id;
+
+	/** @result_handle: An optional BO handle to place result data in */
+	__u32 result_handle;
+
+	/** @in_sync_count: Number of sync objects to wait on before starting this job. */
+	__u32 in_sync_count;
+
+	/** @in_sync_count: Number of sync objects to signal upon completion of this job. */
+	__u32 out_sync_count;
+
+	/** @pad: Number of commands to be submitted */
+	__u32 command_count;
+};
+
+struct drm_asahi_attachment {
+	/** @pointer: Base address of the attachment */
+	__u64 pointer;
+	/** @size: Size of the attachment in bytes */
+	__u64 size;
+	/** @order: Power of 2 exponent related to attachment size (?) */
+	__u32 order;
+	/** @flags: MBZ */
+	__u32 flags;
+};
+
+#define ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES (1UL << 0)
+#define ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S (1UL << 1)
+#define ASAHI_RENDER_VERTEX_SPILLS (1UL << 2)
+#define ASAHI_RENDER_PROCESS_EMPTY_TILES (1UL << 3)
+#define ASAHI_RENDER_NO_VERTEX_CLUSTERING (1UL << 4)
+#define ASAHI_RENDER_MSAA_ZS (1UL << 5)
+/* XXX check */
+#define ASAHI_RENDER_NO_PREEMPTION (1UL << 6)
+
+struct drm_asahi_cmd_render {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	__u64 flags;
+
+	__u64 encoder_ptr;
+
+	__u64 vertex_attachments;
+	__u64 fragment_attachments;
+	__u32 vertex_attachment_count;
+	__u32 fragment_attachment_count;
+
+	__u32 vertex_helper_program;
+	__u32 fragment_helper_program;
+	__u32 vertex_helper_cfg;
+	__u32 fragment_helper_cfg;
+	__u64 vertex_helper_arg;
+	__u64 fragment_helper_arg;
+
+	__u64 depth_buffer_load;
+	__u64 depth_buffer_load_stride;
+	__u64 depth_buffer_store;
+	__u64 depth_buffer_store_stride;
+	__u64 depth_buffer_partial;
+	__u64 depth_buffer_partial_stride;
+	__u64 depth_meta_buffer_load;
+	__u64 depth_meta_buffer_load_stride;
+	__u64 depth_meta_buffer_store;
+	__u64 depth_meta_buffer_store_stride;
+	__u64 depth_meta_buffer_partial;
+	__u64 depth_meta_buffer_partial_stride;
+
+	__u64 stencil_buffer_load;
+	__u64 stencil_buffer_load_stride;
+	__u64 stencil_buffer_store;
+	__u64 stencil_buffer_store_stride;
+	__u64 stencil_buffer_partial;
+	__u64 stencil_buffer_partial_stride;
+	__u64 stencil_meta_buffer_load;
+	__u64 stencil_meta_buffer_load_stride;
+	__u64 stencil_meta_buffer_store;
+	__u64 stencil_meta_buffer_store_stride;
+	__u64 stencil_meta_buffer_partial;
+	__u64 stencil_meta_buffer_partial_stride;
+
+	__u64 scissor_array;
+	__u64 depth_bias_array;
+	__u64 visibility_result_buffer;
+
+	__u64 vertex_sampler_array;
+	__u32 vertex_sampler_count;
+	__u32 vertex_sampler_max;
+
+	__u64 fragment_sampler_array;
+	__u32 fragment_sampler_count;
+	__u32 fragment_sampler_max;
+
+	__u64 zls_ctrl;
+	__u64 ppp_multisamplectl;
+	__u32 ppp_ctrl;
+
+	__u32 fb_width;
+	__u32 fb_height;
+
+	__u32 utile_width;
+	__u32 utile_height;
+
+	__u32 samples;
+	__u32 layers;
+
+	__u32 encoder_id;
+	__u32 cmd_ta_id;
+	__u32 cmd_3d_id;
+
+	__u32 sample_size;
+	__u32 tib_blocks;
+	__u32 iogpu_unk_214;
+
+	__u32 merge_upper_x;
+	__u32 merge_upper_y;
+
+	__u32 load_pipeline;
+	__u32 load_pipeline_bind;
+
+	__u32 store_pipeline;
+	__u32 store_pipeline_bind;
+
+	__u32 partial_reload_pipeline;
+	__u32 partial_reload_pipeline_bind;
+
+	__u32 partial_store_pipeline;
+	__u32 partial_store_pipeline_bind;
+
+	__u32 depth_dimensions;
+	__u32 isp_bgobjdepth;
+	__u32 isp_bgobjvals;
+};
+
+#define ASAHI_RENDER_UNK_UNK1			(1UL << 0)
+#define ASAHI_RENDER_UNK_SET_TILE_CONFIG	(1UL << 1)
+#define ASAHI_RENDER_UNK_SET_UTILE_CONFIG	(1UL << 2)
+#define ASAHI_RENDER_UNK_SET_AUX_FB_UNK		(1UL << 3)
+#define ASAHI_RENDER_UNK_SET_G14_UNK		(1UL << 4)
+
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_140	(1UL << 20)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_158	(1UL << 21)
+#define ASAHI_RENDER_UNK_SET_FRG_TILECFG	(1UL << 22)
+#define ASAHI_RENDER_UNK_SET_LOAD_BGOBJVALS	(1UL << 23)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_38		(1UL << 24)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_3C		(1UL << 25)
+
+#define ASAHI_RENDER_UNK_SET_RELOAD_ZLSCTRL	(1UL << 27)
+#define ASAHI_RENDER_UNK_SET_UNK_BUF_10		(1UL << 28)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_MASK	(1UL << 29)
+
+#define ASAHI_RENDER_UNK_SET_IOGPU_UNK54	(1UL << 40)
+#define ASAHI_RENDER_UNK_SET_IOGPU_UNK56	(1UL << 41)
+#define ASAHI_RENDER_UNK_SET_TILING_CONTROL	(1UL << 42)
+#define ASAHI_RENDER_UNK_SET_TILING_CONTROL_2	(1UL << 43)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_F0		(1UL << 44)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_F8		(1UL << 45)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_118	(1UL << 46)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_MASK	(1UL << 47)
+
+#define ASAHI_RENDER_EXT_UNKNOWNS	0xff00
+
+/* XXX: Do not upstream this struct */
+struct drm_asahi_cmd_render_unknowns {
+	/** @type: Type ID of this extension */
+	__u32 type;
+	__u32 pad;
+	/** @next: Pointer to the next extension struct, if any */
+	__u64 next;
+
+	__u64 flags;
+
+	__u64 tile_config;
+	__u64 utile_config;
+
+	__u64 aux_fb_unk;
+	__u64 g14_unk;
+	__u64 frg_unk_140;
+	__u64 frg_unk_158;
+	__u64 frg_tilecfg;
+	__u64 load_bgobjvals;
+	__u64 frg_unk_38;
+	__u64 frg_unk_3c;
+	__u64 reload_zlsctrl;
+	__u64 unk_buf_10;
+	__u64 frg_unk_mask;
+
+	__u64 iogpu_unk54;
+	__u64 iogpu_unk56;
+	__u64 tiling_control;
+	__u64 tiling_control_2;
+	__u64 vtx_unk_f0;
+	__u64 vtx_unk_f8;
+	__u64 vtx_unk_118;
+	__u64 vtx_unk_mask;
+};
+
+/* XXX check */
+#define ASAHI_COMPUTE_NO_PREEMPTION (1UL << 0)
+
+struct drm_asahi_cmd_compute {
+	__u64 flags;
+
+	__u64 encoder_ptr;
+	__u64 encoder_end;
+
+	__u64 attachments;
+	__u32 attachment_count;
+	__u32 pad;
+
+	__u32 helper_program;
+	__u32 helper_cfg;
+	__u64 helper_arg;
+
+	__u32 encoder_id;
+	__u32 cmd_id;
+
+	__u64 sampler_array;
+	__u32 sampler_count;
+	__u32 sampler_max;
+
+	__u32 iogpu_unk_40;
+	__u32 unk_mask;
+};
+
+enum drm_asahi_status {
+	DRM_ASAHI_STATUS_PENDING = 0,
+	DRM_ASAHI_STATUS_COMPLETE,
+	DRM_ASAHI_STATUS_UNKNOWN_ERROR,
+	DRM_ASAHI_STATUS_TIMEOUT,
+	DRM_ASAHI_STATUS_FAULT,
+	DRM_ASAHI_STATUS_KILLED,
+	DRM_ASAHI_STATUS_NO_DEVICE,
+};
+
+enum drm_asahi_fault {
+	DRM_ASAHI_FAULT_NONE = 0,
+	DRM_ASAHI_FAULT_UNKNOWN,
+	DRM_ASAHI_FAULT_UNMAPPED,
+	DRM_ASAHI_FAULT_AF_FAULT,
+	DRM_ASAHI_FAULT_WRITE_ONLY,
+	DRM_ASAHI_FAULT_READ_ONLY,
+	DRM_ASAHI_FAULT_NO_ACCESS,
+};
+
+struct drm_asahi_result_info {
+	/** @status: One of enum drm_asahi_status */
+	__u32 status;
+
+	/** @reason: One of drm_asahi_fault_type */
+	__u32 fault_type;
+
+	/** @unit: Unit number, hardware dependent */
+	__u32 unit;
+
+	/** @sideband: Sideband information, hardware dependent */
+	__u32 sideband;
+
+	/** @level: Page table level at which the fault occurred, hardware dependent */
+	__u8 level;
+
+	/** @read: Fault was a read */
+	__u8 is_read;
+
+	/** @pad: MBZ */
+	__u16 pad;
+
+	/** @unk_5: Extra bits, hardware dependent */
+	__u32 extra;
+
+	/** @address: Fault address, cache line aligned */
+	__u64 address;
+};
+
+#define DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF (1UL << 0)
+#define DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN (1UL << 1)
+#define DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED (1UL << 2)
+
+struct drm_asahi_result_render {
+	/** @address: Common result information */
+	struct drm_asahi_result_info info;
+
+	/** @flags: Zero or more of of DRM_ASAHI_RESULT_RENDER_* */
+	__u64 flags;
+
+	/** @vertex_ts_start: Timestamp of the start of vertex processing */
+	__u64 vertex_ts_start;
+
+	/** @vertex_ts_end: Timestamp of the end of vertex processing */
+	__u64 vertex_ts_end;
+
+	/** @fragment_ts_start: Timestamp of the start of fragment processing */
+	__u64 fragment_ts_start;
+
+	/** @fragment_ts_end: Timestamp of the end of fragment processing */
+	__u64 fragment_ts_end;
+
+	/** @tvb_size_bytes: TVB size at the start of this render */
+	__u64 tvb_size_bytes;
+
+	/** @tvb_usage_bytes: Total TVB usage in bytes for this render */
+	__u64 tvb_usage_bytes;
+
+	/** @num_tvb_overflows: Number of TVB overflows that occurred for this render */
+	__u32 num_tvb_overflows;
+};
+
+struct drm_asahi_result_compute {
+	/** @address: Common result information */
+	struct drm_asahi_result_info info;
+
+	/** @flags: Zero or more of of DRM_ASAHI_RESULT_COMPUTE_* */
+	__u64 flags;
+
+	/** @ts_start: Timestamp of the start of this compute command */
+	__u64 ts_start;
+
+	/** @vertex_ts_end: Timestamp of the end of this compute command */
+	__u64 ts_end;
+};
+
+struct drm_asahi_get_time {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @flags: MBZ. */
+	__u64 flags;
+
+	/** @tv_sec: On return, seconds part of a point in time */
+	__s64 tv_sec;
+
+	/** @tv_nsec: On return, nanoseconds part of a point in time */
+	__s64 tv_nsec;
+
+	/** @gpu_timestamp: On return, the GPU timestamp at that point in time */
+	__u64 gpu_timestamp;
+};
+
+/* Note: this is an enum so that it can be resolved by Rust bindgen. */
+enum {
+   DRM_IOCTL_ASAHI_GET_PARAMS       = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GET_PARAMS, struct drm_asahi_get_params),
+   DRM_IOCTL_ASAHI_VM_CREATE        = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_VM_CREATE, struct drm_asahi_vm_create),
+   DRM_IOCTL_ASAHI_VM_DESTROY       = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_VM_DESTROY, struct drm_asahi_vm_destroy),
+   DRM_IOCTL_ASAHI_GEM_CREATE       = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GEM_CREATE, struct drm_asahi_gem_create),
+   DRM_IOCTL_ASAHI_GEM_MMAP_OFFSET  = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GEM_MMAP_OFFSET, struct drm_asahi_gem_mmap_offset),
+   DRM_IOCTL_ASAHI_GEM_BIND         = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_GEM_BIND, struct drm_asahi_gem_bind),
+   DRM_IOCTL_ASAHI_QUEUE_CREATE     = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_QUEUE_CREATE, struct drm_asahi_queue_create),
+   DRM_IOCTL_ASAHI_QUEUE_DESTROY    = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_QUEUE_DESTROY, struct drm_asahi_queue_destroy),
+   DRM_IOCTL_ASAHI_SUBMIT           = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_SUBMIT, struct drm_asahi_submit),
+   DRM_IOCTL_ASAHI_GET_TIME         = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GET_TIME, struct drm_asahi_get_time),
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _ASAHI_DRM_H_ */
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@ -5,11 +5,13 @@
 */

 #include <xf86drm.h>
+#include "asahi/lib/agx_device_virtio.h"
 #include "asahi/lib/decode.h"
 #include "util/bitset.h"
 #include "util/u_dynarray.h"
 #include "util/u_range.h"
 #include "agx_state.h"
+#include "vdrm.h"

 #define foreach_active(ctx, idx)                                               \
   BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)
@ -156,13 +158,162 @@ agx_batch_init(struct agx_context *ctx,
      assert(!ret && batch->syncobj);
   }

+   batch->result_off =
+      (2 * sizeof(union agx_batch_result)) * agx_batch_idx(batch);
+   batch->result =
+      (void *)(((uint8_t *)ctx->result_buf->ptr.cpu) + batch->result_off);
+   memset(batch->result, 0, sizeof(union agx_batch_result) * 2);
+
   agx_batch_mark_active(batch);
 }

+const char *status_str[] = {
+   [DRM_ASAHI_STATUS_PENDING] = "(pending)",
+   [DRM_ASAHI_STATUS_COMPLETE] = "Complete",
+   [DRM_ASAHI_STATUS_UNKNOWN_ERROR] = "UNKNOWN ERROR",
+   [DRM_ASAHI_STATUS_TIMEOUT] = "TIMEOUT",
+   [DRM_ASAHI_STATUS_FAULT] = "FAULT",
+   [DRM_ASAHI_STATUS_KILLED] = "KILLED",
+   [DRM_ASAHI_STATUS_NO_DEVICE] = "NO DEVICE",
+};
+
+const char *fault_type_str[] = {
+   [DRM_ASAHI_FAULT_NONE] = "(none)",
+   [DRM_ASAHI_FAULT_UNKNOWN] = "Unknown",
+   [DRM_ASAHI_FAULT_UNMAPPED] = "Unmapped",
+   [DRM_ASAHI_FAULT_AF_FAULT] = "AF Fault",
+   [DRM_ASAHI_FAULT_WRITE_ONLY] = "Write Only",
+   [DRM_ASAHI_FAULT_READ_ONLY] = "Read Only",
+   [DRM_ASAHI_FAULT_NO_ACCESS] = "No Access",
+};
+
+const char *low_unit_str[16] = {
+   "DCMP", "UL1C", "CMP", "GSL1",    "IAP", "VCE",    "TE",  "RAS",
+   "VDM",  "PPP",  "IPF", "IPF_CPF", "VF",  "VF_CPF", "ZLS", "UNK",
+};
+
+const char *mid_unit_str[16] = {
+   "UNK",     "dPM",      "dCDM_KS0", "dCDM_KS1", "dCDM_KS2", "dIPP",
+   "dIPP_CS", "dVDM_CSD", "dVDM_SSD", "dVDM_ILF", "dVDM_ILD", "dRDE0",
+   "dRDE1",   "FC",       "GSL2",     "UNK",
+};
+
+const char *high_unit_str[16] = {
+   "gPM_SP",         "gVDM_CSD_SP", "gVDM_SSD_SP",    "gVDM_ILF_SP",
+   "gVDM_TFP_SP",    "gVDM_MMB_SP", "gCDM_CS_KS0_SP", "gCDM_CS_KS1_SP",
+   "gCDM_CS_KS2_SP", "gCDM_KS0_SP", "gCDM_KS1_SP",    "gCDM_KS2_SP",
+   "gIPP_SP",        "gIPP_CS_SP",  "gRDE0_SP",       "gRDE1_SP",
+};
+
+static void
+agx_print_result(struct agx_device *dev, struct agx_context *ctx,
+                 struct drm_asahi_result_info *info, unsigned batch_idx,
+                 bool is_compute)
+{
+   if (unlikely(info->status != DRM_ASAHI_STATUS_COMPLETE)) {
+      ctx->any_faults = true;
+   }
+
+   if (likely(info->status == DRM_ASAHI_STATUS_COMPLETE &&
+              !((dev)->debug & AGX_DBG_STATS)))
+      return;
+
+   if (is_compute) {
+      struct drm_asahi_result_compute *r = (void *)info;
+      float time = (r->ts_end - r->ts_start) / dev->params.timer_frequency_hz;
+
+      mesa_logw(
+         "[Batch %d] Compute %s: %.06f\n", batch_idx,
+         info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
+         time);
+   } else {
+      struct drm_asahi_result_render *r = (void *)info;
+      float time_vtx = (r->vertex_ts_end - r->vertex_ts_start) /
+                       (float)dev->params.timer_frequency_hz;
+      float time_frag = (r->fragment_ts_end - r->fragment_ts_start) /
+                        (float)dev->params.timer_frequency_hz;
+      mesa_logw(
+         "[Batch %d] Render %s: TVB %9ld/%9ld bytes (%d ovf) %c%c%c | vtx %.06f frag %.06f\n",
+         batch_idx,
+         info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
+         (long)r->tvb_usage_bytes, (long)r->tvb_size_bytes,
+         (int)r->num_tvb_overflows,
+         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF ? 'G' : ' ',
+         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN ? 'M' : ' ',
+         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED ? 'O' : ' ',
+         time_vtx, time_frag);
+   }
+
+   if (info->fault_type != DRM_ASAHI_FAULT_NONE) {
+      const char *unit_name;
+      int unit_index;
+
+      switch (info->unit) {
+      case 0x00 ... 0x9f:
+         unit_name = low_unit_str[info->unit & 0xf];
+         unit_index = info->unit >> 4;
+         break;
+      case 0xa0 ... 0xaf:
+         unit_name = mid_unit_str[info->unit & 0xf];
+         unit_index = 0;
+         break;
+      case 0xb0 ... 0xb7:
+         unit_name = "GL2CC_META";
+         unit_index = info->unit & 0x7;
+         break;
+      case 0xb8:
+         unit_name = "GL2CC_MB";
+         unit_index = 0;
+         break;
+      case 0xe0 ... 0xff:
+         unit_name = high_unit_str[info->unit & 0xf];
+         unit_index = (info->unit >> 4) & 1;
+         break;
+      default:
+         unit_name = "UNK";
+         unit_index = 0;
+         break;
+      }
+
+      mesa_logw(
+         "[Batch %d] Fault: %s : Addr 0x%llx %c Unit %02x (%s/%d) SB 0x%02x L%d Extra 0x%x\n",
+         batch_idx,
+         info->fault_type < ARRAY_SIZE(fault_type_str)
+            ? fault_type_str[info->fault_type]
+            : "?",
+         (long long)info->address, info->is_read ? 'r' : 'W', info->unit,
+         unit_name, unit_index, info->sideband, info->level, info->extra);
+
+      agx_debug_fault(dev, info->address);
+   }
+
+   /* Obscurely, we need to tolerate faults to pass the robustness parts of the
+    * CTS, so we can't assert that we don't fault. But it's helpful for any sort
+    * of debugging to crash on fault.
+    */
+   if (dev->debug) {
+      assert(info->status == DRM_ASAHI_STATUS_COMPLETE ||
+             info->status == DRM_ASAHI_STATUS_KILLED);
+   }
+}
+
 static void
 agx_batch_print_stats(struct agx_device *dev, struct agx_batch *batch)
 {
-   unreachable("Linux UAPI not yet upstream");
+   unsigned batch_idx = agx_batch_idx(batch);
+
+   if (!batch->result)
+      return;
+
+   if (batch->cdm.bo) {
+      agx_print_result(dev, batch->ctx, &batch->result[0].compute.info,
+                       batch_idx, true);
+   }
+
+   if (batch->vdm.bo) {
+      agx_print_result(dev, batch->ctx, &batch->result[1].render.info,
+                       batch_idx, false);
+   }
 }

 static void
@ -175,7 +326,18 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
   assert(ctx->batch != batch);

   uint64_t begin_ts = ~0, end_ts = 0;
-   /* TODO: UAPI pending */
+   if (batch->result) {
+      if (batch->cdm.bo) {
+         begin_ts = MIN2(begin_ts, batch->result[0].compute.ts_start);
+         end_ts = MAX2(end_ts, batch->result[0].compute.ts_end);
+      }
+
+      if (batch->vdm.bo) {
+         begin_ts = MIN2(begin_ts, batch->result[1].render.vertex_ts_start);
+         end_ts = MAX2(end_ts, batch->result[1].render.fragment_ts_end);
+      }
+   }
+
   agx_finish_batch_queries(batch, begin_ts, end_ts);

   if (reset) {
@ -197,7 +359,8 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
         if (writer == batch)
            agx_writer_remove(ctx, handle);

-         p_atomic_cmpxchg(&bo->writer_syncobj, batch->syncobj, 0);
+         p_atomic_cmpxchg(&bo->writer,
+                          agx_bo_writer(ctx->queue_id, batch->syncobj), 0);

         agx_bo_unreference(agx_lookup_bo(dev, handle));
      }
@ -215,6 +378,9 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
   if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
      agx_batch_print_stats(dev, batch);
   }
+
+   util_unreference_framebuffer_state(&batch->key);
+   agx_batch_mark_complete(batch);
 }

 int
@ -566,8 +732,8 @@ agx_add_sync(struct drm_asahi_sync *syncs, unsigned *count, uint32_t handle)

 void
 agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
-                 uint32_t barriers, enum drm_asahi_cmd_type cmd_type,
-                 void *cmdbuf)
+                 struct drm_asahi_cmd_compute *compute,
+                 struct drm_asahi_cmd_render *render)
 {
   struct agx_device *dev = agx_device(ctx->base.screen);
   struct agx_screen *screen = agx_screen(ctx->base.screen);
@ -579,6 +745,9 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
   feedback = true;
 #endif

+   /* Timer queries use the feedback timestamping */
+   feedback |= (batch->timestamps.size > 0);
+
   if (!feedback)
      batch->result = NULL;

@ -597,6 +766,29 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
      .handle = batch->syncobj,
   };

+   /* This lock protects against a subtle race scenario:
+    * - Context 1 submits and registers itself as writer for a BO
+    * - Context 2 runs the below loop, and finds the writer syncobj
+    * - Context 1 is destroyed,
+    *     - flushing all batches, unregistering itself as a writer, and
+    *     - Destroying syncobjs for all batches
+    * - Context 2 submits, with a now invalid syncobj ID
+    *
+    * Since batch syncobjs are only destroyed on context destruction, we can
+    * protect against this scenario with a screen-wide rwlock to ensure that
+    * the syncobj destroy code cannot run concurrently with any other
+    * submission. If a submit runs before the wrlock is taken, the syncobjs
+    * must still exist (even if the batch was flushed and no longer a writer).
+    * If it runs after the wrlock is released, then by definition the
+    * just-destroyed syncobjs cannot be writers for any BO at that point.
+    *
+    * A screen-wide (not device-wide) rwlock is sufficient because by definition
+    * resources can only be implicitly shared within a screen. Any shared
+    * resources across screens must have been imported and will go through the
+    * AGX_BO_SHARED path instead, which has no race (but is slower).
+    */
+   u_rwlock_rdlock(&screen->destroy_lock);
+
   int handle;
   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
      struct agx_bo *bo = agx_lookup_bo(dev, handle);
@ -624,6 +816,29 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,

         /* And keep track of the BO for cloning the out_sync */
         shared_bos[shared_bo_count++] = bo;
+      } else {
+         /* Deal with BOs which are not externally shared, but which have been
+          * written from another context within the same screen. We also need to
+          * wait on these using their syncobj.
+          */
+         uint64_t writer = p_atomic_read_relaxed(&bo->writer);
+         if (writer && agx_bo_writer_queue(writer) != ctx->queue_id) {
+            batch_debug(batch, "Waits on inter-context BO @ 0x%" PRIx64,
+                        bo->ptr.gpu);
+
+            agx_add_sync(in_syncs, &in_sync_count,
+                         agx_bo_writer_syncobj(writer));
+            shared_bos[shared_bo_count++] = NULL;
+         }
+      }
+   }
+
+   if (dev->debug & AGX_DBG_SCRATCH) {
+      if (compute)
+         agx_scratch_debug_pre(&ctx->scratch_cs);
+      if (render) {
+         agx_scratch_debug_pre(&ctx->scratch_vs);
+         agx_scratch_debug_pre(&ctx->scratch_fs);
      }
   }

@ -631,9 +846,71 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
   agx_add_sync(in_syncs, &in_sync_count, agx_get_in_sync(ctx));

   /* Submit! */
-   /* TODO: UAPI */
-   (void)screen;
-   (void)out_sync;
+   struct drm_asahi_command commands[2];
+   unsigned command_count = 0;
+
+   if (compute) {
+      commands[command_count++] = (struct drm_asahi_command){
+         .cmd_type = DRM_ASAHI_CMD_COMPUTE,
+         .flags = 0,
+         .cmd_buffer = (uint64_t)(uintptr_t)compute,
+         .cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute),
+         .result_offset = feedback ? batch->result_off : 0,
+         .result_size = feedback ? sizeof(union agx_batch_result) : 0,
+         /* Barrier on previous submission */
+         .barriers = {0, 0},
+      };
+   }
+
+   if (render) {
+      commands[command_count++] = (struct drm_asahi_command){
+         .cmd_type = DRM_ASAHI_CMD_RENDER,
+         .flags = 0,
+         .cmd_buffer = (uint64_t)(uintptr_t)render,
+         .cmd_buffer_size = sizeof(struct drm_asahi_cmd_render),
+         .result_offset =
+            feedback ? (batch->result_off + sizeof(union agx_batch_result)) : 0,
+         .result_size = feedback ? sizeof(union agx_batch_result) : 0,
+         /* Barrier on previous submission */
+         .barriers = {compute ? DRM_ASAHI_BARRIER_NONE : 0, compute ? 1 : 0},
+      };
+   }
+
+   struct drm_asahi_submit submit = {
+      .flags = 0,
+      .queue_id = ctx->queue_id,
+      .result_handle = feedback ? ctx->result_buf->handle : 0,
+      .in_sync_count = in_sync_count,
+      .out_sync_count = 1,
+      .command_count = command_count,
+      .in_syncs = (uint64_t)(uintptr_t)(in_syncs),
+      .out_syncs = (uint64_t)(uintptr_t)(&out_sync),
+      .commands = (uint64_t)(uintptr_t)(&commands[0]),
+   };
+
+   int ret = dev->ops.submit(dev, &submit, ctx->result_buf->vbo_res_id);
+
+   u_rwlock_rdunlock(&screen->destroy_lock);
+
+   if (ret) {
+      if (compute) {
+         fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT compute failed: %m\n");
+      }
+
+      if (render) {
+         struct drm_asahi_cmd_render *c = render;
+         fprintf(
+            stderr,
+            "DRM_IOCTL_ASAHI_SUBMIT render failed: %m (%dx%d tile %dx%d layers %d samples %d)\n",
+            c->fb_width, c->fb_height, c->utile_width, c->utile_height,
+            c->layers, c->samples);
+      }
+
+      assert(0);
+   }
+
+   if (ret == ENODEV)
+      abort();

   /* Now stash our batch fence into any shared BOs. */
   if (shared_bo_count) {
@ -644,6 +921,9 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
      assert(out_sync_fd >= 0);

      for (unsigned i = 0; i < shared_bo_count; i++) {
+         if (!shared_bos[i])
+            continue;
+
         batch_debug(batch, "Signals shared BO @ 0x%" PRIx64,
                     shared_bos[i]->ptr.gpu);

@ -674,7 +954,7 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,

      /* But any BOs written by active batches are ours */
      assert(writer == batch && "exclusive writer");
-      p_atomic_set(&bo->writer_syncobj, batch->syncobj);
+      p_atomic_set(&bo->writer, agx_bo_writer(ctx->queue_id, batch->syncobj));
   }

   free(in_syncs);
@ -682,11 +962,16 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,

   if (dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_SCRATCH)) {
      if (dev->debug & AGX_DBG_TRACE) {
-         /* agxdecode DRM commands */
-         switch (cmd_type) {
-         default:
-            unreachable("Linux UAPI not yet upstream");
+         if (compute) {
+            agxdecode_drm_cmd_compute(dev->agxdecode, &dev->params, compute,
+                                      true);
         }
+
+         if (render) {
+            agxdecode_drm_cmd_render(dev->agxdecode, &dev->params, render,
+                                     true);
+         }
+
         agxdecode_next_frame();
      }

@ -695,6 +980,19 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
      assert(!ret);

      agx_batch_print_stats(dev, batch);
+
+      if (dev->debug & AGX_DBG_SCRATCH) {
+         if (compute) {
+            fprintf(stderr, "CS scratch:\n");
+            agx_scratch_debug_post(&ctx->scratch_cs);
+         }
+         if (render) {
+            fprintf(stderr, "VS scratch:\n");
+            agx_scratch_debug_post(&ctx->scratch_vs);
+            fprintf(stderr, "FS scratch:\n");
+            agx_scratch_debug_post(&ctx->scratch_fs);
+         }
+      }
   }

   agx_batch_mark_submitted(batch);
@ -767,6 +1065,9 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
   if (ctx->batch == batch)
      ctx->batch = NULL;

+   /* Elide printing stats */
+   batch->result = NULL;
+
   agx_batch_cleanup(ctx, batch, true);
 }

--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@ -12,6 +12,7 @@
 #include "asahi/layout/layout.h"
 #include "asahi/lib/agx_formats.h"
 #include "asahi/lib/decode.h"
+#include "asahi/lib/unstable_asahi_drm.h"
 #include "drm-uapi/drm_fourcc.h"
 #include "frontend/winsys_handle.h"
 #include "gallium/auxiliary/renderonly/renderonly.h"
@ -25,6 +26,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
+#include "util/bitscan.h"
 #include "util/format/u_format.h"
 #include "util/half_float.h"
 #include "util/macros.h"
@ -933,6 +935,7 @@ agx_transfer_map(struct pipe_context *pctx, struct pipe_resource *resource,
 {
   struct agx_context *ctx = agx_context(pctx);
   struct agx_resource *rsrc = agx_resource(resource);
+   struct agx_device *dev = agx_device(ctx->base.screen);

   /* Can't map tiled/compressed directly */
   if ((usage & PIPE_MAP_DIRECTLY) && rsrc->modifier != DRM_FORMAT_MOD_LINEAR)
@ -996,11 +999,11 @@ agx_transfer_map(struct pipe_context *pctx, struct pipe_resource *resource,
         agx_sync_writer(ctx, staging, "GPU read staging blit");
      }

-      agx_bo_mmap(staging->bo);
+      dev->ops.bo_mmap(staging->bo);
      return staging->bo->ptr.cpu;
   }

-   agx_bo_mmap(rsrc->bo);
+   dev->ops.bo_mmap(rsrc->bo);

   if (ail_is_level_twiddled_uncompressed(&rsrc->layout, level)) {
      /* Should never happen for buffers, and it's not safe */
@ -1226,6 +1229,323 @@ agx_flush_resource(struct pipe_context *pctx, struct pipe_resource *pres)
   }
 }

+#define MAX_ATTACHMENTS 16
+
+struct attachments {
+   struct drm_asahi_attachment list[MAX_ATTACHMENTS];
+   size_t count;
+};
+
+static void
+asahi_add_attachment(struct attachments *att, struct agx_resource *rsrc,
+                     struct pipe_surface *surf)
+{
+   assert(att->count < MAX_ATTACHMENTS);
+   int idx = att->count++;
+
+   att->list[idx].size = rsrc->layout.size_B;
+   att->list[idx].pointer = rsrc->bo->ptr.gpu;
+   att->list[idx].order = 1; // TODO: What does this do?
+   att->list[idx].flags = 0;
+}
+
+static bool
+is_aligned(unsigned x, unsigned pot_alignment)
+{
+   assert(util_is_power_of_two_nonzero(pot_alignment));
+   return (x & (pot_alignment - 1)) == 0;
+}
+
+static void
+agx_cmdbuf(struct agx_device *dev, struct drm_asahi_cmd_render *c,
+           struct attachments *att, struct agx_pool *pool,
+           struct agx_batch *batch, struct pipe_framebuffer_state *framebuffer,
+           uint64_t encoder_ptr, uint64_t encoder_id, uint64_t cmd_ta_id,
+           uint64_t cmd_3d_id, uint64_t scissor_ptr, uint64_t depth_bias_ptr,
+           uint64_t visibility_result_ptr, struct asahi_bg_eot pipeline_clear,
+           struct asahi_bg_eot pipeline_load,
+           struct asahi_bg_eot pipeline_store, bool clear_pipeline_textures,
+           double clear_depth, unsigned clear_stencil,
+           struct agx_tilebuffer_layout *tib)
+{
+   memset(c, 0, sizeof(*c));
+
+   c->encoder_ptr = encoder_ptr;
+   c->encoder_id = encoder_id;
+   c->cmd_3d_id = cmd_3d_id;
+   c->cmd_ta_id = cmd_ta_id;
+
+   /* bit 0 specifies OpenGL clip behaviour. Since ARB_clip_control is
+    * advertised, we don't set it and lower in the vertex shader.
+    */
+   c->ppp_ctrl = 0x202;
+
+   c->fb_width = framebuffer->width;
+   c->fb_height = framebuffer->height;
+
+   c->iogpu_unk_214 = 0xc000;
+
+   c->isp_bgobjvals = 0x300;
+
+   struct agx_resource *zres = NULL, *sres = NULL;
+
+   agx_pack(&c->zls_ctrl, ZLS_CONTROL, zls_control) {
+
+      if (framebuffer->zsbuf) {
+         struct pipe_surface *zsbuf = framebuffer->zsbuf;
+         struct agx_resource *zsres = agx_resource(zsbuf->texture);
+
+         unsigned level = zsbuf->u.tex.level;
+         unsigned first_layer = zsbuf->u.tex.first_layer;
+
+         const struct util_format_description *desc = util_format_description(
+            agx_resource(zsbuf->texture)->layout.format);
+
+         assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
+                desc->format == PIPE_FORMAT_Z16_UNORM ||
+                desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ||
+                desc->format == PIPE_FORMAT_S8_UINT);
+
+         c->depth_dimensions =
+            (framebuffer->width - 1) | ((framebuffer->height - 1) << 15);
+
+         if (util_format_has_depth(desc))
+            zres = zsres;
+         else
+            sres = zsres;
+
+         if (zsres->separate_stencil)
+            sres = zsres->separate_stencil;
+
+         if (zres) {
+            bool clear = (batch->clear & PIPE_CLEAR_DEPTH);
+            bool load = (batch->load & PIPE_CLEAR_DEPTH);
+
+            zls_control.z_store_enable = (batch->resolve & PIPE_CLEAR_DEPTH);
+            zls_control.z_load_enable = !clear && load;
+
+            c->depth_buffer_load = agx_map_texture_gpu(zres, first_layer) +
+                                   ail_get_level_offset_B(&zres->layout, level);
+
+            c->depth_buffer_store = c->depth_buffer_load;
+            c->depth_buffer_partial = c->depth_buffer_load;
+
+            /* Main stride in pages */
+            assert((zres->layout.depth_px == 1 ||
+                    is_aligned(zres->layout.layer_stride_B, AIL_PAGESIZE)) &&
+                   "Page aligned Z layers");
+
+            unsigned stride_pages = zres->layout.layer_stride_B / AIL_PAGESIZE;
+            c->depth_buffer_load_stride = ((stride_pages - 1) << 14) | 1;
+            c->depth_buffer_store_stride = c->depth_buffer_load_stride;
+            c->depth_buffer_partial_stride = c->depth_buffer_load_stride;
+
+            assert(zres->layout.tiling != AIL_TILING_LINEAR && "must tile");
+
+            if (ail_is_compressed(&zres->layout)) {
+               c->depth_meta_buffer_load =
+                  agx_map_texture_gpu(zres, 0) +
+                  zres->layout.metadata_offset_B +
+                  (first_layer * zres->layout.compression_layer_stride_B) +
+                  zres->layout.level_offsets_compressed_B[level];
+
+               /* Meta stride in cache lines */
+               assert(is_aligned(zres->layout.compression_layer_stride_B,
+                                 AIL_CACHELINE) &&
+                      "Cacheline aligned Z meta layers");
+               unsigned stride_lines =
+                  zres->layout.compression_layer_stride_B / AIL_CACHELINE;
+               c->depth_meta_buffer_load_stride = (stride_lines - 1) << 14;
+
+               c->depth_meta_buffer_store = c->depth_meta_buffer_load;
+               c->depth_meta_buffer_store_stride =
+                  c->depth_meta_buffer_load_stride;
+               c->depth_meta_buffer_partial = c->depth_meta_buffer_load;
+               c->depth_meta_buffer_partial_stride =
+                  c->depth_meta_buffer_load_stride;
+
+               zls_control.z_compress_1 = true;
+               zls_control.z_compress_2 = true;
+            }
+
+            if (zres->base.format == PIPE_FORMAT_Z16_UNORM) {
+               const float scale = 0xffff;
+               c->isp_bgobjdepth =
+                  (uint16_t)(SATURATE(clear_depth) * scale + 0.5f);
+               zls_control.z_format = AGX_ZLS_FORMAT_16;
+               c->iogpu_unk_214 |= 0x40000;
+            } else {
+               c->isp_bgobjdepth = fui(clear_depth);
+               zls_control.z_format = AGX_ZLS_FORMAT_32F;
+            }
+         }
+
+         if (sres) {
+            bool clear = (batch->clear & PIPE_CLEAR_STENCIL);
+            bool load = (batch->load & PIPE_CLEAR_STENCIL);
+
+            zls_control.s_store_enable = (batch->resolve & PIPE_CLEAR_STENCIL);
+            zls_control.s_load_enable = !clear && load;
+
+            c->stencil_buffer_load =
+               agx_map_texture_gpu(sres, first_layer) +
+               ail_get_level_offset_B(&sres->layout, level);
+
+            c->stencil_buffer_store = c->stencil_buffer_load;
+            c->stencil_buffer_partial = c->stencil_buffer_load;
+
+            /* Main stride in pages */
+            assert((sres->layout.depth_px == 1 ||
+                    is_aligned(sres->layout.layer_stride_B, AIL_PAGESIZE)) &&
+                   "Page aligned S layers");
+            unsigned stride_pages = sres->layout.layer_stride_B / AIL_PAGESIZE;
+            c->stencil_buffer_load_stride = ((stride_pages - 1) << 14) | 1;
+            c->stencil_buffer_store_stride = c->stencil_buffer_load_stride;
+            c->stencil_buffer_partial_stride = c->stencil_buffer_load_stride;
+
+            if (ail_is_compressed(&sres->layout)) {
+               c->stencil_meta_buffer_load =
+                  agx_map_texture_gpu(sres, 0) +
+                  sres->layout.metadata_offset_B +
+                  (first_layer * sres->layout.compression_layer_stride_B) +
+                  sres->layout.level_offsets_compressed_B[level];
+
+               /* Meta stride in cache lines */
+               assert(is_aligned(sres->layout.compression_layer_stride_B,
+                                 AIL_CACHELINE) &&
+                      "Cacheline aligned S meta layers");
+               unsigned stride_lines =
+                  sres->layout.compression_layer_stride_B / AIL_CACHELINE;
+               c->stencil_meta_buffer_load_stride = (stride_lines - 1) << 14;
+
+               c->stencil_meta_buffer_store = c->stencil_meta_buffer_load;
+               c->stencil_meta_buffer_store_stride =
+                  c->stencil_meta_buffer_load_stride;
+               c->stencil_meta_buffer_partial = c->stencil_meta_buffer_load;
+               c->stencil_meta_buffer_partial_stride =
+                  c->stencil_meta_buffer_load_stride;
+
+               zls_control.s_compress_1 = true;
+               zls_control.s_compress_2 = true;
+            }
+
+            c->isp_bgobjvals |= clear_stencil;
+         }
+      }
+   }
+
+   if (clear_pipeline_textures)
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+   else
+      c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES;
+
+   if (zres && !(batch->clear & PIPE_CLEAR_DEPTH))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+
+   if (sres && !(batch->clear & PIPE_CLEAR_STENCIL))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+
+   if (dev->debug & AGX_DBG_NOCLUSTER)
+      c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING;
+
+   /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */
+   if (tib->nr_samples > 1 && framebuffer->zsbuf)
+      c->flags |= ASAHI_RENDER_MSAA_ZS;
+
+   memcpy(&c->load_pipeline_bind, &pipeline_clear.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->store_pipeline_bind, &pipeline_store.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_reload_pipeline_bind, &pipeline_load.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_store_pipeline_bind, &pipeline_store.counts,
+          sizeof(struct agx_counts_packed));
+
+   /* XXX is this correct? */
+   c->load_pipeline = pipeline_clear.usc | (framebuffer->nr_cbufs >= 4 ? 8 : 4);
+   c->store_pipeline = pipeline_store.usc | 4;
+   c->partial_reload_pipeline = pipeline_load.usc | 4;
+   c->partial_store_pipeline = pipeline_store.usc | 4;
+
+   c->utile_width = tib->tile_size.width;
+   c->utile_height = tib->tile_size.height;
+
+   c->samples = tib->nr_samples;
+   c->layers = MAX2(util_framebuffer_get_num_layers(framebuffer), 1);
+
+   c->ppp_multisamplectl = batch->uniforms.ppp_multisamplectl;
+   c->sample_size = tib->sample_size_B;
+
+   /* XXX OR 0x80 with eMRT? */
+   c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(tib), 2048) / 2048;
+
+   float tan_60 = 1.732051f;
+   c->merge_upper_x = fui(tan_60 / framebuffer->width);
+   c->merge_upper_y = fui(tan_60 / framebuffer->height);
+
+   c->scissor_array = scissor_ptr;
+   c->depth_bias_array = depth_bias_ptr;
+   c->visibility_result_buffer = visibility_result_ptr;
+
+   c->vertex_sampler_array =
+      batch->sampler_heap.bo ? batch->sampler_heap.bo->ptr.gpu : 0;
+   c->vertex_sampler_count = batch->sampler_heap.count;
+   c->vertex_sampler_max = batch->sampler_heap.count + 1;
+
+   /* In the future we could split the heaps if useful */
+   c->fragment_sampler_array = c->vertex_sampler_array;
+   c->fragment_sampler_count = c->vertex_sampler_count;
+   c->fragment_sampler_max = c->vertex_sampler_max;
+
+   /* If a tile is empty, we do not want to process it, as the redundant
+    * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
+    * memory bandwidth. Any draw marks a tile as non-empty, so we only need to
+    * process empty tiles if the background+EOT programs have a side effect.
+    * This is the case exactly when there is an attachment we are clearing (some
+    * attachment A in clear and in resolve <==> non-empty intersection).
+    *
+    * This case matters a LOT for performance in workloads that split batches.
+    */
+   if (batch->clear & batch->resolve)
+      c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES;
+
+   for (unsigned i = 0; i < framebuffer->nr_cbufs; ++i) {
+      if (!framebuffer->cbufs[i])
+         continue;
+
+      asahi_add_attachment(att, agx_resource(framebuffer->cbufs[i]->texture),
+                           framebuffer->cbufs[i]);
+   }
+
+   if (framebuffer->zsbuf) {
+      struct agx_resource *rsrc = agx_resource(framebuffer->zsbuf->texture);
+
+      asahi_add_attachment(att, rsrc, framebuffer->zsbuf);
+
+      if (rsrc->separate_stencil) {
+         asahi_add_attachment(att, rsrc->separate_stencil, framebuffer->zsbuf);
+      }
+   }
+
+   c->fragment_attachments = (uint64_t)(uintptr_t)&att->list[0];
+   c->fragment_attachment_count = att->count;
+
+   if (batch->vs_scratch) {
+      c->flags |= ASAHI_RENDER_VERTEX_SPILLS;
+      c->vertex_helper_arg = batch->ctx->scratch_vs.buf->ptr.gpu;
+      c->vertex_helper_cfg = batch->vs_preamble_scratch << 16;
+      c->vertex_helper_program = dev->helper->ptr.gpu | 1;
+   }
+   if (batch->fs_scratch) {
+      c->fragment_helper_arg = batch->ctx->scratch_fs.buf->ptr.gpu;
+      c->fragment_helper_cfg = batch->fs_preamble_scratch << 16;
+      c->fragment_helper_program = dev->helper->ptr.gpu | 1;
+   }
+}
+
 /*
 * context
 */
@ -1255,23 +1575,66 @@ agx_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
   }
 }

-void
-agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
+static void
+agx_flush_compute(struct agx_context *ctx, struct agx_batch *batch,
+                  struct drm_asahi_cmd_compute *cmdbuf)
 {
   struct agx_device *dev = agx_device(ctx->base.screen);

-   assert(agx_batch_is_active(batch));
-   assert(!agx_batch_is_submitted(batch));
+   /* Finalize the encoder */
+   agx_pack(batch->cdm.current, CDM_STREAM_TERMINATE, _)
+      ;

-   /* Make sure there's something to submit. */
-   if (!batch->clear) {
-      agx_batch_reset(ctx, batch);
-      return;
-   }
+   agx_batch_add_bo(batch, batch->cdm.bo);

   if (batch->cs_scratch)
      agx_batch_add_bo(batch, ctx->scratch_cs.buf);

+   unsigned cmdbuf_id = agx_get_global_id(dev);
+   unsigned encoder_id = agx_get_global_id(dev);
+
+   *cmdbuf = (struct drm_asahi_cmd_compute){
+      .flags = 0,
+      .encoder_ptr = batch->cdm.bo->ptr.gpu,
+      .encoder_end = batch->cdm.bo->ptr.gpu +
+                     (batch->cdm.current - (uint8_t *)batch->cdm.bo->ptr.cpu),
+      .helper_arg = 0,
+      .helper_cfg = 0,
+      .helper_program = 0,
+      .iogpu_unk_40 = 0,
+      .sampler_array =
+         batch->sampler_heap.bo ? batch->sampler_heap.bo->ptr.gpu : 0,
+      .sampler_count = batch->sampler_heap.count,
+      .sampler_max = batch->sampler_heap.count + 1,
+      .encoder_id = encoder_id,
+      .cmd_id = cmdbuf_id,
+      .unk_mask = 0xffffffff,
+   };
+
+   if (batch->cs_scratch) {
+      // The commented out lines *may* be related to subgroup-level preemption,
+      // which we can't support without implementing threadgroup memory in the
+      // helper. Disable them for now.
+
+      // cmdbuf->iogpu_unk_40 = 0x1c;
+      cmdbuf->helper_arg = ctx->scratch_cs.buf->ptr.gpu;
+      cmdbuf->helper_cfg = batch->cs_preamble_scratch << 16;
+      // cmdbuf->helper_cfg |= 0x40;
+      cmdbuf->helper_program = dev->helper->ptr.gpu | 1;
+   }
+}
+
+static void
+agx_flush_render(struct agx_context *ctx, struct agx_batch *batch,
+                 struct drm_asahi_cmd_render *cmdbuf, struct attachments *att)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+
+   if (batch->vs_scratch)
+      agx_batch_add_bo(batch, ctx->scratch_vs.buf);
+   if (batch->fs_scratch)
+      agx_batch_add_bo(batch, ctx->scratch_fs.buf);
+
   assert(batch->initialized);

   /* Finalize the encoder */
@ -1313,22 +1676,46 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
    */
   agx_batch_add_bo(batch, batch->vdm.bo);

-   if (batch->vs_scratch)
-      agx_batch_add_bo(batch, ctx->scratch_vs.buf);
-   if (batch->fs_scratch)
-      agx_batch_add_bo(batch, ctx->scratch_fs.buf);
+   unsigned cmd_ta_id = agx_get_global_id(dev);
+   unsigned cmd_3d_id = agx_get_global_id(dev);
+   unsigned encoder_id = agx_get_global_id(dev);

-   /* TODO: Linux UAPI submission */
-   (void)dev;
-   (void)zbias;
-   (void)scissor;
-   (void)clear_pipeline_textures;
-   (void)pipeline_store;
-   (void)pipeline_background;
-   (void)pipeline_background_partial;
+   agx_cmdbuf(dev, cmdbuf, att, &batch->pool, batch, &batch->key,
+              batch->vdm.bo->ptr.gpu, encoder_id, cmd_ta_id, cmd_3d_id, scissor,
+              zbias, agx_get_occlusion_heap(batch), pipeline_background,
+              pipeline_background_partial, pipeline_store,
+              clear_pipeline_textures, batch->clear_depth, batch->clear_stencil,
+              &batch->tilebuffer_layout);
+}

-   unreachable("Linux UAPI not yet upstream");
-   agx_batch_submit(ctx, batch, 0, 0, NULL);
+void
+agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
+{
+   assert(agx_batch_is_active(batch));
+   assert(!agx_batch_is_submitted(batch));
+
+   struct attachments att = {.count = 0};
+   struct drm_asahi_cmd_render render;
+   struct drm_asahi_cmd_compute compute;
+   bool has_vdm = false, has_cdm = false;
+
+   if (batch->cdm.bo) {
+      agx_flush_compute(ctx, batch, &compute);
+      has_cdm = true;
+   }
+
+   if (batch->vdm.bo && (batch->clear || batch->initialized)) {
+      agx_flush_render(ctx, batch, &render, &att);
+      has_vdm = true;
+   }
+
+   if (!has_cdm && !has_vdm) {
+      agx_batch_reset(ctx, batch);
+      return;
+   }
+
+   agx_batch_submit(ctx, batch, has_cdm ? &compute : NULL,
+                    has_vdm ? &render : NULL);
 }

 static void
@ -1336,6 +1723,7 @@ agx_destroy_context(struct pipe_context *pctx)
 {
   struct agx_device *dev = agx_device(pctx->screen);
   struct agx_context *ctx = agx_context(pctx);
+   struct agx_screen *screen = agx_screen(pctx->screen);

   /* Batch state needs to be freed on completion, and we don't want to yank
    * buffers out from in-progress GPU jobs to avoid faults, so just wait until
@ -1357,6 +1745,11 @@ agx_destroy_context(struct pipe_context *pctx)

   agx_bo_unreference(ctx->result_buf);

+   /* Lock around the syncobj destruction, to avoid racing
+    * command submission in another context.
+    **/
+   u_rwlock_wrlock(&screen->destroy_lock);
+
   drmSyncobjDestroy(dev->fd, ctx->in_sync_obj);
   drmSyncobjDestroy(dev->fd, ctx->dummy_syncobj);
   if (ctx->in_sync_fd != -1)
@ -1367,12 +1760,16 @@ agx_destroy_context(struct pipe_context *pctx)
         drmSyncobjDestroy(dev->fd, ctx->batches.slots[i].syncobj);
   }

+   u_rwlock_wrunlock(&screen->destroy_lock);
+
   pipe_resource_reference(&ctx->heap, NULL);

   agx_scratch_fini(&ctx->scratch_vs);
   agx_scratch_fini(&ctx->scratch_fs);
   agx_scratch_fini(&ctx->scratch_cs);

+   agx_destroy_command_queue(dev, ctx->queue_id);
+
   ralloc_free(ctx);
 }

@ -1426,6 +1823,20 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
   }
   pctx->const_uploader = pctx->stream_uploader;

+   uint32_t priority = 2;
+   if (flags & PIPE_CONTEXT_PRIORITY_LOW)
+      priority = 3;
+   else if (flags & PIPE_CONTEXT_PRIORITY_MEDIUM)
+      priority = 2;
+   else if (flags & PIPE_CONTEXT_PRIORITY_HIGH)
+      priority = 1;
+
+   ctx->queue_id = agx_create_command_queue(agx_device(screen),
+                                            DRM_ASAHI_QUEUE_CAP_RENDER |
+                                               DRM_ASAHI_QUEUE_CAP_BLIT |
+                                               DRM_ASAHI_QUEUE_CAP_COMPUTE,
+                                            priority);
+
   pctx->destroy = agx_destroy_context;
   pctx->flush = agx_flush;
   pctx->clear = agx_clear;
@ -1461,9 +1872,10 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)

   ctx->blitter = util_blitter_create(pctx);

-   ctx->result_buf = agx_bo_create(
-      agx_device(screen), sizeof(union agx_batch_result) * AGX_MAX_BATCHES,
-      AGX_BO_WRITEBACK, "Batch result buffer");
+   ctx->result_buf =
+      agx_bo_create(agx_device(screen),
+                    (2 * sizeof(union agx_batch_result)) * AGX_MAX_BATCHES,
+                    AGX_BO_WRITEBACK, "Batch result buffer");
   assert(ctx->result_buf);

   /* Sync object/FD used for NATIVE_FENCE_FD. */
@ -1764,6 +2176,10 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TES_LAYER_VIEWPORT:
      return true;

+   case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+      return PIPE_CONTEXT_PRIORITY_LOW | PIPE_CONTEXT_PRIORITY_MEDIUM |
+             PIPE_CONTEXT_PRIORITY_HIGH;
+
   default:
      return u_pipe_screen_get_param_defaults(pscreen, param);
   }
@ -2179,6 +2595,18 @@ agx_get_timestamp(struct pipe_screen *pscreen)
   return agx_gpu_time_to_ns(dev, agx_get_gpu_timestamp(dev));
 }

+static void
+agx_screen_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+   agx_get_device_uuid(agx_device(pscreen), uuid);
+}
+
+static void
+agx_screen_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+   agx_get_driver_uuid(uuid);
+}
+
 struct pipe_screen *
 agx_screen_create(int fd, struct renderonly *ro,
                  const struct pipe_screen_config *config)
@ -2186,6 +2614,13 @@ agx_screen_create(int fd, struct renderonly *ro,
   struct agx_screen *agx_screen;
   struct pipe_screen *screen;

+   /* Refuse to probe. There is no stable UAPI yet. Upstream Mesa cannot be used
+    * yet with Asahi. Do not try. Do not patch out this check. Do not teach
+    * others about patching this check. Do not distribute upstream Mesa with
+    * this check patched out.
+    */
+   return NULL;
+
   agx_screen = rzalloc(NULL, struct agx_screen);
   if (!agx_screen)
      return NULL;
@ -2202,6 +2637,7 @@ agx_screen_create(int fd, struct renderonly *ro,

   agx_screen->dev.fd = fd;
   agx_screen->dev.ro = ro;
+   u_rwlock_init(&agx_screen->destroy_lock);

   /* Try to open an AGX device */
   if (!agx_open_device(agx_screen, &agx_screen->dev)) {
@ -2209,8 +2645,6 @@ agx_screen_create(int fd, struct renderonly *ro,
      return NULL;
   }

-   agx_screen->queue_id = agx_create_command_queue(&agx_screen->dev, 0);
-
   screen->destroy = agx_destroy_screen;
   screen->get_screen_fd = agx_screen_get_fd;
   screen->get_name = agx_get_name;
@ -2220,6 +2654,8 @@ agx_screen_create(int fd, struct renderonly *ro,
   screen->get_shader_param = agx_get_shader_param;
   screen->get_compute_param = agx_get_compute_param;
   screen->get_paramf = agx_get_paramf;
+   screen->get_device_uuid = agx_screen_get_device_uuid;
+   screen->get_driver_uuid = agx_screen_get_driver_uuid;
   screen->is_format_supported = agx_is_format_supported;
   screen->query_dmabuf_modifiers = agx_query_dmabuf_modifiers;
   screen->query_memory_info = agx_query_memory_info;
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -6,6 +6,7 @@

 #pragma once

+#include <xf86drm.h>
 #include "asahi/compiler/agx_compile.h"
 #include "asahi/genxml/agx_pack.h"
 #include "asahi/layout/layout.h"
@ -18,6 +19,7 @@
 #include "asahi/lib/agx_uvs.h"
 #include "asahi/lib/pool.h"
 #include "asahi/lib/shaders/geometry.h"
+#include "asahi/lib/unstable_asahi_drm.h"
 #include "compiler/nir/nir_lower_blend.h"
 #include "compiler/shader_enums.h"
 #include "gallium/auxiliary/util/u_blitter.h"
@ -28,6 +30,7 @@
 #include "util/bitset.h"
 #include "util/disk_cache.h"
 #include "util/hash_table.h"
+#include "util/rwlock.h"
 #include "util/u_range.h"
 #include "agx_bg_eot.h"
 #include "agx_helpers.h"
@ -357,6 +360,8 @@ struct agx_stage {
 };

 union agx_batch_result {
+   struct drm_asahi_result_render render;
+   struct drm_asahi_result_compute compute;
 };

 /* This is a firmware limit. It should be possible to raise to 2048 in the
@ -632,6 +637,9 @@ struct agx_context {
      uint64_t generation[AGX_MAX_BATCHES];
   } batches;

+   /* Queue handle */
+   uint32_t queue_id;
+
   struct agx_batch *batch;
   struct agx_bo *result_buf;

@ -872,8 +880,9 @@ struct agx_screen {
   struct pipe_screen pscreen;
   struct agx_device dev;
   struct disk_cache *disk_cache;
-   /* Queue handle */
-   uint32_t queue_id;
+
+   /* Lock to protect syncobj usage vs. destruction in context destroy */
+   struct u_rwlock destroy_lock;
 };

 static inline struct agx_screen *
@ -1053,9 +1062,12 @@ agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
 #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle)                             \
   BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count)

+struct drm_asahi_cmd_compute;
+struct drm_asahi_cmd_render;
+
 void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
-                      uint32_t barriers, enum drm_asahi_cmd_type cmd_type,
-                      void *cmdbuf);
+                      struct drm_asahi_cmd_compute *compute,
+                      struct drm_asahi_cmd_render *render);

 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
 void agx_flush_batch_for_reason(struct agx_context *ctx,
--- a/src/gallium/drivers/asahi/meson.build
+++ b/src/gallium/drivers/asahi/meson.build
@ -20,7 +20,7 @@ files_asahi = files(
 libasahi = static_library(
  'asahi',
  [files_asahi],
-  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src, inc_asahi],
+  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src, inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
  c_args : [c_msvc_compat_args],
  gnu_symbol_visibility : 'hidden',
  dependencies : [idep_nir, idep_mesautil, idep_agx_pack, dep_libdrm, idep_mesaclc],