diff --git a/src/asahi/drm-shim/asahi_noop.c b/src/asahi/drm-shim/asahi_noop.c
index deebdda8b03..9407b408734 100644
--- a/src/asahi/drm-shim/asahi_noop.c
+++ b/src/asahi/drm-shim/asahi_noop.c
@@ -4,12 +4,118 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include <string.h>
+
+#include "../lib/unstable_asahi_drm.h"
 #include "drm-shim/drm_shim.h"
 
 bool drm_shim_driver_prefers_first_render_node = true;
 
+static const struct drm_asahi_params_global params = {
+   .unstable_uabi_version = DRM_ASAHI_UNSTABLE_UABI_VERSION,
+   .gpu_generation = 13,
+   .gpu_variant = 'G',
+   .gpu_revision = 0,
+   .vm_user_start = 0x1000000,
+   .vm_user_end = 0x5000000,
+   .vm_shader_start = 0x8000000,
+   .vm_shader_end = 0x9000000,
+   .vm_page_size = 4096,
+};
+
+struct asahi_bo {
+   struct shim_bo base;
+   uint32_t offset;
+};
+
+static struct asahi_bo *
+asahi_bo(struct shim_bo *bo)
+{
+   return (struct asahi_bo *)bo;
+}
+
+struct asahi_device {
+   uint64_t next_offset;
+};
+
+static struct asahi_device asahi = {
+   .next_offset = 0x1000,
+};
+
+static int
+asahi_ioctl_noop(int fd, unsigned long request, void *arg)
+{
+   return 0;
+}
+
+static int
+asahi_ioctl_submit(int fd, unsigned long request, void *arg)
+{
+   return 0;
+}
+
+static int
+asahi_ioctl_gem_create(int fd, unsigned long request, void *arg)
+{
+   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+   struct drm_asahi_gem_create *create = arg;
+   struct asahi_bo *bo = calloc(1, sizeof(*bo));
+
+   drm_shim_bo_init(&bo->base, create->size);
+
+   assert(UINT64_MAX - asahi.next_offset > create->size);
+   bo->offset = asahi.next_offset;
+   asahi.next_offset += create->size;
+
+   create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
+
+   drm_shim_bo_put(&bo->base);
+
+   return 0;
+}
+
+static int
+asahi_ioctl_gem_mmap_offset(int fd, unsigned long request, void *arg)
+{
+   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+   struct drm_asahi_gem_mmap_offset *map = arg;
+   struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
+
+   map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+   drm_shim_bo_put(bo);
+
+   return 0;
+}
+
+static int
+asahi_ioctl_get_param(int fd, unsigned long request, void *arg)
+{
+   struct drm_asahi_get_params *gp = arg;
+
+   switch (gp->param_group) {
+   case 0:
+      assert(gp->size == sizeof(struct drm_asahi_params_global));
+      memcpy((void *)gp->pointer, &params, gp->size);
+      return 0;
+
+   default:
+      fprintf(stderr, "Unknown DRM_IOCTL_ASAHI_GET_PARAMS %d\n",
+              gp->param_group);
+      return -1;
+   }
+}
+
 static ioctl_fn_t driver_ioctls[] = {
-   /* The Asahi Linux UAPI is not yet upstream */
+   [DRM_ASAHI_GET_PARAMS] = asahi_ioctl_get_param,
+   [DRM_ASAHI_VM_CREATE] = asahi_ioctl_noop,
+   [DRM_ASAHI_VM_DESTROY] = asahi_ioctl_noop,
+   [DRM_ASAHI_GEM_CREATE] = asahi_ioctl_gem_create,
+   [DRM_ASAHI_GEM_MMAP_OFFSET] = asahi_ioctl_gem_mmap_offset,
+   [DRM_ASAHI_GEM_BIND] = asahi_ioctl_noop,
+   [DRM_ASAHI_QUEUE_CREATE] = asahi_ioctl_noop,
+   [DRM_ASAHI_QUEUE_DESTROY] = asahi_ioctl_noop,
+   [DRM_ASAHI_SUBMIT] = asahi_ioctl_submit,
 };
 
 void
diff --git a/src/asahi/drm-shim/meson.build b/src/asahi/drm-shim/meson.build
index ec0781d8754..cfc2541604f 100644
--- a/src/asahi/drm-shim/meson.build
+++ b/src/asahi/drm-shim/meson.build
@@ -4,8 +4,8 @@
 libasahi_noop_drm_shim = shared_library(
   'asahi_noop_drm_shim',
   'asahi_noop.c',
-  include_directories: inc_src,
-  dependencies: dep_drm_shim,
+  include_directories: [inc_include, inc_src],
+  dependencies: [dep_drm_shim, dep_valgrind],
   gnu_symbol_visibility : 'hidden',
   install : true,
 )
diff --git a/src/asahi/lib/agx_bo.c b/src/asahi/lib/agx_bo.c
index c51319efcab..a306d824f97 100644
--- a/src/asahi/lib/agx_bo.c
+++ b/src/asahi/lib/agx_bo.c
@@ -189,7 +189,7 @@ agx_bo_unreference(struct agx_bo *bo)
     * lock, let's make sure it's still not referenced before freeing it.
     */
    if (p_atomic_read(&bo->refcnt) == 0) {
-      assert(!p_atomic_read_relaxed(&bo->writer_syncobj));
+      assert(!p_atomic_read_relaxed(&bo->writer));
 
       if (dev->debug & AGX_DBG_TRACE)
          agxdecode_track_free(dev->agxdecode, bo);
@@ -225,12 +225,12 @@ agx_bo_create_aligned(struct agx_device *dev, unsigned size, unsigned align,
     * flush the cache to make space for the new allocation.
     */
    if (!bo)
-      bo = agx_bo_alloc(dev, size, align, flags);
+      bo = dev->ops.bo_alloc(dev, size, align, flags);
    if (!bo)
       bo = agx_bo_cache_fetch(dev, size, align, flags, false);
    if (!bo) {
       agx_bo_cache_evict_all(dev);
-      bo = agx_bo_alloc(dev, size, align, flags);
+      bo = dev->ops.bo_alloc(dev, size, align, flags);
    }
 
    if (!bo) {
diff --git a/src/asahi/lib/agx_bo.h b/src/asahi/lib/agx_bo.h
index d92d74d94c8..665e1d68549 100644
--- a/src/asahi/lib/agx_bo.h
+++ b/src/asahi/lib/agx_bo.h
@@ -80,8 +80,8 @@ struct agx_bo {
    /* DMA-BUF fd clone for adding fences to imports/exports */
    int prime_fd;
 
-   /* Syncobj handle of the current writer, if any */
-   uint32_t writer_syncobj;
+   /* Current writer, if any (queue in upper 32 bits, syncobj in lower 32 bits) */
+   uint64_t writer;
 
    /* Owner */
    struct agx_device *dev;
@@ -97,8 +97,30 @@ struct agx_bo {
 
    /* For debugging */
    const char *label;
+
+   /* virtio blob_id */
+   uint32_t blob_id;
+   uint32_t vbo_res_id;
 };
 
+static inline uint32_t
+agx_bo_writer_syncobj(uint64_t writer)
+{
+   return writer;
+}
+
+static inline uint32_t
+agx_bo_writer_queue(uint64_t writer)
+{
+   return writer >> 32;
+}
+
+static inline uint64_t
+agx_bo_writer(uint32_t queue, uint32_t syncobj)
+{
+   return (((uint64_t)queue) << 32) | syncobj;
+}
+
 struct agx_bo *agx_bo_create_aligned(struct agx_device *dev, unsigned size,
                                      unsigned align, enum agx_bo_flags flags,
                                      const char *label);
@@ -115,8 +137,6 @@ struct agx_bo *agx_bo_import(struct agx_device *dev, int fd);
 int agx_bo_export(struct agx_bo *bo);
 
 void agx_bo_free(struct agx_device *dev, struct agx_bo *bo);
-struct agx_bo *agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
-                            enum agx_bo_flags flags);
 struct agx_bo *agx_bo_cache_fetch(struct agx_device *dev, size_t size,
                                   size_t align, uint32_t flags,
                                   const bool dontwait);
diff --git a/src/asahi/lib/agx_device.c b/src/asahi/lib/agx_device.c
index efacc89a18c..193cc44d17d 100644
--- a/src/asahi/lib/agx_device.c
+++ b/src/asahi/lib/agx_device.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2021 Alyssa Rosenzweig
  * Copyright 2019 Collabora, Ltd.
+ * Copyright 2020 Igalia S.L.
  * SPDX-License-Identifier: MIT
  */
 
@@ -10,6 +11,7 @@
 #include "util/timespec.h"
 #include "agx_bo.h"
 #include "agx_compile.h"
+#include "agx_device_virtio.h"
 #include "agx_scratch.h"
 #include "decode.h"
 #include "glsl_types.h"
@@ -20,16 +22,25 @@
 #include "drm-uapi/dma-buf.h"
 #include "util/blob.h"
 #include "util/log.h"
+#include "util/mesa-sha1.h"
 #include "util/os_file.h"
 #include "util/os_mman.h"
 #include "util/os_time.h"
 #include "util/simple_mtx.h"
 #include "git_sha1.h"
 #include "nir_serialize.h"
+#include "unstable_asahi_drm.h"
+#include "vdrm.h"
 
-/* TODO: Linux UAPI. Dummy defines to get some things to compile. */
-#define ASAHI_BIND_READ  0
-#define ASAHI_BIND_WRITE 0
+static inline int
+asahi_simple_ioctl(struct agx_device *dev, unsigned cmd, void *req)
+{
+   if (dev->is_virtio) {
+      return agx_virtio_simple_ioctl(dev, cmd, req);
+   } else {
+      return drmIoctl(dev->fd, cmd, req);
+   }
+}
 
 /* clang-format off */
 static const struct debug_named_value agx_debug_options[] = {
@@ -101,10 +112,26 @@ static int
 agx_bo_bind(struct agx_device *dev, struct agx_bo *bo, uint64_t addr,
             uint32_t flags)
 {
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_gem_bind gem_bind = {
+      .op = ASAHI_BIND_OP_BIND,
+      .flags = flags,
+      .handle = bo->handle,
+      .vm_id = dev->vm_id,
+      .offset = 0,
+      .range = bo->size,
+      .addr = addr,
+   };
+
+   int ret = drmIoctl(dev->fd, DRM_IOCTL_ASAHI_GEM_BIND, &gem_bind);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GEM_BIND failed: %m (handle=%d)\n",
+              bo->handle);
+   }
+
+   return ret;
 }
 
-struct agx_bo *
+static struct agx_bo *
 agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
              enum agx_bo_flags flags)
 {
@@ -117,7 +144,23 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
    /* executable implies low va */
    assert(!(flags & AGX_BO_EXEC) || (flags & AGX_BO_LOW_VA));
 
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_gem_create gem_create = {.size = size};
+
+   if (flags & AGX_BO_WRITEBACK)
+      gem_create.flags |= ASAHI_GEM_WRITEBACK;
+
+   if (!(flags & (AGX_BO_SHARED | AGX_BO_SHAREABLE))) {
+      gem_create.flags |= ASAHI_GEM_VM_PRIVATE;
+      gem_create.vm_id = dev->vm_id;
+   }
+
+   int ret = drmIoctl(dev->fd, DRM_IOCTL_ASAHI_GEM_CREATE, &gem_create);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GEM_CREATE failed: %m\n");
+      return NULL;
+   }
+
+   handle = gem_create.handle;
 
    pthread_mutex_lock(&dev->bo_map_lock);
    bo = agx_lookup_bo(dev, handle);
@@ -128,7 +171,7 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
    assert(!memcmp(bo, &((struct agx_bo){}), sizeof(*bo)));
 
    bo->type = AGX_ALLOC_REGULAR;
-   bo->size = size; /* TODO: gem_create.size */
+   bo->size = gem_create.size;
    bo->align = MAX2(dev->params.vm_page_size, align);
    bo->flags = flags;
    bo->dev = dev;
@@ -157,13 +200,13 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
       bind |= ASAHI_BIND_WRITE;
    }
 
-   int ret = agx_bo_bind(dev, bo, bo->ptr.gpu, bind);
+   ret = dev->ops.bo_bind(dev, bo, bo->ptr.gpu, bind);
    if (ret) {
       agx_bo_free(dev, bo);
       return NULL;
    }
 
-   agx_bo_mmap(bo);
+   dev->ops.bo_mmap(bo);
 
    if (flags & AGX_BO_LOW_VA)
       bo->ptr.gpu -= dev->shader_base;
@@ -173,10 +216,31 @@ agx_bo_alloc(struct agx_device *dev, size_t size, size_t align,
    return bo;
 }
 
-void
+static void
 agx_bo_mmap(struct agx_bo *bo)
 {
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_gem_mmap_offset gem_mmap_offset = {.handle = bo->handle};
+   int ret;
+
+   if (bo->ptr.cpu)
+      return;
+
+   ret =
+      drmIoctl(bo->dev->fd, DRM_IOCTL_ASAHI_GEM_MMAP_OFFSET, &gem_mmap_offset);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_MMAP_BO failed: %m\n");
+      assert(0);
+   }
+
+   bo->ptr.cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         bo->dev->fd, gem_mmap_offset.offset);
+   if (bo->ptr.cpu == MAP_FAILED) {
+      bo->ptr.cpu = NULL;
+      fprintf(stderr,
+              "mmap failed: result=%p size=0x%llx fd=%i offset=0x%llx %m\n",
+              bo->ptr.cpu, (long long)bo->size, bo->dev->fd,
+              (long long)gem_mmap_offset.offset);
+   }
 }
 
 struct agx_bo *
@@ -239,8 +303,12 @@ agx_bo_import(struct agx_device *dev, int fd)
          abort();
       }
 
-      ret =
-         agx_bo_bind(dev, bo, bo->ptr.gpu, ASAHI_BIND_READ | ASAHI_BIND_WRITE);
+      if (dev->is_virtio) {
+         bo->vbo_res_id = vdrm_handle_to_res_id(dev->vdrm, bo->handle);
+      }
+
+      ret = dev->ops.bo_bind(dev, bo, bo->ptr.gpu,
+                             ASAHI_BIND_READ | ASAHI_BIND_WRITE);
       if (ret) {
          fprintf(stderr, "import failed: Could not bind BO at 0x%llx\n",
                  (long long)bo->ptr.gpu);
@@ -293,11 +361,11 @@ agx_bo_export(struct agx_bo *bo)
       /* If there is a pending writer to this BO, import it into the buffer
        * for implicit sync.
        */
-      uint32_t writer_syncobj = p_atomic_read_relaxed(&bo->writer_syncobj);
-      if (writer_syncobj) {
+      uint64_t writer = p_atomic_read_relaxed(&bo->writer);
+      if (writer) {
          int out_sync_fd = -1;
-         int ret =
-            drmSyncobjExportSyncFile(bo->dev->fd, writer_syncobj, &out_sync_fd);
+         int ret = drmSyncobjExportSyncFile(
+            bo->dev->fd, agx_bo_writer_syncobj(writer), &out_sync_fd);
          assert(ret >= 0);
          assert(out_sync_fd >= 0);
 
@@ -331,10 +399,38 @@ agx_get_global_id(struct agx_device *dev)
 static ssize_t
 agx_get_params(struct agx_device *dev, void *buf, size_t size)
 {
-   /* TODO: Linux UAPI */
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_get_params get_param = {
+      .param_group = 0,
+      .pointer = (uint64_t)(uintptr_t)buf,
+      .size = size,
+   };
+
+   memset(buf, 0, size);
+
+   int ret = drmIoctl(dev->fd, DRM_IOCTL_ASAHI_GET_PARAMS, &get_param);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GET_PARAMS failed: %m\n");
+      return -EINVAL;
+   }
+
+   return get_param.size;
 }
 
+static int
+agx_submit(struct agx_device *dev, struct drm_asahi_submit *submit,
+           uint32_t vbo_res_id)
+{
+   return drmIoctl(dev->fd, DRM_IOCTL_ASAHI_SUBMIT, submit);
+}
+
+const agx_device_ops_t agx_device_drm_ops = {
+   .bo_alloc = agx_bo_alloc,
+   .bo_bind = agx_bo_bind,
+   .bo_mmap = agx_bo_mmap,
+   .get_params = agx_get_params,
+   .submit = agx_submit,
+};
+
 bool
 agx_open_device(void *memctx, struct agx_device *dev)
 {
@@ -342,21 +438,119 @@ agx_open_device(void *memctx, struct agx_device *dev)
       debug_get_flags_option("ASAHI_MESA_DEBUG", agx_debug_options, 0);
 
    dev->agxdecode = agxdecode_new_context();
+   dev->ops = agx_device_drm_ops;
 
    ssize_t params_size = -1;
+   drmVersionPtr version;
 
-   /* TODO: Linux UAPI */
-   return false;
+   version = drmGetVersion(dev->fd);
+   if (!version) {
+      fprintf(stderr, "cannot get version: %s", strerror(errno));
+      return NULL;
+   }
 
-   params_size = agx_get_params(dev, &dev->params, sizeof(dev->params));
+   if (!strcmp(version->name, "asahi")) {
+      dev->is_virtio = false;
+      dev->ops = agx_device_drm_ops;
+   } else if (!strcmp(version->name, "virtio_gpu")) {
+      dev->is_virtio = true;
+      if (!agx_virtio_open_device(dev)) {
+         fprintf(stderr,
+                 "Error opening virtio-gpu device for Asahi native context\n");
+         return false;
+      }
+   } else {
+      return false;
+   }
+
+   params_size = dev->ops.get_params(dev, &dev->params, sizeof(dev->params));
    if (params_size <= 0) {
       assert(0);
       return false;
    }
    assert(params_size >= sizeof(dev->params));
 
-   /* TODO: Linux UAPI: Params */
-   unreachable("Linux UAPI not yet upstream");
+   /* Refuse to probe. */
+   if (dev->params.unstable_uabi_version != DRM_ASAHI_UNSTABLE_UABI_VERSION) {
+      fprintf(
+         stderr,
+         "You are attempting to use upstream Mesa with a downstream kernel!\n"
+         "This WILL NOT work.\n"
+         "The Asahi UABI is unstable and NOT SUPPORTED in upstream Mesa.\n"
+         "UABI related code in upstream Mesa is not for use!\n"
+         "\n"
+         "Do NOT attempt to patch out checks, you WILL break your system.\n"
+         "Do NOT report bugs.\n"
+         "Do NOT ask Mesa developers for support.\n"
+         "Do NOT write guides about how to patch out these checks.\n"
+         "Do NOT package patches to Mesa to bypass this.\n"
+         "\n"
+         "~~~\n"
+         "This is not a place of honor.\n"
+         "No highly esteemed deed is commemorated here.\n"
+         "Nothing valued is here.\n"
+         "\n"
+         "What is here was dangerous and repulsive to us.\n"
+         "This message is a warning about danger.\n"
+         "\n"
+         "The danger is still present, in your time, as it was in ours.\n"
+         "The danger is unleashed only if you substantially disturb this place physically.\n"
+         "This place is best shunned and left uninhabited.\n"
+         "~~~\n"
+         "\n"
+         "THIS IS NOT A BUG. THIS IS YOU DOING SOMETHING BROKEN!\n");
+      abort();
+   }
+
+   uint64_t incompat =
+      dev->params.feat_incompat & (~AGX_SUPPORTED_INCOMPAT_FEATURES);
+   if (incompat) {
+      fprintf(stderr, "Missing GPU incompat features: 0x%" PRIx64 "\n",
+              incompat);
+      assert(0);
+      return false;
+   }
+
+   if (dev->params.gpu_generation >= 13 && dev->params.gpu_variant != 'P') {
+      const char *variant = " Unknown";
+      switch (dev->params.gpu_variant) {
+      case 'G':
+         variant = "";
+         break;
+      case 'S':
+         variant = " Pro";
+         break;
+      case 'C':
+         variant = " Max";
+         break;
+      case 'D':
+         variant = " Ultra";
+         break;
+      }
+      snprintf(dev->name, sizeof(dev->name), "Apple M%d%s (G%d%c %02X)",
+               dev->params.gpu_generation - 12, variant,
+               dev->params.gpu_generation, dev->params.gpu_variant,
+               dev->params.gpu_revision + 0xA0);
+   } else {
+      // Note: untested, theoretically this is the logic for at least a few
+      // generations back.
+      const char *variant = " Unknown";
+      switch (dev->params.gpu_variant) {
+      case 'P':
+         variant = "";
+         break;
+      case 'G':
+         variant = "X";
+         break;
+      }
+      snprintf(dev->name, sizeof(dev->name), "Apple A%d%s (G%d%c %02X)",
+               dev->params.gpu_generation + 1, variant,
+               dev->params.gpu_generation, dev->params.gpu_variant,
+               dev->params.gpu_revision + 0xA0);
+   }
+
+   dev->guard_size = dev->params.vm_page_size;
+   dev->shader_base = dev->params.vm_shader_start;
 
    util_sparse_array_init(&dev->bo_map, sizeof(struct agx_bo), 512);
    pthread_mutex_init(&dev->bo_map_lock, NULL);
@@ -367,7 +561,14 @@ agx_open_device(void *memctx, struct agx_device *dev)
    for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i)
       list_inithead(&dev->bo_cache.buckets[i]);
 
-   /* TODO: Linux UAPI: Create VM */
+   struct drm_asahi_vm_create vm_create = {};
+
+   int ret = asahi_simple_ioctl(dev, DRM_IOCTL_ASAHI_VM_CREATE, &vm_create);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_VM_CREATE failed: %m\n");
+      assert(0);
+      return false;
+   }
 
    simple_mtx_init(&dev->vma_lock, mtx_plain);
    util_vma_heap_init(&dev->main_heap, dev->params.vm_user_start,
@@ -376,6 +577,8 @@ agx_open_device(void *memctx, struct agx_device *dev)
       &dev->usc_heap, dev->params.vm_shader_start,
       dev->params.vm_shader_end - dev->params.vm_shader_start + 1);
 
+   dev->vm_id = vm_create.vm_id;
+
    agx_get_global_ids(dev);
 
    glsl_type_singleton_init_or_ref();
@@ -406,9 +609,34 @@ agx_close_device(struct agx_device *dev)
 }
 
 uint32_t
-agx_create_command_queue(struct agx_device *dev, uint32_t caps)
+agx_create_command_queue(struct agx_device *dev, uint32_t caps,
+                         uint32_t priority)
 {
-   unreachable("Linux UAPI not yet upstream");
+   struct drm_asahi_queue_create queue_create = {
+      .vm_id = dev->vm_id,
+      .queue_caps = caps,
+      .priority = priority,
+      .flags = 0,
+   };
+
+   int ret =
+      asahi_simple_ioctl(dev, DRM_IOCTL_ASAHI_QUEUE_CREATE, &queue_create);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_QUEUE_CREATE failed: %m\n");
+      assert(0);
+   }
+
+   return queue_create.queue_id;
+}
+
+int
+agx_destroy_command_queue(struct agx_device *dev, uint32_t queue_id)
+{
+   struct drm_asahi_queue_destroy queue_destroy = {
+      .queue_id = queue_id,
+   };
+
+   return drmIoctl(dev->fd, DRM_IOCTL_ASAHI_QUEUE_DESTROY, &queue_destroy);
 }
 
 int
@@ -507,3 +735,56 @@ agx_get_gpu_timestamp(struct agx_device *dev)
 #error "invalid architecture for asahi"
 #endif
 }
+
+/* (Re)define UUID_SIZE to avoid including vulkan.h (or p_defines.h) here. */
+#define UUID_SIZE 16
+
+void
+agx_get_device_uuid(const struct agx_device *dev, void *uuid)
+{
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+   /* The device UUID uniquely identifies the given device within the machine.
+    * Since we never have more than one device, this doesn't need to be a real
+    * UUID, so we use SHA1("agx" + gpu_generation + gpu_variant + gpu_revision).
+    */
+   static const char *device_name = "agx";
+   _mesa_sha1_update(&sha1_ctx, device_name, strlen(device_name));
+
+   _mesa_sha1_update(&sha1_ctx, &dev->params.gpu_generation,
+                     sizeof(dev->params.gpu_generation));
+   _mesa_sha1_update(&sha1_ctx, &dev->params.gpu_variant,
+                     sizeof(dev->params.gpu_variant));
+   _mesa_sha1_update(&sha1_ctx, &dev->params.gpu_revision,
+                     sizeof(dev->params.gpu_revision));
+
+   uint8_t sha1[SHA1_DIGEST_LENGTH];
+   _mesa_sha1_final(&sha1_ctx, sha1);
+
+   assert(SHA1_DIGEST_LENGTH >= UUID_SIZE);
+   memcpy(uuid, sha1, UUID_SIZE);
+}
+
+void
+agx_get_driver_uuid(void *uuid)
+{
+   const char *driver_id = PACKAGE_VERSION MESA_GIT_SHA1;
+
+   /* The driver UUID is used for determining sharability of images and memory
+    * between two Vulkan instances in separate processes, but also to
+    * determining memory objects and sharability between Vulkan and OpenGL
+    * driver. People who want to share memory need to also check the device
+    * UUID.
+    */
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+   _mesa_sha1_update(&sha1_ctx, driver_id, strlen(driver_id));
+
+   uint8_t sha1[SHA1_DIGEST_LENGTH];
+   _mesa_sha1_final(&sha1_ctx, sha1);
+
+   assert(SHA1_DIGEST_LENGTH >= UUID_SIZE);
+   memcpy(uuid, sha1, UUID_SIZE);
+}
diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h
index 65cb918a793..c061f938647 100644
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <xf86drm.h>
 #include "util/simple_mtx.h"
 #include "util/sparse_array.h"
 #include "util/timespec.h"
@@ -12,6 +13,11 @@
 #include "agx_bo.h"
 #include "agx_formats.h"
 #include "decode.h"
+#include "unstable_asahi_drm.h"
+
+// TODO: this is a lie right now
+static const uint64_t AGX_SUPPORTED_INCOMPAT_FEATURES =
+   DRM_ASAHI_FEAT_MANDATORY_ZS_COMPRESSION;
 
 enum agx_dbg {
    AGX_DBG_TRACE = BITFIELD_BIT(0),
@@ -37,29 +43,6 @@ enum agx_dbg {
    AGX_DBG_FEEDBACK = BITFIELD_BIT(20),
 };
 
-/* Dummy partial declarations, pending real UAPI */
-enum drm_asahi_cmd_type { DRM_ASAHI_CMD_TYPE_PLACEHOLDER_FOR_DOWNSTREAM_UAPI };
-enum drm_asahi_sync_type { DRM_ASAHI_SYNC_SYNCOBJ };
-struct drm_asahi_sync {
-   uint32_t sync_type;
-   uint32_t handle;
-};
-struct drm_asahi_params_global {
-   uint64_t vm_page_size;
-   uint64_t vm_user_start;
-   uint64_t vm_user_end;
-   uint64_t vm_shader_start;
-   uint64_t vm_shader_end;
-   uint32_t chip_id;
-   uint32_t num_clusters_total;
-   uint32_t gpu_generation;
-   uint32_t gpu_variant;
-   uint32_t num_dies;
-   uint32_t timer_frequency_hz;
-   uint32_t num_cores_per_cluster;
-   uint64_t core_masks[32];
-};
-
 /* How many power-of-two levels in the BO cache do we want? 2^14 minimum chosen
  * as it is the page size that all allocations are rounded to
  */
@@ -72,6 +55,20 @@ struct drm_asahi_params_global {
 /* Forward decl only, do not pull in all of NIR */
 struct nir_shader;
 
+#define BARRIER_RENDER  (1 << DRM_ASAHI_SUBQUEUE_RENDER)
+#define BARRIER_COMPUTE (1 << DRM_ASAHI_SUBQUEUE_COMPUTE)
+
+typedef struct {
+   struct agx_bo *(*bo_alloc)(struct agx_device *dev, size_t size, size_t align,
+                              enum agx_bo_flags flags);
+   int (*bo_bind)(struct agx_device *dev, struct agx_bo *bo, uint64_t addr,
+                  uint32_t flags);
+   void (*bo_mmap)(struct agx_bo *bo);
+   ssize_t (*get_params)(struct agx_device *dev, void *buf, size_t size);
+   int (*submit)(struct agx_device *dev, struct drm_asahi_submit *submit,
+                 uint32_t vbo_res_id);
+} agx_device_ops_t;
+
 struct agx_device {
    uint32_t debug;
 
@@ -81,6 +78,12 @@ struct agx_device {
    char name[64];
    struct drm_asahi_params_global params;
    uint64_t next_global_id, last_global_id;
+   bool is_virtio;
+   agx_device_ops_t ops;
+
+   /* vdrm device */
+   struct vdrm_device *vdrm;
+   uint32_t next_blob_id;
 
    /* Device handle */
    int fd;
@@ -136,11 +139,11 @@ agx_lookup_bo(struct agx_device *dev, uint32_t handle)
    return util_sparse_array_get(&dev->bo_map, handle);
 }
 
-void agx_bo_mmap(struct agx_bo *bo);
-
 uint64_t agx_get_global_id(struct agx_device *dev);
 
-uint32_t agx_create_command_queue(struct agx_device *dev, uint32_t caps);
+uint32_t agx_create_command_queue(struct agx_device *dev, uint32_t caps,
+                                  uint32_t priority);
+int agx_destroy_command_queue(struct agx_device *dev, uint32_t queue_id);
 
 int agx_import_sync_file(struct agx_device *dev, struct agx_bo *bo, int fd);
 int agx_export_sync_file(struct agx_device *dev, struct agx_bo *bo);
@@ -154,3 +157,6 @@ agx_gpu_time_to_ns(struct agx_device *dev, uint64_t gpu_time)
 {
    return (gpu_time * NSEC_PER_SEC) / dev->params.timer_frequency_hz;
 }
+
+void agx_get_device_uuid(const struct agx_device *dev, void *uuid);
+void agx_get_driver_uuid(void *uuid);
diff --git a/src/asahi/lib/agx_device_virtio.c b/src/asahi/lib/agx_device_virtio.c
new file mode 100644
index 00000000000..8a31b8bb383
--- /dev/null
+++ b/src/asahi/lib/agx_device_virtio.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright 2024 Sergio Lopez
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "agx_device_virtio.h"
+
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "drm-uapi/virtgpu_drm.h"
+
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "vdrm.h"
+#include "virglrenderer_hw.h"
+
+#include "asahi_proto.h"
+
+/**
+ * Helper for simple pass-thru ioctls
+ */
+int
+agx_virtio_simple_ioctl(struct agx_device *dev, unsigned cmd, void *_req)
+{
+   struct vdrm_device *vdrm = dev->vdrm;
+   unsigned req_len = sizeof(struct asahi_ccmd_ioctl_simple_req);
+   unsigned rsp_len = sizeof(struct asahi_ccmd_ioctl_simple_rsp);
+
+   req_len += _IOC_SIZE(cmd);
+   if (cmd & IOC_OUT)
+      rsp_len += _IOC_SIZE(cmd);
+
+   uint8_t buf[req_len];
+   struct asahi_ccmd_ioctl_simple_req *req = (void *)buf;
+   struct asahi_ccmd_ioctl_simple_rsp *rsp;
+
+   req->hdr = ASAHI_CCMD(IOCTL_SIMPLE, req_len);
+   req->cmd = cmd;
+   memcpy(req->payload, _req, _IOC_SIZE(cmd));
+
+   rsp = vdrm_alloc_rsp(vdrm, &req->hdr, rsp_len);
+
+   int ret = vdrm_send_req(vdrm, &req->hdr, true);
+   if (ret) {
+      fprintf(stderr, "simple_ioctl: vdrm_send_req failed\n");
+      return ret;
+   }
+
+   if (cmd & IOC_OUT)
+      memcpy(_req, rsp->payload, _IOC_SIZE(cmd));
+
+   return rsp->ret;
+}
+
+static struct agx_bo *
+agx_virtio_bo_alloc(struct agx_device *dev, size_t size, size_t align,
+                    enum agx_bo_flags flags)
+{
+   struct agx_bo *bo;
+   unsigned handle = 0;
+   uint64_t ptr_gpu;
+
+   size = ALIGN_POT(size, dev->params.vm_page_size);
+
+   /* executable implies low va */
+   assert(!(flags & AGX_BO_EXEC) || (flags & AGX_BO_LOW_VA));
+
+   struct asahi_ccmd_gem_new_req req = {
+      .hdr = ASAHI_CCMD(GEM_NEW, sizeof(req)),
+      .size = size,
+   };
+
+   if (flags & AGX_BO_WRITEBACK)
+      req.flags |= ASAHI_GEM_WRITEBACK;
+
+   uint32_t blob_flags =
+      VIRTGPU_BLOB_FLAG_USE_MAPPABLE | VIRTGPU_BLOB_FLAG_USE_SHAREABLE;
+
+   req.bind_flags = ASAHI_BIND_READ;
+   if (!(flags & AGX_BO_READONLY)) {
+      req.bind_flags |= ASAHI_BIND_WRITE;
+   }
+
+   uint32_t blob_id = p_atomic_inc_return(&dev->next_blob_id);
+
+   ASSERTED bool lo = (flags & AGX_BO_LOW_VA);
+
+   struct util_vma_heap *heap;
+   if (lo)
+      heap = &dev->usc_heap;
+   else
+      heap = &dev->main_heap;
+
+   simple_mtx_lock(&dev->vma_lock);
+   ptr_gpu = util_vma_heap_alloc(heap, size + dev->guard_size,
+                                 dev->params.vm_page_size);
+   simple_mtx_unlock(&dev->vma_lock);
+   if (!ptr_gpu) {
+      fprintf(stderr, "Failed to allocate BO VMA\n");
+      return NULL;
+   }
+
+   req.addr = ptr_gpu;
+   req.blob_id = blob_id;
+   req.vm_id = dev->vm_id;
+
+   handle = vdrm_bo_create(dev->vdrm, size, blob_flags, blob_id, &req.hdr);
+   if (!handle) {
+      fprintf(stderr, "vdrm_bo_created failed\n");
+      return NULL;
+   }
+
+   pthread_mutex_lock(&dev->bo_map_lock);
+   bo = agx_lookup_bo(dev, handle);
+   dev->max_handle = MAX2(dev->max_handle, handle);
+   pthread_mutex_unlock(&dev->bo_map_lock);
+
+   /* Fresh handle */
+   assert(!memcmp(bo, &((struct agx_bo){}), sizeof(*bo)));
+
+   bo->type = AGX_ALLOC_REGULAR;
+   bo->size = size;
+   bo->align = MAX2(dev->params.vm_page_size, align);
+   bo->flags = flags;
+   bo->dev = dev;
+   bo->handle = handle;
+   bo->prime_fd = -1;
+   bo->blob_id = blob_id;
+   bo->ptr.gpu = ptr_gpu;
+   bo->vbo_res_id = vdrm_handle_to_res_id(dev->vdrm, handle);
+
+   dev->ops.bo_mmap(bo);
+
+   if (flags & AGX_BO_LOW_VA)
+      bo->ptr.gpu -= dev->shader_base;
+
+   assert(bo->ptr.gpu < (1ull << (lo ? 32 : 40)));
+
+   return bo;
+}
+
+static int
+agx_virtio_bo_bind(struct agx_device *dev, struct agx_bo *bo, uint64_t addr,
+                   uint32_t flags)
+{
+   struct asahi_ccmd_gem_bind_req req = {
+      .op = ASAHI_BIND_OP_BIND,
+      .flags = flags,
+      .vm_id = dev->vm_id,
+      .res_id = bo->vbo_res_id,
+      .size = bo->size,
+      .addr = addr,
+      .hdr.cmd = ASAHI_CCMD_GEM_BIND,
+      .hdr.len = sizeof(struct asahi_ccmd_gem_bind_req),
+   };
+
+   int ret = vdrm_send_req(dev->vdrm, &req.hdr, false);
+   if (ret) {
+      fprintf(stderr, "DRM_IOCTL_ASAHI_GEM_BIND failed: %d (handle=%d)\n", ret,
+              bo->handle);
+   }
+
+   return ret;
+}
+
+static void
+agx_virtio_bo_mmap(struct agx_bo *bo)
+{
+   if (bo->ptr.cpu) {
+      return;
+   }
+
+   bo->ptr.cpu = vdrm_bo_map(bo->dev->vdrm, bo->handle, bo->size, NULL);
+   if (bo->ptr.cpu == MAP_FAILED) {
+      bo->ptr.cpu = NULL;
+      fprintf(stderr, "mmap failed: result=%p size=0x%llx fd=%i\n", bo->ptr.cpu,
+              (long long)bo->size, bo->dev->fd);
+   }
+}
+
+static ssize_t
+agx_virtio_get_params(struct agx_device *dev, void *buf, size_t size)
+{
+   struct vdrm_device *vdrm = dev->vdrm;
+   struct asahi_ccmd_get_params_req req = {
+      .params.size = size,
+      .hdr.cmd = ASAHI_CCMD_GET_PARAMS,
+      .hdr.len = sizeof(struct asahi_ccmd_get_params_req),
+   };
+   struct asahi_ccmd_get_params_rsp *rsp;
+
+   rsp =
+      vdrm_alloc_rsp(vdrm, &req.hdr, sizeof(struct asahi_ccmd_get_params_rsp));
+
+   int ret = vdrm_send_req(vdrm, &req.hdr, true);
+   if (ret)
+      goto out;
+
+   ret = rsp->ret;
+   if (!ret) {
+      memcpy(buf, &rsp->params, size);
+      return size;
+   }
+
+out:
+   return ret;
+}
+
+static int
+agx_virtio_submit(struct agx_device *dev, struct drm_asahi_submit *submit,
+                  uint32_t vbo_res_id)
+{
+   struct drm_asahi_command *commands =
+      (struct drm_asahi_command *)submit->commands;
+   struct drm_asahi_sync *in_syncs = (struct drm_asahi_sync *)submit->in_syncs;
+   struct drm_asahi_sync *out_syncs =
+      (struct drm_asahi_sync *)submit->out_syncs;
+   size_t req_len = sizeof(struct asahi_ccmd_submit_req);
+
+   for (int i = 0; i < submit->command_count; i++) {
+      switch (commands[i].cmd_type) {
+      case DRM_ASAHI_CMD_COMPUTE: {
+         req_len += sizeof(struct drm_asahi_command) +
+                    sizeof(struct drm_asahi_cmd_compute);
+         break;
+      }
+
+      case DRM_ASAHI_CMD_RENDER: {
+         struct drm_asahi_cmd_render *render =
+            (struct drm_asahi_cmd_render *)commands[i].cmd_buffer;
+         req_len += sizeof(struct drm_asahi_command) +
+                    sizeof(struct drm_asahi_cmd_render);
+         req_len += render->fragment_attachment_count *
+                    sizeof(struct drm_asahi_attachment);
+         break;
+      }
+
+      default:
+         return EINVAL;
+      }
+   }
+
+   struct asahi_ccmd_submit_req *req =
+      (struct asahi_ccmd_submit_req *)calloc(1, req_len);
+
+   req->queue_id = submit->queue_id;
+   req->result_res_id = vbo_res_id;
+   req->command_count = submit->command_count;
+
+   char *ptr = (char *)&req->payload;
+
+   for (int i = 0; i < submit->command_count; i++) {
+      memcpy(ptr, &commands[i], sizeof(struct drm_asahi_command));
+      ptr += sizeof(struct drm_asahi_command);
+
+      memcpy(ptr, (char *)commands[i].cmd_buffer, commands[i].cmd_buffer_size);
+      ptr += commands[i].cmd_buffer_size;
+
+      if (commands[i].cmd_type == DRM_ASAHI_CMD_RENDER) {
+         struct drm_asahi_cmd_render *render =
+            (struct drm_asahi_cmd_render *)commands[i].cmd_buffer;
+         size_t fragments_size = sizeof(struct drm_asahi_attachment) *
+                                 render->fragment_attachment_count;
+         memcpy(ptr, (char *)render->fragment_attachments, fragments_size);
+         ptr += fragments_size;
+      }
+   }
+
+   req->hdr.cmd = ASAHI_CCMD_SUBMIT;
+   req->hdr.len = req_len;
+
+   struct drm_virtgpu_execbuffer_syncobj *vdrm_in_syncs = calloc(
+      submit->in_sync_count, sizeof(struct drm_virtgpu_execbuffer_syncobj));
+   for (int i = 0; i < submit->in_sync_count; i++) {
+      vdrm_in_syncs[i].handle = in_syncs[i].handle;
+      vdrm_in_syncs[i].point = in_syncs[i].timeline_value;
+   }
+
+   struct drm_virtgpu_execbuffer_syncobj *vdrm_out_syncs = calloc(
+      submit->out_sync_count, sizeof(struct drm_virtgpu_execbuffer_syncobj));
+   for (int i = 0; i < submit->out_sync_count; i++) {
+      vdrm_out_syncs[i].handle = out_syncs[i].handle;
+      vdrm_out_syncs[i].point = out_syncs[i].timeline_value;
+   }
+
+   struct vdrm_execbuf_params p = {
+      /* Signal the host we want to wait for the command to complete */
+      .ring_idx = 1,
+      .req = &req->hdr,
+      .num_in_syncobjs = submit->in_sync_count,
+      .in_syncobjs = vdrm_in_syncs,
+      .num_out_syncobjs = submit->out_sync_count,
+      .out_syncobjs = vdrm_out_syncs,
+   };
+
+   int ret = vdrm_execbuf(dev->vdrm, &p);
+
+   free(vdrm_out_syncs);
+   free(vdrm_in_syncs);
+   free(req);
+   return ret;
+}
+
+const agx_device_ops_t agx_virtio_device_ops = {
+   .bo_alloc = agx_virtio_bo_alloc,
+   .bo_bind = agx_virtio_bo_bind,
+   .bo_mmap = agx_virtio_bo_mmap,
+   .get_params = agx_virtio_get_params,
+   .submit = agx_virtio_submit,
+};
+
+bool
+agx_virtio_open_device(struct agx_device *dev)
+{
+   struct vdrm_device *vdrm;
+
+   vdrm = vdrm_device_connect(dev->fd, 2);
+   if (!vdrm) {
+      fprintf(stderr, "could not connect vdrm\n");
+      return false;
+   }
+
+   dev->vdrm = vdrm;
+   dev->ops = agx_virtio_device_ops;
+   return true;
+}
diff --git a/src/asahi/lib/agx_device_virtio.h b/src/asahi/lib/agx_device_virtio.h
new file mode 100644
index 00000000000..895fa311e1e
--- /dev/null
+++ b/src/asahi/lib/agx_device_virtio.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright 2024 Sergio Lopez
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include "agx_device.h"
+
+int agx_virtio_simple_ioctl(struct agx_device *dev, unsigned cmd, void *_req);
+
+bool agx_virtio_open_device(struct agx_device *dev);
diff --git a/src/asahi/lib/asahi_proto.h b/src/asahi/lib/asahi_proto.h
new file mode 100644
index 00000000000..8820b46a5e7
--- /dev/null
+++ b/src/asahi/lib/asahi_proto.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2024 Sergio Lopez
+ * Copyright 2022 Google LLC
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef ASAHI_PROTO_H_
+#define ASAHI_PROTO_H_
+
+/**
+ * Defines the layout of shmem buffer used for host->guest communication.
+ */
+struct asahi_shmem {
+   struct vdrm_shmem base;
+
+   /**
+    * Counter that is incremented on asynchronous errors, like SUBMIT
+    * or GEM_NEW failures.  The guest should treat errors as context-
+    * lost.
+    */
+   uint32_t async_error;
+
+   /**
+    * Counter that is incremented on global fault (see MSM_PARAM_FAULTS)
+    */
+   uint32_t global_faults;
+};
+DEFINE_CAST(vdrm_shmem, asahi_shmem)
+
+/*
+ * Possible cmd types for "command stream", ie. payload of EXECBUF ioctl:
+ */
+enum asahi_ccmd {
+   ASAHI_CCMD_NOP = 1, /* No payload, can be used to sync with host */
+   ASAHI_CCMD_IOCTL_SIMPLE,
+   ASAHI_CCMD_GET_PARAMS,
+   ASAHI_CCMD_GEM_NEW,
+   ASAHI_CCMD_GEM_BIND,
+   ASAHI_CCMD_SUBMIT,
+};
+
+#define ASAHI_CCMD(_cmd, _len)                                                 \
+   (struct vdrm_ccmd_req)                                                      \
+   {                                                                           \
+      .cmd = ASAHI_CCMD_##_cmd, .len = (_len),                                 \
+   }
+
+/*
+ * ASAHI_CCMD_NOP
+ */
+struct asahi_ccmd_nop_req {
+   struct vdrm_ccmd_req hdr;
+};
+
+/*
+ * ASAHI_CCMD_IOCTL_SIMPLE
+ *
+ * Forward simple/flat IOC_RW or IOC_W ioctls.  Limited ioctls are supported.
+ */
+struct asahi_ccmd_ioctl_simple_req {
+   struct vdrm_ccmd_req hdr;
+
+   uint32_t cmd;
+   uint8_t payload[];
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_ioctl_simple_req)
+
+struct asahi_ccmd_ioctl_simple_rsp {
+   struct vdrm_ccmd_rsp hdr;
+
+   /* ioctl return value, interrupted syscalls are handled on the host without
+    * returning to the guest.
+    */
+   int32_t ret;
+
+   /* The output payload for IOC_RW ioctls, the payload is the same size as
+    * asahi_context_cmd_ioctl_simple_req.
+    *
+    * For IOC_W ioctls (userspace writes, kernel reads) this is zero length.
+    */
+   uint8_t payload[];
+};
+
+struct asahi_ccmd_get_params_req {
+   struct vdrm_ccmd_req hdr;
+   struct drm_asahi_get_params params;
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_get_params_req)
+
+struct asahi_ccmd_get_params_rsp {
+   struct vdrm_ccmd_rsp hdr;
+   int32_t ret;
+   struct drm_asahi_params_global params;
+};
+
+struct asahi_ccmd_gem_new_req {
+   struct vdrm_ccmd_req hdr;
+   uint32_t flags;
+   uint32_t bind_flags;
+   uint32_t vm_id;
+   uint32_t blob_id;
+   uint64_t size;
+   uint64_t addr;
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_gem_new_req)
+
+struct asahi_ccmd_gem_bind_req {
+   struct vdrm_ccmd_req hdr;
+   uint32_t op;
+   uint32_t flags;
+   uint32_t vm_id;
+   uint32_t res_id;
+   uint64_t size;
+   uint64_t addr;
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_gem_bind_req)
+
+struct asahi_ccmd_gem_bind_rsp {
+   struct vdrm_ccmd_rsp hdr;
+   int32_t ret;
+};
+
+struct asahi_ccmd_submit_req {
+   struct vdrm_ccmd_req hdr;
+   uint32_t queue_id;
+   uint32_t result_res_id;
+   uint32_t command_count;
+
+   uint8_t payload[];
+};
+DEFINE_CAST(vdrm_ccmd_req, asahi_ccmd_submit_req)
+
+#endif // ASAHI_PROTO_H_
diff --git a/src/asahi/lib/decode.c b/src/asahi/lib/decode.c
index f5d32155bfc..e981e6ed4de 100644
--- a/src/asahi/lib/decode.c
+++ b/src/asahi/lib/decode.c
@@ -18,18 +18,11 @@
 
 #include "util/u_hexdump.h"
 #include "decode.h"
+#include "unstable_asahi_drm.h"
 #ifdef __APPLE__
 #include "agx_iokit.h"
 #endif
 
-/* Pending UAPI */
-struct drm_asahi_params_global {
-   int gpu_generation;
-   int gpu_variant;
-   int chip_id;
-   int num_clusters_total;
-};
-
 struct libagxdecode_config lib_config;
 
 UNUSED static const char *agx_alloc_types[AGX_NUM_ALLOC] = {"mem", "map",
@@ -283,6 +276,11 @@ agxdecode_map_read_write(struct agxdecode_ctx *ctx)
       DUMP_UNPACKED(T, temp, str "\n");                                        \
    }
 
+#define DUMP_FIELD(struct, fmt, field)                                         \
+   {                                                                           \
+      fprintf(agxdecode_dump_stream, #field " = " fmt "\n", struct->field);    \
+   }
+
 #define agxdecode_log(str) fputs(str, agxdecode_dump_stream)
 #define agxdecode_msg(str) fprintf(agxdecode_dump_stream, "// %s", str)
 
@@ -980,6 +978,116 @@ agxdecode_image_heap(struct agxdecode_ctx *ctx, uint64_t heap,
    agxdecode_map_read_write(ctx);
 }
 
+void
+agxdecode_drm_cmd_render(struct agxdecode_ctx *ctx,
+                         struct drm_asahi_params_global *params,
+                         struct drm_asahi_cmd_render *c, bool verbose)
+{
+   agxdecode_dump_file_open();
+
+   DUMP_FIELD(c, "%llx", flags);
+   DUMP_FIELD(c, "0x%llx", encoder_ptr);
+   agxdecode_stateful(ctx, c->encoder_ptr, "Encoder", agxdecode_vdm, verbose,
+                      params, NULL);
+   DUMP_FIELD(c, "0x%x", encoder_id);
+   DUMP_FIELD(c, "0x%x", cmd_ta_id);
+   DUMP_FIELD(c, "0x%x", cmd_3d_id);
+   DUMP_FIELD(c, "0x%x", ppp_ctrl);
+   DUMP_FIELD(c, "0x%llx", ppp_multisamplectl);
+   DUMP_CL(ZLS_CONTROL, &c->zls_ctrl, "ZLS Control");
+   DUMP_FIELD(c, "0x%llx", depth_buffer_load);
+   DUMP_FIELD(c, "0x%llx", depth_buffer_store);
+   DUMP_FIELD(c, "0x%llx", depth_buffer_partial);
+   DUMP_FIELD(c, "0x%llx", stencil_buffer_load);
+   DUMP_FIELD(c, "0x%llx", stencil_buffer_store);
+   DUMP_FIELD(c, "0x%llx", stencil_buffer_partial);
+   DUMP_FIELD(c, "0x%llx", scissor_array);
+   DUMP_FIELD(c, "0x%llx", depth_bias_array);
+   DUMP_FIELD(c, "%d", fb_width);
+   DUMP_FIELD(c, "%d", fb_height);
+   DUMP_FIELD(c, "%d", layers);
+   DUMP_FIELD(c, "%d", samples);
+   DUMP_FIELD(c, "%d", sample_size);
+   DUMP_FIELD(c, "%d", tib_blocks);
+   DUMP_FIELD(c, "%d", utile_width);
+   DUMP_FIELD(c, "%d", utile_height);
+   DUMP_FIELD(c, "0x%x", load_pipeline);
+   DUMP_FIELD(c, "0x%x", load_pipeline_bind);
+   agxdecode_stateful(ctx, c->load_pipeline & ~0x7, "Load pipeline",
+                      agxdecode_usc, verbose, params, NULL);
+   DUMP_FIELD(c, "0x%x", store_pipeline);
+   DUMP_FIELD(c, "0x%x", store_pipeline_bind);
+   agxdecode_stateful(ctx, c->store_pipeline & ~0x7, "Store pipeline",
+                      agxdecode_usc, verbose, params, NULL);
+   DUMP_FIELD(c, "0x%x", partial_reload_pipeline);
+   DUMP_FIELD(c, "0x%x", partial_reload_pipeline_bind);
+   agxdecode_stateful(ctx, c->partial_reload_pipeline & ~0x7,
+                      "Partial reload pipeline", agxdecode_usc, verbose, params,
+                      NULL);
+   DUMP_FIELD(c, "0x%x", partial_store_pipeline);
+   DUMP_FIELD(c, "0x%x", partial_store_pipeline_bind);
+   agxdecode_stateful(ctx, c->partial_store_pipeline & ~0x7,
+                      "Partial store pipeline", agxdecode_usc, verbose, params,
+                      NULL);
+
+   DUMP_FIELD(c, "0x%x", depth_dimensions);
+   DUMP_FIELD(c, "0x%x", isp_bgobjdepth);
+   DUMP_FIELD(c, "0x%x", isp_bgobjvals);
+
+   agxdecode_sampler_heap(ctx, c->vertex_sampler_array,
+                          c->vertex_sampler_count);
+
+   /* Linux driver doesn't use this, at least for now */
+   assert(c->fragment_sampler_array == c->vertex_sampler_array);
+   assert(c->fragment_sampler_count == c->vertex_sampler_count);
+
+   DUMP_FIELD(c, "%d", vertex_attachment_count);
+   struct drm_asahi_attachment *vertex_attachments =
+      (void *)c->vertex_attachments;
+   for (unsigned i = 0; i < c->vertex_attachment_count; i++) {
+      DUMP_FIELD((&vertex_attachments[i]), "0x%x", order);
+      DUMP_FIELD((&vertex_attachments[i]), "0x%llx", size);
+      DUMP_FIELD((&vertex_attachments[i]), "0x%llx", pointer);
+   }
+   DUMP_FIELD(c, "%d", fragment_attachment_count);
+   struct drm_asahi_attachment *fragment_attachments =
+      (void *)c->fragment_attachments;
+   for (unsigned i = 0; i < c->fragment_attachment_count; i++) {
+      DUMP_FIELD((&fragment_attachments[i]), "0x%x", order);
+      DUMP_FIELD((&fragment_attachments[i]), "0x%llx", size);
+      DUMP_FIELD((&fragment_attachments[i]), "0x%llx", pointer);
+   }
+
+   agxdecode_map_read_write(ctx);
+}
+
+void
+agxdecode_drm_cmd_compute(struct agxdecode_ctx *ctx,
+                          struct drm_asahi_params_global *params,
+                          struct drm_asahi_cmd_compute *c, bool verbose)
+{
+   agxdecode_dump_file_open();
+
+   DUMP_FIELD(c, "%llx", flags);
+   DUMP_FIELD(c, "0x%llx", encoder_ptr);
+   agxdecode_stateful(ctx, c->encoder_ptr, "Encoder", agxdecode_cdm, verbose,
+                      params, NULL);
+   DUMP_FIELD(c, "0x%x", encoder_id);
+   DUMP_FIELD(c, "0x%x", cmd_id);
+
+   agxdecode_sampler_heap(ctx, c->sampler_array, c->sampler_count);
+
+   agxdecode_map_read_write(ctx);
+
+   if (c->helper_program & 1) {
+      fprintf(agxdecode_dump_stream, "Helper program:\n");
+      uint8_t buf[1024];
+      agx_disassemble(
+         buf, agxdecode_fetch_gpu_array(ctx, c->helper_program & ~1, buf),
+         agxdecode_dump_stream);
+   }
+}
+
 static void
 chip_id_to_params(decoder_params *params, uint32_t chip_id)
 {
diff --git a/src/asahi/lib/decode.h b/src/asahi/lib/decode.h
index 6ae6d8ec5d3..0e91be03376 100644
--- a/src/asahi/lib/decode.h
+++ b/src/asahi/lib/decode.h
@@ -10,6 +10,8 @@
 #include <sys/types.h>
 #include "agx_bo.h"
 
+#include "unstable_asahi_drm.h"
+
 struct agxdecode_ctx;
 
 struct agxdecode_ctx *agxdecode_new_context(void);
@@ -26,6 +28,16 @@ void agxdecode_cmdstream(struct agxdecode_ctx *ctx, unsigned cmdbuf_index,
 void agxdecode_image_heap(struct agxdecode_ctx *ctx, uint64_t heap,
                           unsigned nr_entries);
 
+void agxdecode_drm_cmd_render(struct agxdecode_ctx *ctx,
+                              struct drm_asahi_params_global *params,
+                              struct drm_asahi_cmd_render *cmdbuf,
+                              bool verbose);
+
+void agxdecode_drm_cmd_compute(struct agxdecode_ctx *ctx,
+                               struct drm_asahi_params_global *params,
+                               struct drm_asahi_cmd_compute *cmdbuf,
+                               bool verbose);
+
 void agxdecode_dump_file_open(void);
 
 void agxdecode_track_alloc(struct agxdecode_ctx *ctx, struct agx_bo *alloc);
diff --git a/src/asahi/lib/meson.build b/src/asahi/lib/meson.build
index fefdbafcf10..7a99b95c689 100644
--- a/src/asahi/lib/meson.build
+++ b/src/asahi/lib/meson.build
@@ -9,6 +9,7 @@ libasahi_lib_files = files(
   'agx_bo.c',
   'agx_border.c',
   'agx_device.c',
+  'agx_device_virtio.c',
   'agx_formats.c',
   'agx_linker.c',
   'agx_bg_eot.c',
@@ -86,10 +87,10 @@ libagx_shaders = custom_target(
 libasahi_lib = static_library(
   'asahi_lib',
   [libasahi_lib_files, libagx_shaders, agx_pack],
-  include_directories : inc_asahi,
+  include_directories : [inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
   c_args : [no_override_init_args],
   gnu_symbol_visibility : 'hidden',
-  link_with: [libasahi_decode],
+  link_with: [libasahi_decode, libvdrm],
   dependencies: [dep_libdrm, dep_valgrind, idep_nir],
   build_by_default : false,
 )
diff --git a/src/asahi/lib/unstable_asahi_drm.h b/src/asahi/lib/unstable_asahi_drm.h
new file mode 100644
index 00000000000..09914724fb3
--- /dev/null
+++ b/src/asahi/lib/unstable_asahi_drm.h
@@ -0,0 +1,666 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (C) The Asahi Linux Contributors
+ *
+ * Based on asahi_drm.h which is
+ *
+ * Copyright © 2014-2018 Broadcom
+ * Copyright © 2019 Collabora ltd.
+ */
+#ifndef _ASAHI_DRM_H_
+#define _ASAHI_DRM_H_
+
+#include "drm-uapi/drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * The UAPI defined in this file MUST NOT BE USED. End users, DO NOT attempt to
+ * use upstream Mesa with asahi kernels, it will blow up. Distro packagers, DO
+ * NOT patch upstream Mesa to do the same.
+ */
+#define DRM_ASAHI_UNSTABLE_UABI_VERSION (0xDEADBEEF)
+
+#define DRM_ASAHI_GET_PARAMS			0x00
+#define DRM_ASAHI_VM_CREATE			0x01
+#define DRM_ASAHI_VM_DESTROY			0x02
+#define DRM_ASAHI_GEM_CREATE			0x03
+#define DRM_ASAHI_GEM_MMAP_OFFSET		0x04
+#define DRM_ASAHI_GEM_BIND			0x05
+#define DRM_ASAHI_QUEUE_CREATE			0x06
+#define DRM_ASAHI_QUEUE_DESTROY			0x07
+#define DRM_ASAHI_SUBMIT			0x08
+#define DRM_ASAHI_GET_TIME			0x09
+
+#define DRM_ASAHI_MAX_CLUSTERS	32
+
+struct drm_asahi_params_global {
+	__u32 unstable_uabi_version;
+	__u32 pad0;
+
+	__u64 feat_compat;
+	__u64 feat_incompat;
+
+	__u32 gpu_generation;
+	__u32 gpu_variant;
+	__u32 gpu_revision;
+	__u32 chip_id;
+
+	__u32 num_dies;
+	__u32 num_clusters_total;
+	__u32 num_cores_per_cluster;
+	__u32 num_frags_per_cluster;
+	__u32 num_gps_per_cluster;
+	__u32 num_cores_total_active;
+	__u64 core_masks[DRM_ASAHI_MAX_CLUSTERS];
+
+	__u32 vm_page_size;
+	__u32 pad1;
+	__u64 vm_user_start;
+	__u64 vm_user_end;
+	__u64 vm_shader_start;
+	__u64 vm_shader_end;
+
+	__u32 max_syncs_per_submission;
+	__u32 max_commands_per_submission;
+	__u32 max_commands_in_flight;
+	__u32 max_attachments;
+
+	__u32 timer_frequency_hz;
+	__u32 min_frequency_khz;
+	__u32 max_frequency_khz;
+	__u32 max_power_mw;
+
+	__u32 result_render_size;
+	__u32 result_compute_size;
+
+	__u32 firmware_version[4];
+};
+
+/*
+enum drm_asahi_feat_compat {
+};
+*/
+
+enum drm_asahi_feat_incompat {
+	DRM_ASAHI_FEAT_MANDATORY_ZS_COMPRESSION = (1UL) << 0,
+};
+
+struct drm_asahi_get_params {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @param: Parameter group to fetch (MBZ) */
+	__u32 param_group;
+
+	/** @pad: MBZ */
+	__u32 pad;
+
+	/** @value: User pointer to write parameter struct */
+	__u64 pointer;
+
+	/** @value: Size of user buffer, max size supported on return */
+	__u64 size;
+};
+
+struct drm_asahi_vm_create {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @value: Returned VM ID */
+	__u32 vm_id;
+
+	/** @pad: MBZ */
+	__u32 pad;
+};
+
+struct drm_asahi_vm_destroy {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @value: VM ID to be destroyed */
+	__u32 vm_id;
+
+	/** @pad: MBZ */
+	__u32 pad;
+};
+
+#define ASAHI_GEM_WRITEBACK	(1L << 0)
+#define ASAHI_GEM_VM_PRIVATE	(1L << 1)
+
+struct drm_asahi_gem_create {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @size: Size of the BO */
+	__u64 size;
+
+	/** @flags: BO creation flags */
+	__u32 flags;
+
+	/** @handle: VM ID to assign to the BO, if ASAHI_GEM_VM_PRIVATE is set. */
+	__u32 vm_id;
+
+	/** @handle: Returned GEM handle for the BO */
+	__u32 handle;
+};
+
+struct drm_asahi_gem_mmap_offset {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @handle: Handle for the object being mapped. */
+	__u32 handle;
+
+	/** @flags: Must be zero */
+	__u32 flags;
+
+	/** @offset: The fake offset to use for subsequent mmap call */
+	__u64 offset;
+};
+
+enum drm_asahi_bind_op {
+	ASAHI_BIND_OP_BIND = 0,
+	ASAHI_BIND_OP_UNBIND = 1,
+	ASAHI_BIND_OP_UNBIND_ALL = 2,
+};
+
+#define ASAHI_BIND_READ		(1L << 0)
+#define ASAHI_BIND_WRITE	(1L << 1)
+
+struct drm_asahi_gem_bind {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @obj: Bind operation */
+	__u32 op;
+
+	/** @flags: One or more of ASAHI_BIND_* */
+	__u32 flags;
+
+	/** @obj: GEM object to bind */
+	__u32 handle;
+
+	/** @vm_id: The ID of the VM to bind to */
+	__u32 vm_id;
+
+	/** @offset: Offset into the object */
+	__u64 offset;
+
+	/** @range: Number of bytes from the object to bind to addr */
+	__u64 range;
+
+	/** @addr: Address to bind to */
+	__u64 addr;
+};
+
+enum drm_asahi_cmd_type {
+	DRM_ASAHI_CMD_RENDER = 0,
+	DRM_ASAHI_CMD_BLIT = 1,
+	DRM_ASAHI_CMD_COMPUTE = 2,
+};
+
+/* Note: this is an enum so that it can be resolved by Rust bindgen. */
+enum drm_asahi_queue_cap {
+	DRM_ASAHI_QUEUE_CAP_RENDER	= (1UL << DRM_ASAHI_CMD_RENDER),
+	DRM_ASAHI_QUEUE_CAP_BLIT	= (1UL << DRM_ASAHI_CMD_BLIT),
+	DRM_ASAHI_QUEUE_CAP_COMPUTE	= (1UL << DRM_ASAHI_CMD_COMPUTE),
+};
+
+struct drm_asahi_queue_create {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @flags: MBZ */
+	__u32 flags;
+
+	/** @vm_id: The ID of the VM this queue is bound to */
+	__u32 vm_id;
+
+	/** @type: Bitmask of DRM_ASAHI_QUEUE_CAP_* */
+	__u32 queue_caps;
+
+	/** @priority: Queue priority, 0-3 */
+	__u32 priority;
+
+	/** @queue_id: The returned queue ID */
+	__u32 queue_id;
+};
+
+struct drm_asahi_queue_destroy {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @queue_id: The queue ID to be destroyed */
+	__u32 queue_id;
+};
+
+enum drm_asahi_sync_type {
+	DRM_ASAHI_SYNC_SYNCOBJ = 0,
+	DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ = 1,
+};
+
+struct drm_asahi_sync {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @sync_type: One of drm_asahi_sync_type */
+	__u32 sync_type;
+
+	/** @handle: The sync object handle */
+	__u32 handle;
+
+	/** @timeline_value: Timeline value for timeline sync objects */
+	__u64 timeline_value;
+};
+
+enum drm_asahi_subqueue {
+	DRM_ASAHI_SUBQUEUE_RENDER = 0, /* Also blit */
+	DRM_ASAHI_SUBQUEUE_COMPUTE = 1,
+	DRM_ASAHI_SUBQUEUE_COUNT = 2,
+};
+
+#define DRM_ASAHI_BARRIER_NONE ~(0U)
+
+struct drm_asahi_command {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @type: One of drm_asahi_cmd_type */
+	__u32 cmd_type;
+
+	/** @flags: Flags for command submission */
+	__u32 flags;
+
+	/** @cmdbuf: Pointer to the appropriate command buffer structure */
+	__u64 cmd_buffer;
+
+	/** @cmdbuf: Size of the command buffer structure */
+	__u64 cmd_buffer_size;
+
+	/** @cmdbuf: Offset into the result BO to return information about this command */
+	__u64 result_offset;
+
+	/** @cmdbuf: Size of the result data structure */
+	__u64 result_size;
+
+	/** @barriers: Array of command indices per subqueue to wait on */
+	__u32 barriers[DRM_ASAHI_SUBQUEUE_COUNT];
+};
+
+struct drm_asahi_submit {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @in_syncs: An optional array of drm_asahi_sync to wait on before starting this job. */
+	__u64 in_syncs;
+
+	/** @in_syncs: An optional array of drm_asahi_sync objects to signal upon completion. */
+	__u64 out_syncs;
+
+	/** @commands: Pointer to the drm_asahi_command array of commands to submit. */
+	__u64 commands;
+
+	/** @flags: Flags for command submission (MBZ) */
+	__u32 flags;
+
+	/** @queue_id: The queue ID to be submitted to */
+	__u32 queue_id;
+
+	/** @result_handle: An optional BO handle to place result data in */
+	__u32 result_handle;
+
+	/** @in_sync_count: Number of sync objects to wait on before starting this job. */
+	__u32 in_sync_count;
+
+	/** @in_sync_count: Number of sync objects to signal upon completion of this job. */
+	__u32 out_sync_count;
+
+	/** @pad: Number of commands to be submitted */
+	__u32 command_count;
+};
+
+struct drm_asahi_attachment {
+	/** @pointer: Base address of the attachment */
+	__u64 pointer;
+	/** @size: Size of the attachment in bytes */
+	__u64 size;
+	/** @order: Power of 2 exponent related to attachment size (?) */
+	__u32 order;
+	/** @flags: MBZ */
+	__u32 flags;
+};
+
+#define ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES (1UL << 0)
+#define ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S (1UL << 1)
+#define ASAHI_RENDER_VERTEX_SPILLS (1UL << 2)
+#define ASAHI_RENDER_PROCESS_EMPTY_TILES (1UL << 3)
+#define ASAHI_RENDER_NO_VERTEX_CLUSTERING (1UL << 4)
+#define ASAHI_RENDER_MSAA_ZS (1UL << 5)
+/* XXX check */
+#define ASAHI_RENDER_NO_PREEMPTION (1UL << 6)
+
+struct drm_asahi_cmd_render {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	__u64 flags;
+
+	__u64 encoder_ptr;
+
+	__u64 vertex_attachments;
+	__u64 fragment_attachments;
+	__u32 vertex_attachment_count;
+	__u32 fragment_attachment_count;
+
+	__u32 vertex_helper_program;
+	__u32 fragment_helper_program;
+	__u32 vertex_helper_cfg;
+	__u32 fragment_helper_cfg;
+	__u64 vertex_helper_arg;
+	__u64 fragment_helper_arg;
+
+	__u64 depth_buffer_load;
+	__u64 depth_buffer_load_stride;
+	__u64 depth_buffer_store;
+	__u64 depth_buffer_store_stride;
+	__u64 depth_buffer_partial;
+	__u64 depth_buffer_partial_stride;
+	__u64 depth_meta_buffer_load;
+	__u64 depth_meta_buffer_load_stride;
+	__u64 depth_meta_buffer_store;
+	__u64 depth_meta_buffer_store_stride;
+	__u64 depth_meta_buffer_partial;
+	__u64 depth_meta_buffer_partial_stride;
+
+	__u64 stencil_buffer_load;
+	__u64 stencil_buffer_load_stride;
+	__u64 stencil_buffer_store;
+	__u64 stencil_buffer_store_stride;
+	__u64 stencil_buffer_partial;
+	__u64 stencil_buffer_partial_stride;
+	__u64 stencil_meta_buffer_load;
+	__u64 stencil_meta_buffer_load_stride;
+	__u64 stencil_meta_buffer_store;
+	__u64 stencil_meta_buffer_store_stride;
+	__u64 stencil_meta_buffer_partial;
+	__u64 stencil_meta_buffer_partial_stride;
+
+	__u64 scissor_array;
+	__u64 depth_bias_array;
+	__u64 visibility_result_buffer;
+
+	__u64 vertex_sampler_array;
+	__u32 vertex_sampler_count;
+	__u32 vertex_sampler_max;
+
+	__u64 fragment_sampler_array;
+	__u32 fragment_sampler_count;
+	__u32 fragment_sampler_max;
+
+	__u64 zls_ctrl;
+	__u64 ppp_multisamplectl;
+	__u32 ppp_ctrl;
+
+	__u32 fb_width;
+	__u32 fb_height;
+
+	__u32 utile_width;
+	__u32 utile_height;
+
+	__u32 samples;
+	__u32 layers;
+
+	__u32 encoder_id;
+	__u32 cmd_ta_id;
+	__u32 cmd_3d_id;
+
+	__u32 sample_size;
+	__u32 tib_blocks;
+	__u32 iogpu_unk_214;
+
+	__u32 merge_upper_x;
+	__u32 merge_upper_y;
+
+	__u32 load_pipeline;
+	__u32 load_pipeline_bind;
+
+	__u32 store_pipeline;
+	__u32 store_pipeline_bind;
+
+	__u32 partial_reload_pipeline;
+	__u32 partial_reload_pipeline_bind;
+
+	__u32 partial_store_pipeline;
+	__u32 partial_store_pipeline_bind;
+
+	__u32 depth_dimensions;
+	__u32 isp_bgobjdepth;
+	__u32 isp_bgobjvals;
+};
+
+#define ASAHI_RENDER_UNK_UNK1			(1UL << 0)
+#define ASAHI_RENDER_UNK_SET_TILE_CONFIG	(1UL << 1)
+#define ASAHI_RENDER_UNK_SET_UTILE_CONFIG	(1UL << 2)
+#define ASAHI_RENDER_UNK_SET_AUX_FB_UNK		(1UL << 3)
+#define ASAHI_RENDER_UNK_SET_G14_UNK		(1UL << 4)
+
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_140	(1UL << 20)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_158	(1UL << 21)
+#define ASAHI_RENDER_UNK_SET_FRG_TILECFG	(1UL << 22)
+#define ASAHI_RENDER_UNK_SET_LOAD_BGOBJVALS	(1UL << 23)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_38		(1UL << 24)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_3C		(1UL << 25)
+
+#define ASAHI_RENDER_UNK_SET_RELOAD_ZLSCTRL	(1UL << 27)
+#define ASAHI_RENDER_UNK_SET_UNK_BUF_10		(1UL << 28)
+#define ASAHI_RENDER_UNK_SET_FRG_UNK_MASK	(1UL << 29)
+
+#define ASAHI_RENDER_UNK_SET_IOGPU_UNK54	(1UL << 40)
+#define ASAHI_RENDER_UNK_SET_IOGPU_UNK56	(1UL << 41)
+#define ASAHI_RENDER_UNK_SET_TILING_CONTROL	(1UL << 42)
+#define ASAHI_RENDER_UNK_SET_TILING_CONTROL_2	(1UL << 43)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_F0		(1UL << 44)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_F8		(1UL << 45)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_118	(1UL << 46)
+#define ASAHI_RENDER_UNK_SET_VTX_UNK_MASK	(1UL << 47)
+
+#define ASAHI_RENDER_EXT_UNKNOWNS	0xff00
+
+/* XXX: Do not upstream this struct */
+struct drm_asahi_cmd_render_unknowns {
+	/** @type: Type ID of this extension */
+	__u32 type;
+	__u32 pad;
+	/** @next: Pointer to the next extension struct, if any */
+	__u64 next;
+
+	__u64 flags;
+
+	__u64 tile_config;
+	__u64 utile_config;
+
+	__u64 aux_fb_unk;
+	__u64 g14_unk;
+	__u64 frg_unk_140;
+	__u64 frg_unk_158;
+	__u64 frg_tilecfg;
+	__u64 load_bgobjvals;
+	__u64 frg_unk_38;
+	__u64 frg_unk_3c;
+	__u64 reload_zlsctrl;
+	__u64 unk_buf_10;
+	__u64 frg_unk_mask;
+
+	__u64 iogpu_unk54;
+	__u64 iogpu_unk56;
+	__u64 tiling_control;
+	__u64 tiling_control_2;
+	__u64 vtx_unk_f0;
+	__u64 vtx_unk_f8;
+	__u64 vtx_unk_118;
+	__u64 vtx_unk_mask;
+};
+
+/* XXX check */
+#define ASAHI_COMPUTE_NO_PREEMPTION (1UL << 0)
+
+struct drm_asahi_cmd_compute {
+	__u64 flags;
+
+	__u64 encoder_ptr;
+	__u64 encoder_end;
+
+	__u64 attachments;
+	__u32 attachment_count;
+	__u32 pad;
+
+	__u32 helper_program;
+	__u32 helper_cfg;
+	__u64 helper_arg;
+
+	__u32 encoder_id;
+	__u32 cmd_id;
+
+	__u64 sampler_array;
+	__u32 sampler_count;
+	__u32 sampler_max;
+
+	__u32 iogpu_unk_40;
+	__u32 unk_mask;
+};
+
+enum drm_asahi_status {
+	DRM_ASAHI_STATUS_PENDING = 0,
+	DRM_ASAHI_STATUS_COMPLETE,
+	DRM_ASAHI_STATUS_UNKNOWN_ERROR,
+	DRM_ASAHI_STATUS_TIMEOUT,
+	DRM_ASAHI_STATUS_FAULT,
+	DRM_ASAHI_STATUS_KILLED,
+	DRM_ASAHI_STATUS_NO_DEVICE,
+};
+
+enum drm_asahi_fault {
+	DRM_ASAHI_FAULT_NONE = 0,
+	DRM_ASAHI_FAULT_UNKNOWN,
+	DRM_ASAHI_FAULT_UNMAPPED,
+	DRM_ASAHI_FAULT_AF_FAULT,
+	DRM_ASAHI_FAULT_WRITE_ONLY,
+	DRM_ASAHI_FAULT_READ_ONLY,
+	DRM_ASAHI_FAULT_NO_ACCESS,
+};
+
+struct drm_asahi_result_info {
+	/** @status: One of enum drm_asahi_status */
+	__u32 status;
+
+	/** @reason: One of drm_asahi_fault_type */
+	__u32 fault_type;
+
+	/** @unit: Unit number, hardware dependent */
+	__u32 unit;
+
+	/** @sideband: Sideband information, hardware dependent */
+	__u32 sideband;
+
+	/** @level: Page table level at which the fault occurred, hardware dependent */
+	__u8 level;
+
+	/** @read: Fault was a read */
+	__u8 is_read;
+
+	/** @pad: MBZ */
+	__u16 pad;
+
+	/** @unk_5: Extra bits, hardware dependent */
+	__u32 extra;
+
+	/** @address: Fault address, cache line aligned */
+	__u64 address;
+};
+
+#define DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF (1UL << 0)
+#define DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN (1UL << 1)
+#define DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED (1UL << 2)
+
+struct drm_asahi_result_render {
+	/** @address: Common result information */
+	struct drm_asahi_result_info info;
+
+	/** @flags: Zero or more of of DRM_ASAHI_RESULT_RENDER_* */
+	__u64 flags;
+
+	/** @vertex_ts_start: Timestamp of the start of vertex processing */
+	__u64 vertex_ts_start;
+
+	/** @vertex_ts_end: Timestamp of the end of vertex processing */
+	__u64 vertex_ts_end;
+
+	/** @fragment_ts_start: Timestamp of the start of fragment processing */
+	__u64 fragment_ts_start;
+
+	/** @fragment_ts_end: Timestamp of the end of fragment processing */
+	__u64 fragment_ts_end;
+
+	/** @tvb_size_bytes: TVB size at the start of this render */
+	__u64 tvb_size_bytes;
+
+	/** @tvb_usage_bytes: Total TVB usage in bytes for this render */
+	__u64 tvb_usage_bytes;
+
+	/** @num_tvb_overflows: Number of TVB overflows that occurred for this render */
+	__u32 num_tvb_overflows;
+};
+
+struct drm_asahi_result_compute {
+	/** @address: Common result information */
+	struct drm_asahi_result_info info;
+
+	/** @flags: Zero or more of of DRM_ASAHI_RESULT_COMPUTE_* */
+	__u64 flags;
+
+	/** @ts_start: Timestamp of the start of this compute command */
+	__u64 ts_start;
+
+	/** @vertex_ts_end: Timestamp of the end of this compute command */
+	__u64 ts_end;
+};
+
+struct drm_asahi_get_time {
+	/** @extensions: Pointer to the first extension struct, if any */
+	__u64 extensions;
+
+	/** @flags: MBZ. */
+	__u64 flags;
+
+	/** @tv_sec: On return, seconds part of a point in time */
+	__s64 tv_sec;
+
+	/** @tv_nsec: On return, nanoseconds part of a point in time */
+	__s64 tv_nsec;
+
+	/** @gpu_timestamp: On return, the GPU timestamp at that point in time */
+	__u64 gpu_timestamp;
+};
+
+/* Note: this is an enum so that it can be resolved by Rust bindgen. */
+enum {
+   DRM_IOCTL_ASAHI_GET_PARAMS       = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GET_PARAMS, struct drm_asahi_get_params),
+   DRM_IOCTL_ASAHI_VM_CREATE        = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_VM_CREATE, struct drm_asahi_vm_create),
+   DRM_IOCTL_ASAHI_VM_DESTROY       = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_VM_DESTROY, struct drm_asahi_vm_destroy),
+   DRM_IOCTL_ASAHI_GEM_CREATE       = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GEM_CREATE, struct drm_asahi_gem_create),
+   DRM_IOCTL_ASAHI_GEM_MMAP_OFFSET  = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GEM_MMAP_OFFSET, struct drm_asahi_gem_mmap_offset),
+   DRM_IOCTL_ASAHI_GEM_BIND         = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_GEM_BIND, struct drm_asahi_gem_bind),
+   DRM_IOCTL_ASAHI_QUEUE_CREATE     = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_QUEUE_CREATE, struct drm_asahi_queue_create),
+   DRM_IOCTL_ASAHI_QUEUE_DESTROY    = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_QUEUE_DESTROY, struct drm_asahi_queue_destroy),
+   DRM_IOCTL_ASAHI_SUBMIT           = DRM_IOW(DRM_COMMAND_BASE + DRM_ASAHI_SUBMIT, struct drm_asahi_submit),
+   DRM_IOCTL_ASAHI_GET_TIME         = DRM_IOWR(DRM_COMMAND_BASE + DRM_ASAHI_GET_TIME, struct drm_asahi_get_time),
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _ASAHI_DRM_H_ */
diff --git a/src/gallium/drivers/asahi/agx_batch.c b/src/gallium/drivers/asahi/agx_batch.c
index ccf11c7fb59..6143ea028fe 100644
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@@ -5,11 +5,13 @@
  */
 
 #include <xf86drm.h>
+#include "asahi/lib/agx_device_virtio.h"
 #include "asahi/lib/decode.h"
 #include "util/bitset.h"
 #include "util/u_dynarray.h"
 #include "util/u_range.h"
 #include "agx_state.h"
+#include "vdrm.h"
 
 #define foreach_active(ctx, idx)                                               \
    BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)
@@ -156,13 +158,162 @@ agx_batch_init(struct agx_context *ctx,
       assert(!ret && batch->syncobj);
    }
 
+   batch->result_off =
+      (2 * sizeof(union agx_batch_result)) * agx_batch_idx(batch);
+   batch->result =
+      (void *)(((uint8_t *)ctx->result_buf->ptr.cpu) + batch->result_off);
+   memset(batch->result, 0, sizeof(union agx_batch_result) * 2);
+
    agx_batch_mark_active(batch);
 }
 
+const char *status_str[] = {
+   [DRM_ASAHI_STATUS_PENDING] = "(pending)",
+   [DRM_ASAHI_STATUS_COMPLETE] = "Complete",
+   [DRM_ASAHI_STATUS_UNKNOWN_ERROR] = "UNKNOWN ERROR",
+   [DRM_ASAHI_STATUS_TIMEOUT] = "TIMEOUT",
+   [DRM_ASAHI_STATUS_FAULT] = "FAULT",
+   [DRM_ASAHI_STATUS_KILLED] = "KILLED",
+   [DRM_ASAHI_STATUS_NO_DEVICE] = "NO DEVICE",
+};
+
+const char *fault_type_str[] = {
+   [DRM_ASAHI_FAULT_NONE] = "(none)",
+   [DRM_ASAHI_FAULT_UNKNOWN] = "Unknown",
+   [DRM_ASAHI_FAULT_UNMAPPED] = "Unmapped",
+   [DRM_ASAHI_FAULT_AF_FAULT] = "AF Fault",
+   [DRM_ASAHI_FAULT_WRITE_ONLY] = "Write Only",
+   [DRM_ASAHI_FAULT_READ_ONLY] = "Read Only",
+   [DRM_ASAHI_FAULT_NO_ACCESS] = "No Access",
+};
+
+const char *low_unit_str[16] = {
+   "DCMP", "UL1C", "CMP", "GSL1",    "IAP", "VCE",    "TE",  "RAS",
+   "VDM",  "PPP",  "IPF", "IPF_CPF", "VF",  "VF_CPF", "ZLS", "UNK",
+};
+
+const char *mid_unit_str[16] = {
+   "UNK",     "dPM",      "dCDM_KS0", "dCDM_KS1", "dCDM_KS2", "dIPP",
+   "dIPP_CS", "dVDM_CSD", "dVDM_SSD", "dVDM_ILF", "dVDM_ILD", "dRDE0",
+   "dRDE1",   "FC",       "GSL2",     "UNK",
+};
+
+const char *high_unit_str[16] = {
+   "gPM_SP",         "gVDM_CSD_SP", "gVDM_SSD_SP",    "gVDM_ILF_SP",
+   "gVDM_TFP_SP",    "gVDM_MMB_SP", "gCDM_CS_KS0_SP", "gCDM_CS_KS1_SP",
+   "gCDM_CS_KS2_SP", "gCDM_KS0_SP", "gCDM_KS1_SP",    "gCDM_KS2_SP",
+   "gIPP_SP",        "gIPP_CS_SP",  "gRDE0_SP",       "gRDE1_SP",
+};
+
+static void
+agx_print_result(struct agx_device *dev, struct agx_context *ctx,
+                 struct drm_asahi_result_info *info, unsigned batch_idx,
+                 bool is_compute)
+{
+   if (unlikely(info->status != DRM_ASAHI_STATUS_COMPLETE)) {
+      ctx->any_faults = true;
+   }
+
+   if (likely(info->status == DRM_ASAHI_STATUS_COMPLETE &&
+              !((dev)->debug & AGX_DBG_STATS)))
+      return;
+
+   if (is_compute) {
+      struct drm_asahi_result_compute *r = (void *)info;
+      float time = (r->ts_end - r->ts_start) / dev->params.timer_frequency_hz;
+
+      mesa_logw(
+         "[Batch %d] Compute %s: %.06f\n", batch_idx,
+         info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
+         time);
+   } else {
+      struct drm_asahi_result_render *r = (void *)info;
+      float time_vtx = (r->vertex_ts_end - r->vertex_ts_start) /
+                       (float)dev->params.timer_frequency_hz;
+      float time_frag = (r->fragment_ts_end - r->fragment_ts_start) /
+                        (float)dev->params.timer_frequency_hz;
+      mesa_logw(
+         "[Batch %d] Render %s: TVB %9ld/%9ld bytes (%d ovf) %c%c%c | vtx %.06f frag %.06f\n",
+         batch_idx,
+         info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
+         (long)r->tvb_usage_bytes, (long)r->tvb_size_bytes,
+         (int)r->num_tvb_overflows,
+         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF ? 'G' : ' ',
+         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN ? 'M' : ' ',
+         r->flags & DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED ? 'O' : ' ',
+         time_vtx, time_frag);
+   }
+
+   if (info->fault_type != DRM_ASAHI_FAULT_NONE) {
+      const char *unit_name;
+      int unit_index;
+
+      switch (info->unit) {
+      case 0x00 ... 0x9f:
+         unit_name = low_unit_str[info->unit & 0xf];
+         unit_index = info->unit >> 4;
+         break;
+      case 0xa0 ... 0xaf:
+         unit_name = mid_unit_str[info->unit & 0xf];
+         unit_index = 0;
+         break;
+      case 0xb0 ... 0xb7:
+         unit_name = "GL2CC_META";
+         unit_index = info->unit & 0x7;
+         break;
+      case 0xb8:
+         unit_name = "GL2CC_MB";
+         unit_index = 0;
+         break;
+      case 0xe0 ... 0xff:
+         unit_name = high_unit_str[info->unit & 0xf];
+         unit_index = (info->unit >> 4) & 1;
+         break;
+      default:
+         unit_name = "UNK";
+         unit_index = 0;
+         break;
+      }
+
+      mesa_logw(
+         "[Batch %d] Fault: %s : Addr 0x%llx %c Unit %02x (%s/%d) SB 0x%02x L%d Extra 0x%x\n",
+         batch_idx,
+         info->fault_type < ARRAY_SIZE(fault_type_str)
+            ? fault_type_str[info->fault_type]
+            : "?",
+         (long long)info->address, info->is_read ? 'r' : 'W', info->unit,
+         unit_name, unit_index, info->sideband, info->level, info->extra);
+
+      agx_debug_fault(dev, info->address);
+   }
+
+   /* Obscurely, we need to tolerate faults to pass the robustness parts of the
+    * CTS, so we can't assert that we don't fault. But it's helpful for any sort
+    * of debugging to crash on fault.
+    */
+   if (dev->debug) {
+      assert(info->status == DRM_ASAHI_STATUS_COMPLETE ||
+             info->status == DRM_ASAHI_STATUS_KILLED);
+   }
+}
+
 static void
 agx_batch_print_stats(struct agx_device *dev, struct agx_batch *batch)
 {
-   unreachable("Linux UAPI not yet upstream");
+   unsigned batch_idx = agx_batch_idx(batch);
+
+   if (!batch->result)
+      return;
+
+   if (batch->cdm.bo) {
+      agx_print_result(dev, batch->ctx, &batch->result[0].compute.info,
+                       batch_idx, true);
+   }
+
+   if (batch->vdm.bo) {
+      agx_print_result(dev, batch->ctx, &batch->result[1].render.info,
+                       batch_idx, false);
+   }
 }
 
 static void
@@ -175,7 +326,18 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
    assert(ctx->batch != batch);
 
    uint64_t begin_ts = ~0, end_ts = 0;
-   /* TODO: UAPI pending */
+   if (batch->result) {
+      if (batch->cdm.bo) {
+         begin_ts = MIN2(begin_ts, batch->result[0].compute.ts_start);
+         end_ts = MAX2(end_ts, batch->result[0].compute.ts_end);
+      }
+
+      if (batch->vdm.bo) {
+         begin_ts = MIN2(begin_ts, batch->result[1].render.vertex_ts_start);
+         end_ts = MAX2(end_ts, batch->result[1].render.fragment_ts_end);
+      }
+   }
+
    agx_finish_batch_queries(batch, begin_ts, end_ts);
 
    if (reset) {
@@ -197,7 +359,8 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
          if (writer == batch)
             agx_writer_remove(ctx, handle);
 
-         p_atomic_cmpxchg(&bo->writer_syncobj, batch->syncobj, 0);
+         p_atomic_cmpxchg(&bo->writer,
+                          agx_bo_writer(ctx->queue_id, batch->syncobj), 0);
 
          agx_bo_unreference(agx_lookup_bo(dev, handle));
       }
@@ -215,6 +378,9 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
    if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
       agx_batch_print_stats(dev, batch);
    }
+
+   util_unreference_framebuffer_state(&batch->key);
+   agx_batch_mark_complete(batch);
 }
 
 int
@@ -566,8 +732,8 @@ agx_add_sync(struct drm_asahi_sync *syncs, unsigned *count, uint32_t handle)
 
 void
 agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
-                 uint32_t barriers, enum drm_asahi_cmd_type cmd_type,
-                 void *cmdbuf)
+                 struct drm_asahi_cmd_compute *compute,
+                 struct drm_asahi_cmd_render *render)
 {
    struct agx_device *dev = agx_device(ctx->base.screen);
    struct agx_screen *screen = agx_screen(ctx->base.screen);
@@ -579,6 +745,9 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
    feedback = true;
 #endif
 
+   /* Timer queries use the feedback timestamping */
+   feedback |= (batch->timestamps.size > 0);
+
    if (!feedback)
       batch->result = NULL;
 
@@ -597,6 +766,29 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
       .handle = batch->syncobj,
    };
 
+   /* This lock protects against a subtle race scenario:
+    * - Context 1 submits and registers itself as writer for a BO
+    * - Context 2 runs the below loop, and finds the writer syncobj
+    * - Context 1 is destroyed,
+    *     - flushing all batches, unregistering itself as a writer, and
+    *     - Destroying syncobjs for all batches
+    * - Context 2 submits, with a now invalid syncobj ID
+    *
+    * Since batch syncobjs are only destroyed on context destruction, we can
+    * protect against this scenario with a screen-wide rwlock to ensure that
+    * the syncobj destroy code cannot run concurrently with any other
+    * submission. If a submit runs before the wrlock is taken, the syncobjs
+    * must still exist (even if the batch was flushed and no longer a writer).
+    * If it runs after the wrlock is released, then by definition the
+    * just-destroyed syncobjs cannot be writers for any BO at that point.
+    *
+    * A screen-wide (not device-wide) rwlock is sufficient because by definition
+    * resources can only be implicitly shared within a screen. Any shared
+    * resources across screens must have been imported and will go through the
+    * AGX_BO_SHARED path instead, which has no race (but is slower).
+    */
+   u_rwlock_rdlock(&screen->destroy_lock);
+
    int handle;
    AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
       struct agx_bo *bo = agx_lookup_bo(dev, handle);
@@ -624,6 +816,29 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
 
          /* And keep track of the BO for cloning the out_sync */
          shared_bos[shared_bo_count++] = bo;
+      } else {
+         /* Deal with BOs which are not externally shared, but which have been
+          * written from another context within the same screen. We also need to
+          * wait on these using their syncobj.
+          */
+         uint64_t writer = p_atomic_read_relaxed(&bo->writer);
+         if (writer && agx_bo_writer_queue(writer) != ctx->queue_id) {
+            batch_debug(batch, "Waits on inter-context BO @ 0x%" PRIx64,
+                        bo->ptr.gpu);
+
+            agx_add_sync(in_syncs, &in_sync_count,
+                         agx_bo_writer_syncobj(writer));
+            shared_bos[shared_bo_count++] = NULL;
+         }
+      }
+   }
+
+   if (dev->debug & AGX_DBG_SCRATCH) {
+      if (compute)
+         agx_scratch_debug_pre(&ctx->scratch_cs);
+      if (render) {
+         agx_scratch_debug_pre(&ctx->scratch_vs);
+         agx_scratch_debug_pre(&ctx->scratch_fs);
       }
    }
 
@@ -631,9 +846,71 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
    agx_add_sync(in_syncs, &in_sync_count, agx_get_in_sync(ctx));
 
    /* Submit! */
-   /* TODO: UAPI */
-   (void)screen;
-   (void)out_sync;
+   struct drm_asahi_command commands[2];
+   unsigned command_count = 0;
+
+   if (compute) {
+      commands[command_count++] = (struct drm_asahi_command){
+         .cmd_type = DRM_ASAHI_CMD_COMPUTE,
+         .flags = 0,
+         .cmd_buffer = (uint64_t)(uintptr_t)compute,
+         .cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute),
+         .result_offset = feedback ? batch->result_off : 0,
+         .result_size = feedback ? sizeof(union agx_batch_result) : 0,
+         /* Barrier on previous submission */
+         .barriers = {0, 0},
+      };
+   }
+
+   if (render) {
+      commands[command_count++] = (struct drm_asahi_command){
+         .cmd_type = DRM_ASAHI_CMD_RENDER,
+         .flags = 0,
+         .cmd_buffer = (uint64_t)(uintptr_t)render,
+         .cmd_buffer_size = sizeof(struct drm_asahi_cmd_render),
+         .result_offset =
+            feedback ? (batch->result_off + sizeof(union agx_batch_result)) : 0,
+         .result_size = feedback ? sizeof(union agx_batch_result) : 0,
+         /* Barrier on previous submission */
+         .barriers = {compute ? DRM_ASAHI_BARRIER_NONE : 0, compute ? 1 : 0},
+      };
+   }
+
+   struct drm_asahi_submit submit = {
+      .flags = 0,
+      .queue_id = ctx->queue_id,
+      .result_handle = feedback ? ctx->result_buf->handle : 0,
+      .in_sync_count = in_sync_count,
+      .out_sync_count = 1,
+      .command_count = command_count,
+      .in_syncs = (uint64_t)(uintptr_t)(in_syncs),
+      .out_syncs = (uint64_t)(uintptr_t)(&out_sync),
+      .commands = (uint64_t)(uintptr_t)(&commands[0]),
+   };
+
+   int ret = dev->ops.submit(dev, &submit, ctx->result_buf->vbo_res_id);
+
+   u_rwlock_rdunlock(&screen->destroy_lock);
+
+   if (ret) {
+      if (compute) {
+         fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT compute failed: %m\n");
+      }
+
+      if (render) {
+         struct drm_asahi_cmd_render *c = render;
+         fprintf(
+            stderr,
+            "DRM_IOCTL_ASAHI_SUBMIT render failed: %m (%dx%d tile %dx%d layers %d samples %d)\n",
+            c->fb_width, c->fb_height, c->utile_width, c->utile_height,
+            c->layers, c->samples);
+      }
+
+      assert(0);
+   }
+
+   if (ret == ENODEV)
+      abort();
 
    /* Now stash our batch fence into any shared BOs. */
    if (shared_bo_count) {
@@ -644,6 +921,9 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
       assert(out_sync_fd >= 0);
 
       for (unsigned i = 0; i < shared_bo_count; i++) {
+         if (!shared_bos[i])
+            continue;
+
          batch_debug(batch, "Signals shared BO @ 0x%" PRIx64,
                      shared_bos[i]->ptr.gpu);
 
@@ -674,7 +954,7 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
 
       /* But any BOs written by active batches are ours */
       assert(writer == batch && "exclusive writer");
-      p_atomic_set(&bo->writer_syncobj, batch->syncobj);
+      p_atomic_set(&bo->writer, agx_bo_writer(ctx->queue_id, batch->syncobj));
    }
 
    free(in_syncs);
@@ -682,11 +962,16 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
 
    if (dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_SCRATCH)) {
       if (dev->debug & AGX_DBG_TRACE) {
-         /* agxdecode DRM commands */
-         switch (cmd_type) {
-         default:
-            unreachable("Linux UAPI not yet upstream");
+         if (compute) {
+            agxdecode_drm_cmd_compute(dev->agxdecode, &dev->params, compute,
+                                      true);
          }
+
+         if (render) {
+            agxdecode_drm_cmd_render(dev->agxdecode, &dev->params, render,
+                                     true);
+         }
+
          agxdecode_next_frame();
       }
 
@@ -695,6 +980,19 @@ agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
       assert(!ret);
 
       agx_batch_print_stats(dev, batch);
+
+      if (dev->debug & AGX_DBG_SCRATCH) {
+         if (compute) {
+            fprintf(stderr, "CS scratch:\n");
+            agx_scratch_debug_post(&ctx->scratch_cs);
+         }
+         if (render) {
+            fprintf(stderr, "VS scratch:\n");
+            agx_scratch_debug_post(&ctx->scratch_vs);
+            fprintf(stderr, "FS scratch:\n");
+            agx_scratch_debug_post(&ctx->scratch_fs);
+         }
+      }
    }
 
    agx_batch_mark_submitted(batch);
@@ -767,6 +1065,9 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
    if (ctx->batch == batch)
       ctx->batch = NULL;
 
+   /* Elide printing stats */
+   batch->result = NULL;
+
    agx_batch_cleanup(ctx, batch, true);
 }
 
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index 5f49e69cfbb..b3b3d29850a 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -12,6 +12,7 @@
 #include "asahi/layout/layout.h"
 #include "asahi/lib/agx_formats.h"
 #include "asahi/lib/decode.h"
+#include "asahi/lib/unstable_asahi_drm.h"
 #include "drm-uapi/drm_fourcc.h"
 #include "frontend/winsys_handle.h"
 #include "gallium/auxiliary/renderonly/renderonly.h"
@@ -25,6 +26,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
+#include "util/bitscan.h"
 #include "util/format/u_format.h"
 #include "util/half_float.h"
 #include "util/macros.h"
@@ -933,6 +935,7 @@ agx_transfer_map(struct pipe_context *pctx, struct pipe_resource *resource,
 {
    struct agx_context *ctx = agx_context(pctx);
    struct agx_resource *rsrc = agx_resource(resource);
+   struct agx_device *dev = agx_device(ctx->base.screen);
 
    /* Can't map tiled/compressed directly */
    if ((usage & PIPE_MAP_DIRECTLY) && rsrc->modifier != DRM_FORMAT_MOD_LINEAR)
@@ -996,11 +999,11 @@ agx_transfer_map(struct pipe_context *pctx, struct pipe_resource *resource,
          agx_sync_writer(ctx, staging, "GPU read staging blit");
       }
 
-      agx_bo_mmap(staging->bo);
+      dev->ops.bo_mmap(staging->bo);
       return staging->bo->ptr.cpu;
    }
 
-   agx_bo_mmap(rsrc->bo);
+   dev->ops.bo_mmap(rsrc->bo);
 
    if (ail_is_level_twiddled_uncompressed(&rsrc->layout, level)) {
       /* Should never happen for buffers, and it's not safe */
@@ -1226,6 +1229,323 @@ agx_flush_resource(struct pipe_context *pctx, struct pipe_resource *pres)
    }
 }
 
+#define MAX_ATTACHMENTS 16
+
+struct attachments {
+   struct drm_asahi_attachment list[MAX_ATTACHMENTS];
+   size_t count;
+};
+
+static void
+asahi_add_attachment(struct attachments *att, struct agx_resource *rsrc,
+                     struct pipe_surface *surf)
+{
+   assert(att->count < MAX_ATTACHMENTS);
+   int idx = att->count++;
+
+   att->list[idx].size = rsrc->layout.size_B;
+   att->list[idx].pointer = rsrc->bo->ptr.gpu;
+   att->list[idx].order = 1; // TODO: What does this do?
+   att->list[idx].flags = 0;
+}
+
+static bool
+is_aligned(unsigned x, unsigned pot_alignment)
+{
+   assert(util_is_power_of_two_nonzero(pot_alignment));
+   return (x & (pot_alignment - 1)) == 0;
+}
+
+static void
+agx_cmdbuf(struct agx_device *dev, struct drm_asahi_cmd_render *c,
+           struct attachments *att, struct agx_pool *pool,
+           struct agx_batch *batch, struct pipe_framebuffer_state *framebuffer,
+           uint64_t encoder_ptr, uint64_t encoder_id, uint64_t cmd_ta_id,
+           uint64_t cmd_3d_id, uint64_t scissor_ptr, uint64_t depth_bias_ptr,
+           uint64_t visibility_result_ptr, struct asahi_bg_eot pipeline_clear,
+           struct asahi_bg_eot pipeline_load,
+           struct asahi_bg_eot pipeline_store, bool clear_pipeline_textures,
+           double clear_depth, unsigned clear_stencil,
+           struct agx_tilebuffer_layout *tib)
+{
+   memset(c, 0, sizeof(*c));
+
+   c->encoder_ptr = encoder_ptr;
+   c->encoder_id = encoder_id;
+   c->cmd_3d_id = cmd_3d_id;
+   c->cmd_ta_id = cmd_ta_id;
+
+   /* bit 0 specifies OpenGL clip behaviour. Since ARB_clip_control is
+    * advertised, we don't set it and lower in the vertex shader.
+    */
+   c->ppp_ctrl = 0x202;
+
+   c->fb_width = framebuffer->width;
+   c->fb_height = framebuffer->height;
+
+   c->iogpu_unk_214 = 0xc000;
+
+   c->isp_bgobjvals = 0x300;
+
+   struct agx_resource *zres = NULL, *sres = NULL;
+
+   agx_pack(&c->zls_ctrl, ZLS_CONTROL, zls_control) {
+
+      if (framebuffer->zsbuf) {
+         struct pipe_surface *zsbuf = framebuffer->zsbuf;
+         struct agx_resource *zsres = agx_resource(zsbuf->texture);
+
+         unsigned level = zsbuf->u.tex.level;
+         unsigned first_layer = zsbuf->u.tex.first_layer;
+
+         const struct util_format_description *desc = util_format_description(
+            agx_resource(zsbuf->texture)->layout.format);
+
+         assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
+                desc->format == PIPE_FORMAT_Z16_UNORM ||
+                desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT ||
+                desc->format == PIPE_FORMAT_S8_UINT);
+
+         c->depth_dimensions =
+            (framebuffer->width - 1) | ((framebuffer->height - 1) << 15);
+
+         if (util_format_has_depth(desc))
+            zres = zsres;
+         else
+            sres = zsres;
+
+         if (zsres->separate_stencil)
+            sres = zsres->separate_stencil;
+
+         if (zres) {
+            bool clear = (batch->clear & PIPE_CLEAR_DEPTH);
+            bool load = (batch->load & PIPE_CLEAR_DEPTH);
+
+            zls_control.z_store_enable = (batch->resolve & PIPE_CLEAR_DEPTH);
+            zls_control.z_load_enable = !clear && load;
+
+            c->depth_buffer_load = agx_map_texture_gpu(zres, first_layer) +
+                                   ail_get_level_offset_B(&zres->layout, level);
+
+            c->depth_buffer_store = c->depth_buffer_load;
+            c->depth_buffer_partial = c->depth_buffer_load;
+
+            /* Main stride in pages */
+            assert((zres->layout.depth_px == 1 ||
+                    is_aligned(zres->layout.layer_stride_B, AIL_PAGESIZE)) &&
+                   "Page aligned Z layers");
+
+            unsigned stride_pages = zres->layout.layer_stride_B / AIL_PAGESIZE;
+            c->depth_buffer_load_stride = ((stride_pages - 1) << 14) | 1;
+            c->depth_buffer_store_stride = c->depth_buffer_load_stride;
+            c->depth_buffer_partial_stride = c->depth_buffer_load_stride;
+
+            assert(zres->layout.tiling != AIL_TILING_LINEAR && "must tile");
+
+            if (ail_is_compressed(&zres->layout)) {
+               c->depth_meta_buffer_load =
+                  agx_map_texture_gpu(zres, 0) +
+                  zres->layout.metadata_offset_B +
+                  (first_layer * zres->layout.compression_layer_stride_B) +
+                  zres->layout.level_offsets_compressed_B[level];
+
+               /* Meta stride in cache lines */
+               assert(is_aligned(zres->layout.compression_layer_stride_B,
+                                 AIL_CACHELINE) &&
+                      "Cacheline aligned Z meta layers");
+               unsigned stride_lines =
+                  zres->layout.compression_layer_stride_B / AIL_CACHELINE;
+               c->depth_meta_buffer_load_stride = (stride_lines - 1) << 14;
+
+               c->depth_meta_buffer_store = c->depth_meta_buffer_load;
+               c->depth_meta_buffer_store_stride =
+                  c->depth_meta_buffer_load_stride;
+               c->depth_meta_buffer_partial = c->depth_meta_buffer_load;
+               c->depth_meta_buffer_partial_stride =
+                  c->depth_meta_buffer_load_stride;
+
+               zls_control.z_compress_1 = true;
+               zls_control.z_compress_2 = true;
+            }
+
+            if (zres->base.format == PIPE_FORMAT_Z16_UNORM) {
+               const float scale = 0xffff;
+               c->isp_bgobjdepth =
+                  (uint16_t)(SATURATE(clear_depth) * scale + 0.5f);
+               zls_control.z_format = AGX_ZLS_FORMAT_16;
+               c->iogpu_unk_214 |= 0x40000;
+            } else {
+               c->isp_bgobjdepth = fui(clear_depth);
+               zls_control.z_format = AGX_ZLS_FORMAT_32F;
+            }
+         }
+
+         if (sres) {
+            bool clear = (batch->clear & PIPE_CLEAR_STENCIL);
+            bool load = (batch->load & PIPE_CLEAR_STENCIL);
+
+            zls_control.s_store_enable = (batch->resolve & PIPE_CLEAR_STENCIL);
+            zls_control.s_load_enable = !clear && load;
+
+            c->stencil_buffer_load =
+               agx_map_texture_gpu(sres, first_layer) +
+               ail_get_level_offset_B(&sres->layout, level);
+
+            c->stencil_buffer_store = c->stencil_buffer_load;
+            c->stencil_buffer_partial = c->stencil_buffer_load;
+
+            /* Main stride in pages */
+            assert((sres->layout.depth_px == 1 ||
+                    is_aligned(sres->layout.layer_stride_B, AIL_PAGESIZE)) &&
+                   "Page aligned S layers");
+            unsigned stride_pages = sres->layout.layer_stride_B / AIL_PAGESIZE;
+            c->stencil_buffer_load_stride = ((stride_pages - 1) << 14) | 1;
+            c->stencil_buffer_store_stride = c->stencil_buffer_load_stride;
+            c->stencil_buffer_partial_stride = c->stencil_buffer_load_stride;
+
+            if (ail_is_compressed(&sres->layout)) {
+               c->stencil_meta_buffer_load =
+                  agx_map_texture_gpu(sres, 0) +
+                  sres->layout.metadata_offset_B +
+                  (first_layer * sres->layout.compression_layer_stride_B) +
+                  sres->layout.level_offsets_compressed_B[level];
+
+               /* Meta stride in cache lines */
+               assert(is_aligned(sres->layout.compression_layer_stride_B,
+                                 AIL_CACHELINE) &&
+                      "Cacheline aligned S meta layers");
+               unsigned stride_lines =
+                  sres->layout.compression_layer_stride_B / AIL_CACHELINE;
+               c->stencil_meta_buffer_load_stride = (stride_lines - 1) << 14;
+
+               c->stencil_meta_buffer_store = c->stencil_meta_buffer_load;
+               c->stencil_meta_buffer_store_stride =
+                  c->stencil_meta_buffer_load_stride;
+               c->stencil_meta_buffer_partial = c->stencil_meta_buffer_load;
+               c->stencil_meta_buffer_partial_stride =
+                  c->stencil_meta_buffer_load_stride;
+
+               zls_control.s_compress_1 = true;
+               zls_control.s_compress_2 = true;
+            }
+
+            c->isp_bgobjvals |= clear_stencil;
+         }
+      }
+   }
+
+   if (clear_pipeline_textures)
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+   else
+      c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES;
+
+   if (zres && !(batch->clear & PIPE_CLEAR_DEPTH))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+
+   if (sres && !(batch->clear & PIPE_CLEAR_STENCIL))
+      c->flags |= ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S;
+
+   if (dev->debug & AGX_DBG_NOCLUSTER)
+      c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING;
+
+   /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */
+   if (tib->nr_samples > 1 && framebuffer->zsbuf)
+      c->flags |= ASAHI_RENDER_MSAA_ZS;
+
+   memcpy(&c->load_pipeline_bind, &pipeline_clear.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->store_pipeline_bind, &pipeline_store.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_reload_pipeline_bind, &pipeline_load.counts,
+          sizeof(struct agx_counts_packed));
+
+   memcpy(&c->partial_store_pipeline_bind, &pipeline_store.counts,
+          sizeof(struct agx_counts_packed));
+
+   /* XXX is this correct? */
+   c->load_pipeline = pipeline_clear.usc | (framebuffer->nr_cbufs >= 4 ? 8 : 4);
+   c->store_pipeline = pipeline_store.usc | 4;
+   c->partial_reload_pipeline = pipeline_load.usc | 4;
+   c->partial_store_pipeline = pipeline_store.usc | 4;
+
+   c->utile_width = tib->tile_size.width;
+   c->utile_height = tib->tile_size.height;
+
+   c->samples = tib->nr_samples;
+   c->layers = MAX2(util_framebuffer_get_num_layers(framebuffer), 1);
+
+   c->ppp_multisamplectl = batch->uniforms.ppp_multisamplectl;
+   c->sample_size = tib->sample_size_B;
+
+   /* XXX OR 0x80 with eMRT? */
+   c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(tib), 2048) / 2048;
+
+   float tan_60 = 1.732051f;
+   c->merge_upper_x = fui(tan_60 / framebuffer->width);
+   c->merge_upper_y = fui(tan_60 / framebuffer->height);
+
+   c->scissor_array = scissor_ptr;
+   c->depth_bias_array = depth_bias_ptr;
+   c->visibility_result_buffer = visibility_result_ptr;
+
+   c->vertex_sampler_array =
+      batch->sampler_heap.bo ? batch->sampler_heap.bo->ptr.gpu : 0;
+   c->vertex_sampler_count = batch->sampler_heap.count;
+   c->vertex_sampler_max = batch->sampler_heap.count + 1;
+
+   /* In the future we could split the heaps if useful */
+   c->fragment_sampler_array = c->vertex_sampler_array;
+   c->fragment_sampler_count = c->vertex_sampler_count;
+   c->fragment_sampler_max = c->vertex_sampler_max;
+
+   /* If a tile is empty, we do not want to process it, as the redundant
+    * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
+    * memory bandwidth. Any draw marks a tile as non-empty, so we only need to
+    * process empty tiles if the background+EOT programs have a side effect.
+    * This is the case exactly when there is an attachment we are clearing (some
+    * attachment A in clear and in resolve <==> non-empty intersection).
+    *
+    * This case matters a LOT for performance in workloads that split batches.
+    */
+   if (batch->clear & batch->resolve)
+      c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES;
+
+   for (unsigned i = 0; i < framebuffer->nr_cbufs; ++i) {
+      if (!framebuffer->cbufs[i])
+         continue;
+
+      asahi_add_attachment(att, agx_resource(framebuffer->cbufs[i]->texture),
+                           framebuffer->cbufs[i]);
+   }
+
+   if (framebuffer->zsbuf) {
+      struct agx_resource *rsrc = agx_resource(framebuffer->zsbuf->texture);
+
+      asahi_add_attachment(att, rsrc, framebuffer->zsbuf);
+
+      if (rsrc->separate_stencil) {
+         asahi_add_attachment(att, rsrc->separate_stencil, framebuffer->zsbuf);
+      }
+   }
+
+   c->fragment_attachments = (uint64_t)(uintptr_t)&att->list[0];
+   c->fragment_attachment_count = att->count;
+
+   if (batch->vs_scratch) {
+      c->flags |= ASAHI_RENDER_VERTEX_SPILLS;
+      c->vertex_helper_arg = batch->ctx->scratch_vs.buf->ptr.gpu;
+      c->vertex_helper_cfg = batch->vs_preamble_scratch << 16;
+      c->vertex_helper_program = dev->helper->ptr.gpu | 1;
+   }
+   if (batch->fs_scratch) {
+      c->fragment_helper_arg = batch->ctx->scratch_fs.buf->ptr.gpu;
+      c->fragment_helper_cfg = batch->fs_preamble_scratch << 16;
+      c->fragment_helper_program = dev->helper->ptr.gpu | 1;
+   }
+}
+
 /*
  * context
  */
@@ -1255,23 +1575,66 @@ agx_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
    }
 }
 
-void
-agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
+static void
+agx_flush_compute(struct agx_context *ctx, struct agx_batch *batch,
+                  struct drm_asahi_cmd_compute *cmdbuf)
 {
    struct agx_device *dev = agx_device(ctx->base.screen);
 
-   assert(agx_batch_is_active(batch));
-   assert(!agx_batch_is_submitted(batch));
+   /* Finalize the encoder */
+   agx_pack(batch->cdm.current, CDM_STREAM_TERMINATE, _)
+      ;
 
-   /* Make sure there's something to submit. */
-   if (!batch->clear) {
-      agx_batch_reset(ctx, batch);
-      return;
-   }
+   agx_batch_add_bo(batch, batch->cdm.bo);
 
    if (batch->cs_scratch)
       agx_batch_add_bo(batch, ctx->scratch_cs.buf);
 
+   unsigned cmdbuf_id = agx_get_global_id(dev);
+   unsigned encoder_id = agx_get_global_id(dev);
+
+   *cmdbuf = (struct drm_asahi_cmd_compute){
+      .flags = 0,
+      .encoder_ptr = batch->cdm.bo->ptr.gpu,
+      .encoder_end = batch->cdm.bo->ptr.gpu +
+                     (batch->cdm.current - (uint8_t *)batch->cdm.bo->ptr.cpu),
+      .helper_arg = 0,
+      .helper_cfg = 0,
+      .helper_program = 0,
+      .iogpu_unk_40 = 0,
+      .sampler_array =
+         batch->sampler_heap.bo ? batch->sampler_heap.bo->ptr.gpu : 0,
+      .sampler_count = batch->sampler_heap.count,
+      .sampler_max = batch->sampler_heap.count + 1,
+      .encoder_id = encoder_id,
+      .cmd_id = cmdbuf_id,
+      .unk_mask = 0xffffffff,
+   };
+
+   if (batch->cs_scratch) {
+      // The commented out lines *may* be related to subgroup-level preemption,
+      // which we can't support without implementing threadgroup memory in the
+      // helper. Disable them for now.
+
+      // cmdbuf->iogpu_unk_40 = 0x1c;
+      cmdbuf->helper_arg = ctx->scratch_cs.buf->ptr.gpu;
+      cmdbuf->helper_cfg = batch->cs_preamble_scratch << 16;
+      // cmdbuf->helper_cfg |= 0x40;
+      cmdbuf->helper_program = dev->helper->ptr.gpu | 1;
+   }
+}
+
+static void
+agx_flush_render(struct agx_context *ctx, struct agx_batch *batch,
+                 struct drm_asahi_cmd_render *cmdbuf, struct attachments *att)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+
+   if (batch->vs_scratch)
+      agx_batch_add_bo(batch, ctx->scratch_vs.buf);
+   if (batch->fs_scratch)
+      agx_batch_add_bo(batch, ctx->scratch_fs.buf);
+
    assert(batch->initialized);
 
    /* Finalize the encoder */
@@ -1313,22 +1676,46 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
     */
    agx_batch_add_bo(batch, batch->vdm.bo);
 
-   if (batch->vs_scratch)
-      agx_batch_add_bo(batch, ctx->scratch_vs.buf);
-   if (batch->fs_scratch)
-      agx_batch_add_bo(batch, ctx->scratch_fs.buf);
+   unsigned cmd_ta_id = agx_get_global_id(dev);
+   unsigned cmd_3d_id = agx_get_global_id(dev);
+   unsigned encoder_id = agx_get_global_id(dev);
 
-   /* TODO: Linux UAPI submission */
-   (void)dev;
-   (void)zbias;
-   (void)scissor;
-   (void)clear_pipeline_textures;
-   (void)pipeline_store;
-   (void)pipeline_background;
-   (void)pipeline_background_partial;
+   agx_cmdbuf(dev, cmdbuf, att, &batch->pool, batch, &batch->key,
+              batch->vdm.bo->ptr.gpu, encoder_id, cmd_ta_id, cmd_3d_id, scissor,
+              zbias, agx_get_occlusion_heap(batch), pipeline_background,
+              pipeline_background_partial, pipeline_store,
+              clear_pipeline_textures, batch->clear_depth, batch->clear_stencil,
+              &batch->tilebuffer_layout);
+}
 
-   unreachable("Linux UAPI not yet upstream");
-   agx_batch_submit(ctx, batch, 0, 0, NULL);
+void
+agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
+{
+   assert(agx_batch_is_active(batch));
+   assert(!agx_batch_is_submitted(batch));
+
+   struct attachments att = {.count = 0};
+   struct drm_asahi_cmd_render render;
+   struct drm_asahi_cmd_compute compute;
+   bool has_vdm = false, has_cdm = false;
+
+   if (batch->cdm.bo) {
+      agx_flush_compute(ctx, batch, &compute);
+      has_cdm = true;
+   }
+
+   if (batch->vdm.bo && (batch->clear || batch->initialized)) {
+      agx_flush_render(ctx, batch, &render, &att);
+      has_vdm = true;
+   }
+
+   if (!has_cdm && !has_vdm) {
+      agx_batch_reset(ctx, batch);
+      return;
+   }
+
+   agx_batch_submit(ctx, batch, has_cdm ? &compute : NULL,
+                    has_vdm ? &render : NULL);
 }
 
 static void
@@ -1336,6 +1723,7 @@ agx_destroy_context(struct pipe_context *pctx)
 {
    struct agx_device *dev = agx_device(pctx->screen);
    struct agx_context *ctx = agx_context(pctx);
+   struct agx_screen *screen = agx_screen(pctx->screen);
 
    /* Batch state needs to be freed on completion, and we don't want to yank
     * buffers out from in-progress GPU jobs to avoid faults, so just wait until
@@ -1357,6 +1745,11 @@ agx_destroy_context(struct pipe_context *pctx)
 
    agx_bo_unreference(ctx->result_buf);
 
+   /* Lock around the syncobj destruction, to avoid racing
+    * command submission in another context.
+    **/
+   u_rwlock_wrlock(&screen->destroy_lock);
+
    drmSyncobjDestroy(dev->fd, ctx->in_sync_obj);
    drmSyncobjDestroy(dev->fd, ctx->dummy_syncobj);
    if (ctx->in_sync_fd != -1)
@@ -1367,12 +1760,16 @@ agx_destroy_context(struct pipe_context *pctx)
          drmSyncobjDestroy(dev->fd, ctx->batches.slots[i].syncobj);
    }
 
+   u_rwlock_wrunlock(&screen->destroy_lock);
+
    pipe_resource_reference(&ctx->heap, NULL);
 
    agx_scratch_fini(&ctx->scratch_vs);
    agx_scratch_fini(&ctx->scratch_fs);
    agx_scratch_fini(&ctx->scratch_cs);
 
+   agx_destroy_command_queue(dev, ctx->queue_id);
+
    ralloc_free(ctx);
 }
 
@@ -1426,6 +1823,20 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
    }
    pctx->const_uploader = pctx->stream_uploader;
 
+   uint32_t priority = 2;
+   if (flags & PIPE_CONTEXT_PRIORITY_LOW)
+      priority = 3;
+   else if (flags & PIPE_CONTEXT_PRIORITY_MEDIUM)
+      priority = 2;
+   else if (flags & PIPE_CONTEXT_PRIORITY_HIGH)
+      priority = 1;
+
+   ctx->queue_id = agx_create_command_queue(agx_device(screen),
+                                            DRM_ASAHI_QUEUE_CAP_RENDER |
+                                               DRM_ASAHI_QUEUE_CAP_BLIT |
+                                               DRM_ASAHI_QUEUE_CAP_COMPUTE,
+                                            priority);
+
    pctx->destroy = agx_destroy_context;
    pctx->flush = agx_flush;
    pctx->clear = agx_clear;
@@ -1461,9 +1872,10 @@ agx_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
 
    ctx->blitter = util_blitter_create(pctx);
 
-   ctx->result_buf = agx_bo_create(
-      agx_device(screen), sizeof(union agx_batch_result) * AGX_MAX_BATCHES,
-      AGX_BO_WRITEBACK, "Batch result buffer");
+   ctx->result_buf =
+      agx_bo_create(agx_device(screen),
+                    (2 * sizeof(union agx_batch_result)) * AGX_MAX_BATCHES,
+                    AGX_BO_WRITEBACK, "Batch result buffer");
    assert(ctx->result_buf);
 
    /* Sync object/FD used for NATIVE_FENCE_FD. */
@@ -1764,6 +2176,10 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TES_LAYER_VIEWPORT:
       return true;
 
+   case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+      return PIPE_CONTEXT_PRIORITY_LOW | PIPE_CONTEXT_PRIORITY_MEDIUM |
+             PIPE_CONTEXT_PRIORITY_HIGH;
+
    default:
       return u_pipe_screen_get_param_defaults(pscreen, param);
    }
@@ -2179,6 +2595,18 @@ agx_get_timestamp(struct pipe_screen *pscreen)
    return agx_gpu_time_to_ns(dev, agx_get_gpu_timestamp(dev));
 }
 
+static void
+agx_screen_get_device_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+   agx_get_device_uuid(agx_device(pscreen), uuid);
+}
+
+static void
+agx_screen_get_driver_uuid(struct pipe_screen *pscreen, char *uuid)
+{
+   agx_get_driver_uuid(uuid);
+}
+
 struct pipe_screen *
 agx_screen_create(int fd, struct renderonly *ro,
                   const struct pipe_screen_config *config)
@@ -2186,6 +2614,13 @@ agx_screen_create(int fd, struct renderonly *ro,
    struct agx_screen *agx_screen;
    struct pipe_screen *screen;
 
+   /* Refuse to probe. There is no stable UAPI yet. Upstream Mesa cannot be used
+    * yet with Asahi. Do not try. Do not patch out this check. Do not teach
+    * others about patching this check. Do not distribute upstream Mesa with
+    * this check patched out.
+    */
+   return NULL;
+
    agx_screen = rzalloc(NULL, struct agx_screen);
    if (!agx_screen)
       return NULL;
@@ -2202,6 +2637,7 @@ agx_screen_create(int fd, struct renderonly *ro,
 
    agx_screen->dev.fd = fd;
    agx_screen->dev.ro = ro;
+   u_rwlock_init(&agx_screen->destroy_lock);
 
    /* Try to open an AGX device */
    if (!agx_open_device(agx_screen, &agx_screen->dev)) {
@@ -2209,8 +2645,6 @@ agx_screen_create(int fd, struct renderonly *ro,
       return NULL;
    }
 
-   agx_screen->queue_id = agx_create_command_queue(&agx_screen->dev, 0);
-
    screen->destroy = agx_destroy_screen;
    screen->get_screen_fd = agx_screen_get_fd;
    screen->get_name = agx_get_name;
@@ -2220,6 +2654,8 @@ agx_screen_create(int fd, struct renderonly *ro,
    screen->get_shader_param = agx_get_shader_param;
    screen->get_compute_param = agx_get_compute_param;
    screen->get_paramf = agx_get_paramf;
+   screen->get_device_uuid = agx_screen_get_device_uuid;
+   screen->get_driver_uuid = agx_screen_get_driver_uuid;
    screen->is_format_supported = agx_is_format_supported;
    screen->query_dmabuf_modifiers = agx_query_dmabuf_modifiers;
    screen->query_memory_info = agx_query_memory_info;
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 0d47fa447a2..3d139a3c71b 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <xf86drm.h>
 #include "asahi/compiler/agx_compile.h"
 #include "asahi/genxml/agx_pack.h"
 #include "asahi/layout/layout.h"
@@ -18,6 +19,7 @@
 #include "asahi/lib/agx_uvs.h"
 #include "asahi/lib/pool.h"
 #include "asahi/lib/shaders/geometry.h"
+#include "asahi/lib/unstable_asahi_drm.h"
 #include "compiler/nir/nir_lower_blend.h"
 #include "compiler/shader_enums.h"
 #include "gallium/auxiliary/util/u_blitter.h"
@@ -28,6 +30,7 @@
 #include "util/bitset.h"
 #include "util/disk_cache.h"
 #include "util/hash_table.h"
+#include "util/rwlock.h"
 #include "util/u_range.h"
 #include "agx_bg_eot.h"
 #include "agx_helpers.h"
@@ -357,6 +360,8 @@ struct agx_stage {
 };
 
 union agx_batch_result {
+   struct drm_asahi_result_render render;
+   struct drm_asahi_result_compute compute;
 };
 
 /* This is a firmware limit. It should be possible to raise to 2048 in the
@@ -632,6 +637,9 @@ struct agx_context {
       uint64_t generation[AGX_MAX_BATCHES];
    } batches;
 
+   /* Queue handle */
+   uint32_t queue_id;
+
    struct agx_batch *batch;
    struct agx_bo *result_buf;
 
@@ -872,8 +880,9 @@ struct agx_screen {
    struct pipe_screen pscreen;
    struct agx_device dev;
    struct disk_cache *disk_cache;
-   /* Queue handle */
-   uint32_t queue_id;
+
+   /* Lock to protect syncobj usage vs. destruction in context destroy */
+   struct u_rwlock destroy_lock;
 };
 
 static inline struct agx_screen *
@@ -1053,9 +1062,12 @@ agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
 #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle)                             \
    BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count)
 
+struct drm_asahi_cmd_compute;
+struct drm_asahi_cmd_render;
+
 void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
-                      uint32_t barriers, enum drm_asahi_cmd_type cmd_type,
-                      void *cmdbuf);
+                      struct drm_asahi_cmd_compute *compute,
+                      struct drm_asahi_cmd_render *render);
 
 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
 void agx_flush_batch_for_reason(struct agx_context *ctx,
diff --git a/src/gallium/drivers/asahi/meson.build b/src/gallium/drivers/asahi/meson.build
index 03793328cf8..f0f845ac615 100644
--- a/src/gallium/drivers/asahi/meson.build
+++ b/src/gallium/drivers/asahi/meson.build
@@ -20,7 +20,7 @@ files_asahi = files(
 libasahi = static_library(
   'asahi',
   [files_asahi],
-  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src, inc_asahi],
+  include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src, inc_asahi, inc_virtio_gpu, inc_virtio_vdrm],
   c_args : [c_msvc_compat_args],
   gnu_symbol_visibility : 'hidden',
   dependencies : [idep_nir, idep_mesautil, idep_agx_pack, dep_libdrm, idep_mesaclc],