diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c
index 4503d0631a2..5f781fbfef6 100644
--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@@ -49,6 +49,7 @@ nvk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
 
    nvk_cmd_pool_free_mem_list(pool, &cmd->owned_mem);
    nvk_cmd_pool_free_gart_mem_list(pool, &cmd->owned_gart_mem);
+   nvk_cmd_pool_free_qmd_list(pool, &cmd->owned_qmd);
    util_dynarray_fini(&cmd->pushes);
    vk_command_buffer_finish(&cmd->vk);
    vk_free(&pool->vk.alloc, cmd);
@@ -82,6 +83,7 @@ nvk_create_cmd_buffer(struct vk_command_pool *vk_pool,
 
    list_inithead(&cmd->owned_mem);
    list_inithead(&cmd->owned_gart_mem);
+   list_inithead(&cmd->owned_qmd);
    util_dynarray_init(&cmd->pushes, NULL);
 
    *cmd_buffer_out = &cmd->vk;
@@ -104,6 +106,7 @@ nvk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
 
    nvk_cmd_pool_free_mem_list(pool, &cmd->owned_mem);
    nvk_cmd_pool_free_gart_mem_list(pool, &cmd->owned_gart_mem);
+   nvk_cmd_pool_free_qmd_list(pool, &cmd->owned_qmd);
    cmd->upload_mem = NULL;
    cmd->push_mem = NULL;
    cmd->push_mem_limit = NULL;
@@ -290,6 +293,52 @@ nvk_cmd_buffer_cond_render_alloc(struct nvk_cmd_buffer *cmd,
    return VK_SUCCESS;
 }
 
+VkResult
+nvk_cmd_buffer_alloc_qmd(struct nvk_cmd_buffer *cmd,
+                         uint32_t size, uint32_t alignment,
+                         uint64_t *addr, void **ptr)
+{
+   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+
+   /* On Maxwell B and later, we have INVALIDATE_SKED_CACHES so we can just
+    * allocate from wherever we want (the upload stream in this case).
+    */
+   if (pdev->info.cls_compute >= MAXWELL_COMPUTE_B)
+      return nvk_cmd_buffer_upload_alloc(cmd, size, alignment, addr, ptr);
+
+   /* The GPU compute scheduler (SKED) has a cache.  Maxwell B added the
+    * INVALIDATE_SKED_CACHES instruction to manage the SKED cache.  We call
+    * that at the top of every command buffer so that we always pick up
+    * whatever QMDs we've written from the CPU fresh.  On Maxwell A and
+    * earlier, the SKED cache still exists in some form but we have no way to
+    * invalidate it.  If a compute shader has been dispatched from a QMD at an
+    * address that's no longer valid, the SKED cache can fault.  To work
+    * around this, we have a QMD heap on the device and we allocate QMDs from
+    * that on Maxwell A and earlier.
+    *
+    * Prior to Maxwell B, the GPU doesn't seem to need any sort of SKED cache
+    * invalidation to pick up new writes from the CPU.  However, we do still
+    * have to worry about faults that may be caused by the SKED cache
+    * containing a stale address.  Just allocating all QMDs from a central
+    * heap which never throws memory away seems to be sufficient for this.
+    */
+   assert(size <= NVK_CMD_QMD_SIZE);
+   assert(alignment <= NVK_CMD_QMD_SIZE);
+
+   struct nvk_cmd_qmd *qmd;
+   VkResult result = nvk_cmd_pool_alloc_qmd(nvk_cmd_buffer_pool(cmd), &qmd);
+   if (unlikely(result != VK_SUCCESS))
+      return result;
+
+   list_addtail(&qmd->link, &cmd->owned_qmd);
+
+   *addr = qmd->addr;
+   *ptr = qmd->map;
+
+   return VK_SUCCESS;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                        const VkCommandBufferBeginInfo *pBeginInfo)
@@ -573,7 +622,7 @@ nvk_cmd_invalidate_deps(struct nvk_cmd_buffer *cmd,
    }
 
    if ((barriers & NVK_BARRIER_INVALIDATE_QMD_DATA) &&
-       pdev->info.cls_eng3d >= MAXWELL_COMPUTE_B)
+       pdev->info.cls_compute >= MAXWELL_COMPUTE_B)
       P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
 }
 
diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.h b/src/nouveau/vulkan/nvk_cmd_buffer.h
index 03c5120862d..a001f82398a 100644
--- a/src/nouveau/vulkan/nvk_cmd_buffer.h
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.h
@@ -216,6 +216,7 @@ struct nvk_cmd_buffer {
     */
    struct list_head owned_mem;
    struct list_head owned_gart_mem;
+   struct list_head owned_qmd;
 
    struct nvk_cmd_mem *upload_mem;
    uint32_t upload_offset;
@@ -342,6 +343,10 @@ VkResult nvk_cmd_buffer_upload_data(struct nvk_cmd_buffer *cmd,
 VkResult nvk_cmd_buffer_cond_render_alloc(struct nvk_cmd_buffer *cmd,
 					  uint64_t *addr);
 
+VkResult nvk_cmd_buffer_alloc_qmd(struct nvk_cmd_buffer *cmd,
+                                  uint32_t size, uint32_t alignment,
+                                  uint64_t *addr, void **ptr);
+
 void nvk_cmd_flush_wait_dep(struct nvk_cmd_buffer *cmd,
                             const VkDependencyInfo *dep,
                             bool wait);
diff --git a/src/nouveau/vulkan/nvk_cmd_dispatch.c b/src/nouveau/vulkan/nvk_cmd_dispatch.c
index c9dad593d6a..b777638dce3 100644
--- a/src/nouveau/vulkan/nvk_cmd_dispatch.c
+++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c
@@ -190,9 +190,13 @@ nvk_cmd_upload_qmd(struct nvk_cmd_buffer *cmd,
       uint32_t qmd[64];
       nak_fill_qmd(&pdev->info, &shader->info, &qmd_info, qmd, sizeof(qmd));
 
-      result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 0x100, &qmd_addr);
+      void *qmd_map;
+      result = nvk_cmd_buffer_alloc_qmd(cmd, sizeof(qmd), 0x100,
+                                        &qmd_addr, &qmd_map);
       if (unlikely(result != VK_SUCCESS))
          return result;
+
+      memcpy(qmd_map, qmd, sizeof(qmd));
    }
 
    *qmd_addr_out = qmd_addr;
diff --git a/src/nouveau/vulkan/nvk_cmd_pool.c b/src/nouveau/vulkan/nvk_cmd_pool.c
index e16a02682f6..c652c520861 100644
--- a/src/nouveau/vulkan/nvk_cmd_pool.c
+++ b/src/nouveau/vulkan/nvk_cmd_pool.c
@@ -44,6 +44,39 @@ nvk_cmd_mem_destroy(struct nvk_cmd_pool *pool, struct nvk_cmd_mem *mem)
    vk_free(&pool->vk.alloc, mem);
 }
 
+static VkResult
+nvk_cmd_qmd_create(struct nvk_cmd_pool *pool, struct nvk_cmd_qmd **qmd_out)
+{
+   struct nvk_device *dev = nvk_cmd_pool_device(pool);
+   struct nvk_cmd_qmd *qmd;
+   VkResult result;
+
+   qmd = vk_zalloc(&pool->vk.alloc, sizeof(*qmd), 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (qmd == NULL)
+      return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = nvk_heap_alloc(dev, &dev->qmd_heap,
+                           NVK_CMD_QMD_SIZE, NVK_CMD_QMD_SIZE,
+                           &qmd->addr, &qmd->map);
+   if (result != VK_SUCCESS) {
+      vk_free(&pool->vk.alloc, qmd);
+      return result;
+   }
+
+   *qmd_out = qmd;
+   return VK_SUCCESS;
+}
+
+static void
+nvk_cmd_qmd_destroy(struct nvk_cmd_pool *pool, struct nvk_cmd_qmd *qmd)
+{
+   struct nvk_device *dev = nvk_cmd_pool_device(pool);
+
+   nvk_heap_free(dev, &dev->qmd_heap, qmd->addr, NVK_CMD_QMD_SIZE);
+   vk_free(&pool->vk.alloc, qmd);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_CreateCommandPool(VkDevice _device,
                       const VkCommandPoolCreateInfo *pCreateInfo,
@@ -67,6 +100,7 @@ nvk_CreateCommandPool(VkDevice _device,
 
    list_inithead(&pool->free_mem);
    list_inithead(&pool->free_gart_mem);
+   list_inithead(&pool->free_qmd);
 
    *pCmdPool = nvk_cmd_pool_to_handle(pool);
 
@@ -85,6 +119,10 @@ nvk_cmd_pool_destroy_mem(struct nvk_cmd_pool *pool)
       nvk_cmd_mem_destroy(pool, mem);
 
    list_inithead(&pool->free_gart_mem);
+
+   list_for_each_entry_safe(struct nvk_cmd_qmd, qmd, &pool->free_qmd, link)
+      nvk_cmd_qmd_destroy(pool, qmd);
+   list_inithead(&pool->free_qmd);
 }
 
 VkResult
@@ -108,6 +146,21 @@ nvk_cmd_pool_alloc_mem(struct nvk_cmd_pool *pool, bool force_gart,
    return nvk_cmd_mem_create(pool, force_gart, mem_out);
 }
 
+VkResult
+nvk_cmd_pool_alloc_qmd(struct nvk_cmd_pool *pool,
+                       struct nvk_cmd_qmd **qmd_out)
+{
+   if (!list_is_empty(&pool->free_qmd)) {
+      struct nvk_cmd_qmd *qmd =
+         list_first_entry(&pool->free_qmd, struct nvk_cmd_qmd, link);
+      list_del(&qmd->link);
+      *qmd_out = qmd;
+      return VK_SUCCESS;
+   }
+
+   return nvk_cmd_qmd_create(pool, qmd_out);
+}
+
 void
 nvk_cmd_pool_free_mem_list(struct nvk_cmd_pool *pool,
                            struct list_head *mem_list)
@@ -124,6 +177,14 @@ nvk_cmd_pool_free_gart_mem_list(struct nvk_cmd_pool *pool,
    list_inithead(mem_list);
 }
 
+void
+nvk_cmd_pool_free_qmd_list(struct nvk_cmd_pool *pool,
+                           struct list_head *qmd_list)
+{
+   list_splicetail(qmd_list, &pool->free_qmd);
+   list_inithead(qmd_list);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 nvk_DestroyCommandPool(VkDevice _device,
                        VkCommandPool commandPool,
diff --git a/src/nouveau/vulkan/nvk_cmd_pool.h b/src/nouveau/vulkan/nvk_cmd_pool.h
index 01f14079ffb..6d556ef3952 100644
--- a/src/nouveau/vulkan/nvk_cmd_pool.h
+++ b/src/nouveau/vulkan/nvk_cmd_pool.h
@@ -21,12 +21,23 @@ struct nvk_cmd_mem {
    struct list_head link;
 };
 
+#define NVK_CMD_QMD_SIZE 256
+
+struct nvk_cmd_qmd {
+   uint64_t addr;
+   void *map;
+
+   /** Link in nvk_cmd_pool::free_qmd or nvk_cmd_buffer::owned_qmd */
+   struct list_head link;
+};
+
 struct nvk_cmd_pool {
    struct vk_command_pool vk;
 
    /** List of nvk_cmd_mem */
    struct list_head free_mem;
    struct list_head free_gart_mem;
+   struct list_head free_qmd;
 };
 
 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_cmd_pool, vk.base, VkCommandPool,
@@ -41,9 +52,13 @@ nvk_cmd_pool_device(struct nvk_cmd_pool *pool)
 VkResult nvk_cmd_pool_alloc_mem(struct nvk_cmd_pool *pool,
                                 bool force_gart,
                                 struct nvk_cmd_mem **mem_out);
+VkResult nvk_cmd_pool_alloc_qmd(struct nvk_cmd_pool *pool,
+                                struct nvk_cmd_qmd **qmd_out);
 
 void nvk_cmd_pool_free_mem_list(struct nvk_cmd_pool *pool,
                                 struct list_head *mem_list);
 void nvk_cmd_pool_free_gart_mem_list(struct nvk_cmd_pool *pool,
                                      struct list_head *mem_list);
+void nvk_cmd_pool_free_qmd_list(struct nvk_cmd_pool *pool,
+                                struct list_head *mem_list);
 #endif /* NVK_CMD_POOL_H */
diff --git a/src/nouveau/vulkan/nvk_device.c b/src/nouveau/vulkan/nvk_device.c
index 38d4c9e17ca..df05a9f20e4 100644
--- a/src/nouveau/vulkan/nvk_device.c
+++ b/src/nouveau/vulkan/nvk_device.c
@@ -16,6 +16,7 @@
 
 #include "cl9097.h"
 #include "clb097.h"
+#include "clb197.h"
 #include "clc397.h"
 
 static void
@@ -228,6 +229,14 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
    if (result != VK_SUCCESS)
       goto fail_shader_heap;
 
+   if (pdev->info.cls_eng3d < MAXWELL_B) {
+      result = nvk_heap_init(dev, &dev->qmd_heap,
+                             NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR,
+                             0 /* overalloc */, false /* contiguous */);
+      if (result != VK_SUCCESS)
+         goto fail_event_heap;
+   }
+
    nvk_slm_area_init(&dev->slm);
 
    if (pdev->info.cls_eng3d >= FERMI_A &&
@@ -271,6 +280,9 @@ fail_vab_memory:
       nvkmd_mem_unref(dev->vab_memory);
 fail_slm:
    nvk_slm_area_finish(&dev->slm);
+   if (pdev->info.cls_eng3d < MAXWELL_B)
+      nvk_heap_finish(dev, &dev->qmd_heap);
+fail_event_heap:
    nvk_heap_finish(dev, &dev->event_heap);
 fail_shader_heap:
    nvk_heap_finish(dev, &dev->shader_heap);
@@ -301,6 +313,8 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    if (!dev)
       return;
 
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+
    if (dev->copy_queries)
       vk_shader_destroy(&dev->vk, &dev->copy_queries->vk, &dev->vk.alloc);
 
@@ -316,6 +330,8 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    nvk_upload_queue_sync(dev, &dev->upload);
 
    nvk_slm_area_finish(&dev->slm);
+   if (pdev->info.cls_eng3d < MAXWELL_B)
+      nvk_heap_finish(dev, &dev->qmd_heap);
    nvk_heap_finish(dev, &dev->event_heap);
    nvk_heap_finish(dev, &dev->shader_heap);
    nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache);
diff --git a/src/nouveau/vulkan/nvk_device.h b/src/nouveau/vulkan/nvk_device.h
index 4e38adc58f4..30c99ecc4e4 100644
--- a/src/nouveau/vulkan/nvk_device.h
+++ b/src/nouveau/vulkan/nvk_device.h
@@ -46,6 +46,7 @@ struct nvk_device {
    struct nvk_edb_bview_cache edb_bview_cache;
    struct nvk_heap shader_heap;
    struct nvk_heap event_heap;
+   struct nvk_heap qmd_heap;
    struct nvk_slm_area slm;
    struct nvkmd_mem *vab_memory;