nvk: Allocate QMDs from a heap on Maxwell A and earlier

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34127>
2026-01-01 20:30:12 +01:00 · 2025-03-12 11:24:12 -05:00 · 2025-03-12 11:24:12 -05:00 · 7939331dde
commit 7939331dde
parent 94787116b1
7 changed files with 153 additions and 2 deletions
--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@ -49,6 +49,7 @@ nvk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)

   nvk_cmd_pool_free_mem_list(pool, &cmd->owned_mem);
   nvk_cmd_pool_free_gart_mem_list(pool, &cmd->owned_gart_mem);
+   nvk_cmd_pool_free_qmd_list(pool, &cmd->owned_qmd);
   util_dynarray_fini(&cmd->pushes);
   vk_command_buffer_finish(&cmd->vk);
   vk_free(&pool->vk.alloc, cmd);
@ -82,6 +83,7 @@ nvk_create_cmd_buffer(struct vk_command_pool *vk_pool,

   list_inithead(&cmd->owned_mem);
   list_inithead(&cmd->owned_gart_mem);
+   list_inithead(&cmd->owned_qmd);
   util_dynarray_init(&cmd->pushes, NULL);

   *cmd_buffer_out = &cmd->vk;
@ -104,6 +106,7 @@ nvk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,

   nvk_cmd_pool_free_mem_list(pool, &cmd->owned_mem);
   nvk_cmd_pool_free_gart_mem_list(pool, &cmd->owned_gart_mem);
+   nvk_cmd_pool_free_qmd_list(pool, &cmd->owned_qmd);
   cmd->upload_mem = NULL;
   cmd->push_mem = NULL;
   cmd->push_mem_limit = NULL;
@ -290,6 +293,52 @@ nvk_cmd_buffer_cond_render_alloc(struct nvk_cmd_buffer *cmd,
   return VK_SUCCESS;
 }

+VkResult
+nvk_cmd_buffer_alloc_qmd(struct nvk_cmd_buffer *cmd,
+                         uint32_t size, uint32_t alignment,
+                         uint64_t *addr, void **ptr)
+{
+   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+
+   /* On Maxwell B and later, we have INVALIDATE_SKED_CACHES so we can just
+    * allocate from wherever we want (the upload stream in this case).
+    */
+   if (pdev->info.cls_compute >= MAXWELL_COMPUTE_B)
+      return nvk_cmd_buffer_upload_alloc(cmd, size, alignment, addr, ptr);
+
+   /* The GPU compute scheduler (SKED) has a cache.  Maxwell B added the
+    * INVALIDATE_SKED_CACHES instruction to manage the SKED cache.  We call
+    * that at the top of every command buffer so that we always pick up
+    * whatever QMDs we've written from the CPU fresh.  On Maxwell A and
+    * earlier, the SKED cache still exists in some form but we have no way to
+    * invalidate it.  If a compute shader has been dispatched from a QMD at an
+    * address that's no longer valid, the SKED cache can fault.  To work
+    * around this, we have a QMD heap on the device and we allocate QMDs from
+    * that on Maxwell A and earlier.
+    *
+    * Prior to Maxwell B, the GPU doesn't seem to need any sort of SKED cache
+    * invalidation to pick up new writes from the CPU.  However, we do still
+    * have to worry about faults that may be caused by the SKED cache
+    * containing a stale address.  Just allocating all QMDs from a central
+    * heap which never throws memory away seems to be sufficient for this.
+    */
+   assert(size <= NVK_CMD_QMD_SIZE);
+   assert(alignment <= NVK_CMD_QMD_SIZE);
+
+   struct nvk_cmd_qmd *qmd;
+   VkResult result = nvk_cmd_pool_alloc_qmd(nvk_cmd_buffer_pool(cmd), &qmd);
+   if (unlikely(result != VK_SUCCESS))
+      return result;
+
+   list_addtail(&qmd->link, &cmd->owned_qmd);
+
+   *addr = qmd->addr;
+   *ptr = qmd->map;
+
+   return VK_SUCCESS;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
                       const VkCommandBufferBeginInfo *pBeginInfo)
@ -573,7 +622,7 @@ nvk_cmd_invalidate_deps(struct nvk_cmd_buffer *cmd,
   }

   if ((barriers & NVK_BARRIER_INVALIDATE_QMD_DATA) &&
-       pdev->info.cls_eng3d >= MAXWELL_COMPUTE_B)
+       pdev->info.cls_compute >= MAXWELL_COMPUTE_B)
      P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
 }

--- a/src/nouveau/vulkan/nvk_cmd_buffer.h
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.h
@ -216,6 +216,7 @@ struct nvk_cmd_buffer {
    */
   struct list_head owned_mem;
   struct list_head owned_gart_mem;
+   struct list_head owned_qmd;

   struct nvk_cmd_mem *upload_mem;
   uint32_t upload_offset;
@ -342,6 +343,10 @@ VkResult nvk_cmd_buffer_upload_data(struct nvk_cmd_buffer *cmd,
 VkResult nvk_cmd_buffer_cond_render_alloc(struct nvk_cmd_buffer *cmd,
 					  uint64_t *addr);

+VkResult nvk_cmd_buffer_alloc_qmd(struct nvk_cmd_buffer *cmd,
+                                  uint32_t size, uint32_t alignment,
+                                  uint64_t *addr, void **ptr);
+
 void nvk_cmd_flush_wait_dep(struct nvk_cmd_buffer *cmd,
                            const VkDependencyInfo *dep,
                            bool wait);
--- a/src/nouveau/vulkan/nvk_cmd_dispatch.c
+++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c
@ -190,9 +190,13 @@ nvk_cmd_upload_qmd(struct nvk_cmd_buffer *cmd,
      uint32_t qmd[64];
      nak_fill_qmd(&pdev->info, &shader->info, &qmd_info, qmd, sizeof(qmd));

-      result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 0x100, &qmd_addr);
+      void *qmd_map;
+      result = nvk_cmd_buffer_alloc_qmd(cmd, sizeof(qmd), 0x100,
+                                        &qmd_addr, &qmd_map);
      if (unlikely(result != VK_SUCCESS))
         return result;
+
+      memcpy(qmd_map, qmd, sizeof(qmd));
   }

   *qmd_addr_out = qmd_addr;
--- a/src/nouveau/vulkan/nvk_cmd_pool.c
+++ b/src/nouveau/vulkan/nvk_cmd_pool.c
@ -44,6 +44,39 @@ nvk_cmd_mem_destroy(struct nvk_cmd_pool *pool, struct nvk_cmd_mem *mem)
   vk_free(&pool->vk.alloc, mem);
 }

+static VkResult
+nvk_cmd_qmd_create(struct nvk_cmd_pool *pool, struct nvk_cmd_qmd **qmd_out)
+{
+   struct nvk_device *dev = nvk_cmd_pool_device(pool);
+   struct nvk_cmd_qmd *qmd;
+   VkResult result;
+
+   qmd = vk_zalloc(&pool->vk.alloc, sizeof(*qmd), 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (qmd == NULL)
+      return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = nvk_heap_alloc(dev, &dev->qmd_heap,
+                           NVK_CMD_QMD_SIZE, NVK_CMD_QMD_SIZE,
+                           &qmd->addr, &qmd->map);
+   if (result != VK_SUCCESS) {
+      vk_free(&pool->vk.alloc, qmd);
+      return result;
+   }
+
+   *qmd_out = qmd;
+   return VK_SUCCESS;
+}
+
+static void
+nvk_cmd_qmd_destroy(struct nvk_cmd_pool *pool, struct nvk_cmd_qmd *qmd)
+{
+   struct nvk_device *dev = nvk_cmd_pool_device(pool);
+
+   nvk_heap_free(dev, &dev->qmd_heap, qmd->addr, NVK_CMD_QMD_SIZE);
+   vk_free(&pool->vk.alloc, qmd);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_CreateCommandPool(VkDevice _device,
                      const VkCommandPoolCreateInfo *pCreateInfo,
@ -67,6 +100,7 @@ nvk_CreateCommandPool(VkDevice _device,

   list_inithead(&pool->free_mem);
   list_inithead(&pool->free_gart_mem);
+   list_inithead(&pool->free_qmd);

   *pCmdPool = nvk_cmd_pool_to_handle(pool);

@ -85,6 +119,10 @@ nvk_cmd_pool_destroy_mem(struct nvk_cmd_pool *pool)
      nvk_cmd_mem_destroy(pool, mem);

   list_inithead(&pool->free_gart_mem);
+
+   list_for_each_entry_safe(struct nvk_cmd_qmd, qmd, &pool->free_qmd, link)
+      nvk_cmd_qmd_destroy(pool, qmd);
+   list_inithead(&pool->free_qmd);
 }

 VkResult
@ -108,6 +146,21 @@ nvk_cmd_pool_alloc_mem(struct nvk_cmd_pool *pool, bool force_gart,
   return nvk_cmd_mem_create(pool, force_gart, mem_out);
 }

+VkResult
+nvk_cmd_pool_alloc_qmd(struct nvk_cmd_pool *pool,
+                       struct nvk_cmd_qmd **qmd_out)
+{
+   if (!list_is_empty(&pool->free_qmd)) {
+      struct nvk_cmd_qmd *qmd =
+         list_first_entry(&pool->free_qmd, struct nvk_cmd_qmd, link);
+      list_del(&qmd->link);
+      *qmd_out = qmd;
+      return VK_SUCCESS;
+   }
+
+   return nvk_cmd_qmd_create(pool, qmd_out);
+}
+
 void
 nvk_cmd_pool_free_mem_list(struct nvk_cmd_pool *pool,
                           struct list_head *mem_list)
@ -124,6 +177,14 @@ nvk_cmd_pool_free_gart_mem_list(struct nvk_cmd_pool *pool,
   list_inithead(mem_list);
 }

+void
+nvk_cmd_pool_free_qmd_list(struct nvk_cmd_pool *pool,
+                           struct list_head *qmd_list)
+{
+   list_splicetail(qmd_list, &pool->free_qmd);
+   list_inithead(qmd_list);
+}
+
 VKAPI_ATTR void VKAPI_CALL
 nvk_DestroyCommandPool(VkDevice _device,
                       VkCommandPool commandPool,
--- a/src/nouveau/vulkan/nvk_cmd_pool.h
+++ b/src/nouveau/vulkan/nvk_cmd_pool.h
@ -21,12 +21,23 @@ struct nvk_cmd_mem {
   struct list_head link;
 };

+#define NVK_CMD_QMD_SIZE 256
+
+struct nvk_cmd_qmd {
+   uint64_t addr;
+   void *map;
+
+   /** Link in nvk_cmd_pool::free_qmd or nvk_cmd_buffer::owned_qmd */
+   struct list_head link;
+};
+
 struct nvk_cmd_pool {
   struct vk_command_pool vk;

   /** List of nvk_cmd_mem */
   struct list_head free_mem;
   struct list_head free_gart_mem;
+   struct list_head free_qmd;
 };

 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_cmd_pool, vk.base, VkCommandPool,
@ -41,9 +52,13 @@ nvk_cmd_pool_device(struct nvk_cmd_pool *pool)
 VkResult nvk_cmd_pool_alloc_mem(struct nvk_cmd_pool *pool,
                                bool force_gart,
                                struct nvk_cmd_mem **mem_out);
+VkResult nvk_cmd_pool_alloc_qmd(struct nvk_cmd_pool *pool,
+                                struct nvk_cmd_qmd **qmd_out);

 void nvk_cmd_pool_free_mem_list(struct nvk_cmd_pool *pool,
                                struct list_head *mem_list);
 void nvk_cmd_pool_free_gart_mem_list(struct nvk_cmd_pool *pool,
                                     struct list_head *mem_list);
+void nvk_cmd_pool_free_qmd_list(struct nvk_cmd_pool *pool,
+                                struct list_head *mem_list);
 #endif /* NVK_CMD_POOL_H */
--- a/src/nouveau/vulkan/nvk_device.c
+++ b/src/nouveau/vulkan/nvk_device.c
@ -16,6 +16,7 @@

 #include "cl9097.h"
 #include "clb097.h"
+#include "clb197.h"
 #include "clc397.h"

 static void
@ -228,6 +229,14 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
   if (result != VK_SUCCESS)
      goto fail_shader_heap;

+   if (pdev->info.cls_eng3d < MAXWELL_B) {
+      result = nvk_heap_init(dev, &dev->qmd_heap,
+                             NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR,
+                             0 /* overalloc */, false /* contiguous */);
+      if (result != VK_SUCCESS)
+         goto fail_event_heap;
+   }
+
   nvk_slm_area_init(&dev->slm);

   if (pdev->info.cls_eng3d >= FERMI_A &&
@ -271,6 +280,9 @@ fail_vab_memory:
      nvkmd_mem_unref(dev->vab_memory);
 fail_slm:
   nvk_slm_area_finish(&dev->slm);
+   if (pdev->info.cls_eng3d < MAXWELL_B)
+      nvk_heap_finish(dev, &dev->qmd_heap);
+fail_event_heap:
   nvk_heap_finish(dev, &dev->event_heap);
 fail_shader_heap:
   nvk_heap_finish(dev, &dev->shader_heap);
@ -301,6 +313,8 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   if (!dev)
      return;

+   const struct nvk_physical_device *pdev = nvk_device_physical(dev);
+
   if (dev->copy_queries)
      vk_shader_destroy(&dev->vk, &dev->copy_queries->vk, &dev->vk.alloc);

@ -316,6 +330,8 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   nvk_upload_queue_sync(dev, &dev->upload);

   nvk_slm_area_finish(&dev->slm);
+   if (pdev->info.cls_eng3d < MAXWELL_B)
+      nvk_heap_finish(dev, &dev->qmd_heap);
   nvk_heap_finish(dev, &dev->event_heap);
   nvk_heap_finish(dev, &dev->shader_heap);
   nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache);
--- a/src/nouveau/vulkan/nvk_device.h
+++ b/src/nouveau/vulkan/nvk_device.h
@ -46,6 +46,7 @@ struct nvk_device {
   struct nvk_edb_bview_cache edb_bview_cache;
   struct nvk_heap shader_heap;
   struct nvk_heap event_heap;
+   struct nvk_heap qmd_heap;
   struct nvk_slm_area slm;
   struct nvkmd_mem *vab_memory;