nvk: Use VM_BIND for contiguous heaps instead of copying

This gets rid of our (fairly sketchy) heap resizing via stall-and-copy and replaces it with VM_BIND. We couldn't do this on the old nouveau API but now that we can assume VM_BIND, it makes everything simpler. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27205>
2025-12-20 13:50:11 +01:00 · 2024-01-22 16:59:22 -06:00 · 2024-01-22 16:59:22 -06:00 · e162c2e78e
commit e162c2e78e
parent f0fad6ed17
7 changed files with 96 additions and 183 deletions
--- a/src/nouveau/vulkan/nvk_cmd_dispatch.c
+++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c
@ -55,6 +55,15 @@ nvk_push_dispatch_state_init(struct nvk_device *dev, struct nv_push *p)
   if (pdev->info.cls_compute == MAXWELL_COMPUTE_A)
      P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
   if (pdev->info.cls_eng3d < VOLTA_COMPUTE_A) {
      uint64_t shader_base_addr =
         nvk_heap_contiguous_base_address(&dev->shader_heap);
      P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
      P_NVA0C0_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
      P_NVA0C0_SET_PROGRAM_REGION_B(p, shader_base_addr);
   }
   return VK_SUCCESS;
 }
--- a/src/nouveau/vulkan/nvk_cmd_draw.c
+++ b/src/nouveau/vulkan/nvk_cmd_draw.c
@ -367,6 +367,15 @@ nvk_push_draw_state_init(struct nvk_device *dev, struct nv_push *p)
   P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
   if (pdev->info.cls_eng3d < VOLTA_A) {
      uint64_t shader_base_addr =
         nvk_heap_contiguous_base_address(&dev->shader_heap);
      P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
      P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
      P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
   }
   for (uint32_t i = 0; i < 6; i++) {
      P_IMMD(p, NV9097, SET_PIPELINE_SHADER(i), {
         .enable  = ENABLE_FALSE,
--- a/src/nouveau/vulkan/nvk_heap.c
+++ b/src/nouveau/vulkan/nvk_heap.c
@ -26,7 +26,17 @@ nvk_heap_init(struct nvk_device *dev, struct nvk_heap *heap,
      heap->bo_flags |= NOUVEAU_WS_BO_MAP;
   heap->map_flags = map_flags;
   heap->overalloc = overalloc;
-   heap->contiguous = contiguous;
+
   if (contiguous) {
      heap->base_addr = nouveau_ws_alloc_vma(dev->ws_dev, 0,
                                             NVK_HEAP_MAX_SIZE,
                                             0, false /* bda */,
                                             false /* sparse */);
      if (heap->base_addr == 0) {
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Failed to allocate VMA for heap");
      }
   }
   simple_mtx_init(&heap->mutex, mtx_plain);
   util_vma_heap_init(&heap->heap, 0, 0);
@ -41,12 +51,21 @@ void
 nvk_heap_finish(struct nvk_device *dev, struct nvk_heap *heap)
 {
   for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) {
      if (heap->base_addr != 0) {
         nouveau_ws_bo_unbind_vma(dev->ws_dev, heap->bos[bo_idx].addr,
                                  heap->bos[bo_idx].bo->size);
      }
      nouveau_ws_bo_unmap(heap->bos[bo_idx].bo, heap->bos[bo_idx].map);
      nouveau_ws_bo_destroy(heap->bos[bo_idx].bo);
   }
   util_vma_heap_finish(&heap->heap);
   simple_mtx_destroy(&heap->mutex);
   if (heap->base_addr != 0) {
      nouveau_ws_free_vma(dev->ws_dev, heap->base_addr, NVK_HEAP_MAX_SIZE,
                          false /* bda */, false /* sparse */);
   }
 }
 static uint64_t
@ -74,115 +93,41 @@ vma_bo_offset(uint64_t offset)
 static VkResult
 nvk_heap_grow_locked(struct nvk_device *dev, struct nvk_heap *heap)
 {
-   VkResult result;
+   if (heap->bo_count >= NVK_HEAP_MAX_BO_COUNT) {
-
+      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
-   if (heap->contiguous) {
+                       "Heap has already hit its maximum size");
      if (heap->total_size >= NVK_HEAP_MAX_SIZE) {
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Heap has already hit its maximum size");
      }
      const uint64_t new_bo_size =
         MAX2(heap->total_size * 2, NVK_HEAP_MIN_SIZE);
      void *new_bo_map;
      struct nouveau_ws_bo *new_bo =
         nouveau_ws_bo_new_mapped(dev->ws_dev,
                                  new_bo_size + heap->overalloc, 0,
                                  heap->bo_flags, heap->map_flags,
                                  &new_bo_map);
      if (new_bo == NULL) {
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Failed to allocate a heap BO: %m");
      }
      if (heap->bo_count > 0) {
         assert(heap->bo_count == 1);
         struct nouveau_ws_bo *old_bo = heap->bos[0].bo;
         assert(util_is_power_of_two_nonzero64(heap->total_size));
         assert(heap->total_size >= NVK_HEAP_MIN_SIZE);
         assert(heap->total_size <= old_bo->size);
         assert(heap->total_size < new_bo_size);
         unsigned line_bytes = MIN2(heap->total_size, 1 << 17);
         assert(heap->total_size % line_bytes == 0);
         unsigned line_count = heap->total_size / line_bytes;
         uint32_t push_dw[12];
         struct nv_push push;
         nv_push_init(&push, push_dw, ARRAY_SIZE(push_dw));
         struct nv_push *p = &push;
         P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
         P_NV90B5_OFFSET_IN_UPPER(p, old_bo->offset >> 32);
         P_NV90B5_OFFSET_IN_LOWER(p, old_bo->offset & 0xffffffff);
         P_NV90B5_OFFSET_OUT_UPPER(p, new_bo->offset >> 32);
         P_NV90B5_OFFSET_OUT_LOWER(p, new_bo->offset & 0xffffffff);
         P_NV90B5_PITCH_IN(p, line_bytes);
         P_NV90B5_PITCH_OUT(p, line_bytes);
         P_NV90B5_LINE_LENGTH_IN(p, line_bytes);
         P_NV90B5_LINE_COUNT(p, line_count);
         P_IMMD(p, NV90B5, LAUNCH_DMA, {
            .data_transfer_type = DATA_TRANSFER_TYPE_NON_PIPELINED,
            .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
            .flush_enable = FLUSH_ENABLE_TRUE,
            .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
            .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
         });
         struct nouveau_ws_bo *push_bos[] = { new_bo, old_bo, };
         result = nvk_queue_submit_simple(&dev->queue,
                                          nv_push_dw_count(&push), push_dw,
                                          ARRAY_SIZE(push_bos), push_bos);
         if (result != VK_SUCCESS) {
            nouveau_ws_bo_unmap(new_bo, new_bo_map);
            nouveau_ws_bo_destroy(new_bo);
            return result;
         }
         nouveau_ws_bo_unmap(heap->bos[0].bo, heap->bos[0].map);
         nouveau_ws_bo_destroy(heap->bos[0].bo);
      }
      uint64_t vma = encode_vma(0, heap->total_size);
      util_vma_heap_free(&heap->heap, vma, new_bo_size - heap->total_size);
      heap->total_size = new_bo_size;
      heap->bo_count = 1;
      heap->bos[0].bo = new_bo;
      heap->bos[0].map = new_bo_map;
      return VK_SUCCESS;
   } else {
      if (heap->bo_count >= NVK_HEAP_MAX_BO_COUNT) {
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Heap has already hit its maximum size");
      }
      /* First two BOs are MIN_SIZE, double after that */
      const uint64_t new_bo_size =
         NVK_HEAP_MIN_SIZE << (MAX2(heap->bo_count, 1) - 1);
      heap->bos[heap->bo_count].bo =
         nouveau_ws_bo_new_mapped(dev->ws_dev,
                                  new_bo_size + heap->overalloc, 0,
                                  heap->bo_flags, heap->map_flags,
                                  &heap->bos[heap->bo_count].map);
      if (heap->bos[heap->bo_count].bo == NULL) {
         return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Failed to allocate a heap BO: %m");
      }
      uint64_t vma = encode_vma(heap->bo_count, 0);
      util_vma_heap_free(&heap->heap, vma, new_bo_size);
      heap->total_size += new_bo_size;
      heap->bo_count++;
      return VK_SUCCESS;
   }
   /* First two BOs are MIN_SIZE, double after that */
   const uint64_t new_bo_size =
      NVK_HEAP_MIN_SIZE << (MAX2(heap->bo_count, 1) - 1);
   heap->bos[heap->bo_count].bo =
      nouveau_ws_bo_new_mapped(dev->ws_dev,
                               new_bo_size + heap->overalloc, 0,
                               heap->bo_flags, heap->map_flags,
                               &heap->bos[heap->bo_count].map);
   if (heap->bos[heap->bo_count].bo == NULL) {
      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                       "Failed to allocate a heap BO: %m");
   }
   if (heap->base_addr != 0) {
      heap->bos[heap->bo_count].addr = heap->base_addr + heap->total_size;
      nouveau_ws_bo_bind_vma(dev->ws_dev, heap->bos[heap->bo_count].bo,
                             heap->bos[heap->bo_count].addr,
                             new_bo_size, 0, 0);
   } else {
      heap->bos[heap->bo_count].addr = heap->bos[heap->bo_count].bo->offset;
   }
   uint64_t vma = encode_vma(heap->bo_count, 0);
   util_vma_heap_free(&heap->heap, vma, new_bo_size);
   heap->total_size += new_bo_size;
   heap->bo_count++;
   return VK_SUCCESS;
 }
 static VkResult
@ -201,7 +146,7 @@ nvk_heap_alloc_locked(struct nvk_device *dev, struct nvk_heap *heap,
         assert(bo_offset + size + heap->overalloc <=
                heap->bos[bo_idx].bo->size);
-         if (heap->contiguous) {
+         if (heap->base_addr != 0) {
            assert(bo_idx == 0);
            *addr_out = bo_offset;
         } else {
@ -224,31 +169,21 @@ nvk_heap_free_locked(struct nvk_device *dev, struct nvk_heap *heap,
 {
   assert(addr + size > addr);
-   if (heap->contiguous) {
+   for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) {
-      assert(heap->bo_count == 1);
+      if (addr < heap->bos[bo_idx].bo->offset)
-      uint64_t bo_offset = addr;
+         continue;
-      assert(bo_offset + size <= heap->bos[0].bo->size);
+      uint64_t bo_offset = addr - heap->bos[bo_idx].bo->offset;
-      uint64_t vma = encode_vma(0, bo_offset);
+      if (bo_offset >= heap->bos[bo_idx].bo->size)
         continue;
      assert(bo_offset + size <= heap->bos[bo_idx].bo->size);
      uint64_t vma = encode_vma(bo_idx, bo_offset);
      util_vma_heap_free(&heap->heap, vma, size);
-   } else {
+      return;
      for (uint32_t bo_idx = 0; bo_idx < heap->bo_count; bo_idx++) {
         if (addr < heap->bos[bo_idx].bo->offset)
            continue;
         uint64_t bo_offset = addr - heap->bos[bo_idx].bo->offset;
         if (bo_offset >= heap->bos[bo_idx].bo->size)
            continue;
         assert(bo_offset + size <= heap->bos[bo_idx].bo->size);
         uint64_t vma = encode_vma(bo_idx, bo_offset);
         util_vma_heap_free(&heap->heap, vma, size);
         return;
      }
      assert(!"Failed to find heap BO");
   }
   assert(!"Failed to find heap BO");
 }
 VkResult
@ -256,12 +191,6 @@ nvk_heap_alloc(struct nvk_device *dev, struct nvk_heap *heap,
               uint64_t size, uint32_t alignment,
               uint64_t *addr_out, void **map_out)
 {
   /* We can't return maps from contiguous heaps because the the map may go
    * away at any time when the lock isn't taken and we don't want to trust
    * the caller with racy maps.
    */
   assert(!heap->contiguous);
   simple_mtx_lock(&heap->mutex);
   VkResult result = nvk_heap_alloc_locked(dev, heap, size, alignment,
                                           addr_out, map_out);
--- a/src/nouveau/vulkan/nvk_heap.h
+++ b/src/nouveau/vulkan/nvk_heap.h
@ -23,17 +23,20 @@ struct nvk_device;
 struct nvk_heap_bo {
   struct nouveau_ws_bo *bo;
   void *map;
   uint64_t addr;
 };
 struct nvk_heap {
   enum nouveau_ws_bo_flags bo_flags;
   enum nouveau_ws_bo_map_flags map_flags;
   uint32_t overalloc;
   bool contiguous;
   simple_mtx_t mutex;
   struct util_vma_heap heap;
   /* Base address for contiguous heaps, 0 otherwise */
   uint64_t base_addr;
   uint64_t total_size;
   uint32_t bo_count;
@ -58,19 +61,11 @@ VkResult nvk_heap_upload(struct nvk_device *dev, struct nvk_heap *heap,
 void nvk_heap_free(struct nvk_device *dev, struct nvk_heap *heap,
                   uint64_t addr, uint64_t size);
-static inline struct nouveau_ws_bo *
+static inline uint64_t
-nvk_heap_get_contiguous_bo_ref(struct nvk_heap *heap)
+nvk_heap_contiguous_base_address(struct nvk_heap *heap)
 {
-   assert(heap->contiguous);
+   assert(heap->base_addr != 0);
-   assert(heap->bo_count <= 1);
+   return heap->base_addr;
   simple_mtx_lock(&heap->mutex);
   struct nouveau_ws_bo *bo = heap->bos[0].bo;
   if (bo)
      nouveau_ws_bo_ref(bo);
   simple_mtx_unlock(&heap->mutex);
   return bo;
 }
 #endif /* define NVK_HEAP_H */
--- a/src/nouveau/vulkan/nvk_queue.c
+++ b/src/nouveau/vulkan/nvk_queue.c
@ -35,8 +35,6 @@ nvk_queue_state_finish(struct nvk_device *dev,
      nouveau_ws_bo_destroy(qs->images.bo);
   if (qs->samplers.bo)
      nouveau_ws_bo_destroy(qs->samplers.bo);
   if (qs->shaders.bo)
      nouveau_ws_bo_destroy(qs->shaders.bo);
   if (qs->slm.bo)
      nouveau_ws_bo_destroy(qs->slm.bo);
   if (qs->push.bo) {
@ -90,19 +88,6 @@ nvk_queue_state_update(struct nvk_device *dev,
         nouveau_ws_bo_destroy(bo);
   }
   if (dev->shader_heap.contiguous) {
      bo = nvk_heap_get_contiguous_bo_ref(&dev->shader_heap);
      if (qs->shaders.bo != bo) {
         if (qs->shaders.bo)
            nouveau_ws_bo_destroy(qs->shaders.bo);
         qs->shaders.bo = bo;
         dirty = true;
      } else {
         if (bo)
            nouveau_ws_bo_destroy(bo);
      }
   }
   bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_tpc);
   if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
       qs->slm.bytes_per_tpc != bytes_per_tpc) {
@ -182,20 +167,6 @@ nvk_queue_state_update(struct nvk_device *dev,
      });
   }
   if (qs->shaders.bo) {
      /* Compute */
      assert(dev->pdev->info.cls_compute < VOLTA_COMPUTE_A);
      P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
      P_NVA0C0_SET_PROGRAM_REGION_A(p, qs->shaders.bo->offset >> 32);
      P_NVA0C0_SET_PROGRAM_REGION_B(p, qs->shaders.bo->offset);
      /* 3D */
      assert(dev->pdev->info.cls_eng3d < VOLTA_A);
      P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
      P_NV9097_SET_PROGRAM_REGION_A(p, qs->shaders.bo->offset >> 32);
      P_NV9097_SET_PROGRAM_REGION_B(p, qs->shaders.bo->offset);
   }
   if (qs->slm.bo) {
      const uint64_t slm_addr = qs->slm.bo->offset;
      const uint64_t slm_size = qs->slm.bo->size;
--- a/src/nouveau/vulkan/nvk_queue.h
+++ b/src/nouveau/vulkan/nvk_queue.h
@ -26,10 +26,6 @@ struct nvk_queue_state {
      uint32_t alloc_count;
   } samplers;
   struct {
      struct nouveau_ws_bo *bo;
   } shaders;
   struct {
      struct nouveau_ws_bo *bo;
      uint32_t bytes_per_warp;
--- a/src/nouveau/winsys/nouveau_bo.c
+++ b/src/nouveau/winsys/nouveau_bo.c
@ -63,6 +63,10 @@ nouveau_ws_alloc_vma(struct nouveau_ws_device *dev,
 {
   assert(dev->has_vm_bind);
   /* if the caller doesn't care, use the GPU page size */
   if (align == 0)
      align = 0x1000;
   uint64_t offset;
   simple_mtx_lock(&dev->vma_mutex);
   if (bda_capture_replay) {