anv: Add VMA allocator for shader binaries

Introduce a VMA-first chunk allocator for shader binaries to eventually replace the anv_state_pool-based implementation. This allocator works directly with GPU virtual addresses through util_vma_heap, making the virtual address space an explicit resource managed by ANV. No functional change in this commit. v2(Michael Cheng): Use existing instruction state pool anv_va_range v3(Lionel): Simplify allocator Signed-off-by: default avatarMichael Cheng <michael.cheng@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38869>
2025-12-25 21:40:08 +01:00 · 2025-12-03 14:15:27 +02:00 · 2025-12-03 14:15:27 +02:00 · 1fa327ac32
commit 1fa327ac32
parent 20f320b7c7
5 changed files with 329 additions and 0 deletions
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1230,6 +1230,56 @@ struct anv_gfx_state_ptr {
             4 * _cmd_state->len);                                      \
   } while (0)

+#define ANV_SHADER_HEAP_MAX_BOS (128)
+
+struct anv_shader_heap {
+   struct anv_device *device;
+
+   struct anv_va_range va_range;
+
+   uint32_t start_pot_size;
+   uint32_t base_pot_size;
+
+   uint64_t start_chunk_size;
+   uint64_t base_chunk_size;
+
+   uint32_t small_chunk_count;
+
+   struct {
+      uint64_t addr;
+      uint64_t size;
+
+      struct anv_bo *bo;
+   } bos[ANV_SHADER_HEAP_MAX_BOS];
+   BITSET_DECLARE(allocated_bos, ANV_SHADER_HEAP_MAX_BOS);
+
+   struct util_vma_heap vma;
+   simple_mtx_t mutex;
+};
+
+struct anv_shader_alloc {
+   uint64_t offset;
+   uint64_t alloc_size;
+};
+
+VkResult anv_shader_heap_init(struct anv_shader_heap *heap,
+                              struct anv_device *device,
+                              struct anv_va_range va_range,
+                              uint32_t start_pot_size,
+                              uint32_t base_pot_size);
+void anv_shader_heap_finish(struct anv_shader_heap *heap);
+
+struct anv_shader_alloc anv_shader_heap_alloc(struct anv_shader_heap *heap,
+                                              uint64_t size,
+                                              uint64_t align,
+                                              bool capture_replay,
+                                              uint64_t requested_addr);
+void anv_shader_heap_free(struct anv_shader_heap *heap, struct anv_shader_alloc alloc);
+
+void anv_shader_heap_upload(struct anv_shader_heap *heap,
+                            struct anv_shader_alloc alloc,
+                            const void *data, uint64_t size);
+
 struct anv_shader {
   struct vk_shader vk;

@ -2463,6 +2513,8 @@ struct anv_device {
    struct util_vma_heap                        vma_dynamic_visible;
    struct util_vma_heap                        vma_trtt;

+    struct anv_shader_heap                      shader_heap;
+
    /** List of all anv_device_memory objects */
    struct list_head                            memory_objects;

--- a/src/intel/vulkan/anv_shader_heap.c
+++ b/src/intel/vulkan/anv_shader_heap.c
@ -0,0 +1,209 @@
+/* Copyright © 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "anv_private.h"
+
+static inline uint32_t
+shader_bo_index(struct anv_shader_heap *heap, uint64_t addr)
+{
+   uint64_t alloc_offset = addr - heap->va_range.addr;
+
+   unsigned b;
+   if (alloc_offset < heap->base_chunk_size) {
+      b = alloc_offset < heap->start_chunk_size ? 0 :
+         (util_last_bit64(alloc_offset) - heap->start_pot_size);
+      assert(b < heap->small_chunk_count);
+      return b;
+   } else if (alloc_offset >= (heap->va_range.size - heap->base_chunk_size)) {
+      alloc_offset = heap->va_range.size - alloc_offset - 1;
+      b = alloc_offset < heap->start_chunk_size ? 0 :
+         (util_last_bit64(alloc_offset) - heap->start_pot_size);
+      assert(b < heap->small_chunk_count);
+      b = heap->small_chunk_count + b;
+   } else {
+      b = 2 * heap->small_chunk_count +
+          (alloc_offset / heap->base_chunk_size) - 1;
+   }
+
+   assert(addr >= heap->bos[b].addr &&
+          addr < (heap->bos[b].addr + heap->bos[b].size));
+
+   return b;
+}
+
+VkResult
+anv_shader_heap_init(struct anv_shader_heap *heap,
+                     struct anv_device *device,
+                     struct anv_va_range va_range,
+                     uint32_t start_pot_size,
+                     uint32_t base_pot_size)
+{
+   assert((1ull << start_pot_size) >= device->info->mem_alignment);
+   assert(base_pot_size >= start_pot_size);
+   assert(va_range.size % (1ull << base_pot_size) == 0);
+   assert((DIV_ROUND_UP(va_range.size, (1ull << base_pot_size)) -
+           (base_pot_size - start_pot_size) - 1) <
+          ARRAY_SIZE(heap->bos));
+
+   memset(heap, 0, sizeof(*heap));
+
+   heap->start_pot_size = start_pot_size;
+   heap->base_pot_size = base_pot_size;
+   heap->start_chunk_size = 1ull << start_pot_size;
+   heap->base_chunk_size = 1ull << base_pot_size;
+   heap->small_chunk_count = base_pot_size - start_pot_size + 1;
+   heap->device = device;
+   heap->va_range = va_range;
+
+   for (uint32_t i = 0; i < heap->small_chunk_count; i++) {
+      heap->bos[i].size =
+         heap->bos[heap->small_chunk_count + i].size =
+         1ull << (i == 0 ? start_pot_size : (start_pot_size + i - 1));
+
+
+      heap->bos[i].addr = heap->va_range.addr +
+         (i == 0 ? 0 : (1ull << (start_pot_size + i - 1)));
+      heap->bos[heap->small_chunk_count + i].addr =
+         heap->va_range.addr + heap->va_range.size -
+         (1ull << (start_pot_size + i));
+   }
+
+   const uint64_t base_chunks_size =
+      heap->va_range.size - 2 * heap->base_chunk_size;
+   for (uint32_t i = 0; i < base_chunks_size / heap->base_chunk_size; i++) {
+      heap->bos[2 * heap->small_chunk_count + i].addr =
+         heap->va_range.addr + heap->base_chunk_size + i * heap->base_chunk_size;
+      heap->bos[2 * heap->small_chunk_count + i].size = heap->base_chunk_size;
+   }
+
+   simple_mtx_init(&heap->mutex, mtx_plain);
+   util_vma_heap_init(&heap->vma, va_range.addr, va_range.size - 64);
+
+   BITSET_ZERO(heap->allocated_bos);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_shader_heap_finish(struct anv_shader_heap *heap)
+{
+   for (uint32_t i = 0; i < ARRAY_SIZE(heap->bos); i++) {
+      if (heap->bos[i].bo) {
+         ANV_DMR_BO_FREE(&heap->device->vk.base, heap->bos[i].bo);
+         anv_device_release_bo(heap->device, heap->bos[i].bo);
+         heap->bos[i].bo = NULL;
+      }
+   }
+
+   util_vma_heap_finish(&heap->vma);
+   simple_mtx_destroy(&heap->mutex);
+}
+
+struct anv_shader_alloc
+anv_shader_heap_alloc(struct anv_shader_heap *heap,
+                      uint64_t size,
+                      uint64_t align,
+                      bool capture_replay,
+                      uint64_t requested_addr)
+{
+   assert(align <= heap->base_chunk_size);
+   assert(size <= heap->base_chunk_size);
+
+   simple_mtx_lock(&heap->mutex);
+
+   heap->vma.nospan_shift = MAX2(21, util_last_bit64(size) - 1);
+   if ((1ull << heap->vma.nospan_shift) < size)
+      heap->vma.nospan_shift++;
+
+   uint64_t addr = 0;
+   if (requested_addr) {
+      if (util_vma_heap_alloc_addr(&heap->vma,
+                                   requested_addr, size)) {
+         addr = requested_addr;
+      }
+   } else {
+      if (capture_replay) {
+         heap->vma.alloc_high = false;
+         addr = util_vma_heap_alloc(&heap->vma, size, align);
+      } else {
+         heap->vma.alloc_high = true;
+         addr = util_vma_heap_alloc(&heap->vma, size, align);
+      }
+   }
+
+   struct anv_shader_alloc alloc = {};
+
+   if (addr != 0) {
+      const uint32_t bo_begin_idx = shader_bo_index(heap, addr);
+      const uint32_t bo_end_idx = shader_bo_index(heap, addr + size - 1);
+      for (uint32_t i = MIN2(bo_begin_idx, bo_end_idx);
+           i <= MAX2(bo_begin_idx, bo_end_idx); i++) {
+         if (heap->bos[i].bo != NULL)
+            continue;
+
+         VkResult result =
+            anv_device_alloc_bo(heap->device, "shaders",
+                                heap->bos[i].size,
+                                ANV_BO_ALLOC_FIXED_ADDRESS |
+                                ANV_BO_ALLOC_MAPPED |
+                                ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+                                ANV_BO_ALLOC_CAPTURE |
+                                ANV_BO_ALLOC_INTERNAL,
+                                heap->bos[i].addr,
+                                &heap->bos[i].bo);
+         ANV_DMR_BO_ALLOC(&heap->device->vk.base, heap->bos[i].bo, result);
+         if (result == VK_SUCCESS)
+            BITSET_SET(heap->allocated_bos, i);
+         else {
+            util_vma_heap_free(&heap->vma, addr, size);
+            addr = 0;
+            break;
+         }
+      }
+
+      if (addr != 0) {
+         alloc.offset = addr - heap->va_range.addr;
+         alloc.alloc_size = size;
+      }
+   }
+
+   simple_mtx_unlock(&heap->mutex);
+
+   return alloc;
+}
+
+void
+anv_shader_heap_free(struct anv_shader_heap *heap, struct anv_shader_alloc alloc)
+{
+   simple_mtx_lock(&heap->mutex);
+
+   util_vma_heap_free(&heap->vma, heap->va_range.addr + alloc.offset,
+                      alloc.alloc_size);
+
+   simple_mtx_unlock(&heap->mutex);
+}
+
+void
+anv_shader_heap_upload(struct anv_shader_heap *heap,
+                       struct anv_shader_alloc alloc,
+                       const void *data, uint64_t data_size)
+{
+   const uint32_t bo_begin_idx = shader_bo_index(
+      heap, heap->va_range.addr + alloc.offset);
+   const uint32_t bo_end_idx = shader_bo_index(
+      heap, heap->va_range.addr + alloc.offset + data_size - 1);
+
+   const uint64_t upload_addr = heap->va_range.addr + alloc.offset;
+   for (uint32_t i = MIN2(bo_begin_idx, bo_end_idx);
+        i <= MAX2(bo_begin_idx, bo_end_idx); i++) {
+      const uint64_t bo_offset =
+         MAX2(upload_addr, heap->bos[i].addr) - heap->bos[i].addr;
+      const uint32_t data_offset =
+         upload_addr - (heap->bos[i].addr + bo_offset);
+      const uint64_t copy_size =
+         MIN2(heap->bos[i].size - bo_offset, data_size - data_offset);
+
+      memcpy(heap->bos[i].bo->map + bo_offset, data, copy_size);
+   }
+}
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@ -192,6 +192,7 @@ libanv_files = files(
  'anv_sampler.c',
  'anv_shader.c',
  'anv_shader_compile.c',
+  'anv_shader_heap.c',
  'anv_slab_bo.c',
  'anv_slab_bo.h',
  'anv_sparse.c',
@ -328,6 +329,7 @@ if with_tests
    'tests/block_pool_no_free.c',
    'tests/block_pool_grow_first.c',
    'tests/block_pool_max_size.c',
+    'tests/shader_heap_small_allocs.c',
  )

  test(
--- a/src/intel/vulkan/tests/anv_tests.cpp
+++ b/src/intel/vulkan/tests/anv_tests.cpp
@ -20,6 +20,9 @@ ANV_C_TEST(BlockPool, NoFree, block_pool_no_free_test);
 ANV_C_TEST(BlockPool, GrowFirst, block_pool_grow_first_test);
 ANV_C_TEST(BlockPool, MaxSize, block_pool_max_size);

+ANV_C_TEST(ShaderHeap, SmallAllocsLow, shader_heap_small_allocs_lo);
+ANV_C_TEST(ShaderHeap, SmallAllocsHigh, shader_heap_small_allocs_hi);
+
 extern "C" void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg) {
   GTEST_FAIL_AT(file_path, line_number) << msg;
 }
--- a/src/intel/vulkan/tests/shader_heap_small_allocs.c
+++ b/src/intel/vulkan/tests/shader_heap_small_allocs.c
@ -0,0 +1,63 @@
+/*
+ * Copyright © 2025 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "anv_private.h"
+#include "test_common.h"
+
+void shader_heap_small_allocs_lo(void);
+void shader_heap_small_allocs_hi(void);
+
+static void shader_heap_small_allocs(bool high)
+{
+   struct anv_physical_device physical_device = {};
+   struct anv_device device = {};
+   struct anv_shader_heap heap;
+
+   test_device_info_init(&physical_device.info);
+   device.vk.base.device = &device.vk;
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_shader_heap_init(&heap, &device,
+                        (struct anv_va_range) {
+                           .addr = 3ull * 1024 * 1024 * 1024,
+                           .size = 1ull * 1024 * 1024 * 1024,
+                        }, 21, 27);
+
+   uint32_t sizes[] = {
+      64,
+      3 * 64,
+      12 * 64,
+      16 * 64,
+      233 * 64,
+      1025 * 64,
+      6 * 4096 + 64,
+      2 * 1024 * 1024,
+      4 * 1024 * 1024,
+      2 * 1024 * 1024 + 2048,
+      16 * 1024 * 1024 + 1024,
+   };
+   struct anv_shader_alloc allocs[ARRAY_SIZE(sizes)];
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(sizes); i++) {
+      allocs[i] = anv_shader_heap_alloc(&heap, sizes[i], 64, high, 0);
+      assert(allocs[i].alloc_size != 0);
+   }
+
+   anv_shader_heap_finish(&heap);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
+
+void shader_heap_small_allocs_hi()
+{
+   shader_heap_small_allocs(true);
+}
+
+void shader_heap_small_allocs_lo()
+{
+   shader_heap_small_allocs(false);
+}