anv: Add VMA allocator for shader binaries

Introduce a VMA-first chunk allocator for shader binaries to eventually
replace the anv_state_pool-based implementation. This allocator works
directly with GPU virtual addresses through util_vma_heap, making the
virtual address space an explicit resource managed by ANV.

No functional change in this commit.

v2(Michael Cheng): Use existing instruction state pool anv_va_range

v3(Lionel): Simplify allocator

Signed-off-by: default avatarMichael Cheng <michael.cheng@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38869>
This commit is contained in:
Michael Cheng 2025-12-03 14:15:27 +02:00 committed by Marge Bot
parent 20f320b7c7
commit 1fa327ac32
5 changed files with 329 additions and 0 deletions

View file

@ -1230,6 +1230,56 @@ struct anv_gfx_state_ptr {
4 * _cmd_state->len); \
} while (0)
#define ANV_SHADER_HEAP_MAX_BOS (128)
struct anv_shader_heap {
struct anv_device *device;
struct anv_va_range va_range;
uint32_t start_pot_size;
uint32_t base_pot_size;
uint64_t start_chunk_size;
uint64_t base_chunk_size;
uint32_t small_chunk_count;
struct {
uint64_t addr;
uint64_t size;
struct anv_bo *bo;
} bos[ANV_SHADER_HEAP_MAX_BOS];
BITSET_DECLARE(allocated_bos, ANV_SHADER_HEAP_MAX_BOS);
struct util_vma_heap vma;
simple_mtx_t mutex;
};
struct anv_shader_alloc {
uint64_t offset;
uint64_t alloc_size;
};
VkResult anv_shader_heap_init(struct anv_shader_heap *heap,
struct anv_device *device,
struct anv_va_range va_range,
uint32_t start_pot_size,
uint32_t base_pot_size);
void anv_shader_heap_finish(struct anv_shader_heap *heap);
struct anv_shader_alloc anv_shader_heap_alloc(struct anv_shader_heap *heap,
uint64_t size,
uint64_t align,
bool capture_replay,
uint64_t requested_addr);
void anv_shader_heap_free(struct anv_shader_heap *heap, struct anv_shader_alloc alloc);
void anv_shader_heap_upload(struct anv_shader_heap *heap,
struct anv_shader_alloc alloc,
const void *data, uint64_t size);
struct anv_shader {
struct vk_shader vk;
@ -2463,6 +2513,8 @@ struct anv_device {
struct util_vma_heap vma_dynamic_visible;
struct util_vma_heap vma_trtt;
struct anv_shader_heap shader_heap;
/** List of all anv_device_memory objects */
struct list_head memory_objects;

View file

@ -0,0 +1,209 @@
/* Copyright © 2025 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "anv_private.h"
static inline uint32_t
shader_bo_index(struct anv_shader_heap *heap, uint64_t addr)
{
uint64_t alloc_offset = addr - heap->va_range.addr;
unsigned b;
if (alloc_offset < heap->base_chunk_size) {
b = alloc_offset < heap->start_chunk_size ? 0 :
(util_last_bit64(alloc_offset) - heap->start_pot_size);
assert(b < heap->small_chunk_count);
return b;
} else if (alloc_offset >= (heap->va_range.size - heap->base_chunk_size)) {
alloc_offset = heap->va_range.size - alloc_offset - 1;
b = alloc_offset < heap->start_chunk_size ? 0 :
(util_last_bit64(alloc_offset) - heap->start_pot_size);
assert(b < heap->small_chunk_count);
b = heap->small_chunk_count + b;
} else {
b = 2 * heap->small_chunk_count +
(alloc_offset / heap->base_chunk_size) - 1;
}
assert(addr >= heap->bos[b].addr &&
addr < (heap->bos[b].addr + heap->bos[b].size));
return b;
}
VkResult
anv_shader_heap_init(struct anv_shader_heap *heap,
struct anv_device *device,
struct anv_va_range va_range,
uint32_t start_pot_size,
uint32_t base_pot_size)
{
assert((1ull << start_pot_size) >= device->info->mem_alignment);
assert(base_pot_size >= start_pot_size);
assert(va_range.size % (1ull << base_pot_size) == 0);
assert((DIV_ROUND_UP(va_range.size, (1ull << base_pot_size)) -
(base_pot_size - start_pot_size) - 1) <
ARRAY_SIZE(heap->bos));
memset(heap, 0, sizeof(*heap));
heap->start_pot_size = start_pot_size;
heap->base_pot_size = base_pot_size;
heap->start_chunk_size = 1ull << start_pot_size;
heap->base_chunk_size = 1ull << base_pot_size;
heap->small_chunk_count = base_pot_size - start_pot_size + 1;
heap->device = device;
heap->va_range = va_range;
for (uint32_t i = 0; i < heap->small_chunk_count; i++) {
heap->bos[i].size =
heap->bos[heap->small_chunk_count + i].size =
1ull << (i == 0 ? start_pot_size : (start_pot_size + i - 1));
heap->bos[i].addr = heap->va_range.addr +
(i == 0 ? 0 : (1ull << (start_pot_size + i - 1)));
heap->bos[heap->small_chunk_count + i].addr =
heap->va_range.addr + heap->va_range.size -
(1ull << (start_pot_size + i));
}
const uint64_t base_chunks_size =
heap->va_range.size - 2 * heap->base_chunk_size;
for (uint32_t i = 0; i < base_chunks_size / heap->base_chunk_size; i++) {
heap->bos[2 * heap->small_chunk_count + i].addr =
heap->va_range.addr + heap->base_chunk_size + i * heap->base_chunk_size;
heap->bos[2 * heap->small_chunk_count + i].size = heap->base_chunk_size;
}
simple_mtx_init(&heap->mutex, mtx_plain);
util_vma_heap_init(&heap->vma, va_range.addr, va_range.size - 64);
BITSET_ZERO(heap->allocated_bos);
return VK_SUCCESS;
}
void
anv_shader_heap_finish(struct anv_shader_heap *heap)
{
for (uint32_t i = 0; i < ARRAY_SIZE(heap->bos); i++) {
if (heap->bos[i].bo) {
ANV_DMR_BO_FREE(&heap->device->vk.base, heap->bos[i].bo);
anv_device_release_bo(heap->device, heap->bos[i].bo);
heap->bos[i].bo = NULL;
}
}
util_vma_heap_finish(&heap->vma);
simple_mtx_destroy(&heap->mutex);
}
struct anv_shader_alloc
anv_shader_heap_alloc(struct anv_shader_heap *heap,
uint64_t size,
uint64_t align,
bool capture_replay,
uint64_t requested_addr)
{
assert(align <= heap->base_chunk_size);
assert(size <= heap->base_chunk_size);
simple_mtx_lock(&heap->mutex);
heap->vma.nospan_shift = MAX2(21, util_last_bit64(size) - 1);
if ((1ull << heap->vma.nospan_shift) < size)
heap->vma.nospan_shift++;
uint64_t addr = 0;
if (requested_addr) {
if (util_vma_heap_alloc_addr(&heap->vma,
requested_addr, size)) {
addr = requested_addr;
}
} else {
if (capture_replay) {
heap->vma.alloc_high = false;
addr = util_vma_heap_alloc(&heap->vma, size, align);
} else {
heap->vma.alloc_high = true;
addr = util_vma_heap_alloc(&heap->vma, size, align);
}
}
struct anv_shader_alloc alloc = {};
if (addr != 0) {
const uint32_t bo_begin_idx = shader_bo_index(heap, addr);
const uint32_t bo_end_idx = shader_bo_index(heap, addr + size - 1);
for (uint32_t i = MIN2(bo_begin_idx, bo_end_idx);
i <= MAX2(bo_begin_idx, bo_end_idx); i++) {
if (heap->bos[i].bo != NULL)
continue;
VkResult result =
anv_device_alloc_bo(heap->device, "shaders",
heap->bos[i].size,
ANV_BO_ALLOC_FIXED_ADDRESS |
ANV_BO_ALLOC_MAPPED |
ANV_BO_ALLOC_HOST_CACHED_COHERENT |
ANV_BO_ALLOC_CAPTURE |
ANV_BO_ALLOC_INTERNAL,
heap->bos[i].addr,
&heap->bos[i].bo);
ANV_DMR_BO_ALLOC(&heap->device->vk.base, heap->bos[i].bo, result);
if (result == VK_SUCCESS)
BITSET_SET(heap->allocated_bos, i);
else {
util_vma_heap_free(&heap->vma, addr, size);
addr = 0;
break;
}
}
if (addr != 0) {
alloc.offset = addr - heap->va_range.addr;
alloc.alloc_size = size;
}
}
simple_mtx_unlock(&heap->mutex);
return alloc;
}
void
anv_shader_heap_free(struct anv_shader_heap *heap, struct anv_shader_alloc alloc)
{
simple_mtx_lock(&heap->mutex);
util_vma_heap_free(&heap->vma, heap->va_range.addr + alloc.offset,
alloc.alloc_size);
simple_mtx_unlock(&heap->mutex);
}
void
anv_shader_heap_upload(struct anv_shader_heap *heap,
struct anv_shader_alloc alloc,
const void *data, uint64_t data_size)
{
const uint32_t bo_begin_idx = shader_bo_index(
heap, heap->va_range.addr + alloc.offset);
const uint32_t bo_end_idx = shader_bo_index(
heap, heap->va_range.addr + alloc.offset + data_size - 1);
const uint64_t upload_addr = heap->va_range.addr + alloc.offset;
for (uint32_t i = MIN2(bo_begin_idx, bo_end_idx);
i <= MAX2(bo_begin_idx, bo_end_idx); i++) {
const uint64_t bo_offset =
MAX2(upload_addr, heap->bos[i].addr) - heap->bos[i].addr;
const uint32_t data_offset =
upload_addr - (heap->bos[i].addr + bo_offset);
const uint64_t copy_size =
MIN2(heap->bos[i].size - bo_offset, data_size - data_offset);
memcpy(heap->bos[i].bo->map + bo_offset, data, copy_size);
}
}

View file

@ -192,6 +192,7 @@ libanv_files = files(
'anv_sampler.c',
'anv_shader.c',
'anv_shader_compile.c',
'anv_shader_heap.c',
'anv_slab_bo.c',
'anv_slab_bo.h',
'anv_sparse.c',
@ -328,6 +329,7 @@ if with_tests
'tests/block_pool_no_free.c',
'tests/block_pool_grow_first.c',
'tests/block_pool_max_size.c',
'tests/shader_heap_small_allocs.c',
)
test(

View file

@ -20,6 +20,9 @@ ANV_C_TEST(BlockPool, NoFree, block_pool_no_free_test);
ANV_C_TEST(BlockPool, GrowFirst, block_pool_grow_first_test);
ANV_C_TEST(BlockPool, MaxSize, block_pool_max_size);
ANV_C_TEST(ShaderHeap, SmallAllocsLow, shader_heap_small_allocs_lo);
ANV_C_TEST(ShaderHeap, SmallAllocsHigh, shader_heap_small_allocs_hi);
extern "C" void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg) {
GTEST_FAIL_AT(file_path, line_number) << msg;
}

View file

@ -0,0 +1,63 @@
/*
* Copyright © 2025 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "anv_private.h"
#include "test_common.h"
void shader_heap_small_allocs_lo(void);
void shader_heap_small_allocs_hi(void);
static void shader_heap_small_allocs(bool high)
{
struct anv_physical_device physical_device = {};
struct anv_device device = {};
struct anv_shader_heap heap;
test_device_info_init(&physical_device.info);
device.vk.base.device = &device.vk;
anv_device_set_physical(&device, &physical_device);
device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
pthread_mutex_init(&device.mutex, NULL);
anv_bo_cache_init(&device.bo_cache, &device);
anv_shader_heap_init(&heap, &device,
(struct anv_va_range) {
.addr = 3ull * 1024 * 1024 * 1024,
.size = 1ull * 1024 * 1024 * 1024,
}, 21, 27);
uint32_t sizes[] = {
64,
3 * 64,
12 * 64,
16 * 64,
233 * 64,
1025 * 64,
6 * 4096 + 64,
2 * 1024 * 1024,
4 * 1024 * 1024,
2 * 1024 * 1024 + 2048,
16 * 1024 * 1024 + 1024,
};
struct anv_shader_alloc allocs[ARRAY_SIZE(sizes)];
for (uint32_t i = 0; i < ARRAY_SIZE(sizes); i++) {
allocs[i] = anv_shader_heap_alloc(&heap, sizes[i], 64, high, 0);
assert(allocs[i].alloc_size != 0);
}
anv_shader_heap_finish(&heap);
anv_bo_cache_finish(&device.bo_cache);
pthread_mutex_destroy(&device.mutex);
}
void shader_heap_small_allocs_hi()
{
shader_heap_small_allocs(true);
}
void shader_heap_small_allocs_lo()
{
shader_heap_small_allocs(false);
}