diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 49472aa7d2e..5015156462d 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1230,6 +1230,56 @@ struct anv_gfx_state_ptr { 4 * _cmd_state->len); \ } while (0) +#define ANV_SHADER_HEAP_MAX_BOS (128) + +struct anv_shader_heap { + struct anv_device *device; + + struct anv_va_range va_range; + + uint32_t start_pot_size; + uint32_t base_pot_size; + + uint64_t start_chunk_size; + uint64_t base_chunk_size; + + uint32_t small_chunk_count; + + struct { + uint64_t addr; + uint64_t size; + + struct anv_bo *bo; + } bos[ANV_SHADER_HEAP_MAX_BOS]; + BITSET_DECLARE(allocated_bos, ANV_SHADER_HEAP_MAX_BOS); + + struct util_vma_heap vma; + simple_mtx_t mutex; +}; + +struct anv_shader_alloc { + uint64_t offset; + uint64_t alloc_size; +}; + +VkResult anv_shader_heap_init(struct anv_shader_heap *heap, + struct anv_device *device, + struct anv_va_range va_range, + uint32_t start_pot_size, + uint32_t base_pot_size); +void anv_shader_heap_finish(struct anv_shader_heap *heap); + +struct anv_shader_alloc anv_shader_heap_alloc(struct anv_shader_heap *heap, + uint64_t size, + uint64_t align, + bool capture_replay, + uint64_t requested_addr); +void anv_shader_heap_free(struct anv_shader_heap *heap, struct anv_shader_alloc alloc); + +void anv_shader_heap_upload(struct anv_shader_heap *heap, + struct anv_shader_alloc alloc, + const void *data, uint64_t size); + struct anv_shader { struct vk_shader vk; @@ -2463,6 +2513,8 @@ struct anv_device { struct util_vma_heap vma_dynamic_visible; struct util_vma_heap vma_trtt; + struct anv_shader_heap shader_heap; + /** List of all anv_device_memory objects */ struct list_head memory_objects; diff --git a/src/intel/vulkan/anv_shader_heap.c b/src/intel/vulkan/anv_shader_heap.c new file mode 100644 index 00000000000..c68a9f5b222 --- /dev/null +++ b/src/intel/vulkan/anv_shader_heap.c @@ -0,0 +1,209 @@ +/* Copyright © 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "anv_private.h" + +static inline uint32_t +shader_bo_index(struct anv_shader_heap *heap, uint64_t addr) +{ + uint64_t alloc_offset = addr - heap->va_range.addr; + + unsigned b; + if (alloc_offset < heap->base_chunk_size) { + b = alloc_offset < heap->start_chunk_size ? 0 : + (util_last_bit64(alloc_offset) - heap->start_pot_size); + assert(b < heap->small_chunk_count); + return b; + } else if (alloc_offset >= (heap->va_range.size - heap->base_chunk_size)) { + alloc_offset = heap->va_range.size - alloc_offset - 1; + b = alloc_offset < heap->start_chunk_size ? 0 : + (util_last_bit64(alloc_offset) - heap->start_pot_size); + assert(b < heap->small_chunk_count); + b = heap->small_chunk_count + b; + } else { + b = 2 * heap->small_chunk_count + + (alloc_offset / heap->base_chunk_size) - 1; + } + + assert(addr >= heap->bos[b].addr && + addr < (heap->bos[b].addr + heap->bos[b].size)); + + return b; +} + +VkResult +anv_shader_heap_init(struct anv_shader_heap *heap, + struct anv_device *device, + struct anv_va_range va_range, + uint32_t start_pot_size, + uint32_t base_pot_size) +{ + assert((1ull << start_pot_size) >= device->info->mem_alignment); + assert(base_pot_size >= start_pot_size); + assert(va_range.size % (1ull << base_pot_size) == 0); + assert((DIV_ROUND_UP(va_range.size, (1ull << base_pot_size)) - + (base_pot_size - start_pot_size) - 1) < + ARRAY_SIZE(heap->bos)); + + memset(heap, 0, sizeof(*heap)); + + heap->start_pot_size = start_pot_size; + heap->base_pot_size = base_pot_size; + heap->start_chunk_size = 1ull << start_pot_size; + heap->base_chunk_size = 1ull << base_pot_size; + heap->small_chunk_count = base_pot_size - start_pot_size + 1; + heap->device = device; + heap->va_range = va_range; + + for (uint32_t i = 0; i < heap->small_chunk_count; i++) { + heap->bos[i].size = + heap->bos[heap->small_chunk_count + i].size = + 1ull << (i == 0 ? start_pot_size : (start_pot_size + i - 1)); + + + heap->bos[i].addr = heap->va_range.addr + + (i == 0 ? 0 : (1ull << (start_pot_size + i - 1))); + heap->bos[heap->small_chunk_count + i].addr = + heap->va_range.addr + heap->va_range.size - + (1ull << (start_pot_size + i)); + } + + const uint64_t base_chunks_size = + heap->va_range.size - 2 * heap->base_chunk_size; + for (uint32_t i = 0; i < base_chunks_size / heap->base_chunk_size; i++) { + heap->bos[2 * heap->small_chunk_count + i].addr = + heap->va_range.addr + heap->base_chunk_size + i * heap->base_chunk_size; + heap->bos[2 * heap->small_chunk_count + i].size = heap->base_chunk_size; + } + + simple_mtx_init(&heap->mutex, mtx_plain); + util_vma_heap_init(&heap->vma, va_range.addr, va_range.size - 64); + + BITSET_ZERO(heap->allocated_bos); + + return VK_SUCCESS; +} + +void +anv_shader_heap_finish(struct anv_shader_heap *heap) +{ + for (uint32_t i = 0; i < ARRAY_SIZE(heap->bos); i++) { + if (heap->bos[i].bo) { + ANV_DMR_BO_FREE(&heap->device->vk.base, heap->bos[i].bo); + anv_device_release_bo(heap->device, heap->bos[i].bo); + heap->bos[i].bo = NULL; + } + } + + util_vma_heap_finish(&heap->vma); + simple_mtx_destroy(&heap->mutex); +} + +struct anv_shader_alloc +anv_shader_heap_alloc(struct anv_shader_heap *heap, + uint64_t size, + uint64_t align, + bool capture_replay, + uint64_t requested_addr) +{ + assert(align <= heap->base_chunk_size); + assert(size <= heap->base_chunk_size); + + simple_mtx_lock(&heap->mutex); + + heap->vma.nospan_shift = MAX2(21, util_last_bit64(size) - 1); + if ((1ull << heap->vma.nospan_shift) < size) + heap->vma.nospan_shift++; + + uint64_t addr = 0; + if (requested_addr) { + if (util_vma_heap_alloc_addr(&heap->vma, + requested_addr, size)) { + addr = requested_addr; + } + } else { + if (capture_replay) { + heap->vma.alloc_high = false; + addr = util_vma_heap_alloc(&heap->vma, size, align); + } else { + heap->vma.alloc_high = true; + addr = util_vma_heap_alloc(&heap->vma, size, align); + } + } + + struct anv_shader_alloc alloc = {}; + + if (addr != 0) { + const uint32_t bo_begin_idx = shader_bo_index(heap, addr); + const uint32_t bo_end_idx = shader_bo_index(heap, addr + size - 1); + for (uint32_t i = MIN2(bo_begin_idx, bo_end_idx); + i <= MAX2(bo_begin_idx, bo_end_idx); i++) { + if (heap->bos[i].bo != NULL) + continue; + + VkResult result = + anv_device_alloc_bo(heap->device, "shaders", + heap->bos[i].size, + ANV_BO_ALLOC_FIXED_ADDRESS | + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_HOST_CACHED_COHERENT | + ANV_BO_ALLOC_CAPTURE | + ANV_BO_ALLOC_INTERNAL, + heap->bos[i].addr, + &heap->bos[i].bo); + ANV_DMR_BO_ALLOC(&heap->device->vk.base, heap->bos[i].bo, result); + if (result == VK_SUCCESS) + BITSET_SET(heap->allocated_bos, i); + else { + util_vma_heap_free(&heap->vma, addr, size); + addr = 0; + break; + } + } + + if (addr != 0) { + alloc.offset = addr - heap->va_range.addr; + alloc.alloc_size = size; + } + } + + simple_mtx_unlock(&heap->mutex); + + return alloc; +} + +void +anv_shader_heap_free(struct anv_shader_heap *heap, struct anv_shader_alloc alloc) +{ + simple_mtx_lock(&heap->mutex); + + util_vma_heap_free(&heap->vma, heap->va_range.addr + alloc.offset, + alloc.alloc_size); + + simple_mtx_unlock(&heap->mutex); +} + +void +anv_shader_heap_upload(struct anv_shader_heap *heap, + struct anv_shader_alloc alloc, + const void *data, uint64_t data_size) +{ + const uint32_t bo_begin_idx = shader_bo_index( + heap, heap->va_range.addr + alloc.offset); + const uint32_t bo_end_idx = shader_bo_index( + heap, heap->va_range.addr + alloc.offset + data_size - 1); + + const uint64_t upload_addr = heap->va_range.addr + alloc.offset; + for (uint32_t i = MIN2(bo_begin_idx, bo_end_idx); + i <= MAX2(bo_begin_idx, bo_end_idx); i++) { + const uint64_t bo_offset = + MAX2(upload_addr, heap->bos[i].addr) - heap->bos[i].addr; + const uint32_t data_offset = + upload_addr - (heap->bos[i].addr + bo_offset); + const uint64_t copy_size = + MIN2(heap->bos[i].size - bo_offset, data_size - data_offset); + + memcpy(heap->bos[i].bo->map + bo_offset, data, copy_size); + } +} diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 15d380d44cb..9538b58272a 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -192,6 +192,7 @@ libanv_files = files( 'anv_sampler.c', 'anv_shader.c', 'anv_shader_compile.c', + 'anv_shader_heap.c', 'anv_slab_bo.c', 'anv_slab_bo.h', 'anv_sparse.c', @@ -328,6 +329,7 @@ if with_tests 'tests/block_pool_no_free.c', 'tests/block_pool_grow_first.c', 'tests/block_pool_max_size.c', + 'tests/shader_heap_small_allocs.c', ) test( diff --git a/src/intel/vulkan/tests/anv_tests.cpp b/src/intel/vulkan/tests/anv_tests.cpp index 09be512f81e..ce5f63ff4af 100644 --- a/src/intel/vulkan/tests/anv_tests.cpp +++ b/src/intel/vulkan/tests/anv_tests.cpp @@ -20,6 +20,9 @@ ANV_C_TEST(BlockPool, NoFree, block_pool_no_free_test); ANV_C_TEST(BlockPool, GrowFirst, block_pool_grow_first_test); ANV_C_TEST(BlockPool, MaxSize, block_pool_max_size); +ANV_C_TEST(ShaderHeap, SmallAllocsLow, shader_heap_small_allocs_lo); +ANV_C_TEST(ShaderHeap, SmallAllocsHigh, shader_heap_small_allocs_hi); + extern "C" void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg) { GTEST_FAIL_AT(file_path, line_number) << msg; } diff --git a/src/intel/vulkan/tests/shader_heap_small_allocs.c b/src/intel/vulkan/tests/shader_heap_small_allocs.c new file mode 100644 index 00000000000..f7d4a54725c --- /dev/null +++ b/src/intel/vulkan/tests/shader_heap_small_allocs.c @@ -0,0 +1,63 @@ +/* + * Copyright © 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "anv_private.h" +#include "test_common.h" + +void shader_heap_small_allocs_lo(void); +void shader_heap_small_allocs_hi(void); + +static void shader_heap_small_allocs(bool high) +{ + struct anv_physical_device physical_device = {}; + struct anv_device device = {}; + struct anv_shader_heap heap; + + test_device_info_init(&physical_device.info); + device.vk.base.device = &device.vk; + anv_device_set_physical(&device, &physical_device); + device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + anv_shader_heap_init(&heap, &device, + (struct anv_va_range) { + .addr = 3ull * 1024 * 1024 * 1024, + .size = 1ull * 1024 * 1024 * 1024, + }, 21, 27); + + uint32_t sizes[] = { + 64, + 3 * 64, + 12 * 64, + 16 * 64, + 233 * 64, + 1025 * 64, + 6 * 4096 + 64, + 2 * 1024 * 1024, + 4 * 1024 * 1024, + 2 * 1024 * 1024 + 2048, + 16 * 1024 * 1024 + 1024, + }; + struct anv_shader_alloc allocs[ARRAY_SIZE(sizes)]; + + for (uint32_t i = 0; i < ARRAY_SIZE(sizes); i++) { + allocs[i] = anv_shader_heap_alloc(&heap, sizes[i], 64, high, 0); + assert(allocs[i].alloc_size != 0); + } + + anv_shader_heap_finish(&heap); + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} + +void shader_heap_small_allocs_hi() +{ + shader_heap_small_allocs(true); +} + +void shader_heap_small_allocs_lo() +{ + shader_heap_small_allocs(false); +}