anv: Use vma_heap for descriptor pool host allocation

Pre-patch, anv_descriptor_pool used a free list for host allocations
that never merged adjacent free blocks.  If the pool only allocated
fixed-sized blocks, then this would not be a problem. But the pool
allocations are variable-sized, and this caused over half of the pool's
memory to be consumed by unusable free blocks in some workloads, causing
unnecessary memory footprint.

Replacing the free list with util_vma_heap, which does merge adjacent
free blocks, fixes the memory explosion in the target workload.

Disdavantges of util_vma_heap compared to the free list:
  - The heap calls malloc() when a new hole is created.
  - The heap calls free() when a hole disappears or is merged with an
    adjacent hole.
  - The Vulkan spec expects descriptor set creation/destruction to be
    thread-local lockless in the common case. For workloads that
    create/destroy with high frequency, malloc/free may cause overhead.
    Profiling is needed.

Tested with a ChromeOS internal TensorFlow benchmark, provided by
package 'tensorflow', running with its OpenCL backend on clvk.

  cmdline: benchmark_model --graph=mn2.tflite --use_gpu=true --min_secs=60
  gpu: adl
  memory footprint from start of benchmark:
    before: init=132.691MB max=227.684MB
    after:  init=134.988MB max=134.988MB

Reported-by: Romaric Jodin <rjodin@google.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20289>
This commit is contained in:
Chad Versace 2022-12-08 11:28:22 -08:00 committed by Marge Bot
parent 94a6384f1b
commit a5f9e59ce3
2 changed files with 37 additions and 49 deletions

View file

@ -806,13 +806,13 @@ void anv_DestroyPipelineLayout(
/*
* Descriptor pools.
*
* These are implemented using a big pool of memory and a free-list for the
* These are implemented using a big pool of memory and a vma heap for the
* host memory allocations and a state_stream and a free list for the buffer
* view surface state. The spec allows us to fail to allocate due to
* fragmentation in all cases but two: 1) after pool reset, allocating up
* until the pool size with no freeing must succeed and 2) allocating and
* freeing only descriptor sets with the same layout. Case 1) is easy enough,
* and the free lists lets us recycle blocks for case 2).
* and the vma heap ensures case 2).
*/
/* The vma heap reserves 0 to mean NULL; we have to offset by some amount to
@ -897,25 +897,26 @@ VkResult anv_CreateDescriptorPool(
const bool host_only =
pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT;
/* For host_only pools, allocate some memory to hold the written surface
* states of the internal anv_buffer_view. With normal pools, the memory
* holding surface state is allocated from the device surface_state_pool.
*/
const size_t pool_size =
const size_t host_mem_size =
pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) +
descriptor_count * sizeof(struct anv_descriptor) +
buffer_view_count * sizeof(struct anv_buffer_view) +
(host_only ? buffer_view_count * ANV_SURFACE_STATE_SIZE : 0);
const size_t total_size = sizeof(*pool) + pool_size;
pool = vk_object_alloc(&device->vk, pAllocator, total_size,
pool = vk_object_alloc(&device->vk, pAllocator,
sizeof(*pool) + host_mem_size,
VK_OBJECT_TYPE_DESCRIPTOR_POOL);
if (!pool)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pool->size = pool_size;
pool->next = 0;
pool->free_list = EMPTY;
pool->host_mem_size = host_mem_size;
util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, host_mem_size);
pool->host_only = host_only;
if (descriptor_bo_size > 0) {
@ -990,8 +991,8 @@ VkResult anv_ResetDescriptorPool(
}
list_inithead(&pool->desc_sets);
pool->next = 0;
pool->free_list = EMPTY;
util_vma_heap_finish(&pool->host_heap);
util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, pool->host_mem_size);
if (pool->bo) {
util_vma_heap_finish(&pool->bo_heap);
@ -1006,57 +1007,37 @@ VkResult anv_ResetDescriptorPool(
return VK_SUCCESS;
}
struct pool_free_list_entry {
uint32_t next;
uint32_t size;
};
static VkResult
anv_descriptor_pool_alloc_set(struct anv_descriptor_pool *pool,
uint32_t size,
struct anv_descriptor_set **set)
{
if (size <= pool->size - pool->next) {
*set = (struct anv_descriptor_set *) (pool->data + pool->next);
(*set)->size = size;
pool->next += size;
return VK_SUCCESS;
} else {
struct pool_free_list_entry *entry;
uint32_t *link = &pool->free_list;
for (uint32_t f = pool->free_list; f != EMPTY; f = entry->next) {
entry = (struct pool_free_list_entry *) (pool->data + f);
if (size <= entry->size) {
*link = entry->next;
*set = (struct anv_descriptor_set *) entry;
(*set)->size = entry->size;
return VK_SUCCESS;
}
link = &entry->next;
}
uint64_t vma_offset = util_vma_heap_alloc(&pool->host_heap, size, 1);
if (pool->free_list != EMPTY) {
if (vma_offset == 0) {
if (size <= pool->host_heap.free_size) {
return VK_ERROR_FRAGMENTED_POOL;
} else {
return VK_ERROR_OUT_OF_POOL_MEMORY;
}
}
assert(vma_offset >= POOL_HEAP_OFFSET);
uint64_t host_mem_offset = vma_offset - POOL_HEAP_OFFSET;
*set = (struct anv_descriptor_set *) (pool->host_mem + host_mem_offset);
(*set)->size = size;
return VK_SUCCESS;
}
static void
anv_descriptor_pool_free_set(struct anv_descriptor_pool *pool,
struct anv_descriptor_set *set)
{
/* Put the descriptor set allocation back on the free list. */
const uint32_t index = (char *) set - pool->data;
if (index + set->size == pool->next) {
pool->next = index;
} else {
struct pool_free_list_entry *entry = (struct pool_free_list_entry *) set;
entry->next = pool->free_list;
entry->size = set->size;
pool->free_list = (char *) entry - pool->data;
}
util_vma_heap_free(&pool->host_heap,
((char *) set - pool->host_mem) + POOL_HEAP_OFFSET,
set->size);
}
struct surface_state_free_list_entry {

View file

@ -1891,21 +1891,28 @@ anv_descriptor_set_address(struct anv_descriptor_set *set)
struct anv_descriptor_pool {
struct vk_object_base base;
uint32_t size;
uint32_t next;
uint32_t free_list;
struct anv_bo *bo;
struct util_vma_heap bo_heap;
struct anv_state_stream surface_state_stream;
void *surface_state_free_list;
/** List of anv_descriptor_set. */
struct list_head desc_sets;
/** Heap over host_mem */
struct util_vma_heap host_heap;
/** Allocated size of host_mem */
uint32_t host_mem_size;
/**
* VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT. If set, then
* surface_state_stream is unused.
*/
bool host_only;
char data[0];
char host_mem[0];
};
size_t