diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c index c545eba5739..a017c816628 100644 --- a/src/intel/vulkan/anv_allocator.c +++ b/src/intel/vulkan/anv_allocator.c @@ -1613,7 +1613,8 @@ anv_device_alloc_bo(struct anv_device *device, /* In platforms with LLC we can promote all bos to cached+coherent for free */ const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL | - ANV_BO_ALLOC_PROTECTED; + ANV_BO_ALLOC_PROTECTED | + ANV_BO_ALLOC_SLAB_PARENT; if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0)) alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT; @@ -1683,7 +1684,8 @@ anv_device_alloc_bo(struct anv_device *device, .alloc_flags = alloc_flags, }; - if (alloc_flags & ANV_BO_ALLOC_MAPPED) { + if ((alloc_flags & ANV_BO_ALLOC_MAPPED) && + ((alloc_flags & ANV_BO_ALLOC_SLAB_PARENT) == 0)) { VkResult result = anv_device_map_bo(device, &new_bo, 0, size, NULL, &new_bo.map); if (unlikely(result != VK_SUCCESS)) { diff --git a/src/intel/vulkan/anv_instance.c b/src/intel/vulkan/anv_instance.c index d478c2d52cd..bcb436f1250 100644 --- a/src/intel/vulkan/anv_instance.c +++ b/src/intel/vulkan/anv_instance.c @@ -79,6 +79,7 @@ static const struct debug_control debug_control[] = { { "video-decode", ANV_DEBUG_VIDEO_DECODE}, { "video-encode", ANV_DEBUG_VIDEO_ENCODE}, { "shader-hash", ANV_DEBUG_SHADER_HASH}, + { "no-slab", ANV_DEBUG_NO_SLAB}, { NULL, 0 } }; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 6e6d67851ce..f7ea7b99b40 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -456,6 +456,9 @@ enum anv_bo_alloc_flags { /** Compressed buffer, only supported in Xe2+ */ ANV_BO_ALLOC_COMPRESSED = (1 << 21), + + /** Specifies that this bo is a slab parent */ + ANV_BO_ALLOC_SLAB_PARENT = (1 << 22), }; /** Specifies that the BO should be cached and coherent. */ @@ -1312,6 +1315,7 @@ enum anv_debug { ANV_DEBUG_VIDEO_DECODE = BITFIELD_BIT(5), ANV_DEBUG_VIDEO_ENCODE = BITFIELD_BIT(6), ANV_DEBUG_SHADER_HASH = BITFIELD_BIT(7), + ANV_DEBUG_NO_SLAB = BITFIELD_BIT(8), }; struct anv_instance { @@ -2420,6 +2424,8 @@ void anv_vma_free(struct anv_device *device, static inline bool anv_bo_is_small_heap(enum anv_bo_alloc_flags alloc_flags) { + if (alloc_flags & ANV_BO_ALLOC_SLAB_PARENT) + return false; return alloc_flags & (ANV_BO_ALLOC_DESCRIPTOR_POOL | ANV_BO_ALLOC_DYNAMIC_VISIBLE_POOL | ANV_BO_ALLOC_32BIT_ADDRESS); diff --git a/src/intel/vulkan/anv_slab_bo.c b/src/intel/vulkan/anv_slab_bo.c index 19b2959d029..81e12a877f9 100644 --- a/src/intel/vulkan/anv_slab_bo.c +++ b/src/intel/vulkan/anv_slab_bo.c @@ -4,25 +4,357 @@ #include "anv_slab_bo.h" +enum anv_bo_slab_heap { + ANV_BO_SLAB_HEAP_SMEM_CACHED_COHERENT, + ANV_BO_SLAB_HEAP_SMEM_CACHED_INCOHERENT, + ANV_BO_SLAB_HEAP_SMEM_COHERENT, + ANV_BO_SLAB_HEAP_COMPRESSED, /* used by integrated and discrete GPUs */ + ANV_BO_SLAB_HEAP_LMEM_SMEM, + ANV_BO_SLAB_HEAP_LMEM_ONLY, + ANV_BO_SLAB_NOT_SUPPORTED, +}; + +struct anv_slab { + struct pb_slab base; + + /** The BO representing the entire slab */ + struct anv_bo *bo; + + /** Array of anv_bo structs representing BOs allocated out of this slab */ + struct anv_bo *entries; +}; + +static enum anv_bo_slab_heap +anv_bo_alloc_flags_to_slab_heap(struct anv_device *device, + enum anv_bo_alloc_flags alloc_flags) +{ + enum anv_bo_alloc_flags not_supported = ANV_BO_ALLOC_32BIT_ADDRESS | + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_CAPTURE | + ANV_BO_ALLOC_FIXED_ADDRESS | + ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS | + ANV_BO_ALLOC_DESCRIPTOR_POOL | + ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE | + ANV_BO_ALLOC_SCANOUT | + ANV_BO_ALLOC_PROTECTED | + ANV_BO_ALLOC_DYNAMIC_VISIBLE_POOL | + ANV_BO_ALLOC_IMPORTED | + ANV_BO_ALLOC_AUX_CCS | + ANV_BO_ALLOC_SLAB_PARENT; + + if (device->info->kmd_type == INTEL_KMD_TYPE_I915) { + not_supported |= (ANV_BO_ALLOC_IMPLICIT_SYNC | + ANV_BO_ALLOC_IMPLICIT_WRITE); + } + + if (alloc_flags & not_supported) + return ANV_BO_SLAB_NOT_SUPPORTED; + + if (alloc_flags & ANV_BO_ALLOC_COMPRESSED) + return ANV_BO_SLAB_HEAP_COMPRESSED; + + if (anv_physical_device_has_vram(device->physical)) { + if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) + return ANV_BO_SLAB_HEAP_SMEM_CACHED_COHERENT; + if (alloc_flags & (ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)) + return ANV_BO_SLAB_HEAP_LMEM_SMEM; + return ANV_BO_SLAB_HEAP_LMEM_ONLY; + } + + if ((alloc_flags & ANV_BO_ALLOC_HOST_CACHED_COHERENT) == ANV_BO_ALLOC_HOST_CACHED_COHERENT) + return ANV_BO_SLAB_HEAP_SMEM_CACHED_COHERENT; + if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED) + return ANV_BO_SLAB_HEAP_SMEM_CACHED_INCOHERENT; + return ANV_BO_SLAB_HEAP_SMEM_COHERENT; +} + +/* Return the power of two size of a slab entry matching the input size. */ +static unsigned +get_slab_pot_entry_size(struct anv_device *device, unsigned size) +{ + unsigned entry_size = util_next_power_of_two(size); + unsigned min_entry_size = 1 << device->bo_slabs[0].min_order; + + return MAX2(entry_size, min_entry_size); +} + +static struct pb_slabs * +get_slabs(struct anv_device *device, uint64_t size) +{ + const unsigned num_slab_allocator = ARRAY_SIZE(device->bo_slabs); + + for (unsigned i = 0; i < num_slab_allocator; i++) { + struct pb_slabs *slabs = &device->bo_slabs[i]; + + if (size <= (1ull << (slabs->min_order + slabs->num_orders - 1))) + return slabs; + } + + unreachable("should have found a valid slab for this size"); + return NULL; +} + +static inline bool +anv_slab_bo_is_disabled(struct anv_device *device) +{ + return device->bo_slabs[0].num_heaps == 0; +} + struct anv_bo * anv_slab_bo_alloc(struct anv_device *device, const char *name, uint64_t requested_size, uint32_t alignment, enum anv_bo_alloc_flags alloc_flags) { - return NULL; + if (anv_slab_bo_is_disabled(device)) + return NULL; + + const enum anv_bo_slab_heap slab_heap = anv_bo_alloc_flags_to_slab_heap(device, alloc_flags); + if (slab_heap == ANV_BO_SLAB_NOT_SUPPORTED) + return NULL; + + const unsigned num_slab_allocator = ARRAY_SIZE(device->bo_slabs); + struct pb_slabs *last_slab = &device->bo_slabs[num_slab_allocator - 1]; + const uint64_t max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1); + + if (requested_size > max_slab_entry_size) + return NULL; + + uint64_t alloc_size = MAX2(alignment, requested_size); + alloc_size = get_slab_pot_entry_size(device, alloc_size); + + if (alloc_size > max_slab_entry_size) + return NULL; + + struct pb_slabs *slabs = get_slabs(device, alloc_size); + struct pb_slab_entry *entry = pb_slab_alloc(slabs, alloc_size, slab_heap); + if (!entry) { + /* Clean up and try again... */ + pb_slabs_reclaim(slabs); + + entry = pb_slab_alloc(slabs, alloc_size, slab_heap); + } + if (!entry) + return NULL; + + struct anv_bo *bo = container_of(entry, struct anv_bo, slab_entry); + bo->name = name; + bo->refcount = 1; + bo->size = requested_size; + bo->alloc_flags = alloc_flags; + bo->flags = device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags); + + assert(bo->flags == bo->slab_parent->flags); + assert((intel_48b_address(bo->offset) & (alignment - 1)) == 0); + + if (alloc_flags & ANV_BO_ALLOC_MAPPED) { + if (anv_device_map_bo(device, bo, 0, bo->size, NULL, &bo->map) != VK_SUCCESS) { + anv_slab_bo_free(device, bo); + return NULL; + } + } + + return bo; } void anv_slab_bo_free(struct anv_device *device, struct anv_bo *bo) { + assert(bo->slab_parent); + + if (bo->map) { + anv_device_unmap_bo(device, bo, bo->map, bo->size, false /* replace */); + bo->map = NULL; + } + + bo->refcount = 0; + pb_slab_free(get_slabs(device, bo->size), &bo->slab_entry); +} + +static unsigned heap_max_get(struct anv_device *device) +{ + unsigned ret; + + if (anv_physical_device_has_vram(device->physical)) + ret = ANV_BO_SLAB_HEAP_LMEM_ONLY; + else + ret = device->info->verx10 >= 200 ? ANV_BO_SLAB_HEAP_COMPRESSED : + ANV_BO_SLAB_HEAP_SMEM_COHERENT; + + return (ret + 1); +} + +static bool +anv_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) +{ + struct anv_bo *bo = container_of(entry, struct anv_bo, slab_entry); + + return p_atomic_read(&bo->refcount) == 0; +} + +static struct pb_slab * +anv_slab_alloc(void *priv, + unsigned heap, + unsigned entry_size, + unsigned group_index) +{ + struct anv_device *device = priv; + struct anv_slab *slab = calloc(1, sizeof(struct anv_slab)); + + if (!slab) + return NULL; + + const enum anv_bo_slab_heap bo_slab_heap = heap; + enum anv_bo_alloc_flags alloc_flags = ANV_BO_ALLOC_SLAB_PARENT; + + switch (bo_slab_heap) { + case ANV_BO_SLAB_HEAP_SMEM_CACHED_COHERENT: + alloc_flags |= ANV_BO_ALLOC_HOST_CACHED_COHERENT | + ANV_BO_ALLOC_NO_LOCAL_MEM; + break; + case ANV_BO_SLAB_HEAP_SMEM_CACHED_INCOHERENT: + alloc_flags |= ANV_BO_ALLOC_HOST_CACHED | + ANV_BO_ALLOC_NO_LOCAL_MEM; + break; + case ANV_BO_SLAB_HEAP_SMEM_COHERENT: + alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT | + ANV_BO_ALLOC_NO_LOCAL_MEM; + break; + case ANV_BO_SLAB_HEAP_COMPRESSED: + alloc_flags |= ANV_BO_ALLOC_COMPRESSED; + break; + case ANV_BO_SLAB_HEAP_LMEM_SMEM: + alloc_flags |= ANV_BO_ALLOC_MAPPED; + break; + case ANV_BO_SLAB_HEAP_LMEM_ONLY: + break; + default: + unreachable("Missing"); + return NULL; + } + + struct pb_slabs *slabs = get_slabs(device, entry_size); + + entry_size = MAX2(entry_size, 1ULL << slabs->min_order); + if (!util_is_power_of_two_nonzero(entry_size)) + entry_size = util_next_power_of_two(entry_size); + + unsigned slab_parent_size = entry_size * 8; + /* allocate at least a 2MB buffer, this allows KMD to enable THP for this bo */ + slab_parent_size = MAX2(slab_parent_size, 2 * 1024 * 1024); + + VkResult result; + result = anv_device_alloc_bo(device, "slab_parent", slab_parent_size, alloc_flags, + 0, &slab->bo); + if (result != VK_SUCCESS) + goto error_alloc_bo; + + slab_parent_size = slab->bo->size = slab->bo->actual_size; + slab->base.num_entries = slab_parent_size / entry_size; + slab->base.num_free = slab->base.num_entries; + slab->base.group_index = group_index; + slab->base.entry_size = entry_size; + slab->entries = calloc(slab->base.num_entries, sizeof(*slab->entries)); + if (!slab->entries) + goto error_alloc_entries; + + list_inithead(&slab->base.free); + + for (unsigned i = 0; i < slab->base.num_entries; i++) { + struct anv_bo *bo = &slab->entries[i]; + uint64_t offset = intel_48b_address(slab->bo->offset); + + offset += (i * entry_size); + + bo->name = "slab_child"; + bo->gem_handle = slab->bo->gem_handle; + bo->refcount = 0; + bo->offset = intel_canonical_address(offset); + bo->size = entry_size; + bo->actual_size = entry_size; + bo->alloc_flags = alloc_flags; + bo->vma_heap = slab->bo->vma_heap; + bo->slab_parent = slab->bo; + bo->slab_entry.slab = &slab->base; + + list_addtail(&bo->slab_entry.head, &slab->base.free); + } + + return &slab->base; + +error_alloc_entries: + anv_device_release_bo(device, slab->bo); +error_alloc_bo: + free(slab); + + return NULL; +} + +static void +anv_slab_free(void *priv, struct pb_slab *pslab) +{ + struct anv_device *device = priv; + struct anv_slab *slab = (void *)pslab; + + anv_device_release_bo(device, slab->bo); + + free(slab->entries); + free(slab); } bool anv_slab_bo_init(struct anv_device *device) { + const unsigned num_slab_allocator = ARRAY_SIZE(device->bo_slabs); + unsigned min_slab_order = 8;/* 256 bytes */ + const unsigned max_slab_order = 20;/* 1 MB (slab size = 2 MB) */ + unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) / + num_slab_allocator; + + if (unlikely(device->physical->instance->debug & ANV_DEBUG_NO_SLAB)) + return true; + + /* feature requirement */ + if (!device->info->has_mmap_offset || !device->info->has_partial_mmap_offset) + return true; + + /* Divide the size order range among slab managers. */ + for (unsigned i = 0; i < num_slab_allocator; i++) { + const unsigned min_order = min_slab_order; + const unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator, + max_slab_order); + + if (!pb_slabs_init(&device->bo_slabs[i], min_order, max_order, + heap_max_get(device), false, device, + anv_can_reclaim_slab, + anv_slab_alloc, + anv_slab_free)) { + goto error_slabs_init; + } + min_slab_order = max_order + 1; + } + return true; + +error_slabs_init: + for (unsigned i = 0; i < num_slab_allocator; i++) { + if (!device->bo_slabs[i].groups) + break; + + pb_slabs_deinit(&device->bo_slabs[i]); + } + + return false; } void anv_slab_bo_deinit(struct anv_device *device) { + const unsigned num_slab_allocator = ARRAY_SIZE(device->bo_slabs); + + if (anv_slab_bo_is_disabled(device)) + return; + + for (int i = 0; i < num_slab_allocator; i++) { + if (device->bo_slabs[i].groups) + pb_slabs_deinit(&device->bo_slabs[i]); + } }