winsys/amdgpu: don't layer slabs, use only 1 level of slabs, it improves perf

This increases FPS in VP2020/Catia1 by 10-18%!!!!!!!!!!!!!!!!!!!!!!!

I have no rational explanation for this.

In the most extreme case, 8192 256B slab BOs (smallest size) are now
allocated from a single 2MB slab.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26643>
This commit is contained in:
Marek Olšák 2023-12-08 20:10:11 -05:00 committed by Marge Bot
parent 4a078e693e
commit cf2dc2d512
3 changed files with 49 additions and 101 deletions

View file

@ -229,9 +229,7 @@ static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buff
static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
{
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
pb_slabs_reclaim(&ws->bo_slabs[i]);
pb_slabs_reclaim(&ws->bo_slabs);
pb_cache_release_all_buffers(&ws->bo_cache);
}
@ -615,25 +613,11 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
return amdgpu_bo_can_reclaim(priv, &bo->b.base);
}
static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
{
/* Find the correct slab allocator for the given size. */
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
struct pb_slabs *slabs = &ws->bo_slabs[i];
if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
return slabs;
}
assert(0);
return NULL;
}
static unsigned get_slab_wasted_size(struct amdgpu_winsys *ws, struct amdgpu_bo_slab_entry *bo)
{
assert(bo->b.base.size <= bo->entry.slab->entry_size);
assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) ||
bo->b.base.size < 1 << ws->bo_slabs[0].min_order ||
bo->b.base.size < 1 << ws->bo_slabs.min_order ||
bo->b.base.size > bo->entry.slab->entry_size / 2);
return bo->entry.slab->entry_size - bo->b.base.size;
}
@ -642,23 +626,20 @@ static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer *
{
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf));
struct pb_slabs *slabs;
slabs = get_slabs(ws, bo->b.base.size);
if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
ws->slab_wasted_vram -= get_slab_wasted_size(ws, bo);
else
ws->slab_wasted_gtt -= get_slab_wasted_size(ws, bo);
pb_slab_free(slabs, &bo->entry);
pb_slab_free(&ws->bo_slabs, &bo->entry);
}
/* Return the power of two size of a slab entry matching the input size. */
static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
{
unsigned entry_size = util_next_power_of_two(size);
unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
unsigned min_entry_size = 1 << ws->bo_slabs.min_order;
return MAX2(entry_size, min_entry_size);
}
@ -682,44 +663,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
uint32_t base_id;
unsigned slab_size = 0;
if (!slab)
return NULL;
/* Determine the slab buffer size. */
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
unsigned max_entry_size = 1 << (ws->bo_slabs[i].min_order + ws->bo_slabs[i].num_orders - 1);
unsigned max_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
if (entry_size <= max_entry_size) {
/* The slab size is twice the size of the largest possible entry. */
slab_size = max_entry_size * 2;
assert(entry_size <= max_entry_size);
if (!util_is_power_of_two_nonzero(entry_size)) {
assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
/* The slab size is twice the size of the largest possible entry. */
unsigned slab_size = max_entry_size * 2;
/* If the entry size is 3/4 of a power of two, we would waste space and not gain
* anything if we allocated only twice the power of two for the backing buffer:
* 2 * 3/4 = 1.5 usable with buffer size 2
*
* Allocating 5 times the entry size leads us to the next power of two and results
* in a much better memory utilization:
* 5 * 3/4 = 3.75 usable with buffer size 4
*/
if (entry_size * 5 > slab_size)
slab_size = util_next_power_of_two(entry_size * 5);
}
if (!util_is_power_of_two_nonzero(entry_size)) {
assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
/* The largest slab should have the same size as the PTE fragment
* size to get faster address translation.
*/
if (i == NUM_SLAB_ALLOCATORS - 1 &&
slab_size < ws->info.pte_fragment_size)
slab_size = ws->info.pte_fragment_size;
break;
}
/* If the entry size is 3/4 of a power of two, we would waste space and not gain
* anything if we allocated only twice the power of two for the backing buffer:
* 2 * 3/4 = 1.5 usable with buffer size 2
*
* Allocating 5 times the entry size leads us to the next power of two and results
* in a much better memory utilization:
* 5 * 3/4 = 3.75 usable with buffer size 4
*/
if (entry_size * 5 > slab_size)
slab_size = util_next_power_of_two(entry_size * 5);
}
assert(slab_size != 0);
/* The largest slab should have the same size as the PTE fragment
* size to get faster address translation.
*/
slab_size = MAX2(slab_size, ws->info.pte_fragment_size);
slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,
slab_size, slab_size,
@ -727,6 +701,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
if (!slab->buffer)
goto fail;
/* We can get a buffer from pb_cache that is slightly larger. */
slab_size = slab->buffer->base.size;
slab->base.num_entries = slab_size / entry_size;
@ -751,13 +726,9 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
bo->b.va = slab->buffer->va + i * entry_size;
bo->b.unique_id = base_id + i;
if (is_real_bo(slab->buffer)) {
/* The slab is not suballocated. */
bo->real = get_real_bo(slab->buffer);
} else {
/* The slab is allocated out of a bigger slab. */
bo->real = get_slab_entry_bo(slab->buffer)->real;
}
/* The slab is not suballocated. */
assert(is_real_bo(slab->buffer));
bo->real = get_real_bo(slab->buffer);
bo->entry.slab = &slab->base;
list_addtail(&bo->entry.head, &slab->base.free);
@ -1358,8 +1329,7 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
return amdgpu_bo_sparse_create(ws, size, domain, flags);
}
struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
unsigned max_slab_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
int heap = radeon_get_heap_index(domain, flags);
/* Sub-allocate small buffers from slabs. */
@ -1387,13 +1357,12 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
}
}
struct pb_slabs *slabs = get_slabs(ws, alloc_size);
entry = pb_slab_alloc(slabs, alloc_size, heap);
entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
if (!entry) {
/* Clean up buffer managers and try again. */
amdgpu_clean_up_buffer_managers(ws);
entry = pb_slab_alloc(slabs, alloc_size, heap);
entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
}
if (!entry)
return NULL;

View file

@ -76,10 +76,8 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
util_queue_destroy(&ws->cs_queue);
simple_mtx_destroy(&ws->bo_fence_lock);
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
if (ws->bo_slabs[i].groups)
pb_slabs_deinit(&ws->bo_slabs[i]);
}
if (ws->bo_slabs.groups)
pb_slabs_deinit(&ws->bo_slabs);
pb_cache_deinit(&ws->bo_cache);
_mesa_hash_table_destroy(ws->bo_export_table, NULL);
simple_mtx_destroy(&ws->sws_list_lock);
@ -454,35 +452,22 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
* is a struct pointer instead of void*. */
(void*)amdgpu_bo_destroy, (void*)amdgpu_bo_can_reclaim);
unsigned min_slab_order = 8; /* 256 bytes */
unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */
unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) /
NUM_SLAB_ALLOCATORS;
/* Divide the size order range among slab managers. */
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
unsigned min_order = min_slab_order;
unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator,
max_slab_order);
if (!pb_slabs_init(&aws->bo_slabs[i],
min_order, max_order,
RADEON_NUM_HEAPS, true,
aws,
amdgpu_bo_can_reclaim_slab,
amdgpu_bo_slab_alloc,
/* Cast to void* because one of the function parameters
* is a struct pointer instead of void*. */
(void*)amdgpu_bo_slab_free)) {
amdgpu_winsys_destroy(&ws->base);
simple_mtx_unlock(&dev_tab_mutex);
return NULL;
}
min_slab_order = max_order + 1;
if (!pb_slabs_init(&aws->bo_slabs,
8, /* min slab entry size: 256 bytes */
20, /* max slab entry size: 1 MB (slab size = 2 MB) */
RADEON_NUM_HEAPS, true,
aws,
amdgpu_bo_can_reclaim_slab,
amdgpu_bo_slab_alloc,
/* Cast to void* because one of the function parameters
* is a struct pointer instead of void*. */
(void*)amdgpu_bo_slab_free)) {
amdgpu_winsys_destroy(&ws->base);
simple_mtx_unlock(&dev_tab_mutex);
return NULL;
}
aws->info.min_alloc_size = 1 << aws->bo_slabs[0].min_order;
aws->info.min_alloc_size = 1 << aws->bo_slabs.min_order;
/* init reference */
pipe_reference_init(&aws->reference, 1);

View file

@ -17,8 +17,6 @@
struct amdgpu_cs;
#define NUM_SLAB_ALLOCATORS 3
/* DRM file descriptors, file descriptions and buffer sharing.
*
* amdgpu_device_initialize first argument is a file descriptor (fd)
@ -70,11 +68,7 @@ struct amdgpu_winsys {
int fd;
struct pb_cache bo_cache;
/* Each slab buffer can only contain suballocations of equal sizes, so we
* need to layer the allocators, so that we don't waste too much memory.
*/
struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
struct pb_slabs bo_slabs; /* Slab allocator. */
amdgpu_device_handle dev;