mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-04 18:28:12 +02:00
winsys/amdgpu: don't layer slabs, use only 1 level of slabs, it improves perf
This increases FPS in VP2020/Catia1 by 10-18%!!!!!!!!!!!!!!!!!!!!!!! I have no rational explanation for this. In the most extreme case, 8192 256B slab BOs (smallest size) are now allocated from a single 2MB slab. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26643>
This commit is contained in:
parent
4a078e693e
commit
cf2dc2d512
3 changed files with 49 additions and 101 deletions
|
|
@ -229,9 +229,7 @@ static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buff
|
|||
|
||||
static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
|
||||
{
|
||||
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
|
||||
pb_slabs_reclaim(&ws->bo_slabs[i]);
|
||||
|
||||
pb_slabs_reclaim(&ws->bo_slabs);
|
||||
pb_cache_release_all_buffers(&ws->bo_cache);
|
||||
}
|
||||
|
||||
|
|
@ -615,25 +613,11 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
|
|||
return amdgpu_bo_can_reclaim(priv, &bo->b.base);
|
||||
}
|
||||
|
||||
static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
|
||||
{
|
||||
/* Find the correct slab allocator for the given size. */
|
||||
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
|
||||
struct pb_slabs *slabs = &ws->bo_slabs[i];
|
||||
|
||||
if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
|
||||
return slabs;
|
||||
}
|
||||
|
||||
assert(0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned get_slab_wasted_size(struct amdgpu_winsys *ws, struct amdgpu_bo_slab_entry *bo)
|
||||
{
|
||||
assert(bo->b.base.size <= bo->entry.slab->entry_size);
|
||||
assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) ||
|
||||
bo->b.base.size < 1 << ws->bo_slabs[0].min_order ||
|
||||
bo->b.base.size < 1 << ws->bo_slabs.min_order ||
|
||||
bo->b.base.size > bo->entry.slab->entry_size / 2);
|
||||
return bo->entry.slab->entry_size - bo->b.base.size;
|
||||
}
|
||||
|
|
@ -642,23 +626,20 @@ static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer *
|
|||
{
|
||||
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
|
||||
struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf));
|
||||
struct pb_slabs *slabs;
|
||||
|
||||
slabs = get_slabs(ws, bo->b.base.size);
|
||||
|
||||
if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
|
||||
ws->slab_wasted_vram -= get_slab_wasted_size(ws, bo);
|
||||
else
|
||||
ws->slab_wasted_gtt -= get_slab_wasted_size(ws, bo);
|
||||
|
||||
pb_slab_free(slabs, &bo->entry);
|
||||
pb_slab_free(&ws->bo_slabs, &bo->entry);
|
||||
}
|
||||
|
||||
/* Return the power of two size of a slab entry matching the input size. */
|
||||
static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
|
||||
{
|
||||
unsigned entry_size = util_next_power_of_two(size);
|
||||
unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
|
||||
unsigned min_entry_size = 1 << ws->bo_slabs.min_order;
|
||||
|
||||
return MAX2(entry_size, min_entry_size);
|
||||
}
|
||||
|
|
@ -682,44 +663,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
|
|||
enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
|
||||
enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
|
||||
uint32_t base_id;
|
||||
unsigned slab_size = 0;
|
||||
|
||||
if (!slab)
|
||||
return NULL;
|
||||
|
||||
/* Determine the slab buffer size. */
|
||||
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
|
||||
unsigned max_entry_size = 1 << (ws->bo_slabs[i].min_order + ws->bo_slabs[i].num_orders - 1);
|
||||
unsigned max_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
|
||||
|
||||
if (entry_size <= max_entry_size) {
|
||||
/* The slab size is twice the size of the largest possible entry. */
|
||||
slab_size = max_entry_size * 2;
|
||||
assert(entry_size <= max_entry_size);
|
||||
|
||||
if (!util_is_power_of_two_nonzero(entry_size)) {
|
||||
assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
|
||||
/* The slab size is twice the size of the largest possible entry. */
|
||||
unsigned slab_size = max_entry_size * 2;
|
||||
|
||||
/* If the entry size is 3/4 of a power of two, we would waste space and not gain
|
||||
* anything if we allocated only twice the power of two for the backing buffer:
|
||||
* 2 * 3/4 = 1.5 usable with buffer size 2
|
||||
*
|
||||
* Allocating 5 times the entry size leads us to the next power of two and results
|
||||
* in a much better memory utilization:
|
||||
* 5 * 3/4 = 3.75 usable with buffer size 4
|
||||
*/
|
||||
if (entry_size * 5 > slab_size)
|
||||
slab_size = util_next_power_of_two(entry_size * 5);
|
||||
}
|
||||
if (!util_is_power_of_two_nonzero(entry_size)) {
|
||||
assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
|
||||
|
||||
/* The largest slab should have the same size as the PTE fragment
|
||||
* size to get faster address translation.
|
||||
*/
|
||||
if (i == NUM_SLAB_ALLOCATORS - 1 &&
|
||||
slab_size < ws->info.pte_fragment_size)
|
||||
slab_size = ws->info.pte_fragment_size;
|
||||
break;
|
||||
}
|
||||
/* If the entry size is 3/4 of a power of two, we would waste space and not gain
|
||||
* anything if we allocated only twice the power of two for the backing buffer:
|
||||
* 2 * 3/4 = 1.5 usable with buffer size 2
|
||||
*
|
||||
* Allocating 5 times the entry size leads us to the next power of two and results
|
||||
* in a much better memory utilization:
|
||||
* 5 * 3/4 = 3.75 usable with buffer size 4
|
||||
*/
|
||||
if (entry_size * 5 > slab_size)
|
||||
slab_size = util_next_power_of_two(entry_size * 5);
|
||||
}
|
||||
assert(slab_size != 0);
|
||||
|
||||
/* The largest slab should have the same size as the PTE fragment
|
||||
* size to get faster address translation.
|
||||
*/
|
||||
slab_size = MAX2(slab_size, ws->info.pte_fragment_size);
|
||||
|
||||
slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,
|
||||
slab_size, slab_size,
|
||||
|
|
@ -727,6 +701,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
|
|||
if (!slab->buffer)
|
||||
goto fail;
|
||||
|
||||
/* We can get a buffer from pb_cache that is slightly larger. */
|
||||
slab_size = slab->buffer->base.size;
|
||||
|
||||
slab->base.num_entries = slab_size / entry_size;
|
||||
|
|
@ -751,13 +726,9 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
|
|||
bo->b.va = slab->buffer->va + i * entry_size;
|
||||
bo->b.unique_id = base_id + i;
|
||||
|
||||
if (is_real_bo(slab->buffer)) {
|
||||
/* The slab is not suballocated. */
|
||||
bo->real = get_real_bo(slab->buffer);
|
||||
} else {
|
||||
/* The slab is allocated out of a bigger slab. */
|
||||
bo->real = get_slab_entry_bo(slab->buffer)->real;
|
||||
}
|
||||
/* The slab is not suballocated. */
|
||||
assert(is_real_bo(slab->buffer));
|
||||
bo->real = get_real_bo(slab->buffer);
|
||||
|
||||
bo->entry.slab = &slab->base;
|
||||
list_addtail(&bo->entry.head, &slab->base.free);
|
||||
|
|
@ -1358,8 +1329,7 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
|
|||
return amdgpu_bo_sparse_create(ws, size, domain, flags);
|
||||
}
|
||||
|
||||
struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
|
||||
unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
|
||||
unsigned max_slab_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
|
||||
int heap = radeon_get_heap_index(domain, flags);
|
||||
|
||||
/* Sub-allocate small buffers from slabs. */
|
||||
|
|
@ -1387,13 +1357,12 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
|
|||
}
|
||||
}
|
||||
|
||||
struct pb_slabs *slabs = get_slabs(ws, alloc_size);
|
||||
entry = pb_slab_alloc(slabs, alloc_size, heap);
|
||||
entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
|
||||
if (!entry) {
|
||||
/* Clean up buffer managers and try again. */
|
||||
amdgpu_clean_up_buffer_managers(ws);
|
||||
|
||||
entry = pb_slab_alloc(slabs, alloc_size, heap);
|
||||
entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
|
||||
}
|
||||
if (!entry)
|
||||
return NULL;
|
||||
|
|
|
|||
|
|
@ -76,10 +76,8 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
|
|||
util_queue_destroy(&ws->cs_queue);
|
||||
|
||||
simple_mtx_destroy(&ws->bo_fence_lock);
|
||||
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
|
||||
if (ws->bo_slabs[i].groups)
|
||||
pb_slabs_deinit(&ws->bo_slabs[i]);
|
||||
}
|
||||
if (ws->bo_slabs.groups)
|
||||
pb_slabs_deinit(&ws->bo_slabs);
|
||||
pb_cache_deinit(&ws->bo_cache);
|
||||
_mesa_hash_table_destroy(ws->bo_export_table, NULL);
|
||||
simple_mtx_destroy(&ws->sws_list_lock);
|
||||
|
|
@ -454,35 +452,22 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
|
|||
* is a struct pointer instead of void*. */
|
||||
(void*)amdgpu_bo_destroy, (void*)amdgpu_bo_can_reclaim);
|
||||
|
||||
unsigned min_slab_order = 8; /* 256 bytes */
|
||||
unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */
|
||||
unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) /
|
||||
NUM_SLAB_ALLOCATORS;
|
||||
|
||||
/* Divide the size order range among slab managers. */
|
||||
for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
|
||||
unsigned min_order = min_slab_order;
|
||||
unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator,
|
||||
max_slab_order);
|
||||
|
||||
if (!pb_slabs_init(&aws->bo_slabs[i],
|
||||
min_order, max_order,
|
||||
RADEON_NUM_HEAPS, true,
|
||||
aws,
|
||||
amdgpu_bo_can_reclaim_slab,
|
||||
amdgpu_bo_slab_alloc,
|
||||
/* Cast to void* because one of the function parameters
|
||||
* is a struct pointer instead of void*. */
|
||||
(void*)amdgpu_bo_slab_free)) {
|
||||
amdgpu_winsys_destroy(&ws->base);
|
||||
simple_mtx_unlock(&dev_tab_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
min_slab_order = max_order + 1;
|
||||
if (!pb_slabs_init(&aws->bo_slabs,
|
||||
8, /* min slab entry size: 256 bytes */
|
||||
20, /* max slab entry size: 1 MB (slab size = 2 MB) */
|
||||
RADEON_NUM_HEAPS, true,
|
||||
aws,
|
||||
amdgpu_bo_can_reclaim_slab,
|
||||
amdgpu_bo_slab_alloc,
|
||||
/* Cast to void* because one of the function parameters
|
||||
* is a struct pointer instead of void*. */
|
||||
(void*)amdgpu_bo_slab_free)) {
|
||||
amdgpu_winsys_destroy(&ws->base);
|
||||
simple_mtx_unlock(&dev_tab_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
aws->info.min_alloc_size = 1 << aws->bo_slabs[0].min_order;
|
||||
aws->info.min_alloc_size = 1 << aws->bo_slabs.min_order;
|
||||
|
||||
/* init reference */
|
||||
pipe_reference_init(&aws->reference, 1);
|
||||
|
|
|
|||
|
|
@ -17,8 +17,6 @@
|
|||
|
||||
struct amdgpu_cs;
|
||||
|
||||
#define NUM_SLAB_ALLOCATORS 3
|
||||
|
||||
/* DRM file descriptors, file descriptions and buffer sharing.
|
||||
*
|
||||
* amdgpu_device_initialize first argument is a file descriptor (fd)
|
||||
|
|
@ -70,11 +68,7 @@ struct amdgpu_winsys {
|
|||
int fd;
|
||||
|
||||
struct pb_cache bo_cache;
|
||||
|
||||
/* Each slab buffer can only contain suballocations of equal sizes, so we
|
||||
* need to layer the allocators, so that we don't waste too much memory.
|
||||
*/
|
||||
struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
|
||||
struct pb_slabs bo_slabs; /* Slab allocator. */
|
||||
|
||||
amdgpu_device_handle dev;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue