winsys/amdgpu: don't layer slabs, use only 1 level of slabs, it improves perf

This increases FPS in VP2020/Catia1 by 10-18%!!!!!!!!!!!!!!!!!!!!!!! I have no rational explanation for this. In the most extreme case, 8192 256B slab BOs (smallest size) are now allocated from a single 2MB slab. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26643>
2026-05-04 18:28:12 +02:00 · 2023-12-08 20:10:11 -05:00 · 2023-12-08 20:10:11 -05:00 · cf2dc2d512
commit cf2dc2d512
parent 4a078e693e
3 changed files with 49 additions and 101 deletions
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@ -229,9 +229,7 @@ static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buff

 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
 {
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
-      pb_slabs_reclaim(&ws->bo_slabs[i]);
-
+   pb_slabs_reclaim(&ws->bo_slabs);
   pb_cache_release_all_buffers(&ws->bo_cache);
 }

@ -615,25 +613,11 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
   return amdgpu_bo_can_reclaim(priv, &bo->b.base);
 }

-static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
-{
-   /* Find the correct slab allocator for the given size. */
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-      struct pb_slabs *slabs = &ws->bo_slabs[i];
-
-      if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
-         return slabs;
-   }
-
-   assert(0);
-   return NULL;
-}
-
 static unsigned get_slab_wasted_size(struct amdgpu_winsys *ws, struct amdgpu_bo_slab_entry *bo)
 {
   assert(bo->b.base.size <= bo->entry.slab->entry_size);
   assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) ||
-          bo->b.base.size < 1 << ws->bo_slabs[0].min_order ||
+          bo->b.base.size < 1 << ws->bo_slabs.min_order ||
          bo->b.base.size > bo->entry.slab->entry_size / 2);
   return bo->entry.slab->entry_size - bo->b.base.size;
 }
@ -642,23 +626,20 @@ static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer *
 {
   struct amdgpu_winsys *ws = amdgpu_winsys(rws);
   struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf));
-   struct pb_slabs *slabs;
-
-   slabs = get_slabs(ws, bo->b.base.size);

   if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
      ws->slab_wasted_vram -= get_slab_wasted_size(ws, bo);
   else
      ws->slab_wasted_gtt -= get_slab_wasted_size(ws, bo);

-   pb_slab_free(slabs, &bo->entry);
+   pb_slab_free(&ws->bo_slabs, &bo->entry);
 }

 /* Return the power of two size of a slab entry matching the input size. */
 static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
 {
   unsigned entry_size = util_next_power_of_two(size);
-   unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
+   unsigned min_entry_size = 1 << ws->bo_slabs.min_order;

   return MAX2(entry_size, min_entry_size);
 }
@ -682,44 +663,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
   enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
   enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
   uint32_t base_id;
-   unsigned slab_size = 0;

   if (!slab)
      return NULL;

   /* Determine the slab buffer size. */
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-      unsigned max_entry_size = 1 << (ws->bo_slabs[i].min_order + ws->bo_slabs[i].num_orders - 1);
+   unsigned max_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);

-      if (entry_size <= max_entry_size) {
-         /* The slab size is twice the size of the largest possible entry. */
-         slab_size = max_entry_size * 2;
+   assert(entry_size <= max_entry_size);

-         if (!util_is_power_of_two_nonzero(entry_size)) {
-            assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
+   /* The slab size is twice the size of the largest possible entry. */
+   unsigned slab_size = max_entry_size * 2;

-            /* If the entry size is 3/4 of a power of two, we would waste space and not gain
-             * anything if we allocated only twice the power of two for the backing buffer:
-             *   2 * 3/4 = 1.5 usable with buffer size 2
-             *
-             * Allocating 5 times the entry size leads us to the next power of two and results
-             * in a much better memory utilization:
-             *   5 * 3/4 = 3.75 usable with buffer size 4
-             */
-            if (entry_size * 5 > slab_size)
-               slab_size = util_next_power_of_two(entry_size * 5);
-         }
+   if (!util_is_power_of_two_nonzero(entry_size)) {
+      assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));

-         /* The largest slab should have the same size as the PTE fragment
-          * size to get faster address translation.
-          */
-         if (i == NUM_SLAB_ALLOCATORS - 1 &&
-             slab_size < ws->info.pte_fragment_size)
-            slab_size = ws->info.pte_fragment_size;
-         break;
-      }
+      /* If the entry size is 3/4 of a power of two, we would waste space and not gain
+       * anything if we allocated only twice the power of two for the backing buffer:
+       *   2 * 3/4 = 1.5 usable with buffer size 2
+       *
+       * Allocating 5 times the entry size leads us to the next power of two and results
+       * in a much better memory utilization:
+       *   5 * 3/4 = 3.75 usable with buffer size 4
+       */
+      if (entry_size * 5 > slab_size)
+         slab_size = util_next_power_of_two(entry_size * 5);
   }
-   assert(slab_size != 0);
+
+   /* The largest slab should have the same size as the PTE fragment
+    * size to get faster address translation.
+    */
+   slab_size = MAX2(slab_size, ws->info.pte_fragment_size);

   slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,
                                                    slab_size, slab_size,
@ -727,6 +701,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
   if (!slab->buffer)
      goto fail;

+   /* We can get a buffer from pb_cache that is slightly larger. */
   slab_size = slab->buffer->base.size;

   slab->base.num_entries = slab_size / entry_size;
@ -751,13 +726,9 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
      bo->b.va = slab->buffer->va + i * entry_size;
      bo->b.unique_id = base_id + i;

-      if (is_real_bo(slab->buffer)) {
-         /* The slab is not suballocated. */
-         bo->real = get_real_bo(slab->buffer);
-      } else {
-         /* The slab is allocated out of a bigger slab. */
-         bo->real = get_slab_entry_bo(slab->buffer)->real;
-      }
+      /* The slab is not suballocated. */
+      assert(is_real_bo(slab->buffer));
+      bo->real = get_real_bo(slab->buffer);

      bo->entry.slab = &slab->base;
      list_addtail(&bo->entry.head, &slab->base.free);
@ -1358,8 +1329,7 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
      return amdgpu_bo_sparse_create(ws, size, domain, flags);
   }

-   struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
-   unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
+   unsigned max_slab_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
   int heap = radeon_get_heap_index(domain, flags);

   /* Sub-allocate small buffers from slabs. */
@ -1387,13 +1357,12 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
         }
      }

-      struct pb_slabs *slabs = get_slabs(ws, alloc_size);
-      entry = pb_slab_alloc(slabs, alloc_size, heap);
+      entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
      if (!entry) {
         /* Clean up buffer managers and try again. */
         amdgpu_clean_up_buffer_managers(ws);

-         entry = pb_slab_alloc(slabs, alloc_size, heap);
+         entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
      }
      if (!entry)
         return NULL;
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@ -76,10 +76,8 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
      util_queue_destroy(&ws->cs_queue);

   simple_mtx_destroy(&ws->bo_fence_lock);
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-      if (ws->bo_slabs[i].groups)
-         pb_slabs_deinit(&ws->bo_slabs[i]);
-   }
+   if (ws->bo_slabs.groups)
+      pb_slabs_deinit(&ws->bo_slabs);
   pb_cache_deinit(&ws->bo_cache);
   _mesa_hash_table_destroy(ws->bo_export_table, NULL);
   simple_mtx_destroy(&ws->sws_list_lock);
@ -454,35 +452,22 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
                     * is a struct pointer instead of void*. */
                    (void*)amdgpu_bo_destroy, (void*)amdgpu_bo_can_reclaim);

-      unsigned min_slab_order = 8;  /* 256 bytes */
-      unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */
-      unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) /
-                                               NUM_SLAB_ALLOCATORS;
-
-      /* Divide the size order range among slab managers. */
-      for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-         unsigned min_order = min_slab_order;
-         unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator,
-                                   max_slab_order);
-
-         if (!pb_slabs_init(&aws->bo_slabs[i],
-                            min_order, max_order,
-                            RADEON_NUM_HEAPS, true,
-                            aws,
-                            amdgpu_bo_can_reclaim_slab,
-                            amdgpu_bo_slab_alloc,
-                            /* Cast to void* because one of the function parameters
-                             * is a struct pointer instead of void*. */
-                            (void*)amdgpu_bo_slab_free)) {
-            amdgpu_winsys_destroy(&ws->base);
-            simple_mtx_unlock(&dev_tab_mutex);
-            return NULL;
-         }
-
-         min_slab_order = max_order + 1;
+      if (!pb_slabs_init(&aws->bo_slabs,
+                         8,  /* min slab entry size: 256 bytes */
+                         20, /* max slab entry size: 1 MB (slab size = 2 MB) */
+                         RADEON_NUM_HEAPS, true,
+                         aws,
+                         amdgpu_bo_can_reclaim_slab,
+                         amdgpu_bo_slab_alloc,
+                         /* Cast to void* because one of the function parameters
+                          * is a struct pointer instead of void*. */
+                         (void*)amdgpu_bo_slab_free)) {
+         amdgpu_winsys_destroy(&ws->base);
+         simple_mtx_unlock(&dev_tab_mutex);
+         return NULL;
      }

-      aws->info.min_alloc_size = 1 << aws->bo_slabs[0].min_order;
+      aws->info.min_alloc_size = 1 << aws->bo_slabs.min_order;

      /* init reference */
      pipe_reference_init(&aws->reference, 1);
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@ -17,8 +17,6 @@

 struct amdgpu_cs;

-#define NUM_SLAB_ALLOCATORS 3
-
 /* DRM file descriptors, file descriptions and buffer sharing.
 *
 * amdgpu_device_initialize first argument is a file descriptor (fd)
@ -70,11 +68,7 @@ struct amdgpu_winsys {
   int fd;

   struct pb_cache bo_cache;
-
-   /* Each slab buffer can only contain suballocations of equal sizes, so we
-    * need to layer the allocators, so that we don't waste too much memory.
-    */
-   struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
+   struct pb_slabs bo_slabs;  /* Slab allocator. */

   amdgpu_device_handle dev;