freedreno: Suballocate our long-lived ring objects.

On drawoverhead -test 9 (8 texture changes), this saves us 172kb of memory. That's only ~1% of the GEM memory while the test is running, but more importantly it saves us 29% of the gem BO allocations. non-TC drawoverhead -test 9 (8 texture change) throughput 0.449019% +/- 0.336296% (n=100), but this gets better as we get better suballocation density. Note that this means that all fd_ringbuffer_new_object calls can now return data aligned to 64 bytes, instead of 4k. We may find that we need to increase it if some of our objects (tex consts, sampler consts, etc.) require more alignment than that. But, this may help non-drawoverhead perf if any of our RB objects have a cache in front of them (indirect consts?) and we don't have most of our data in the same cache set any more. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11697>
2026-01-11 14:30:26 +01:00 · 2021-07-02 10:51:16 -07:00 · 2021-07-02 10:51:16 -07:00 · 737d4caa83
commit 737d4caa83
parent eefd93c176
3 changed files with 26 additions and 3 deletions
--- a/src/freedreno/drm/msm_pipe.c
+++ b/src/freedreno/drm/msm_pipe.c
@ -171,6 +171,10 @@ static void
 msm_pipe_destroy(struct fd_pipe *pipe)
 {
   struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
+
+   if (msm_pipe->suballoc_bo)
+      fd_bo_del_locked(msm_pipe->suballoc_bo);
+
   close_submitqueue(pipe, msm_pipe->queue_id);
   msm_pipe_sp_ringpool_init(msm_pipe);
   free(msm_pipe);
--- a/src/freedreno/drm/msm_priv.h
+++ b/src/freedreno/drm/msm_priv.h
@ -56,6 +56,10 @@ struct msm_pipe {
   uint32_t queue_id;
   struct slab_parent_pool ring_pool;

+   /* BO for suballocating long-lived objects on the pipe. */
+   struct fd_bo *suballoc_bo;
+   uint32_t suballoc_offset;
+
   /**
    * The last fence seqno that was flushed to kernel (doesn't mean that it
    * is complete, just that the kernel knows about it)
--- a/src/freedreno/drm/msm_ringbuffer_sp.c
+++ b/src/freedreno/drm/msm_ringbuffer_sp.c
@ -42,6 +42,8 @@

 #define INIT_SIZE 0x1000

+#define SUBALLOC_SIZE (32 * 1024)
+
 /* In the pipe->flush() path, we don't have a util_queue_fence we can wait on,
 * instead use a condition-variable.  Note that pipe->flush() is not expected
 * to be a common/hot path.
@ -180,7 +182,7 @@ msm_submit_suballoc_ring_bo(struct fd_submit *submit,

   if (!suballoc_bo) {
      // TODO possibly larger size for streaming bo?
-      msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, 0x8000);
+      msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, SUBALLOC_SIZE);
      msm_ring->offset = 0;
   } else {
      msm_ring->ring_bo = fd_bo_ref(suballoc_bo);
@ -811,12 +813,25 @@ msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size,
 struct fd_ringbuffer *
 msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size)
 {
+   struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
   struct msm_ringbuffer_sp *msm_ring = malloc(sizeof(*msm_ring));

+   /* Maximum known alignment requirement is a6xx's TEX_CONST at 16 dwords */
+   msm_ring->offset = align(msm_pipe->suballoc_offset, 64);
+   if (!msm_pipe->suballoc_bo ||
+       msm_ring->offset + size > fd_bo_size(msm_pipe->suballoc_bo)) {
+      if (msm_pipe->suballoc_bo)
+         fd_bo_del(msm_pipe->suballoc_bo);
+      msm_pipe->suballoc_bo =
+         fd_bo_new_ring(pipe->dev, MAX2(SUBALLOC_SIZE, align(size, 4096)));
+      msm_ring->offset = 0;
+   }
+
   msm_ring->u.pipe = pipe;
-   msm_ring->offset = 0;
-   msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size);
+   msm_ring->ring_bo = fd_bo_ref(msm_pipe->suballoc_bo);
   msm_ring->base.refcnt = 1;

+   msm_pipe->suballoc_offset = msm_ring->offset + size;
+
   return msm_ringbuffer_sp_init(msm_ring, size, _FD_RINGBUFFER_OBJECT);
 }