winsys/amdgpu: rewrite BO fence tracking by adding a new queue fence system

This decreases the time spent in amdgpu_cs_submit_ib from 15.4% to 8.3% in VP2020/Catia1, which is a decrease of CPU load for that thread by 46%. Overall, it increases performance by a small number in CPU-bound benchmarks. The biggest improvement I have seen is VP2020/Catia2, where it increases FPS by 12%. It no longer stores pipe_fence_handle references inside amdgpu_winsys_bo. The idea is to have a global fixed list of queues (only 1 queue per IP for now) where each queue generates its own sequence numbers (generated by the winsys, not the kernel). Each queue also has a ring of fences. The sequence numbers are used as indices into the ring of fences, which is how sequence numbers are converted to fences. With that, each BO only has to keep a list of sequence numbers, 1 for each queue. The maximum number of queues is set to 6. Since the system can handle integer wraparounds of sequence numbers correctly, we only need 16-bit sequence numbers in BOs to have accurate busyness tracking. Thus, each BO uses only 12 bytes to represent all its fences for all queues. There is also a 1-byte bitmask saying which sequence numbers are initialized. amdgpu_winsys.h contains the complete description. It has several limitations that exist to minimize the memory footprint and updating of BO fences. Acked-by: Yogesh Mohan Marimuthu <yogesh.mohanmarimuthu@amd.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26643>
2026-01-25 16:40:30 +01:00 · 2023-12-01 22:16:34 -05:00 · 2023-12-01 22:16:34 -05:00 · 4d486888ee
commit 4d486888ee
parent b976f8fc1e
6 changed files with 323 additions and 167 deletions
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@ -65,63 +65,48 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws,
      return !buffer_busy;
   }

-   if (timeout == 0) {
-      unsigned idle_fences;
-      bool buffer_idle;
+   simple_mtx_lock(&ws->bo_fence_lock);

-      simple_mtx_lock(&ws->bo_fence_lock);
+   u_foreach_bit(i, bo->fences.valid_fence_mask) {
+      struct pipe_fence_handle **fence = get_fence_from_ring(ws, &bo->fences, i);

-      for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
-         if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
-            break;
-      }
+      if (fence) {
+         if (timeout == 0) {
+            bool idle = amdgpu_fence_wait(*fence, 0, false);

-      /* Release the idle fences to avoid checking them again later. */
-      for (unsigned i = 0; i < idle_fences; ++i)
-         amdgpu_fence_reference(&bo->fences[i], NULL);
+            if (!idle) {
+               simple_mtx_unlock(&ws->bo_fence_lock);
+               return false; /* busy */
+            }

-      memmove(&bo->fences[0], &bo->fences[idle_fences],
-              (bo->num_fences - idle_fences) * sizeof(*bo->fences));
-      bo->num_fences -= idle_fences;
+            /* It's idle. Remove it from the ring to skip checking it again later. */
+            amdgpu_fence_reference(fence, NULL);
+         } else {
+            struct pipe_fence_handle *tmp_fence = NULL;
+            amdgpu_fence_reference(&tmp_fence, *fence);

-      buffer_idle = !bo->num_fences;
-      simple_mtx_unlock(&ws->bo_fence_lock);
+            /* While waiting, unlock the mutex. */
+            simple_mtx_unlock(&ws->bo_fence_lock);

-      return buffer_idle;
-   } else {
-      bool buffer_idle = true;
+            bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true);
+            if (!idle) {
+               amdgpu_fence_reference(&tmp_fence, NULL);
+               return false; /* busy */
+            }

-      simple_mtx_lock(&ws->bo_fence_lock);
-      while (bo->num_fences && buffer_idle) {
-         struct pipe_fence_handle *fence = NULL;
-         bool fence_idle = false;
-
-         amdgpu_fence_reference(&fence, bo->fences[0]);
-
-         /* Wait for the fence. */
-         simple_mtx_unlock(&ws->bo_fence_lock);
-         if (amdgpu_fence_wait(fence, abs_timeout, true))
-            fence_idle = true;
-         else
-            buffer_idle = false;
-         simple_mtx_lock(&ws->bo_fence_lock);
-
-         /* Release an idle fence to avoid checking it again later, keeping in
-          * mind that the fence array may have been modified by other threads.
-          */
-         if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
-            amdgpu_fence_reference(&bo->fences[0], NULL);
-            memmove(&bo->fences[0], &bo->fences[1],
-                    (bo->num_fences - 1) * sizeof(*bo->fences));
-            bo->num_fences--;
+            simple_mtx_lock(&ws->bo_fence_lock);
+            /* It's idle. Remove it from the ring to skip checking it again later. */
+            if (tmp_fence == *fence)
+               amdgpu_fence_reference(fence, NULL);
+            amdgpu_fence_reference(&tmp_fence, NULL);
         }
-
-         amdgpu_fence_reference(&fence, NULL);
      }
-      simple_mtx_unlock(&ws->bo_fence_lock);

-      return buffer_idle;
+      bo->fences.valid_fence_mask &= ~BITFIELD_BIT(i); /* remove the fence from the BO */
   }
+
+   simple_mtx_unlock(&ws->bo_fence_lock);
+   return true; /* idle */
 }

 static inline unsigned get_slab_entry_offset(struct amdgpu_winsys_bo *bo)
@ -148,12 +133,7 @@ static enum radeon_bo_flag amdgpu_bo_get_flags(

 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
 {
-   for (unsigned i = 0; i < bo->num_fences; ++i)
-      amdgpu_fence_reference(&bo->fences[i], NULL);
-
-   FREE(bo->fences);
-   bo->num_fences = 0;
-   bo->max_fences = 0;
+   bo->fences.valid_fence_mask = 0;
 }

 void amdgpu_bo_destroy(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf)
@ -937,8 +917,11 @@ sparse_free_backing_buffer(struct amdgpu_winsys *ws, struct amdgpu_bo_sparse *bo
 {
   bo->num_backing_pages -= backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE;

+   /* Add fences from bo to backing->bo. */
   simple_mtx_lock(&ws->bo_fence_lock);
-   amdgpu_add_fences(&backing->bo->b, bo->b.num_fences, bo->b.fences);
+   u_foreach_bit(i, bo->b.fences.valid_fence_mask) {
+      add_seq_no_to_list(ws, &backing->bo->b.fences, i, bo->b.fences.seq_no[i]);
+   }
   simple_mtx_unlock(&ws->bo_fence_lock);

   list_del(&backing->list);
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@ -49,7 +49,8 @@ enum amdgpu_bo_type {
 /* Base class of the buffer object that other structures inherit. */
 struct amdgpu_winsys_bo {
   struct pb_buffer_lean base;
-   enum amdgpu_bo_type type;
+   enum amdgpu_bo_type type:8;
+   struct amdgpu_seq_no_fences fences;

   /* This is set when a buffer is returned by buffer_create(), not when the memory is allocated
    * as part of slab BO.
@ -59,11 +60,6 @@ struct amdgpu_winsys_bo {
   /* how many command streams, which are being emitted in a separate
    * thread, is this bo referenced in? */
   volatile int num_active_ioctls;
-
-   /* Fences for buffer synchronization. */
-   uint16_t num_fences;
-   uint16_t max_fences;
-   struct pipe_fence_handle **fences;
 };

 /* Real GPU memory allocation managed by the amdgpu kernel driver.
@ -177,6 +173,66 @@ static struct amdgpu_bo_real *get_slab_entry_real_bo(struct amdgpu_winsys_bo *bo
   return &get_bo_from_slab(((struct amdgpu_bo_slab_entry*)bo)->entry.slab)->b.b;
 }

+/* Given a sequence number "fences->seq_no[queue_index]", return a pointer to a non-NULL fence
+ * pointer in the queue ring corresponding to that sequence number if the fence is non-NULL.
+ * If the fence is not present in the ring (= is idle), return NULL. If it returns a non-NULL
+ * pointer and the caller finds the fence to be idle, it's recommended to use the returned pointer
+ * to set the fence to NULL in the ring, which is why we return a pointer to a pointer.
+ */
+static inline struct pipe_fence_handle **
+get_fence_from_ring(struct amdgpu_winsys *ws, struct amdgpu_seq_no_fences *fences,
+                    unsigned queue_index)
+{
+   /* The caller should check if the BO has a fence. */
+   assert(queue_index < AMDGPU_MAX_QUEUES);
+   assert(fences->valid_fence_mask & BITFIELD_BIT(queue_index));
+
+   uint_seq_no buffer_seq_no = fences->seq_no[queue_index];
+   uint_seq_no latest_seq_no = ws->queues[queue_index].latest_seq_no;
+   bool fence_present = latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE;
+
+   if (fence_present) {
+      struct pipe_fence_handle **fence =
+         &ws->queues[queue_index].fences[buffer_seq_no % AMDGPU_FENCE_RING_SIZE];
+
+      if (*fence)
+         return fence;
+   }
+
+   /* If the sequence number references a fence that is not present, it's guaranteed to be idle
+    * because the winsys always waits for the oldest fence when it removes it from the ring.
+    */
+   fences->valid_fence_mask &= ~BITFIELD_BIT(queue_index);
+   return NULL;
+}
+
+static inline uint_seq_no pick_latest_seq_no(struct amdgpu_winsys *ws, unsigned queue_index,
+                                             uint_seq_no n1, uint_seq_no n2)
+{
+   uint_seq_no latest = ws->queues[queue_index].latest_seq_no;
+
+   /* Since sequence numbers can wrap around, we need to pick the later number that's logically
+    * before "latest". The trick is to subtract "latest + 1" to underflow integer such
+    * that "latest" becomes UINT*_MAX, and then just return the maximum.
+    */
+   uint_seq_no s1 = n1 - latest - 1;
+   uint_seq_no s2 = n2 - latest - 1;
+
+   return s1 >= s2 ? n1 : n2;
+}
+
+static inline void add_seq_no_to_list(struct amdgpu_winsys *ws, struct amdgpu_seq_no_fences *fences,
+                                      unsigned queue_index, uint_seq_no seq_no)
+{
+   if (fences->valid_fence_mask & BITFIELD_BIT(queue_index)) {
+      fences->seq_no[queue_index] = pick_latest_seq_no(ws, queue_index, seq_no,
+                                                       fences->seq_no[queue_index]);
+   } else {
+      fences->seq_no[queue_index] = seq_no;
+      fences->valid_fence_mask |= BITFIELD_BIT(queue_index);
+   }
+}
+
 bool amdgpu_bo_can_reclaim(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf);
 struct pb_buffer_lean *amdgpu_bo_create(struct amdgpu_winsys *ws,
                                   uint64_t size,
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@ -51,6 +51,7 @@ amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
   }

   util_queue_fence_init(&fence->submitted);
+   fence->imported = true;

   assert(amdgpu_fence_is_syncobj(fence));
   return (struct pipe_fence_handle*)fence;
@ -84,6 +85,7 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
   }

   util_queue_fence_init(&fence->submitted);
+   fence->imported = true;

   return (struct pipe_fence_handle*)fence;
 }
@ -975,6 +977,23 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
   cs->has_chaining = ctx->ws->info.gfx_level >= GFX7 &&
                      (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);

+   /* Compute the queue index by counting the IPs that have queues. */
+   assert(ip_type < ARRAY_SIZE(ctx->ws->info.ip));
+   assert(ctx->ws->info.ip[ip_type].num_queues);
+   cs->queue_index = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) {
+      if (!ctx->ws->info.ip[i].num_queues)
+         continue;
+
+      if (i == ip_type)
+         break;
+
+      cs->queue_index++;
+   }
+
+   assert(cs->queue_index < AMDGPU_MAX_QUEUES);
+
   struct amdgpu_cs_fence_info fence_info;
   fence_info.handle = cs->ctx->user_fence_bo;
   fence_info.offset = cs->ip_type * 4;
@ -1190,27 +1209,6 @@ static void add_fence_to_list(struct amdgpu_fence_list *fences,
   amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
 }

-static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
-                                     struct amdgpu_fence *fence)
-{
-   struct amdgpu_cs_context *cs = acs->csc;
-
-   /* Detect no-op dependencies only when there is only 1 ring,
-    * because IBs on one ring are always executed one at a time.
-    *
-    * We always want no dependency between back-to-back gfx IBs, because
-    * we need the parallelism between IBs for good performance.
-    */
-   if ((acs->ip_type == AMD_IP_GFX ||
-        acs->ws->info.ip[acs->ip_type].num_queues == 1) &&
-       !amdgpu_fence_is_syncobj(fence) &&
-       fence->ctx == acs->ctx &&
-       fence->fence.ip_type == cs->chunk_ib[IB_MAIN].ip_type)
-      return true;
-
-   return amdgpu_fence_wait((void *)fence, 0, false);
-}
-
 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
                                           struct pipe_fence_handle *pfence,
                                           unsigned dependency_flags)
@ -1221,7 +1219,8 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,

   util_queue_fence_wait(&fence->submitted);

-   if (is_noop_fence_dependency(acs, fence))
+   /* Ignore non-imported idle fences. This will only check the user fence in memory. */
+   if (!fence->imported && amdgpu_fence_wait((void *)fence, 0, false))
      return;

   if (amdgpu_fence_is_syncobj(fence))
@ -1230,94 +1229,30 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
      add_fence_to_list(&cs->fence_dependencies, fence);
 }

-static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
-                                             struct amdgpu_cs_context *cs,
-                                             struct amdgpu_cs_buffer *buffer)
-{
-   struct amdgpu_winsys_bo *bo = buffer->bo;
-   unsigned new_num_fences = 0;
-   const unsigned num_fences = bo->num_fences;
-
-   for (unsigned j = 0; j < num_fences; ++j) {
-      struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
-
-      if (is_noop_fence_dependency(acs, bo_fence))
-         continue;
-
-      amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
-      new_num_fences++;
-
-      if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
-         continue;
-
-      add_fence_to_list(&cs->fence_dependencies, bo_fence);
-   }
-
-   for (unsigned j = new_num_fences; j < num_fences; ++j)
-      amdgpu_fence_reference(&bo->fences[j], NULL);
-
-   bo->num_fences = new_num_fences;
-}
-
-/* Add the given list of fences to the buffer's fence list.
- *
- * Must be called with the winsys bo_fence_lock held.
- */
-void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
-                       unsigned num_fences,
-                       struct pipe_fence_handle **fences)
-{
-   if (bo->num_fences + num_fences > bo->max_fences) {
-      unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2);
-      struct pipe_fence_handle **new_fences =
-         REALLOC(bo->fences,
-                 bo->num_fences * sizeof(*new_fences),
-                 new_max_fences * sizeof(*new_fences));
-      if (likely(new_fences && new_max_fences < UINT16_MAX)) {
-         bo->fences = new_fences;
-         bo->max_fences = new_max_fences;
-      } else {
-         unsigned drop;
-
-         fprintf(stderr, new_fences ? "amdgpu_add_fences: too many fences, dropping some\n"
-                                    : "amdgpu_add_fences: allocation failure, dropping fence(s)\n");
-         free(new_fences);
-
-         if (!bo->num_fences)
-            return;
-
-         bo->num_fences--; /* prefer to keep the most recent fence if possible */
-         amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
-
-         drop = bo->num_fences + num_fences - bo->max_fences;
-         num_fences -= drop;
-         fences += drop;
-      }
-   }
-
-   unsigned bo_num_fences = bo->num_fences;
-
-   for (unsigned i = 0; i < num_fences; ++i) {
-      bo->fences[bo_num_fences] = NULL;
-      amdgpu_fence_reference(&bo->fences[bo_num_fences], fences[i]);
-      bo_num_fences++;
-   }
-   bo->num_fences = bo_num_fences;
-}
-
 static void amdgpu_add_bo_fences_to_dependencies(struct amdgpu_cs *acs,
-                                                 struct amdgpu_cs_context *cs,
-                                                 struct pipe_fence_handle *fence,
+                                                 struct amdgpu_seq_no_fences *dependencies,
+                                                 uint_seq_no new_queue_seq_no,
                                                 struct amdgpu_buffer_list *list)
 {
+   struct amdgpu_winsys *ws = acs->ws;
+   unsigned queue_index = acs->queue_index;
   unsigned num_buffers = list->num_buffers;

   for (unsigned i = 0; i < num_buffers; i++) {
      struct amdgpu_cs_buffer *buffer = &list->buffers[i];
      struct amdgpu_winsys_bo *bo = buffer->bo;

-      amdgpu_add_bo_fence_dependencies(acs, cs, buffer);
-      amdgpu_add_fences(bo, 1, &fence);
+      /* Add BO fences from queues other than 'queue_index' to dependencies. */
+      if (buffer->usage & RADEON_USAGE_SYNCHRONIZED) {
+         u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~BITFIELD_BIT(queue_index)) {
+            add_seq_no_to_list(ws, dependencies, other_queue_idx,
+                               bo->fences.seq_no[other_queue_idx]);
+         }
+      }
+
+      /* Also set the fence in the BO. */
+      bo->fences.seq_no[queue_index] = new_queue_seq_no;
+      bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
   }
 }

@ -1378,11 +1313,112 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
   bool has_user_fence = amdgpu_cs_has_user_fence(cs);

   simple_mtx_lock(&ws->bo_fence_lock);
-   /* Since the kernel driver doesn't synchronize execution between different
-    * rings automatically, we have to add fence dependencies manually.
+   struct amdgpu_queue *queue = &ws->queues[acs->queue_index];
+   uint_seq_no prev_seq_no = queue->latest_seq_no;
+
+   /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
+    * but the values aren't related.
    */
-   for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
-      amdgpu_add_bo_fences_to_dependencies(acs, cs, cs->fence, &cs->buffer_lists[i]);
+   uint_seq_no next_seq_no = prev_seq_no + 1;
+
+   /* Wait for the oldest fence to signal. This should always check the user fence, then wait
+    * via the ioctl. We have to do this because we are going to release the oldest fence and
+    * replace it with the latest fence in the ring.
+    */
+   struct pipe_fence_handle **oldest_fence =
+      &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
+
+   if (*oldest_fence) {
+      if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
+         /* Take the reference because the fence can be released by other threads after we
+          * unlock the mutex.
+          */
+         struct pipe_fence_handle *tmp_fence = NULL;
+         amdgpu_fence_reference(&tmp_fence, *oldest_fence);
+
+         /* Unlock the mutex before waiting. */
+         simple_mtx_unlock(&ws->bo_fence_lock);
+         amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
+         amdgpu_fence_reference(&tmp_fence, NULL);
+         simple_mtx_lock(&ws->bo_fence_lock);
+      }
+
+      /* Remove the idle fence from the ring. */
+      amdgpu_fence_reference(oldest_fence, NULL);
+   }
+
+   /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
+    * sequence number per queue and removes all older ones.
+    */
+   struct amdgpu_seq_no_fences seq_no_dependencies;
+   seq_no_dependencies.valid_fence_mask = 0;
+
+   /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
+    * make it appear as if it had only 1 queue, or if the previous IB comes from a different
+    * context. The reasons are:
+    * - Our BO fence tracking only supports 1 queue per IP.
+    * - IBs from different contexts must wait for each other and can't execute in a random order.
+    */
+   struct amdgpu_fence *prev_fence =
+      (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
+
+   if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || prev_fence->ctx != acs->ctx))
+      add_seq_no_to_list(ws, &seq_no_dependencies, acs->queue_index, prev_seq_no);
+
+   /* Since the kernel driver doesn't synchronize execution between different
+    * rings automatically, we have to add fence dependencies manually. This gathers sequence
+    * numbers from BOs and sets the next sequence number in the BOs.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
+      amdgpu_add_bo_fences_to_dependencies(acs, &seq_no_dependencies, next_seq_no,
+                                           &cs->buffer_lists[i]);
+   }
+
+#if 0 /* Debug code. */
+   printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
+
+   /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
+   for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
+      if (i == acs->queue_index)
+         continue;
+
+      struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
+      if (!fence) {
+         if (i <= 1)
+            printf("      queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
+         continue;
+      }
+
+      bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
+      uint_seq_no old = seq_no_dependencies.seq_no[i];
+      add_seq_no_to_list(ws, &seq_no_dependencies, i, ws->queues[i].latest_seq_no);
+      uint_seq_no new = seq_no_dependencies.seq_no[i];
+
+      if (!valid)
+         printf("   missing dependency on queue=%u, seq_no=%u\n", i, new);
+      else if (old != new)
+         printf("   too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
+      else
+         printf("   has dependency on queue=%u, seq_no=%u\n", i, old);
+   }
+#endif
+
+   /* Convert the sequence numbers we gathered to fence dependencies. */
+   u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
+      struct pipe_fence_handle **fence = get_fence_from_ring(ws, &seq_no_dependencies, i);
+
+      if (fence) {
+         /* If it's idle, don't add it to the list of dependencies. */
+         if (amdgpu_fence_wait(*fence, 0, false))
+            amdgpu_fence_reference(fence, NULL);
+         else
+            add_fence_to_list(&cs->fence_dependencies, (struct amdgpu_fence*)*fence);
+      }
+   }
+
+   /* Finally, add the IB fence into the winsys queue. */
+   amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
+   queue->latest_seq_no = next_seq_no;
   simple_mtx_unlock(&ws->bo_fence_lock);

   struct drm_amdgpu_bo_list_entry *bo_list = NULL;
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@ -119,6 +119,7 @@ struct amdgpu_cs {
    */
   struct drm_amdgpu_cs_chunk_fence fence_chunk;
   enum amd_ip_type ip_type;
+   unsigned queue_index;

   /* We flip between these two CS. While one is being consumed
    * by the kernel in another thread, the other one is being filled
@ -166,6 +167,7 @@ struct amdgpu_fence {
   struct util_queue_fence submitted;

   volatile int signalled;              /* bool (int for atomicity) */
+   bool imported;
 };

 static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence)
@ -242,9 +244,6 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs,

 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
                       bool absolute);
-void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
-                       unsigned num_fences,
-                       struct pipe_fence_handle **fences);
 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs);
 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws);

--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@ -72,6 +72,11 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
   if (ws->reserve_vmid)
      amdgpu_vm_unreserve_vmid(ws->dev, 0);

+   for (unsigned i = 0; i < ARRAY_SIZE(ws->queues); i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(ws->queues[i].fences); j++)
+         amdgpu_fence_reference(&ws->queues[i].fences[j], NULL);
+   }
+
   if (util_queue_is_initialized(&ws->cs_queue))
      util_queue_destroy(&ws->cs_queue);

--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@ -62,11 +62,88 @@ struct amdgpu_screen_winsys {
   struct hash_table *kms_handles;
 };

+/* Maximum this number of IBs can be busy per queue. When submitting a new IB and the oldest IB
+ * ("AMDGPU_FENCE_RING_SIZE" IBs ago) is still busy, the CS thread will wait for it and will
+ * also block all queues from submitting new IBs.
+ */
+#define AMDGPU_FENCE_RING_SIZE 32
+
+/* The maximum number of queues that can be present. */
+#define AMDGPU_MAX_QUEUES 6
+
+/* This can use any integer type because the logic handles integer wraparounds robustly, but
+ * uint8_t wraps around so quickly that some BOs might never become idle because we don't
+ * remove idle fences from BOs, so they become "busy" again after a queue sequence number wraps
+ * around and they may stay "busy" in pb_cache long enough that we run out of memory.
+ */
+typedef uint16_t uint_seq_no;
+
+struct amdgpu_queue {
+   /* Ring buffer of fences.
+    *
+    * We only remember a certain number of the most recent fences per queue. When we add a new
+    * fence, we wait for the oldest one, which implies that all older fences not present
+    * in the ring are idle. This way we don't have to keep track of a million fence references
+    * for a million BOs.
+    *
+    * We only support 1 queue per IP. If an IP has multiple queues, we always add a fence
+    * dependency on the previous fence to make it behave like there is only 1 queue.
+    *
+    * amdgpu_winsys_bo doesn't have a list of fences. It only remembers the last sequence number
+    * for every queue where it was used. We then use the BO's sequence number to look up a fence
+    * in this ring.
+    */
+   struct pipe_fence_handle *fences[AMDGPU_FENCE_RING_SIZE];
+
+   /* The sequence number of the latest fence.
+    *
+    * This sequence number is global per queue per device, shared by all contexts, and generated
+    * by the winsys, not the kernel.
+    *
+    * The latest fence is: fences[latest_seq_no % AMDGPU_FENCE_RING_SIZE]
+    * The oldest fence is: fences([latest_seq_no + 1) % AMDGPU_FENCE_RING_SIZE]
+    * The oldest sequence number in the ring: latest_seq_no - AMDGPU_FENCE_RING_SIZE + 1
+    *
+    * The sequence number is in the ring if:
+    *    latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE
+    * If the sequence number is not in the ring, it's idle.
+    *
+    * Integer wraparounds of the sequence number behave as follows:
+    *
+    * The comparison above gives the correct answer if buffer_seq_no isn't older than UINT*_MAX.
+    * If it's older than UINT*_MAX but not older than UINT*_MAX + AMDGPU_FENCE_RING_SIZE, we
+    * incorrectly pick and wait for one of the fences in the ring. That's only a problem when
+    * the type is so small (uint8_t) that seq_no wraps around very frequently, causing BOs to
+    * never become idle in certain very unlucky scenarios and running out of memory.
+    */
+   uint_seq_no latest_seq_no;
+};
+
+/* This is part of every BO. */
+struct amdgpu_seq_no_fences {
+   /* A fence sequence number per queue. This number is used to look up the fence from
+    * struct amdgpu_queue.
+    *
+    * This sequence number is global per queue per device, shared by all contexts, and generated
+    * by the winsys, not the kernel.
+    */
+   uint_seq_no seq_no[AMDGPU_MAX_QUEUES];
+
+   /* The mask of queues where seq_no[i] is valid. */
+   uint8_t valid_fence_mask;
+};
+
+/* valid_fence_mask should have 1 bit for each queue. */
+static_assert(sizeof(((struct amdgpu_seq_no_fences*)NULL)->valid_fence_mask) * 8 >= AMDGPU_MAX_QUEUES, "");
+
 struct amdgpu_winsys {
   struct pipe_reference reference;
   /* See comment above */
   int fd;

+   /* Protected by bo_fence_lock. */
+   struct amdgpu_queue queues[AMDGPU_MAX_QUEUES];
+
   struct pb_cache bo_cache;
   struct pb_slabs bo_slabs;  /* Slab allocator. */