diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 95f7c8641b2..687cfd3a144 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -65,63 +65,48 @@ static bool amdgpu_bo_wait(struct radeon_winsys *rws,
       return !buffer_busy;
    }
 
-   if (timeout == 0) {
-      unsigned idle_fences;
-      bool buffer_idle;
+   simple_mtx_lock(&ws->bo_fence_lock);
 
-      simple_mtx_lock(&ws->bo_fence_lock);
+   u_foreach_bit(i, bo->fences.valid_fence_mask) {
+      struct pipe_fence_handle **fence = get_fence_from_ring(ws, &bo->fences, i);
 
-      for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
-         if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
-            break;
-      }
+      if (fence) {
+         if (timeout == 0) {
+            bool idle = amdgpu_fence_wait(*fence, 0, false);
 
-      /* Release the idle fences to avoid checking them again later. */
-      for (unsigned i = 0; i < idle_fences; ++i)
-         amdgpu_fence_reference(&bo->fences[i], NULL);
+            if (!idle) {
+               simple_mtx_unlock(&ws->bo_fence_lock);
+               return false; /* busy */
+            }
 
-      memmove(&bo->fences[0], &bo->fences[idle_fences],
-              (bo->num_fences - idle_fences) * sizeof(*bo->fences));
-      bo->num_fences -= idle_fences;
+            /* It's idle. Remove it from the ring to skip checking it again later. */
+            amdgpu_fence_reference(fence, NULL);
+         } else {
+            struct pipe_fence_handle *tmp_fence = NULL;
+            amdgpu_fence_reference(&tmp_fence, *fence);
 
-      buffer_idle = !bo->num_fences;
-      simple_mtx_unlock(&ws->bo_fence_lock);
+            /* While waiting, unlock the mutex. */
+            simple_mtx_unlock(&ws->bo_fence_lock);
 
-      return buffer_idle;
-   } else {
-      bool buffer_idle = true;
+            bool idle = amdgpu_fence_wait(tmp_fence, abs_timeout, true);
+            if (!idle) {
+               amdgpu_fence_reference(&tmp_fence, NULL);
+               return false; /* busy */
+            }
 
-      simple_mtx_lock(&ws->bo_fence_lock);
-      while (bo->num_fences && buffer_idle) {
-         struct pipe_fence_handle *fence = NULL;
-         bool fence_idle = false;
-
-         amdgpu_fence_reference(&fence, bo->fences[0]);
-
-         /* Wait for the fence. */
-         simple_mtx_unlock(&ws->bo_fence_lock);
-         if (amdgpu_fence_wait(fence, abs_timeout, true))
-            fence_idle = true;
-         else
-            buffer_idle = false;
-         simple_mtx_lock(&ws->bo_fence_lock);
-
-         /* Release an idle fence to avoid checking it again later, keeping in
-          * mind that the fence array may have been modified by other threads.
-          */
-         if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
-            amdgpu_fence_reference(&bo->fences[0], NULL);
-            memmove(&bo->fences[0], &bo->fences[1],
-                    (bo->num_fences - 1) * sizeof(*bo->fences));
-            bo->num_fences--;
+            simple_mtx_lock(&ws->bo_fence_lock);
+            /* It's idle. Remove it from the ring to skip checking it again later. */
+            if (tmp_fence == *fence)
+               amdgpu_fence_reference(fence, NULL);
+            amdgpu_fence_reference(&tmp_fence, NULL);
          }
-
-         amdgpu_fence_reference(&fence, NULL);
       }
-      simple_mtx_unlock(&ws->bo_fence_lock);
 
-      return buffer_idle;
+      bo->fences.valid_fence_mask &= ~BITFIELD_BIT(i); /* remove the fence from the BO */
    }
+
+   simple_mtx_unlock(&ws->bo_fence_lock);
+   return true; /* idle */
 }
 
 static inline unsigned get_slab_entry_offset(struct amdgpu_winsys_bo *bo)
@@ -148,12 +133,7 @@ static enum radeon_bo_flag amdgpu_bo_get_flags(
 
 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
 {
-   for (unsigned i = 0; i < bo->num_fences; ++i)
-      amdgpu_fence_reference(&bo->fences[i], NULL);
-
-   FREE(bo->fences);
-   bo->num_fences = 0;
-   bo->max_fences = 0;
+   bo->fences.valid_fence_mask = 0;
 }
 
 void amdgpu_bo_destroy(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf)
@@ -937,8 +917,11 @@ sparse_free_backing_buffer(struct amdgpu_winsys *ws, struct amdgpu_bo_sparse *bo
 {
    bo->num_backing_pages -= backing->bo->b.base.size / RADEON_SPARSE_PAGE_SIZE;
 
+   /* Add fences from bo to backing->bo. */
    simple_mtx_lock(&ws->bo_fence_lock);
-   amdgpu_add_fences(&backing->bo->b, bo->b.num_fences, bo->b.fences);
+   u_foreach_bit(i, bo->b.fences.valid_fence_mask) {
+      add_seq_no_to_list(ws, &backing->bo->b.fences, i, bo->b.fences.seq_no[i]);
+   }
    simple_mtx_unlock(&ws->bo_fence_lock);
 
    list_del(&backing->list);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index 0e4a0ee71f8..f034d148858 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -49,7 +49,8 @@ enum amdgpu_bo_type {
 /* Base class of the buffer object that other structures inherit. */
 struct amdgpu_winsys_bo {
    struct pb_buffer_lean base;
-   enum amdgpu_bo_type type;
+   enum amdgpu_bo_type type:8;
+   struct amdgpu_seq_no_fences fences;
 
    /* This is set when a buffer is returned by buffer_create(), not when the memory is allocated
     * as part of slab BO.
@@ -59,11 +60,6 @@ struct amdgpu_winsys_bo {
    /* how many command streams, which are being emitted in a separate
     * thread, is this bo referenced in? */
    volatile int num_active_ioctls;
-
-   /* Fences for buffer synchronization. */
-   uint16_t num_fences;
-   uint16_t max_fences;
-   struct pipe_fence_handle **fences;
 };
 
 /* Real GPU memory allocation managed by the amdgpu kernel driver.
@@ -177,6 +173,66 @@ static struct amdgpu_bo_real *get_slab_entry_real_bo(struct amdgpu_winsys_bo *bo
    return &get_bo_from_slab(((struct amdgpu_bo_slab_entry*)bo)->entry.slab)->b.b;
 }
 
+/* Given a sequence number "fences->seq_no[queue_index]", return a pointer to a non-NULL fence
+ * pointer in the queue ring corresponding to that sequence number if the fence is non-NULL.
+ * If the fence is not present in the ring (= is idle), return NULL. If it returns a non-NULL
+ * pointer and the caller finds the fence to be idle, it's recommended to use the returned pointer
+ * to set the fence to NULL in the ring, which is why we return a pointer to a pointer.
+ */
+static inline struct pipe_fence_handle **
+get_fence_from_ring(struct amdgpu_winsys *ws, struct amdgpu_seq_no_fences *fences,
+                    unsigned queue_index)
+{
+   /* The caller should check if the BO has a fence. */
+   assert(queue_index < AMDGPU_MAX_QUEUES);
+   assert(fences->valid_fence_mask & BITFIELD_BIT(queue_index));
+
+   uint_seq_no buffer_seq_no = fences->seq_no[queue_index];
+   uint_seq_no latest_seq_no = ws->queues[queue_index].latest_seq_no;
+   bool fence_present = latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE;
+
+   if (fence_present) {
+      struct pipe_fence_handle **fence =
+         &ws->queues[queue_index].fences[buffer_seq_no % AMDGPU_FENCE_RING_SIZE];
+
+      if (*fence)
+         return fence;
+   }
+
+   /* If the sequence number references a fence that is not present, it's guaranteed to be idle
+    * because the winsys always waits for the oldest fence when it removes it from the ring.
+    */
+   fences->valid_fence_mask &= ~BITFIELD_BIT(queue_index);
+   return NULL;
+}
+
+static inline uint_seq_no pick_latest_seq_no(struct amdgpu_winsys *ws, unsigned queue_index,
+                                             uint_seq_no n1, uint_seq_no n2)
+{
+   uint_seq_no latest = ws->queues[queue_index].latest_seq_no;
+
+   /* Since sequence numbers can wrap around, we need to pick the later number that's logically
+    * before "latest". The trick is to subtract "latest + 1" to underflow integer such
+    * that "latest" becomes UINT*_MAX, and then just return the maximum.
+    */
+   uint_seq_no s1 = n1 - latest - 1;
+   uint_seq_no s2 = n2 - latest - 1;
+
+   return s1 >= s2 ? n1 : n2;
+}
+
+static inline void add_seq_no_to_list(struct amdgpu_winsys *ws, struct amdgpu_seq_no_fences *fences,
+                                      unsigned queue_index, uint_seq_no seq_no)
+{
+   if (fences->valid_fence_mask & BITFIELD_BIT(queue_index)) {
+      fences->seq_no[queue_index] = pick_latest_seq_no(ws, queue_index, seq_no,
+                                                       fences->seq_no[queue_index]);
+   } else {
+      fences->seq_no[queue_index] = seq_no;
+      fences->valid_fence_mask |= BITFIELD_BIT(queue_index);
+   }
+}
+
 bool amdgpu_bo_can_reclaim(struct amdgpu_winsys *ws, struct pb_buffer_lean *_buf);
 struct pb_buffer_lean *amdgpu_bo_create(struct amdgpu_winsys *ws,
                                    uint64_t size,
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index b2c0ac2f5ae..55689fb5215 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -51,6 +51,7 @@ amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
    }
 
    util_queue_fence_init(&fence->submitted);
+   fence->imported = true;
 
    assert(amdgpu_fence_is_syncobj(fence));
    return (struct pipe_fence_handle*)fence;
@@ -84,6 +85,7 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
    }
 
    util_queue_fence_init(&fence->submitted);
+   fence->imported = true;
 
    return (struct pipe_fence_handle*)fence;
 }
@@ -975,6 +977,23 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs,
    cs->has_chaining = ctx->ws->info.gfx_level >= GFX7 &&
                       (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
 
+   /* Compute the queue index by counting the IPs that have queues. */
+   assert(ip_type < ARRAY_SIZE(ctx->ws->info.ip));
+   assert(ctx->ws->info.ip[ip_type].num_queues);
+   cs->queue_index = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx->ws->info.ip); i++) {
+      if (!ctx->ws->info.ip[i].num_queues)
+         continue;
+
+      if (i == ip_type)
+         break;
+
+      cs->queue_index++;
+   }
+
+   assert(cs->queue_index < AMDGPU_MAX_QUEUES);
+
    struct amdgpu_cs_fence_info fence_info;
    fence_info.handle = cs->ctx->user_fence_bo;
    fence_info.offset = cs->ip_type * 4;
@@ -1190,27 +1209,6 @@ static void add_fence_to_list(struct amdgpu_fence_list *fences,
    amdgpu_fence_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
 }
 
-static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
-                                     struct amdgpu_fence *fence)
-{
-   struct amdgpu_cs_context *cs = acs->csc;
-
-   /* Detect no-op dependencies only when there is only 1 ring,
-    * because IBs on one ring are always executed one at a time.
-    *
-    * We always want no dependency between back-to-back gfx IBs, because
-    * we need the parallelism between IBs for good performance.
-    */
-   if ((acs->ip_type == AMD_IP_GFX ||
-        acs->ws->info.ip[acs->ip_type].num_queues == 1) &&
-       !amdgpu_fence_is_syncobj(fence) &&
-       fence->ctx == acs->ctx &&
-       fence->fence.ip_type == cs->chunk_ib[IB_MAIN].ip_type)
-      return true;
-
-   return amdgpu_fence_wait((void *)fence, 0, false);
-}
-
 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
                                            struct pipe_fence_handle *pfence,
                                            unsigned dependency_flags)
@@ -1221,7 +1219,8 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
 
    util_queue_fence_wait(&fence->submitted);
 
-   if (is_noop_fence_dependency(acs, fence))
+   /* Ignore non-imported idle fences. This will only check the user fence in memory. */
+   if (!fence->imported && amdgpu_fence_wait((void *)fence, 0, false))
       return;
 
    if (amdgpu_fence_is_syncobj(fence))
@@ -1230,94 +1229,30 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
       add_fence_to_list(&cs->fence_dependencies, fence);
 }
 
-static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
-                                             struct amdgpu_cs_context *cs,
-                                             struct amdgpu_cs_buffer *buffer)
-{
-   struct amdgpu_winsys_bo *bo = buffer->bo;
-   unsigned new_num_fences = 0;
-   const unsigned num_fences = bo->num_fences;
-
-   for (unsigned j = 0; j < num_fences; ++j) {
-      struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
-
-      if (is_noop_fence_dependency(acs, bo_fence))
-         continue;
-
-      amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
-      new_num_fences++;
-
-      if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
-         continue;
-
-      add_fence_to_list(&cs->fence_dependencies, bo_fence);
-   }
-
-   for (unsigned j = new_num_fences; j < num_fences; ++j)
-      amdgpu_fence_reference(&bo->fences[j], NULL);
-
-   bo->num_fences = new_num_fences;
-}
-
-/* Add the given list of fences to the buffer's fence list.
- *
- * Must be called with the winsys bo_fence_lock held.
- */
-void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
-                       unsigned num_fences,
-                       struct pipe_fence_handle **fences)
-{
-   if (bo->num_fences + num_fences > bo->max_fences) {
-      unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2);
-      struct pipe_fence_handle **new_fences =
-         REALLOC(bo->fences,
-                 bo->num_fences * sizeof(*new_fences),
-                 new_max_fences * sizeof(*new_fences));
-      if (likely(new_fences && new_max_fences < UINT16_MAX)) {
-         bo->fences = new_fences;
-         bo->max_fences = new_max_fences;
-      } else {
-         unsigned drop;
-
-         fprintf(stderr, new_fences ? "amdgpu_add_fences: too many fences, dropping some\n"
-                                    : "amdgpu_add_fences: allocation failure, dropping fence(s)\n");
-         free(new_fences);
-
-         if (!bo->num_fences)
-            return;
-
-         bo->num_fences--; /* prefer to keep the most recent fence if possible */
-         amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
-
-         drop = bo->num_fences + num_fences - bo->max_fences;
-         num_fences -= drop;
-         fences += drop;
-      }
-   }
-
-   unsigned bo_num_fences = bo->num_fences;
-
-   for (unsigned i = 0; i < num_fences; ++i) {
-      bo->fences[bo_num_fences] = NULL;
-      amdgpu_fence_reference(&bo->fences[bo_num_fences], fences[i]);
-      bo_num_fences++;
-   }
-   bo->num_fences = bo_num_fences;
-}
-
 static void amdgpu_add_bo_fences_to_dependencies(struct amdgpu_cs *acs,
-                                                 struct amdgpu_cs_context *cs,
-                                                 struct pipe_fence_handle *fence,
+                                                 struct amdgpu_seq_no_fences *dependencies,
+                                                 uint_seq_no new_queue_seq_no,
                                                  struct amdgpu_buffer_list *list)
 {
+   struct amdgpu_winsys *ws = acs->ws;
+   unsigned queue_index = acs->queue_index;
    unsigned num_buffers = list->num_buffers;
 
    for (unsigned i = 0; i < num_buffers; i++) {
       struct amdgpu_cs_buffer *buffer = &list->buffers[i];
       struct amdgpu_winsys_bo *bo = buffer->bo;
 
-      amdgpu_add_bo_fence_dependencies(acs, cs, buffer);
-      amdgpu_add_fences(bo, 1, &fence);
+      /* Add BO fences from queues other than 'queue_index' to dependencies. */
+      if (buffer->usage & RADEON_USAGE_SYNCHRONIZED) {
+         u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~BITFIELD_BIT(queue_index)) {
+            add_seq_no_to_list(ws, dependencies, other_queue_idx,
+                               bo->fences.seq_no[other_queue_idx]);
+         }
+      }
+
+      /* Also set the fence in the BO. */
+      bo->fences.seq_no[queue_index] = new_queue_seq_no;
+      bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
    }
 }
 
@@ -1378,11 +1313,112 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
    bool has_user_fence = amdgpu_cs_has_user_fence(cs);
 
    simple_mtx_lock(&ws->bo_fence_lock);
-   /* Since the kernel driver doesn't synchronize execution between different
-    * rings automatically, we have to add fence dependencies manually.
+   struct amdgpu_queue *queue = &ws->queues[acs->queue_index];
+   uint_seq_no prev_seq_no = queue->latest_seq_no;
+
+   /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
+    * but the values aren't related.
     */
-   for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
-      amdgpu_add_bo_fences_to_dependencies(acs, cs, cs->fence, &cs->buffer_lists[i]);
+   uint_seq_no next_seq_no = prev_seq_no + 1;
+
+   /* Wait for the oldest fence to signal. This should always check the user fence, then wait
+    * via the ioctl. We have to do this because we are going to release the oldest fence and
+    * replace it with the latest fence in the ring.
+    */
+   struct pipe_fence_handle **oldest_fence =
+      &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
+
+   if (*oldest_fence) {
+      if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
+         /* Take the reference because the fence can be released by other threads after we
+          * unlock the mutex.
+          */
+         struct pipe_fence_handle *tmp_fence = NULL;
+         amdgpu_fence_reference(&tmp_fence, *oldest_fence);
+
+         /* Unlock the mutex before waiting. */
+         simple_mtx_unlock(&ws->bo_fence_lock);
+         amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
+         amdgpu_fence_reference(&tmp_fence, NULL);
+         simple_mtx_lock(&ws->bo_fence_lock);
+      }
+
+      /* Remove the idle fence from the ring. */
+      amdgpu_fence_reference(oldest_fence, NULL);
+   }
+
+   /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
+    * sequence number per queue and removes all older ones.
+    */
+   struct amdgpu_seq_no_fences seq_no_dependencies;
+   seq_no_dependencies.valid_fence_mask = 0;
+
+   /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
+    * make it appear as if it had only 1 queue, or if the previous IB comes from a different
+    * context. The reasons are:
+    * - Our BO fence tracking only supports 1 queue per IP.
+    * - IBs from different contexts must wait for each other and can't execute in a random order.
+    */
+   struct amdgpu_fence *prev_fence =
+      (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
+
+   if (prev_fence && (ws->info.ip[acs->ip_type].num_queues > 1 || prev_fence->ctx != acs->ctx))
+      add_seq_no_to_list(ws, &seq_no_dependencies, acs->queue_index, prev_seq_no);
+
+   /* Since the kernel driver doesn't synchronize execution between different
+    * rings automatically, we have to add fence dependencies manually. This gathers sequence
+    * numbers from BOs and sets the next sequence number in the BOs.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
+      amdgpu_add_bo_fences_to_dependencies(acs, &seq_no_dependencies, next_seq_no,
+                                           &cs->buffer_lists[i]);
+   }
+
+#if 0 /* Debug code. */
+   printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
+
+   /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
+   for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
+      if (i == acs->queue_index)
+         continue;
+
+      struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
+      if (!fence) {
+         if (i <= 1)
+            printf("      queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
+         continue;
+      }
+
+      bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
+      uint_seq_no old = seq_no_dependencies.seq_no[i];
+      add_seq_no_to_list(ws, &seq_no_dependencies, i, ws->queues[i].latest_seq_no);
+      uint_seq_no new = seq_no_dependencies.seq_no[i];
+
+      if (!valid)
+         printf("   missing dependency on queue=%u, seq_no=%u\n", i, new);
+      else if (old != new)
+         printf("   too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
+      else
+         printf("   has dependency on queue=%u, seq_no=%u\n", i, old);
+   }
+#endif
+
+   /* Convert the sequence numbers we gathered to fence dependencies. */
+   u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
+      struct pipe_fence_handle **fence = get_fence_from_ring(ws, &seq_no_dependencies, i);
+
+      if (fence) {
+         /* If it's idle, don't add it to the list of dependencies. */
+         if (amdgpu_fence_wait(*fence, 0, false))
+            amdgpu_fence_reference(fence, NULL);
+         else
+            add_fence_to_list(&cs->fence_dependencies, (struct amdgpu_fence*)*fence);
+      }
+   }
+
+   /* Finally, add the IB fence into the winsys queue. */
+   amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
+   queue->latest_seq_no = next_seq_no;
    simple_mtx_unlock(&ws->bo_fence_lock);
 
    struct drm_amdgpu_bo_list_entry *bo_list = NULL;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index c922efc596f..5b505af00bc 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -119,6 +119,7 @@ struct amdgpu_cs {
     */
    struct drm_amdgpu_cs_chunk_fence fence_chunk;
    enum amd_ip_type ip_type;
+   unsigned queue_index;
 
    /* We flip between these two CS. While one is being consumed
     * by the kernel in another thread, the other one is being filled
@@ -166,6 +167,7 @@ struct amdgpu_fence {
    struct util_queue_fence submitted;
 
    volatile int signalled;              /* bool (int for atomicity) */
+   bool imported;
 };
 
 static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence)
@@ -242,9 +244,6 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs,
 
 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
                        bool absolute);
-void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
-                       unsigned num_fences,
-                       struct pipe_fence_handle **fences);
 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs);
 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws);
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index efd9c18c32f..8d3ef782a25 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -72,6 +72,11 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
    if (ws->reserve_vmid)
       amdgpu_vm_unreserve_vmid(ws->dev, 0);
 
+   for (unsigned i = 0; i < ARRAY_SIZE(ws->queues); i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(ws->queues[i].fences); j++)
+         amdgpu_fence_reference(&ws->queues[i].fences[j], NULL);
+   }
+
    if (util_queue_is_initialized(&ws->cs_queue))
       util_queue_destroy(&ws->cs_queue);
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index 4c552461bca..70564e41bbc 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -62,11 +62,88 @@ struct amdgpu_screen_winsys {
    struct hash_table *kms_handles;
 };
 
+/* Maximum this number of IBs can be busy per queue. When submitting a new IB and the oldest IB
+ * ("AMDGPU_FENCE_RING_SIZE" IBs ago) is still busy, the CS thread will wait for it and will
+ * also block all queues from submitting new IBs.
+ */
+#define AMDGPU_FENCE_RING_SIZE 32
+
+/* The maximum number of queues that can be present. */
+#define AMDGPU_MAX_QUEUES 6
+
+/* This can use any integer type because the logic handles integer wraparounds robustly, but
+ * uint8_t wraps around so quickly that some BOs might never become idle because we don't
+ * remove idle fences from BOs, so they become "busy" again after a queue sequence number wraps
+ * around and they may stay "busy" in pb_cache long enough that we run out of memory.
+ */
+typedef uint16_t uint_seq_no;
+
+struct amdgpu_queue {
+   /* Ring buffer of fences.
+    *
+    * We only remember a certain number of the most recent fences per queue. When we add a new
+    * fence, we wait for the oldest one, which implies that all older fences not present
+    * in the ring are idle. This way we don't have to keep track of a million fence references
+    * for a million BOs.
+    *
+    * We only support 1 queue per IP. If an IP has multiple queues, we always add a fence
+    * dependency on the previous fence to make it behave like there is only 1 queue.
+    *
+    * amdgpu_winsys_bo doesn't have a list of fences. It only remembers the last sequence number
+    * for every queue where it was used. We then use the BO's sequence number to look up a fence
+    * in this ring.
+    */
+   struct pipe_fence_handle *fences[AMDGPU_FENCE_RING_SIZE];
+
+   /* The sequence number of the latest fence.
+    *
+    * This sequence number is global per queue per device, shared by all contexts, and generated
+    * by the winsys, not the kernel.
+    *
+    * The latest fence is: fences[latest_seq_no % AMDGPU_FENCE_RING_SIZE]
+    * The oldest fence is: fences([latest_seq_no + 1) % AMDGPU_FENCE_RING_SIZE]
+    * The oldest sequence number in the ring: latest_seq_no - AMDGPU_FENCE_RING_SIZE + 1
+    *
+    * The sequence number is in the ring if:
+    *    latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE
+    * If the sequence number is not in the ring, it's idle.
+    *
+    * Integer wraparounds of the sequence number behave as follows:
+    *
+    * The comparison above gives the correct answer if buffer_seq_no isn't older than UINT*_MAX.
+    * If it's older than UINT*_MAX but not older than UINT*_MAX + AMDGPU_FENCE_RING_SIZE, we
+    * incorrectly pick and wait for one of the fences in the ring. That's only a problem when
+    * the type is so small (uint8_t) that seq_no wraps around very frequently, causing BOs to
+    * never become idle in certain very unlucky scenarios and running out of memory.
+    */
+   uint_seq_no latest_seq_no;
+};
+
+/* This is part of every BO. */
+struct amdgpu_seq_no_fences {
+   /* A fence sequence number per queue. This number is used to look up the fence from
+    * struct amdgpu_queue.
+    *
+    * This sequence number is global per queue per device, shared by all contexts, and generated
+    * by the winsys, not the kernel.
+    */
+   uint_seq_no seq_no[AMDGPU_MAX_QUEUES];
+
+   /* The mask of queues where seq_no[i] is valid. */
+   uint8_t valid_fence_mask;
+};
+
+/* valid_fence_mask should have 1 bit for each queue. */
+static_assert(sizeof(((struct amdgpu_seq_no_fences*)NULL)->valid_fence_mask) * 8 >= AMDGPU_MAX_QUEUES, "");
+
 struct amdgpu_winsys {
    struct pipe_reference reference;
    /* See comment above */
    int fd;
 
+   /* Protected by bo_fence_lock. */
+   struct amdgpu_queue queues[AMDGPU_MAX_QUEUES];
+
    struct pb_cache bo_cache;
    struct pb_slabs bo_slabs;  /* Slab allocator. */