From cd38b3ec5478db0f2ef72eaab2d0c8579a5e98c7 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Sat, 2 Oct 2021 11:16:18 -0700
Subject: [PATCH] Revert "freedreno: Move the batch cache to the context."

This reverts commit b2349a46715e870d7f38ce5f3854bef9718a835c.

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5441
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13159>
---
 .../drivers/freedreno/a5xx/fd5_blitter.c      |   4 +
 .../drivers/freedreno/a6xx/fd6_blitter.c      |   4 +
 .../drivers/freedreno/freedreno_batch.c       | 204 ++++++------
 .../drivers/freedreno/freedreno_batch.h       |  44 ++-
 .../drivers/freedreno/freedreno_batch_cache.c | 291 ++++++++++++------
 .../drivers/freedreno/freedreno_batch_cache.h |  36 ++-
 .../drivers/freedreno/freedreno_context.c     |   6 +-
 .../drivers/freedreno/freedreno_context.h     |   3 -
 .../drivers/freedreno/freedreno_draw.c        |  12 +
 .../drivers/freedreno/freedreno_query_acc.c   |   2 +
 .../drivers/freedreno/freedreno_query_hw.c    |   2 +-
 .../drivers/freedreno/freedreno_resource.c    | 138 ++++-----
 .../drivers/freedreno/freedreno_resource.h    | 112 ++++---
 .../drivers/freedreno/freedreno_screen.c      |   3 +
 .../drivers/freedreno/freedreno_screen.h      |   3 +-
 .../drivers/freedreno/freedreno_state.c       |  13 +-
 16 files changed, 555 insertions(+), 322 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c b/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c
index f67441a63d4..fe2bba5ef2b 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c
@@ -446,9 +446,13 @@ fd5_blitter_blit(struct fd_context *ctx,
 
    batch = fd_bc_alloc_batch(ctx, true);
 
+   fd_screen_lock(ctx->screen);
+
    fd_batch_resource_read(batch, src);
    fd_batch_resource_write(batch, dst);
 
+   fd_screen_unlock(ctx->screen);
+
    DBG_BLIT(info, batch);
 
    fd_batch_update_queries(batch);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
index 84b88714089..c7d893c14f0 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
@@ -916,9 +916,13 @@ handle_rgba_blit(struct fd_context *ctx,
 
    batch = fd_bc_alloc_batch(ctx, true);
 
+   fd_screen_lock(ctx->screen);
+
    fd_batch_resource_read(batch, src);
    fd_batch_resource_write(batch, dst);
 
+   fd_screen_unlock(ctx->screen);
+
    ASSERTED bool ret = fd_batch_lock_submit(batch);
    assert(ret);
 
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c
index dc1a850dc2b..6bc1e06969b 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch.c
@@ -30,7 +30,6 @@
 #include "util/u_string.h"
 
 #include "freedreno_batch.h"
-#include "freedreno_batch_cache.h"
 #include "freedreno_context.h"
 #include "freedreno_fence.h"
 #include "freedreno_query_hw.h"
@@ -139,8 +138,6 @@ fd_batch_create(struct fd_context *ctx, bool nondraw)
 
    batch->resources =
       _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
-   batch->dependents =
-      _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
 
    batch_init(batch);
 
@@ -189,8 +186,6 @@ cleanup_submit(struct fd_batch *batch)
 
    fd_submit_del(batch->submit);
    batch->submit = NULL;
-
-   util_copy_framebuffer_state(&batch->framebuffer, NULL);
 }
 
 static void
@@ -232,58 +227,46 @@ batch_fini(struct fd_batch *batch)
    u_trace_fini(&batch->trace);
 }
 
-/* Flushes any batches that this batch depends on, recursively. */
 static void
 batch_flush_dependencies(struct fd_batch *batch) assert_dt
 {
-   set_foreach (batch->dependents, entry) {
-      struct fd_batch *dep = (void *)entry->key;
+   struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+   struct fd_batch *dep;
+
+   foreach_batch (dep, cache, batch->dependents_mask) {
       fd_batch_flush(dep);
       fd_batch_reference(&dep, NULL);
    }
-   _mesa_set_clear(batch->dependents, NULL);
+
+   batch->dependents_mask = 0;
 }
 
 static void
 batch_reset_dependencies(struct fd_batch *batch)
 {
-   set_foreach (batch->dependents, entry) {
-      struct fd_batch *dep = (void *)entry->key;
+   struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+   struct fd_batch *dep;
+
+   foreach_batch (dep, cache, batch->dependents_mask) {
       fd_batch_reference(&dep, NULL);
    }
-   _mesa_set_clear(batch->dependents, NULL);
+
+   batch->dependents_mask = 0;
 }
 
 static void
 batch_reset_resources(struct fd_batch *batch)
 {
-   struct fd_batch_cache *cache = &batch->ctx->batch_cache;
+   fd_screen_assert_locked(batch->ctx->screen);
 
    set_foreach (batch->resources, entry) {
       struct fd_resource *rsc = (struct fd_resource *)entry->key;
-      struct hash_entry *entry =
-         _mesa_hash_table_search_pre_hashed(cache->written_resources, rsc->hash, rsc);
-      if (entry) {
-         struct fd_batch *write_batch = entry->data;
-         assert(write_batch == batch);
-
-         struct pipe_resource *table_ref = &rsc->b.b;
-         pipe_resource_reference(&table_ref, NULL);
-
-         fd_batch_reference(&write_batch, NULL);
-         _mesa_hash_table_remove(cache->written_resources, entry);
-      }
-
-      ASSERTED int32_t count = p_atomic_dec_return(&rsc->batch_references);
-      assert(count >= 0);
-
-      struct pipe_resource *table_ref = &rsc->b.b;
-      pipe_resource_reference(&table_ref, NULL);
+      _mesa_set_remove(batch->resources, entry);
+      debug_assert(rsc->track->batch_mask & (1 << batch->idx));
+      rsc->track->batch_mask &= ~(1 << batch->idx);
+      if (rsc->track->write_batch == batch)
+         fd_batch_reference_locked(&rsc->track->write_batch, NULL);
    }
-   /* Clear at the end so if the batch is reused we get a fully empty set
-    * rather than having any deleted keys.
-    */
-   _mesa_set_clear(batch->resources, NULL);
 }
 
 static void
@@ -292,7 +275,11 @@ batch_reset(struct fd_batch *batch) assert_dt
    DBG("%p", batch);
 
    batch_reset_dependencies(batch);
+
+   fd_screen_lock(batch->ctx->screen);
    batch_reset_resources(batch);
+   fd_screen_unlock(batch->ctx->screen);
+
    batch_fini(batch);
    batch_init(batch);
 }
@@ -307,23 +294,30 @@ fd_batch_reset(struct fd_batch *batch)
 void
 __fd_batch_destroy(struct fd_batch *batch)
 {
+   struct fd_context *ctx = batch->ctx;
+
    DBG("%p", batch);
 
-   fd_bc_free_key(batch);
+   fd_screen_assert_locked(batch->ctx->screen);
+
+   fd_bc_invalidate_batch(batch, true);
 
    batch_reset_resources(batch);
-   assert(batch->resources->entries == 0);
+   debug_assert(batch->resources->entries == 0);
    _mesa_set_destroy(batch->resources, NULL);
 
+   fd_screen_unlock(ctx->screen);
    batch_reset_dependencies(batch);
-   assert(batch->dependents->entries == 0);
-   _mesa_set_destroy(batch->dependents, NULL);
+   debug_assert(batch->dependents_mask == 0);
 
+   util_copy_framebuffer_state(&batch->framebuffer, NULL);
    batch_fini(batch);
 
    simple_mtx_destroy(&batch->submit_lock);
 
+   free(batch->key);
    free(batch);
+   fd_screen_lock(ctx->screen);
 }
 
 void
@@ -359,12 +353,20 @@ batch_flush(struct fd_batch *batch) assert_dt
 
    batch_flush_dependencies(batch);
 
+   fd_screen_lock(batch->ctx->screen);
    batch_reset_resources(batch);
-   fd_bc_free_key(batch);
+   /* NOTE: remove=false removes the batch from the hashtable, so future
+    * lookups won't cache-hit a flushed batch, but leaves the weak reference
+    * to the batch to avoid having multiple batches with same batch->idx, as
+    * that causes all sorts of hilarity.
+    */
+   fd_bc_invalidate_batch(batch, false);
    batch->flushed = true;
 
    if (batch == batch->ctx->batch)
-      fd_batch_reference(&batch->ctx->batch, NULL);
+      fd_batch_reference_locked(&batch->ctx->batch, NULL);
+
+   fd_screen_unlock(batch->ctx->screen);
 
    if (batch->fence)
       fd_fence_ref(&batch->ctx->last_fence, batch->fence);
@@ -384,61 +386,78 @@ fd_batch_flush(struct fd_batch *batch)
 {
    struct fd_batch *tmp = NULL;
 
-   /* NOTE: Many callers pass in a ctx->batch or fd_bc_writer() batches without
-    * refcounting, which batch_flush will reset, so we need to hold a ref across
-    * the body of the flush.
+   /* NOTE: we need to hold an extra ref across the body of flush,
+    * since the last ref to this batch could be dropped when cleaning
+    * up used_resources
     */
    fd_batch_reference(&tmp, batch);
    batch_flush(tmp);
    fd_batch_reference(&tmp, NULL);
 }
 
-static uint32_t ASSERTED
-dependents_contains(const struct fd_batch *haystack, const struct fd_batch *needle)
+/* find a batches dependents mask, including recursive dependencies: */
+static uint32_t
+recursive_dependents_mask(struct fd_batch *batch)
 {
-   if (haystack == needle)
-      return true;
+   struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+   struct fd_batch *dep;
+   uint32_t dependents_mask = batch->dependents_mask;
 
-   set_foreach (haystack->dependents, entry) {
-      if (dependents_contains(entry->key, needle))
-         return true;
-   }
+   foreach_batch (dep, cache, batch->dependents_mask)
+      dependents_mask |= recursive_dependents_mask(dep);
 
-   return false;
+   return dependents_mask;
 }
 
 void
 fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep)
 {
-   if (_mesa_set_search(batch->dependents, dep))
+   fd_screen_assert_locked(batch->ctx->screen);
+
+   if (batch->dependents_mask & (1 << dep->idx))
       return;
 
    /* a loop should not be possible */
-   assert(!dependents_contains(dep, batch));
+   debug_assert(!((1 << batch->idx) & recursive_dependents_mask(dep)));
 
-   struct fd_batch *table_ref = NULL;
-   fd_batch_reference(&table_ref, dep);
-   _mesa_set_add(batch->dependents, dep);
+   struct fd_batch *other = NULL;
+   fd_batch_reference_locked(&other, dep);
+   batch->dependents_mask |= (1 << dep->idx);
    DBG("%p: added dependency on %p", batch, dep);
 }
 
+static void
+flush_write_batch(struct fd_resource *rsc) assert_dt
+{
+   struct fd_batch *b = NULL;
+   fd_batch_reference_locked(&b, rsc->track->write_batch);
+
+   fd_screen_unlock(b->ctx->screen);
+   fd_batch_flush(b);
+   fd_screen_lock(b->ctx->screen);
+
+   fd_batch_reference_locked(&b, NULL);
+}
+
 static void
 fd_batch_add_resource(struct fd_batch *batch, struct fd_resource *rsc)
 {
-   bool found = false;
-   _mesa_set_search_or_add_pre_hashed(batch->resources, rsc->hash, rsc, &found);
-   if (!found) {
-      struct pipe_resource *table_ref = NULL;
-      pipe_resource_reference(&table_ref, &rsc->b.b);
-      p_atomic_inc(&rsc->batch_references);
+
+   if (likely(fd_batch_references_resource(batch, rsc))) {
+      debug_assert(_mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc));
+      return;
    }
+
+   debug_assert(!_mesa_set_search(batch->resources, rsc));
+
+   _mesa_set_add_pre_hashed(batch->resources, rsc->hash, rsc);
+   rsc->track->batch_mask |= (1 << batch->idx);
 }
 
 void
 fd_batch_resource_write(struct fd_batch *batch, struct fd_resource *rsc)
 {
-   struct fd_context *ctx = batch->ctx;
-   struct fd_batch_cache *cache = &ctx->batch_cache;
+   fd_screen_assert_locked(batch->ctx->screen);
 
    DBG("%p: write %p", batch, rsc);
 
@@ -447,8 +466,7 @@ fd_batch_resource_write(struct fd_batch *batch, struct fd_resource *rsc)
     */
    rsc->valid = true;
 
-   struct fd_batch *prev_writer = fd_bc_writer(ctx, rsc);
-   if (prev_writer == batch)
+   if (rsc->track->write_batch == batch)
       return;
 
    fd_batch_write_prep(batch, rsc);
@@ -456,21 +474,32 @@ fd_batch_resource_write(struct fd_batch *batch, struct fd_resource *rsc)
    if (rsc->stencil)
       fd_batch_resource_write(batch, rsc->stencil);
 
-   /* Flush any other batches accessing our resource.  Similar to
-    * fd_bc_flush_readers().
+   /* note, invalidate write batch, to avoid further writes to rsc
+    * resulting in a write-after-read hazard.
     */
-   foreach_batch (reader, cache) {
-      if (reader == batch || !fd_batch_references(reader, rsc))
-         continue;
-      fd_batch_flush(reader);
-   }
+   /* if we are pending read or write by any other batch: */
+   if (unlikely(rsc->track->batch_mask & ~(1 << batch->idx))) {
+      struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+      struct fd_batch *dep;
 
-   struct fd_batch *table_ref = NULL;
-   fd_batch_reference(&table_ref, batch);
-   struct pipe_resource *table_rsc_ref = NULL;
-   pipe_resource_reference(&table_rsc_ref, &rsc->b.b);
-   _mesa_hash_table_insert_pre_hashed(cache->written_resources, rsc->hash, rsc,
-                                      batch);
+      if (rsc->track->write_batch)
+         flush_write_batch(rsc);
+
+      foreach_batch (dep, cache, rsc->track->batch_mask) {
+         struct fd_batch *b = NULL;
+         if (dep == batch)
+            continue;
+         /* note that batch_add_dep could flush and unref dep, so
+          * we need to hold a reference to keep it live for the
+          * fd_bc_invalidate_batch()
+          */
+         fd_batch_reference(&b, dep);
+         fd_batch_add_dep(batch, b);
+         fd_bc_invalidate_batch(b, false);
+         fd_batch_reference_locked(&b, NULL);
+      }
+   }
+   fd_batch_reference_locked(&rsc->track->write_batch, batch);
 
    fd_batch_add_resource(batch, rsc);
 }
@@ -478,16 +507,19 @@ fd_batch_resource_write(struct fd_batch *batch, struct fd_resource *rsc)
 void
 fd_batch_resource_read_slowpath(struct fd_batch *batch, struct fd_resource *rsc)
 {
+   fd_screen_assert_locked(batch->ctx->screen);
+
    if (rsc->stencil)
       fd_batch_resource_read(batch, rsc->stencil);
 
    DBG("%p: read %p", batch, rsc);
 
-   struct fd_context *ctx = batch->ctx;
-
-   struct fd_batch *writer = fd_bc_writer(ctx, rsc);
-   if (writer && writer != batch)
-      fd_batch_flush(writer);
+   /* If reading a resource pending a write, go ahead and flush the
+    * writer.  This avoids situations where we end up having to
+    * flush the current batch in _resource_used()
+    */
+   if (unlikely(rsc->track->write_batch && rsc->track->write_batch != batch))
+      flush_write_batch(rsc);
 
    fd_batch_add_resource(batch, rsc);
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index c6edab46ef1..7b554c3cceb 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -28,7 +28,6 @@
 #define FREEDRENO_BATCH_H_
 
 #include "util/list.h"
-#include "util/set.h"
 #include "util/simple_mtx.h"
 #include "util/u_inlines.h"
 #include "util/u_queue.h"
@@ -245,9 +244,8 @@ struct fd_batch {
    uint32_t query_tile_stride;
    /*@}*/
 
-   /* Set of references to resources used by currently-unsubmitted batch (read
-    * or write).  Note that this set may not include all BOs referenced by the
-    * batch due to fd_bc_resource_invalidate().
+   /* Set of resources used by currently-unsubmitted batch (read or
+    * write).. does not hold a reference to the resource.
     */
    struct set *resources;
 
@@ -256,7 +254,7 @@ struct fd_batch {
    uint32_t hash;
 
    /** set of dependent batches.. holds refs to dependent batches: */
-   struct set *dependents;
+   uint32_t dependents_mask;
 
    /* Buffer for tessellation engine input
     */
@@ -291,11 +289,30 @@ struct fd_batch_key *fd_batch_key_clone(void *mem_ctx,
 void __fd_batch_describe(char *buf, const struct fd_batch *batch) assert_dt;
 void __fd_batch_destroy(struct fd_batch *batch);
 
+/*
+ * NOTE the rule is, you need to hold the screen->lock when destroying
+ * a batch..  so either use fd_batch_reference() (which grabs the lock
+ * for you) if you don't hold the lock, or fd_batch_reference_locked()
+ * if you do hold the lock.
+ *
+ * WARNING the _locked() version can briefly drop the lock.  Without
+ * recursive mutexes, I'm not sure there is much else we can do (since
+ * __fd_batch_destroy() needs to unref resources)
+ *
+ * WARNING you must acquire the screen->lock and use the _locked()
+ * version in case that the batch being ref'd can disappear under
+ * you.
+ */
+
 static inline void
-fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
+fd_batch_reference_locked(struct fd_batch **ptr, struct fd_batch *batch)
 {
    struct fd_batch *old_batch = *ptr;
 
+   /* only need lock if a reference is dropped: */
+   if (old_batch)
+      fd_screen_assert_locked(old_batch->ctx->screen);
+
    if (pipe_reference_described(
           &(*ptr)->reference, &batch->reference,
           (debug_reference_descriptor)__fd_batch_describe))
@@ -304,6 +321,21 @@ fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
    *ptr = batch;
 }
 
+static inline void
+fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
+{
+   struct fd_batch *old_batch = *ptr;
+   struct fd_context *ctx = old_batch ? old_batch->ctx : NULL;
+
+   if (ctx)
+      fd_screen_lock(ctx->screen);
+
+   fd_batch_reference_locked(ptr, batch);
+
+   if (ctx)
+      fd_screen_unlock(ctx->screen);
+}
+
 static inline void
 fd_batch_unlock_submit(struct fd_batch *batch)
 {
diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.c b/src/gallium/drivers/freedreno/freedreno_batch_cache.c
index d0ef1147eb6..5c31475c223 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch_cache.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.c
@@ -129,30 +129,39 @@ fd_batch_key_clone(void *mem_ctx, const struct fd_batch_key *key)
 }
 
 void
-fd_bc_init(struct fd_context *ctx)
+fd_bc_init(struct fd_batch_cache *cache)
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
-
    cache->ht =
       _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals);
-   cache->written_resources = _mesa_pointer_hash_table_create(NULL);
 }
 
 void
-fd_bc_fini(struct fd_context *ctx) assert_dt
+fd_bc_fini(struct fd_batch_cache *cache)
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
-
-   fd_bc_flush(ctx, false);
    _mesa_hash_table_destroy(cache->ht, NULL);
-   _mesa_hash_table_destroy(cache->written_resources, NULL);
 }
 
 /* Flushes all batches in the batch cache.  Used at glFlush() and similar times. */
 void
 fd_bc_flush(struct fd_context *ctx, bool deferred) assert_dt
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
+   struct fd_batch_cache *cache = &ctx->screen->batch_cache;
+
+   /* fd_batch_flush() (and fd_batch_add_dep() which calls it indirectly)
+    * can cause batches to be unref'd and freed under our feet, so grab
+    * a reference to all the batches we need up-front.
+    */
+   struct fd_batch *batches[ARRAY_SIZE(cache->batches)] = {0};
+   struct fd_batch *batch;
+   unsigned n = 0;
+
+   fd_screen_lock(ctx->screen);
+
+   foreach_batch (batch, cache, cache->batch_mask) {
+      if (batch->ctx == ctx) {
+         fd_batch_reference_locked(&batches[n++], batch);
+      }
+   }
 
    /* deferred flush doesn't actually flush, but it marks every other
     * batch associated with the context as dependent on the current
@@ -162,170 +171,245 @@ fd_bc_flush(struct fd_context *ctx, bool deferred) assert_dt
    if (deferred) {
       struct fd_batch *current_batch = fd_context_batch(ctx);
 
-      foreach_batch (batch, cache) {
-         if (batch != current_batch)
-            fd_batch_add_dep(current_batch, batch);
+      for (unsigned i = 0; i < n; i++) {
+         if (batches[i] && (batches[i]->ctx == ctx) &&
+             (batches[i] != current_batch)) {
+            fd_batch_add_dep(current_batch, batches[i]);
+         }
       }
-      fd_batch_reference(&current_batch, NULL);
+
+      fd_batch_reference_locked(&current_batch, NULL);
+
+      fd_screen_unlock(ctx->screen);
    } else {
-      foreach_batch (batch, cache) {
-         fd_batch_flush(batch);
+      fd_screen_unlock(ctx->screen);
+
+      for (unsigned i = 0; i < n; i++) {
+         fd_batch_flush(batches[i]);
       }
    }
+
+   for (unsigned i = 0; i < n; i++) {
+      fd_batch_reference(&batches[i], NULL);
+   }
 }
 
 /**
- * Flushes the batch (if any) writing this resource.
+ * Flushes the batch (if any) writing this resource.  Must not hold the screen
+ * lock.
  */
 void
 fd_bc_flush_writer(struct fd_context *ctx, struct fd_resource *rsc) assert_dt
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
-   struct hash_entry *entry =
-      _mesa_hash_table_search_pre_hashed(cache->written_resources, rsc->hash, rsc);
-   if (entry) {
-      fd_batch_flush(entry->data);
+   fd_screen_lock(ctx->screen);
+   struct fd_batch *write_batch = NULL;
+   fd_batch_reference_locked(&write_batch, rsc->track->write_batch);
+   fd_screen_unlock(ctx->screen);
+
+   if (write_batch) {
+      fd_batch_flush(write_batch);
+      fd_batch_reference(&write_batch, NULL);
    }
 }
 
 /**
- * Flushes any batches reading this resource.
+ * Flushes any batches reading this resource.  Must not hold the screen lock.
  */
 void
 fd_bc_flush_readers(struct fd_context *ctx, struct fd_resource *rsc) assert_dt
 {
-   foreach_batch (batch, &ctx->batch_cache) {
-      if (fd_batch_references(batch, rsc))
-         fd_batch_flush(batch);
-   }
-}
+   struct fd_batch *batch, *batches[32] = {};
+   uint32_t batch_count = 0;
 
-/**
- * Flushes any batches accessing this resource as part of the gmem key.
- *
- * Used in resource shadowing.
- */
-void
-fd_bc_flush_gmem_users(struct fd_context *ctx, struct fd_resource *rsc)
-{
-   foreach_batch (batch, &ctx->batch_cache) {
-      if (!batch->key)
-         continue;
-      for (int i = 0; i < batch->key->num_surfs; i++) {
-         if (batch->key->surf[i].texture == &rsc->b.b) {
-            struct fd_batch *tmp = NULL;
-            fd_batch_reference(&tmp, batch);
-            fd_batch_flush(batch);
-            fd_batch_reference(&tmp, NULL);
-            break;
-         }
-      }
+   /* This is a bit awkward, probably a fd_batch_flush_locked()
+    * would make things simpler.. but we need to hold the lock
+    * to iterate the batches which reference this resource.  So
+    * we must first grab references under a lock, then flush.
+    */
+   fd_screen_lock(ctx->screen);
+   foreach_batch (batch, &ctx->screen->batch_cache, rsc->track->batch_mask)
+      fd_batch_reference_locked(&batches[batch_count++], batch);
+   fd_screen_unlock(ctx->screen);
+
+   for (int i = 0; i < batch_count; i++) {
+      fd_batch_flush(batches[i]);
+      fd_batch_reference(&batches[i], NULL);
    }
 }
 
 void
 fd_bc_dump(struct fd_context *ctx, const char *fmt, ...)
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
+   struct fd_batch_cache *cache = &ctx->screen->batch_cache;
 
    if (!FD_DBG(MSGS))
       return;
 
+   fd_screen_lock(ctx->screen);
+
    va_list ap;
    va_start(ap, fmt);
    vprintf(fmt, ap);
    va_end(ap);
 
-   foreach_batch (batch, cache) {
-      printf("  %p<%u>%s\n", batch, batch->seqno,
-             batch->needs_flush ? ", NEEDS FLUSH" : "");
+   for (int i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+      struct fd_batch *batch = cache->batches[i];
+      if (batch) {
+         printf("  %p<%u>%s\n", batch, batch->seqno,
+                batch->needs_flush ? ", NEEDS FLUSH" : "");
+      }
    }
 
    printf("----\n");
+
+   fd_screen_unlock(ctx->screen);
 }
 
-/* Removes a batch from the batch cache (typically after a flush) */
+/**
+ * Note that when batch is flushed, it needs to remain in the cache so
+ * that fd_bc_invalidate_resource() can work.. otherwise we can have
+ * the case where a rsc is destroyed while a batch still has a dangling
+ * reference to it.
+ *
+ * Note that the cmdstream (or, after the SUBMIT ioctl, the kernel)
+ * would have a reference to the underlying bo, so it is ok for the
+ * rsc to be destroyed before the batch.
+ */
 void
-fd_bc_free_key(struct fd_batch *batch)
+fd_bc_invalidate_batch(struct fd_batch *batch, bool remove)
 {
    if (!batch)
       return;
 
-   struct fd_context *ctx = batch->ctx;
-   struct fd_batch_cache *cache = &ctx->batch_cache;
+   struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+   struct fd_batch_key *key = batch->key;
 
-   if (batch->key) {
-      _mesa_hash_table_remove_key(cache->ht, batch->key);
-      free((void *)batch->key);
-      batch->key = NULL;
+   fd_screen_assert_locked(batch->ctx->screen);
+
+   if (remove) {
+      cache->batches[batch->idx] = NULL;
+      cache->batch_mask &= ~(1 << batch->idx);
    }
-}
 
-/* Called when the resource is has had its underlying storage replaced, so
- * previous batch references to it are no longer relevant for flushing access to
- * that storage.
- */
-void
-fd_bc_invalidate_resource(struct fd_context *ctx, struct fd_resource *rsc)
-{
-   foreach_batch (batch, &ctx->batch_cache) {
-      struct set_entry *entry = _mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc);
-      if (entry) {
-         struct pipe_resource *table_ref = &rsc->b.b;
-         pipe_resource_reference(&table_ref, NULL);
+   if (!key)
+      return;
 
-         ASSERTED int32_t count = p_atomic_dec_return(&rsc->batch_references);
-         assert(count >= 0);
-
-         _mesa_set_remove(batch->resources, entry);
-      }
+   DBG("%p: key=%p", batch, batch->key);
+   for (unsigned idx = 0; idx < key->num_surfs; idx++) {
+      struct fd_resource *rsc = fd_resource(key->surf[idx].texture);
+      rsc->track->bc_batch_mask &= ~(1 << batch->idx);
    }
 
    struct hash_entry *entry =
-      _mesa_hash_table_search(ctx->batch_cache.written_resources, rsc);
-   if (entry) {
-      struct fd_batch *batch = entry->data;
-      struct pipe_resource *table_ref = &rsc->b.b;
-      pipe_resource_reference(&table_ref, NULL);
-      _mesa_hash_table_remove(ctx->batch_cache.written_resources, entry);
-      fd_batch_reference(&batch, NULL);
+      _mesa_hash_table_search_pre_hashed(cache->ht, batch->hash, key);
+   _mesa_hash_table_remove(cache->ht, entry);
+}
+
+void
+fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy)
+{
+   struct fd_screen *screen = fd_screen(rsc->b.b.screen);
+   struct fd_batch *batch;
+
+   fd_screen_lock(screen);
+
+   if (destroy) {
+      foreach_batch (batch, &screen->batch_cache, rsc->track->batch_mask) {
+         struct set_entry *entry = _mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc);
+         _mesa_set_remove(batch->resources, entry);
+      }
+      rsc->track->batch_mask = 0;
+
+      fd_batch_reference_locked(&rsc->track->write_batch, NULL);
    }
+
+   foreach_batch (batch, &screen->batch_cache, rsc->track->bc_batch_mask)
+      fd_bc_invalidate_batch(batch, false);
+
+   rsc->track->bc_batch_mask = 0;
+
+   fd_screen_unlock(screen);
 }
 
 static struct fd_batch *
-alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx, bool nondraw) assert_dt
+alloc_batch_locked(struct fd_batch_cache *cache, struct fd_context *ctx,
+                   bool nondraw) assert_dt
 {
    struct fd_batch *batch;
+   uint32_t idx;
 
-   if (cache->ht->entries >= 32) {
+   fd_screen_assert_locked(ctx->screen);
+
+   while ((idx = ffs(~cache->batch_mask)) == 0) {
+#if 0
+      for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+         batch = cache->batches[i];
+         debug_printf("%d: needs_flush=%d, depends:", batch->idx, batch->needs_flush);
+         set_foreach (batch->dependencies, entry) {
+            struct fd_batch *dep = (struct fd_batch *)entry->key;
+            debug_printf(" %d", dep->idx);
+         }
+         debug_printf("\n");
+      }
+#endif
       /* TODO: is LRU the better policy?  Or perhaps the batch that
        * depends on the fewest other batches?
        */
       struct fd_batch *flush_batch = NULL;
-      foreach_batch (batch, cache) {
-         if (!flush_batch || (batch->seqno < flush_batch->seqno))
-            fd_batch_reference(&flush_batch, batch);
+      for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+         if (!flush_batch || (cache->batches[i]->seqno < flush_batch->seqno))
+            fd_batch_reference_locked(&flush_batch, cache->batches[i]);
       }
 
+      /* we can drop lock temporarily here, since we hold a ref,
+       * flush_batch won't disappear under us.
+       */
+      fd_screen_unlock(ctx->screen);
       DBG("%p: too many batches!  flush forced!", flush_batch);
       fd_batch_flush(flush_batch);
+      fd_screen_lock(ctx->screen);
 
-      fd_batch_reference(&flush_batch, NULL);
+      /* While the resources get cleaned up automatically, the flush_batch
+       * doesn't get removed from the dependencies of other batches, so
+       * it won't be unref'd and will remain in the table.
+       *
+       * TODO maybe keep a bitmask of batches that depend on me, to make
+       * this easier:
+       */
+      for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+         struct fd_batch *other = cache->batches[i];
+         if (!other)
+            continue;
+         if (other->dependents_mask & (1 << flush_batch->idx)) {
+            other->dependents_mask &= ~(1 << flush_batch->idx);
+            struct fd_batch *ref = flush_batch;
+            fd_batch_reference_locked(&ref, NULL);
+         }
+      }
+
+      fd_batch_reference_locked(&flush_batch, NULL);
    }
 
+   idx--; /* bit zero returns 1 for ffs() */
+
    batch = fd_batch_create(ctx, nondraw);
    if (!batch)
       return NULL;
 
    batch->seqno = cache->cnt++;
+   batch->idx = idx;
+   cache->batch_mask |= (1 << idx);
+
+   debug_assert(cache->batches[idx] == NULL);
+   cache->batches[idx] = batch;
 
    return batch;
 }
 
 struct fd_batch *
-fd_bc_alloc_batch(struct fd_context *ctx, bool nondraw) assert_dt
+fd_bc_alloc_batch(struct fd_context *ctx, bool nondraw)
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
+   struct fd_batch_cache *cache = &ctx->screen->batch_cache;
    struct fd_batch *batch;
 
    /* For normal draw batches, pctx->set_framebuffer_state() handles
@@ -335,7 +419,9 @@ fd_bc_alloc_batch(struct fd_context *ctx, bool nondraw) assert_dt
    if (nondraw)
       fd_context_switch_from(ctx);
 
-   batch = alloc_batch(cache, ctx, nondraw);
+   fd_screen_lock(ctx->screen);
+   batch = alloc_batch_locked(cache, ctx, nondraw);
+   fd_screen_unlock(ctx->screen);
 
    if (batch && nondraw)
       fd_context_switch_to(ctx, batch);
@@ -346,7 +432,7 @@ fd_bc_alloc_batch(struct fd_context *ctx, bool nondraw) assert_dt
 static struct fd_batch *
 batch_from_key(struct fd_context *ctx, struct fd_batch_key *key) assert_dt
 {
-   struct fd_batch_cache *cache = &ctx->batch_cache;
+   struct fd_batch_cache *cache = &ctx->screen->batch_cache;
    struct fd_batch *batch = NULL;
    uint32_t hash = fd_batch_key_hash(key);
    struct hash_entry *entry =
@@ -354,12 +440,12 @@ batch_from_key(struct fd_context *ctx, struct fd_batch_key *key) assert_dt
 
    if (entry) {
       free(key);
-      fd_batch_reference(&batch, (struct fd_batch *)entry->data);
+      fd_batch_reference_locked(&batch, (struct fd_batch *)entry->data);
       assert(!batch->flushed);
       return batch;
    }
 
-   batch = alloc_batch(cache, ctx, false);
+   batch = alloc_batch_locked(cache, ctx, false);
 #ifdef DEBUG
    DBG("%p: hash=0x%08x, %ux%u, %u layers, %u samples", batch, hash, key->width,
        key->height, key->layers, key->samples);
@@ -387,6 +473,11 @@ batch_from_key(struct fd_context *ctx, struct fd_batch_key *key) assert_dt
    batch->key = key;
    batch->hash = hash;
 
+   for (unsigned idx = 0; idx < key->num_surfs; idx++) {
+      struct fd_resource *rsc = fd_resource(key->surf[idx].texture);
+      rsc->track->bc_batch_mask = (1 << batch->idx);
+   }
+
    return batch;
 }
 
@@ -423,5 +514,9 @@ fd_batch_from_fb(struct fd_context *ctx,
 
    key->num_surfs = idx;
 
-   return batch_from_key(ctx, key);
+   fd_screen_lock(ctx->screen);
+   struct fd_batch *batch = batch_from_key(ctx, key);
+   fd_screen_unlock(ctx->screen);
+
+   return batch;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.h b/src/gallium/drivers/freedreno/freedreno_batch_cache.h
index cce2b7aaf9c..4b2737bac80 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch_cache.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.h
@@ -28,7 +28,6 @@
 #define FREEDRENO_BATCH_CACHE_H_
 
 #include "pipe/p_state.h"
-#include "util/hash_table.h"
 
 #include "freedreno_util.h"
 
@@ -40,32 +39,41 @@ struct fd_screen;
 struct hash_table;
 
 struct fd_batch_cache {
-   /* Mapping from struct key (basically framebuffer state) to a batch of
-    * rendering for rendering to that target.
-    */
    struct hash_table *ht;
    unsigned cnt;
 
-   /* Mapping from struct pipe_resource to the batch writing it.  Keeps a reference. */
-   struct hash_table *written_resources;
+   /* set of active batches.. there is an upper limit on the number of
+    * in-flight batches, for two reasons:
+    * 1) to avoid big spikes in number of batches in edge cases, such as
+    *    game startup (ie, lots of texture uploads, but no usages yet of
+    *    the textures), etc.
+    * 2) so we can use a simple bitmask in fd_resource to track which
+    *    batches have reference to the resource
+    */
+   struct fd_batch *batches[32];
+   uint32_t batch_mask;
 };
 
-#define foreach_batch(batch, cache) \
-   hash_table_foreach ((cache)->ht, _batch_entry) \
-      for (struct fd_batch *batch = _batch_entry->data; batch != NULL; batch = NULL)
+/* note: if batches get unref'd in the body of the loop, they are removed
+ * from the various masks.. but since we copy the mask at the beginning of
+ * the loop into _m, we need the &= at the end of the loop to make sure
+ * we don't have stale bits in _m
+ */
+#define foreach_batch(batch, cache, mask)                                      \
+   for (uint32_t _m = (mask);                                                  \
+        _m && ((batch) = (cache)->batches[u_bit_scan(&_m)]); _m &= (mask))
 
-void fd_bc_init(struct fd_context *ctx);
-void fd_bc_fini(struct fd_context *ctx);
+void fd_bc_init(struct fd_batch_cache *cache);
+void fd_bc_fini(struct fd_batch_cache *cache);
 
 void fd_bc_flush(struct fd_context *ctx, bool deferred) assert_dt;
 void fd_bc_flush_writer(struct fd_context *ctx, struct fd_resource *rsc) assert_dt;
 void fd_bc_flush_readers(struct fd_context *ctx, struct fd_resource *rsc) assert_dt;
-void fd_bc_flush_gmem_users(struct fd_context *ctx, struct fd_resource *rsc) assert_dt;
 void fd_bc_dump(struct fd_context *ctx, const char *fmt, ...)
    _util_printf_format(2, 3);
 
-void fd_bc_free_key(struct fd_batch *batch);
-void fd_bc_invalidate_resource(struct fd_context *ctx, struct fd_resource *rsc);
+void fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy);
+void fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy);
 struct fd_batch *fd_bc_alloc_batch(struct fd_context *ctx,
                                    bool nondraw) assert_dt;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 7587b35a507..43ab14f8a8c 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -353,7 +353,9 @@ fd_context_destroy(struct pipe_context *pctx)
 
    util_copy_framebuffer_state(&ctx->framebuffer, NULL);
    fd_batch_reference(&ctx->batch, NULL); /* unref current batch */
-   fd_bc_fini(ctx);
+
+   /* Make sure nothing in the batch cache references our context any more. */
+   fd_bc_flush(ctx, false);
 
    fd_prog_fini(pctx);
 
@@ -640,8 +642,6 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
    slab_create_child(&ctx->transfer_pool, &screen->transfer_pool);
    slab_create_child(&ctx->transfer_pool_unsync, &screen->transfer_pool);
 
-   fd_bc_init(ctx);
-
    fd_draw_init(pctx);
    fd_resource_context_init(pctx);
    fd_query_context_init(pctx);
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 67bdba50718..9da86bf27d2 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -37,7 +37,6 @@
 #include "util/perf/u_trace.h"
 
 #include "freedreno_autotune.h"
-#include "freedreno_batch_cache.h"
 #include "freedreno_gmem.h"
 #include "freedreno_perfetto.h"
 #include "freedreno_screen.h"
@@ -213,8 +212,6 @@ struct fd_context {
     */
    simple_mtx_t gmem_lock;
 
-   struct fd_batch_cache batch_cache;
-
    struct fd_device *dev;
    struct fd_screen *screen;
    struct fd_pipe *pipe;
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index e5d430182be..baa5f2e5530 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -197,6 +197,8 @@ batch_draw_tracking(struct fd_batch *batch, const struct pipe_draw_info *info,
     * Figure out the buffers/features we need:
     */
 
+   fd_screen_lock(ctx->screen);
+
    if (ctx->dirty & FD_DIRTY_RESOURCE)
       batch_draw_tracking_for_dirty_bits(batch);
 
@@ -218,6 +220,8 @@ batch_draw_tracking(struct fd_batch *batch, const struct pipe_draw_info *info,
 
    list_for_each_entry (struct fd_acc_query, aq, &ctx->acc_active_queries, node)
       resource_written(batch, aq->prsc);
+
+   fd_screen_unlock(ctx->screen);
 }
 
 static void
@@ -395,6 +399,8 @@ batch_clear_tracking(struct fd_batch *batch, unsigned buffers) assert_dt
 
    batch->resolve |= buffers;
 
+   fd_screen_lock(ctx->screen);
+
    if (buffers & PIPE_CLEAR_COLOR)
       for (unsigned i = 0; i < pfb->nr_cbufs; i++)
          if (buffers & (PIPE_CLEAR_COLOR0 << i))
@@ -409,6 +415,8 @@ batch_clear_tracking(struct fd_batch *batch, unsigned buffers) assert_dt
 
    list_for_each_entry (struct fd_acc_query, aq, &ctx->acc_active_queries, node)
       resource_written(batch, aq->prsc);
+
+   fd_screen_unlock(ctx->screen);
 }
 
 static void
@@ -511,6 +519,8 @@ fd_launch_grid(struct pipe_context *pctx,
    fd_batch_reference(&ctx->batch, batch);
    fd_context_all_dirty(ctx);
 
+   fd_screen_lock(ctx->screen);
+
    /* Mark SSBOs */
    u_foreach_bit (i, so->enabled_mask & so->writable_mask)
       resource_written(batch, so->sb[i].buffer);
@@ -543,6 +553,8 @@ fd_launch_grid(struct pipe_context *pctx,
    if (info->indirect)
       resource_read(batch, info->indirect);
 
+   fd_screen_unlock(ctx->screen);
+
    DBG("%p: work_dim=%u, block=%ux%ux%u, grid=%ux%ux%u",
        batch, info->work_dim,
        info->block[0], info->block[1], info->block[2],
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c
index 518f792425a..74da4ce8ab2 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_acc.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c
@@ -89,7 +89,9 @@ fd_acc_query_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt
    fd_batch_needs_flush(aq->batch);
    p->resume(aq, aq->batch);
 
+   fd_screen_lock(batch->ctx->screen);
    fd_batch_resource_write(batch, fd_resource(aq->prsc));
+   fd_screen_unlock(batch->ctx->screen);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.c b/src/gallium/drivers/freedreno/freedreno_query_hw.c
index 5a2ed47a837..560cf1bba39 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_hw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c
@@ -340,7 +340,7 @@ fd_hw_query_prepare(struct fd_batch *batch, uint32_t num_tiles)
    uint32_t tile_stride = batch->next_sample_offset;
 
    if (tile_stride > 0)
-      fd_resource_resize(batch->ctx, batch->query_buf, tile_stride * num_tiles);
+      fd_resource_resize(batch->query_buf, tile_stride * num_tiles);
 
    batch->query_tile_stride = tile_stride;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index f09d194ea2f..add830d9c12 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -191,7 +191,7 @@ __fd_resource_wait(struct fd_context *ctx, struct fd_resource *rsc, unsigned op,
 }
 
 static void
-realloc_bo(struct fd_context *ctx, struct fd_resource *rsc, uint32_t size)
+realloc_bo(struct fd_resource *rsc, uint32_t size)
 {
    struct pipe_resource *prsc = &rsc->b.b;
    struct fd_screen *screen = fd_screen(rsc->b.b.screen);
@@ -224,8 +224,7 @@ realloc_bo(struct fd_context *ctx, struct fd_resource *rsc, uint32_t size)
    }
 
    util_range_set_empty(&rsc->valid_buffer_range);
-   if (ctx)
-      fd_bc_invalidate_resource(ctx, rsc);
+   fd_bc_invalidate_resource(rsc, true);
 }
 
 static void
@@ -269,20 +268,20 @@ fd_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *pdst,
     */
    assert(pdst->target == PIPE_BUFFER);
    assert(psrc->target == PIPE_BUFFER);
-#ifndef NDEBUG
-   foreach_batch (batch, &ctx->batch_cache) {
-      assert(!fd_batch_references(batch, src));
-   }
-#endif
+   assert(dst->track->bc_batch_mask == 0);
+   assert(src->track->bc_batch_mask == 0);
+   assert(src->track->batch_mask == 0);
+   assert(src->track->write_batch == NULL);
    assert(memcmp(&dst->layout, &src->layout, sizeof(dst->layout)) == 0);
 
-   /* get rid of any references that batch-cache might have to us.
+   /* get rid of any references that batch-cache might have to us (which
+    * should empty/destroy rsc->batches hashset)
     *
     * Note that we aren't actually destroying dst, but we are replacing
     * it's storage so we want to go thru the same motions of decoupling
     * it's batch connections.
     */
-   fd_bc_invalidate_resource(ctx, dst);
+   fd_bc_invalidate_resource(dst, true);
    rebind_resource(dst);
 
    util_idalloc_mt_free(&ctx->screen->buffer_ids, delete_buffer_id);
@@ -292,6 +291,7 @@ fd_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *pdst,
    fd_bo_del(dst->bo);
    dst->bo = fd_bo_ref(src->bo);
 
+   fd_resource_tracking_reference(&dst->track, src->track);
    src->is_replacement = true;
 
    dst->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno);
@@ -319,11 +319,7 @@ fd_resource_busy(struct pipe_screen *pscreen, struct pipe_resource *prsc,
 {
    struct fd_resource *rsc = fd_resource(prsc);
 
-   /* Note that while TC *can* ask us about busy for MAP_READ to promote to an
-    * unsync map, it's not a case we have ever seen a need to optimize for, and
-    * being conservative in resource_busy() is safe.
-    */
-   if (p_atomic_read(&rsc->batch_references) != 0)
+   if (pending(rsc, !!(usage & PIPE_MAP_WRITE)))
       return true;
 
    if (resource_busy(rsc, translate_usage(usage)))
@@ -362,6 +358,8 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc,
 {
    struct pipe_context *pctx = &ctx->base;
    struct pipe_resource *prsc = &rsc->b.b;
+   struct fd_screen *screen = fd_screen(pctx->screen);
+   struct fd_batch *batch;
    bool fallback = false;
 
    if (prsc->next)
@@ -382,7 +380,9 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc,
     * Note that being in the gmem key doesn't necessarily mean the
     * batch was considered a writer!
     */
-   fd_bc_flush_gmem_users(ctx, rsc);
+   foreach_batch (batch, &screen->batch_cache, rsc->track->bc_batch_mask) {
+      fd_batch_flush(batch);
+   }
 
    /* TODO: somehow munge dimensions and format to copy unsupported
     * render target format to something that is supported?
@@ -415,11 +415,14 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc,
    assert(!ctx->in_shadow);
    ctx->in_shadow = true;
 
-   /* Signal that any HW state pointing at this resource needs to be
-    * re-emitted.
+   /* get rid of any references that batch-cache might have to us (which
+    * should empty/destroy rsc->batches hashset)
     */
+   fd_bc_invalidate_resource(rsc, false);
    rebind_resource(rsc);
 
+   fd_screen_lock(ctx->screen);
+
    /* Swap the backing bo's, so shadow becomes the old buffer,
     * blit from shadow to new buffer.  From here on out, we
     * cannot fail.
@@ -430,8 +433,8 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc,
     */
    struct fd_resource *shadow = fd_resource(pshadow);
 
-   DBG("shadow: %p (%d) -> %p (%d)", rsc, rsc->b.b.reference.count,
-       shadow, shadow->b.b.reference.count);
+   DBG("shadow: %p (%d, %p) -> %p (%d, %p)", rsc, rsc->b.b.reference.count,
+       rsc->track, shadow, shadow->b.b.reference.count, shadow->track);
 
    swap(rsc->bo, shadow->bo);
    swap(rsc->valid, shadow->valid);
@@ -444,27 +447,19 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc,
    swap(rsc->layout, shadow->layout);
    rsc->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno);
 
-   /* At this point, the newly created shadow buffer is not referenced by any
-    * batches, but the existing rsc may be as a reader (we flushed writers
-    * above).  We want to remove the old reader references to rsc so that we
-    * don't cause unnecessary synchronization on the write we're about to emit,
-    * but we don't need to bother with transferring the references over to
-    * shadow because right after our blit (which is also doing a read so it only
-    * cares about the lack of a writer) shadow will be freed.
+   /* at this point, the newly created shadow buffer is not referenced
+    * by any batches, but the existing rsc (probably) is.  We need to
+    * transfer those references over:
     */
-   assert(!pending(ctx, shadow, true));
-   foreach_batch (batch, &ctx->batch_cache) {
+   debug_assert(shadow->track->batch_mask == 0);
+   foreach_batch (batch, &ctx->screen->batch_cache, rsc->track->batch_mask) {
       struct set_entry *entry = _mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc);
-      if (entry) {
-         struct pipe_resource *table_ref = &rsc->b.b;
-         pipe_resource_reference(&table_ref, NULL);
-
-         ASSERTED int32_t count = p_atomic_dec_return(&rsc->batch_references);
-         assert(count >= 0);
-
-         _mesa_set_remove(batch->resources, entry);
-      }
+      _mesa_set_remove(batch->resources, entry);
+      _mesa_set_add_pre_hashed(batch->resources, shadow->hash, shadow);
    }
+   swap(rsc->track, shadow->track);
+
+   fd_screen_unlock(ctx->screen);
 
    struct pipe_blit_info blit = {};
    blit.dst.resource = prsc;
@@ -731,14 +726,14 @@ fd_resource_transfer_unmap(struct pipe_context *pctx,
 }
 
 static void
-invalidate_resource(struct fd_context *ctx, struct fd_resource *rsc, unsigned usage) assert_dt
+invalidate_resource(struct fd_resource *rsc, unsigned usage) assert_dt
 {
-   bool needs_flush = pending(ctx, rsc, !!(usage & PIPE_MAP_WRITE));
+   bool needs_flush = pending(rsc, !!(usage & PIPE_MAP_WRITE));
    unsigned op = translate_usage(usage);
 
    if (needs_flush || resource_busy(rsc, op)) {
       rebind_resource(rsc);
-      realloc_bo(ctx, rsc, fd_bo_size(rsc->bo));
+      realloc_bo(rsc, fd_bo_size(rsc->bo));
    } else {
       util_range_set_empty(&rsc->valid_buffer_range);
    }
@@ -830,10 +825,10 @@ resource_transfer_map(struct pipe_context *pctx, struct pipe_resource *prsc,
    }
 
    if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
-      invalidate_resource(ctx, rsc, usage);
+      invalidate_resource(rsc, usage);
    } else {
       unsigned op = translate_usage(usage);
-      bool needs_flush = pending(ctx, rsc, !!(usage & PIPE_MAP_WRITE));
+      bool needs_flush = pending(rsc, !!(usage & PIPE_MAP_WRITE));
 
       /* If the GPU is writing to the resource, or if it is reading from the
        * resource and we're trying to write to it, flush the renders.
@@ -1004,13 +999,8 @@ fd_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *prsc)
    struct fd_screen *screen = fd_screen(prsc->screen);
    struct fd_resource *rsc = fd_resource(prsc);
 
-   /* Note that a destroyed resource may be present in an outstanding
-    * batch->resources (not as a batch_cache->written_resource, since those
-    * have references to resources).  However, all that means is that we might
-    * miss an opportunity to reorder a batch by spuriously detecting a newly
-    * created resource as still in use and flush the old reader.
-    */
-
+   if (!rsc->is_replacement)
+      fd_bc_invalidate_resource(rsc, true);
    if (rsc->bo)
       fd_bo_del(rsc->bo);
    if (rsc->lrz)
@@ -1025,6 +1015,7 @@ fd_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *prsc)
 
    util_range_destroy(&rsc->valid_buffer_range);
    simple_mtx_destroy(&rsc->lock);
+   fd_resource_tracking_reference(&rsc->track, NULL);
 
    FREE(rsc);
 }
@@ -1061,7 +1052,7 @@ fd_resource_get_handle(struct pipe_screen *pscreen, struct pipe_context *pctx,
 
 /* special case to resize query buf after allocated.. */
 void
-fd_resource_resize(struct fd_context *ctx, struct pipe_resource *prsc, uint32_t sz)
+fd_resource_resize(struct pipe_resource *prsc, uint32_t sz)
 {
    struct fd_resource *rsc = fd_resource(prsc);
 
@@ -1070,7 +1061,7 @@ fd_resource_resize(struct fd_context *ctx, struct pipe_resource *prsc, uint32_t
    debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER);
 
    prsc->width0 = sz;
-   realloc_bo(ctx, rsc, fd_screen(prsc->screen)->setup_slices(rsc));
+   realloc_bo(rsc, fd_screen(prsc->screen)->setup_slices(rsc));
 }
 
 static void
@@ -1110,6 +1101,14 @@ alloc_resource_struct(struct pipe_screen *pscreen,
    util_range_init(&rsc->valid_buffer_range);
    simple_mtx_init(&rsc->lock, mtx_plain);
 
+   rsc->track = CALLOC_STRUCT(fd_resource_tracking);
+   if (!rsc->track) {
+      free(rsc);
+      return NULL;
+   }
+
+   pipe_reference_init(&rsc->track->reference, 1);
+
    threaded_resource_init(prsc);
 
    if (tmpl->target == PIPE_BUFFER)
@@ -1333,7 +1332,7 @@ fd_resource_create_with_modifiers(struct pipe_screen *pscreen,
       return NULL;
    rsc = fd_resource(prsc);
 
-   realloc_bo(NULL, rsc, size);
+   realloc_bo(rsc, size);
    if (!rsc->bo)
       goto fail;
 
@@ -1457,25 +1456,24 @@ fd_invalidate_resource(struct pipe_context *pctx,
    if (prsc->target == PIPE_BUFFER) {
       /* Handle the glInvalidateBufferData() case:
        */
-      invalidate_resource(ctx, rsc, PIPE_MAP_READ | PIPE_MAP_WRITE);
-   } else {
-      struct fd_batch *batch = fd_bc_writer(ctx, rsc);
-      if (batch) {
-         /* Handle the glInvalidateFramebuffer() case, telling us that
-          * we can skip resolve.
-          */
-         struct pipe_framebuffer_state *pfb = &batch->framebuffer;
+      invalidate_resource(rsc, PIPE_MAP_READ | PIPE_MAP_WRITE);
+   } else if (rsc->track->write_batch) {
+      /* Handle the glInvalidateFramebuffer() case, telling us that
+       * we can skip resolve.
+       */
 
-         if (pfb->zsbuf && pfb->zsbuf->texture == prsc) {
-            batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL);
-            fd_context_dirty(ctx, FD_DIRTY_ZSA);
-         }
+      struct fd_batch *batch = rsc->track->write_batch;
+      struct pipe_framebuffer_state *pfb = &batch->framebuffer;
 
-         for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
-            if (pfb->cbufs[i] && pfb->cbufs[i]->texture == prsc) {
-               batch->resolve &= ~(PIPE_CLEAR_COLOR0 << i);
-               fd_context_dirty(ctx, FD_DIRTY_FRAMEBUFFER);
-            }
+      if (pfb->zsbuf && pfb->zsbuf->texture == prsc) {
+         batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL);
+         fd_context_dirty(ctx, FD_DIRTY_ZSA);
+      }
+
+      for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
+         if (pfb->cbufs[i] && pfb->cbufs[i]->texture == prsc) {
+            batch->resolve &= ~(PIPE_CLEAR_COLOR0 << i);
+            fd_context_dirty(ctx, FD_DIRTY_FRAMEBUFFER);
          }
       }
    }
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h
index ba2825a742c..93057172746 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.h
+++ b/src/gallium/drivers/freedreno/freedreno_resource.h
@@ -28,7 +28,6 @@
 #define FREEDRENO_RESOURCE_H_
 
 #include "util/list.h"
-#include "util/set.h"
 #include "util/simple_mtx.h"
 #include "util/u_dump.h"
 #include "util/u_range.h"
@@ -56,6 +55,62 @@ enum fd_lrz_direction {
    FD_LRZ_GREATER,
 };
 
+/**
+ * State related to batch/resource tracking.
+ *
+ * With threaded_context we need to support replace_buffer_storage, in
+ * which case we can end up in transfer_map with tres->latest, but other
+ * pipe_context APIs using the original prsc pointer.  This allows TC to
+ * not have to synchronize the front-end thread with the buffer storage
+ * replacement called on driver thread.  But it complicates the batch/
+ * resource tracking.
+ *
+ * To handle this, we need to split the tracking out into it's own ref-
+ * counted structure, so as needed both "versions" of the resource can
+ * point to the same tracking.
+ *
+ * We could *almost* just push this down to fd_bo, except for a3xx/a4xx
+ * hw queries, where we don't know up-front the size to allocate for
+ * per-tile query results.
+ */
+struct fd_resource_tracking {
+   struct pipe_reference reference;
+
+   /* bitmask of in-flight batches which reference this resource.  Note
+    * that the batch doesn't hold reference to resources (but instead
+    * the fd_ringbuffer holds refs to the underlying fd_bo), but in case
+    * the resource is destroyed we need to clean up the batch's weak
+    * references to us.
+    */
+   uint32_t batch_mask;
+
+   /* reference to batch that writes this resource: */
+   struct fd_batch *write_batch;
+
+   /* Set of batches whose batch-cache key references this resource.
+    * We need to track this to know which batch-cache entries to
+    * invalidate if, for example, the resource is invalidated or
+    * shadowed.
+    */
+   uint32_t bc_batch_mask;
+};
+
+void __fd_resource_tracking_destroy(struct fd_resource_tracking *track);
+
+static inline void
+fd_resource_tracking_reference(struct fd_resource_tracking **ptr,
+                               struct fd_resource_tracking *track)
+{
+   struct fd_resource_tracking *old_track = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &track->reference)) {
+      assert(!old_track->write_batch);
+      free(old_track);
+   }
+
+   *ptr = track;
+}
+
 /**
  * A resource (any buffer/texture/image/etc)
  */
@@ -64,13 +119,6 @@ struct fd_resource {
    struct fd_bo *bo; /* use fd_resource_set_bo() to write */
    enum pipe_format internal_format;
    uint32_t hash; /* _mesa_hash_pointer() on this resource's address. */
-
-   /* Atomic counter of the number of batches referencing this resource in any
-    * batch cache.  Used for fd_resource_busy(), which needs to check for busy
-    * in the batch cache from the frontend thread.
-    */
-   uint32_t batch_references;
-
    struct fdl_layout layout;
 
    /* buffer range that has been initialized */
@@ -82,6 +130,8 @@ struct fd_resource {
    /* TODO rename to secondary or auxiliary? */
    struct fd_resource *stencil;
 
+   struct fd_resource_tracking *track;
+
    simple_mtx_t lock;
 
    /* bitmask of state this resource could potentially dirty when rebound,
@@ -94,6 +144,8 @@ struct fd_resource {
 
    /* Is this buffer a replacement created by threaded_context to avoid
     * a stall in PIPE_MAP_DISCARD_WHOLE_RESOURCE|PIPE_MAP_WRITE case?
+    * If so, it no longer "owns" it's rsc->track, and so should not
+    * invalidate when the rsc is destroyed.
     */
    bool is_replacement : 1;
 
@@ -141,35 +193,17 @@ fd_memory_object(struct pipe_memory_object *pmemobj)
 }
 
 static inline bool
-fd_batch_references(struct fd_batch *batch, struct fd_resource *rsc)
+pending(struct fd_resource *rsc, bool write)
 {
-   return _mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc) != NULL;
-}
+   /* if we have a pending GPU write, we are busy in any case: */
+   if (rsc->track->write_batch)
+      return true;
 
-static inline struct fd_batch *
-fd_bc_writer(struct fd_context *ctx, struct fd_resource *rsc)
-{
-   struct hash_entry *entry =
-      _mesa_hash_table_search_pre_hashed(ctx->batch_cache.written_resources, rsc->hash, rsc);
-   if (entry)
-      return entry->data;
-   return NULL;
-}
+   /* if CPU wants to write, but we are pending a GPU read, we are busy: */
+   if (write && rsc->track->batch_mask)
+      return true;
 
-static inline bool
-pending(struct fd_context *ctx, struct fd_resource *rsc, bool write) assert_dt
-{
-   if (write) {
-      foreach_batch (batch, &ctx->batch_cache) {
-         if (fd_batch_references(batch, rsc))
-            return true;
-      }
-   } else {
-      if (fd_bc_writer(ctx, rsc))
-         return true;
-   }
-
-   if (rsc->stencil && pending(ctx, rsc->stencil, write))
+   if (rsc->stencil && pending(rsc->stencil, write))
       return true;
 
    return false;
@@ -314,7 +348,7 @@ void fd_resource_screen_init(struct pipe_screen *pscreen);
 void fd_resource_context_init(struct pipe_context *pctx);
 
 uint32_t fd_setup_slices(struct fd_resource *rsc);
-void fd_resource_resize(struct fd_context *ctx, struct pipe_resource *prsc, uint32_t sz);
+void fd_resource_resize(struct pipe_resource *prsc, uint32_t sz);
 void fd_replace_buffer_storage(struct pipe_context *ctx,
                                struct pipe_resource *dst,
                                struct pipe_resource *src,
@@ -331,6 +365,12 @@ void fd_resource_dump(struct fd_resource *rsc, const char *name);
 
 bool fd_render_condition_check(struct pipe_context *pctx) assert_dt;
 
+static inline bool
+fd_batch_references_resource(struct fd_batch *batch, struct fd_resource *rsc)
+{
+   return rsc->track->batch_mask & (1 << batch->idx);
+}
+
 static inline void
 fd_batch_write_prep(struct fd_batch *batch, struct fd_resource *rsc) assert_dt
 {
@@ -348,7 +388,7 @@ fd_batch_resource_read(struct fd_batch *batch,
     * writing to it (since both _write and _read flush other writers), and
     * that we've already recursed for stencil.
     */
-   if (unlikely(!fd_batch_references(batch, rsc)))
+   if (unlikely(!fd_batch_references_resource(batch, rsc)))
       fd_batch_resource_read_slowpath(batch, rsc);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 080d67918c8..15fb442ef89 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -155,6 +155,7 @@ fd_screen_destroy(struct pipe_screen *pscreen)
    if (screen->ro)
       screen->ro->destroy(screen->ro);
 
+   fd_bc_fini(&screen->batch_cache);
    fd_gmem_screen_fini(pscreen);
 
    slab_destroy_parent(&screen->transfer_pool);
@@ -1087,6 +1088,8 @@ fd_screen_create(struct fd_device *dev, struct renderonly *ro,
    if (fd_device_version(dev) >= FD_VERSION_UNLIMITED_CMDS)
       screen->reorder = !FD_DBG(INORDER);
 
+   fd_bc_init(&screen->batch_cache);
+
    list_inithead(&screen->context_list);
 
    util_idalloc_mt_init_tc(&screen->buffer_ids);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index a9bda74dc30..ca15d3e4a5f 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -41,11 +41,11 @@
 #include "util/u_memory.h"
 #include "util/u_queue.h"
 
+#include "freedreno_batch_cache.h"
 #include "freedreno_gmem.h"
 #include "freedreno_util.h"
 
 struct fd_bo;
-struct fd_resource;
 
 /* Potential reasons for needing to skip bypass path and use GMEM, the
  * generation backend can override this with screen->gmem_reason_mask
@@ -135,6 +135,7 @@ struct fd_screen {
 
    int64_t cpu_gpu_time_delta;
 
+   struct fd_batch_cache batch_cache;
    struct fd_gmem_cache gmem_cache;
 
    bool reorder;
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index df9c803ff51..5d5b942be39 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -276,13 +276,18 @@ fd_set_framebuffer_state(struct pipe_context *pctx,
    cso->samples = util_framebuffer_get_num_samples(cso);
 
    if (ctx->screen->reorder) {
-      if (ctx->batch) {
-         fd_batch_finish_queries(ctx->batch);
-         fd_batch_reference(&ctx->batch, NULL);
-      }
+      struct fd_batch *old_batch = NULL;
 
+      fd_batch_reference(&old_batch, ctx->batch);
+
+      if (likely(old_batch))
+         fd_batch_finish_queries(old_batch);
+
+      fd_batch_reference(&ctx->batch, NULL);
       fd_context_all_dirty(ctx);
       ctx->update_active_queries = true;
+
+      fd_batch_reference(&old_batch, NULL);
    } else if (ctx->batch) {
       DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush,
           framebuffer->cbufs[0], framebuffer->zsbuf);