asahi: Add batch tracking logic

We already have the notion of an agx_batch, which encapsulates a render pass. Extend the logic to allow multiple in-flight batches per context, avoiding a flush in set_framebuffer_state and improving performance for certain applications designed for IMRs that ping-pong unnecessarily between FBOs. I don't have such an application immediately in mind, but I wanted to get this flag-day out of the way while the driver is still small and flexible. The driver was written from day 1 with batch tracking in mind, so this is a relatively small change to actually wire it up, but there are lots of little details to get right. The code itself is mostly a copy/paste of panfrost, which in turn draws inspiration from freedreno and v3d. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19865>
2026-01-19 17:40:32 +01:00 · 2022-11-17 18:10:11 -05:00 · 2022-11-17 18:10:11 -05:00 · d7511ad784
commit d7511ad784
parent de1eb9400f
4 changed files with 327 additions and 150 deletions
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@ -1,16 +1,202 @@
 /*
 * Copyright 2022 Alyssa Rosenzweig
+ * Copyright 2019-2020 Collabora, Ltd.
 * SPDX-License-Identifier: MIT
 */

 #include "agx_state.h"

-void
-agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+#define foreach_batch(ctx, idx) \
+        BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)
+
+static unsigned
+agx_batch_idx(struct agx_batch *batch)
 {
-   /* TODO: Turn into loop when we support multiple batches */
-   if (ctx->batch) {
-      struct agx_batch *batch = ctx->batch;
+   return batch - batch->ctx->batches.slots;
+}
+
+bool
+agx_batch_is_active(struct agx_batch *batch)
+{
+   return BITSET_TEST(batch->ctx->batches.active, agx_batch_idx(batch));
+}
+
+static void
+agx_batch_init(struct agx_context *ctx,
+               const struct pipe_framebuffer_state *key,
+               struct agx_batch *batch)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+
+   batch->ctx = ctx;
+   util_copy_framebuffer_state(&batch->key, key);
+   batch->seqnum = ++ctx->batches.seqnum;
+
+   agx_pool_init(&batch->pool, dev, AGX_MEMORY_TYPE_FRAMEBUFFER, true);
+   agx_pool_init(&batch->pipeline_pool, dev, AGX_MEMORY_TYPE_SHADER, true);
+
+   /* These allocations can happen only once and will just be zeroed (not freed)
+    * during batch clean up. The memory is owned by the context.
+    */
+   if (!batch->bo_list.set) {
+      batch->bo_list.set = rzalloc_array(ctx, BITSET_WORD, 128);
+      batch->bo_list.word_count = 128;
+   } else {
+      memset(batch->bo_list.set, 0, batch->bo_list.word_count * sizeof(BITSET_WORD));
+   }
+
+   if (!batch->encoder) {
+      batch->encoder = agx_bo_create(dev, 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
+      batch->encoder_current = batch->encoder->ptr.cpu;
+      batch->encoder_end = batch->encoder_current + batch->encoder->size;
+   } else {
+      batch->encoder_current = batch->encoder->ptr.cpu;
+      batch->encoder_end = batch->encoder_current + batch->encoder->size;
+   }
+
+   if (!batch->scissor.bo) {
+      batch->scissor.bo = agx_bo_create(dev, 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
+   }
+
+   if (!batch->depth_bias.bo) {
+      batch->depth_bias.bo = agx_bo_create(dev, 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
+   }
+
+   batch->clear = 0;
+   batch->draw = 0;
+   batch->load = 0;
+   batch->clear_depth = 0;
+   batch->clear_stencil = 0;
+   batch->scissor.count = 0;
+   batch->depth_bias.count = 0;
+   batch->varyings = 0;
+
+   /* We need to emit prim state at the start. Max collides with all. */
+   batch->reduced_prim = PIPE_PRIM_MAX;
+
+   if (batch->key.zsbuf) {
+      agx_batch_writes(batch, agx_resource(key->zsbuf->texture));
+   }
+
+   for (unsigned i = 0; i < key->nr_cbufs; ++i) {
+      agx_batch_writes(batch, agx_resource(key->cbufs[i]->texture));
+   }
+
+   unsigned batch_idx = agx_batch_idx(batch);
+   BITSET_SET(ctx->batches.active, batch_idx);
+
+   agx_batch_init_state(batch);
+}
+
+void
+agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch)
+{
+   struct agx_device *dev = agx_device(ctx->base.screen);
+   assert(batch->ctx == ctx);
+
+   if (ctx->batch == batch)
+      ctx->batch = NULL;
+
+   /* There is no more writer for anything we wrote recorded on this context */
+   hash_table_foreach(ctx->writer, ent) {
+      if (ent->data == batch)
+         _mesa_hash_table_remove(ctx->writer, ent);
+   }
+
+   int handle;
+   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
+      agx_bo_unreference(agx_lookup_bo(dev, handle));
+   }
+
+   agx_pool_cleanup(&batch->pool);
+   agx_pool_cleanup(&batch->pipeline_pool);
+   util_unreference_framebuffer_state(&batch->key);
+
+   unsigned batch_idx = agx_batch_idx(batch);
+   BITSET_CLEAR(ctx->batches.active, batch_idx);
+}
+
+static struct agx_batch *
+agx_get_batch_for_framebuffer(struct agx_context *ctx,
+                              const struct pipe_framebuffer_state *state)
+{
+   /* Look if we have a matching batch */
+   unsigned i;
+   foreach_batch(ctx, i) {
+      struct agx_batch *candidate = &ctx->batches.slots[i];
+
+      if (util_framebuffer_state_equal(&candidate->key, state)) {
+         /* We found a match, increase the seqnum for the LRU
+          * eviction logic.
+          */
+         candidate->seqnum = ++ctx->batches.seqnum;
+         return candidate;
+      }
+   }
+
+   /* Look if we have a free batch */
+   struct agx_batch *batch = NULL;
+   for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+      if (!BITSET_TEST(ctx->batches.active, i)) {
+         batch = &ctx->batches.slots[i];
+         break;
+      }
+   }
+
+   /* Else, evict something */
+   if (!batch) {
+      for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+         struct agx_batch *candidate = &ctx->batches.slots[i];
+
+         if (!batch || batch->seqnum > candidate->seqnum)
+            batch = candidate;
+      }
+
+      agx_flush_batch(ctx, batch);
+   }
+
+   /* Batch is now free */
+   agx_batch_init(ctx, state, batch);
+   return batch;
+}
+
+struct agx_batch *
+agx_get_batch(struct agx_context *ctx)
+{
+   if (!ctx->batch) {
+      ctx->batch = agx_get_batch_for_framebuffer(ctx, &ctx->framebuffer);
+      agx_dirty_all(ctx);
+   }
+
+   assert(util_framebuffer_state_equal(&ctx->framebuffer, &ctx->batch->key));
+   return ctx->batch;
+}
+
+void
+agx_flush_all(struct agx_context *ctx, const char *reason)
+{
+   if (reason)
+      perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
+
+   unsigned idx;
+   foreach_batch(ctx, idx) {
+      agx_flush_batch(ctx, &ctx->batches.slots[idx]);
+   }
+}
+
+static void
+agx_flush_readers_except(struct agx_context *ctx,
+                         struct agx_resource *rsrc,
+                         struct agx_batch *except,
+                         const char *reason)
+{
+   unsigned idx;
+
+   foreach_batch(ctx, idx) {
+      struct agx_batch *batch = &ctx->batches.slots[idx];
+
+      if (batch == except)
+         continue;

      if (agx_batch_uses_bo(batch, rsrc->bo)) {
         perf_debug_ctx(ctx, "Flush reader due to: %s\n", reason);
@ -19,20 +205,38 @@ agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char
   }
 }

-void
-agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+static void
+agx_flush_writer_except(struct agx_context *ctx,
+                        struct agx_resource *rsrc,
+                        struct agx_batch *except,
+                        const char *reason)
 {
   struct hash_entry *ent = _mesa_hash_table_search(ctx->writer, rsrc);

-   if (ent) {
+   if (ent && ent->data != except) {
      perf_debug_ctx(ctx, "Flush writer due to: %s\n", reason);
      agx_flush_batch(ctx, ent->data);
   }
 }

+void
+agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+{
+   agx_flush_readers_except(ctx, rsrc, NULL, reason);
+}
+
+void
+agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason)
+{
+   agx_flush_writer_except(ctx, rsrc, NULL, reason);
+}
+
 void
 agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc)
 {
+   /* Hazard: read-after-write */
+   agx_flush_writer_except(batch->ctx, rsrc, batch, "Read from another batch");
+
   agx_batch_add_bo(batch, rsrc->bo);

   if (rsrc->separate_stencil)
@ -45,12 +249,15 @@ agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc)
   struct agx_context *ctx = batch->ctx;
   struct hash_entry *ent = _mesa_hash_table_search(ctx->writer, rsrc);

+   agx_flush_readers_except(ctx, rsrc, batch, "Write from other batch");
+
   /* Nothing to do if we're already writing */
   if (ent && ent->data == batch)
      return;

-   /* Flush the old writer if there is one */
-   agx_flush_writer(ctx, rsrc, "Multiple writers");
+   /* Hazard: writer-after-write, write-after-read */
+   if (ent)
+      agx_flush_writer(ctx, rsrc, "Multiple writers");

   /* Write is strictly stronger than a read */
   agx_batch_reads(batch, rsrc);
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@ -659,7 +659,7 @@ agx_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor
          const union pipe_color_union *color, double depth, unsigned stencil)
 {
   struct agx_context *ctx = agx_context(pctx);
-   struct agx_batch *batch = ctx->batch;
+   struct agx_batch *batch = agx_get_batch(ctx);

   unsigned fastclear = buffers & ~(batch->draw | batch->load);
   unsigned slowclear = buffers & ~fastclear;
@ -690,11 +690,11 @@ agx_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor
   assert((batch->draw & slowclear) == slowclear);
 }

-
 static void
 agx_flush_resource(struct pipe_context *ctx,
                   struct pipe_resource *resource)
 {
+   agx_flush_writer(agx_context(ctx), agx_resource(resource), "flush_resource");
 }

 /*
@ -710,7 +710,7 @@ agx_flush(struct pipe_context *pctx,
   if (fence)
      *fence = NULL;

-   agx_flush_batch(ctx, ctx->batch);
+   agx_flush_all(ctx, "Gallium flush");
 }

 void
@ -718,9 +718,13 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
 {
   struct agx_device *dev = agx_device(ctx->base.screen);

+   assert(agx_batch_is_active(batch));
+
   /* Nothing to do */
-   if (!(batch->draw | batch->clear))
+   if (!(batch->draw | batch->clear)) {
+      agx_batch_cleanup(ctx, batch);
      return;
+   }

   /* Finalize the encoder */
   uint8_t stop[5 + 64] = { 0x00, 0x00, 0x00, 0xc0, 0x00 };
@ -761,7 +765,7 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
      pipeline_store =
         agx_build_store_pipeline(batch,
                                  dev->internal.store,
-                                  agx_pool_upload(&batch->pool, ctx->render_target[0], sizeof(ctx->render_target)));
+                                  agx_batch_upload_pbe(batch, 0));
   }

   for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
@ -851,37 +855,7 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
      agxdecode_next_frame();
   }

-   AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
-      agx_bo_unreference(agx_lookup_bo(dev, handle));
-   }
-
-   /* There is no more writer for anything we wrote recorded on this context */
-   hash_table_foreach(ctx->writer, ent) {
-      if (ent->data == batch)
-         _mesa_hash_table_remove(ctx->writer, ent);
-   }
-
-   memset(batch->bo_list.set, 0, batch->bo_list.word_count * sizeof(BITSET_WORD));
-   agx_pool_cleanup(&batch->pool);
-   agx_pool_cleanup(&batch->pipeline_pool);
-   agx_pool_init(&batch->pool, dev, AGX_MEMORY_TYPE_FRAMEBUFFER, true);
-   agx_pool_init(&batch->pipeline_pool, dev, AGX_MEMORY_TYPE_CMDBUF_32, true);
-   batch->clear = 0;
-   batch->draw = 0;
-   batch->load = 0;
-   batch->encoder_current = batch->encoder->ptr.cpu;
-   batch->encoder_end = batch->encoder_current + batch->encoder->size;
-   batch->scissor.count = 0;
-
-   agx_dirty_all(ctx);
-   agx_batch_init_state(batch);
-
-   /* After resetting the batch, rebind the framebuffer so we update resource
-    * tracking logic and the BO lists.
-    *
-    * XXX: This is a hack to workaround lack of proper batch tracking.
-    */
-   ctx->base.set_framebuffer_state(&ctx->base, &ctx->framebuffer);
+   agx_batch_cleanup(ctx, batch);
 }

 static void
@ -919,20 +893,6 @@ agx_create_context(struct pipe_screen *screen,
   pctx->screen = screen;
   pctx->priv = priv;

-   ctx->batch = rzalloc(ctx, struct agx_batch);
-   ctx->batch->ctx = ctx;
-   ctx->batch->bo_list.set = rzalloc_array(ctx->batch, BITSET_WORD, 128);
-   ctx->batch->bo_list.word_count = 128;
-   agx_pool_init(&ctx->batch->pool,
-                 agx_device(screen), AGX_MEMORY_TYPE_FRAMEBUFFER, true);
-   agx_pool_init(&ctx->batch->pipeline_pool,
-                 agx_device(screen), AGX_MEMORY_TYPE_SHADER, true);
-   ctx->batch->encoder = agx_bo_create(agx_device(screen), 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
-   ctx->batch->encoder_current = ctx->batch->encoder->ptr.cpu;
-   ctx->batch->encoder_end = ctx->batch->encoder_current + ctx->batch->encoder->size;
-   ctx->batch->scissor.bo = agx_bo_create(agx_device(screen), 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
-   ctx->batch->depth_bias.bo = agx_bo_create(agx_device(screen), 0x80000, AGX_MEMORY_TYPE_FRAMEBUFFER);
-
   ctx->writer = _mesa_pointer_hash_table_create(ctx);

   /* Upload fixed shaders (TODO: compile them?) */
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -781,63 +781,61 @@ agx_set_framebuffer_state(struct pipe_context *pctx,
   if (!state)
      return;

-   /* XXX: eliminate this flush with batch tracking logic */
-   agx_flush_all(ctx, "Framebuffer switch");
-
   util_copy_framebuffer_state(&ctx->framebuffer, state);
-   util_copy_framebuffer_state(&ctx->batch->key, state);
-   ctx->dirty = ~0;
-
-   if (state->zsbuf)
-      agx_batch_writes(ctx->batch, agx_resource(state->zsbuf->texture));
-
-
-   for (unsigned i = 0; i < state->nr_cbufs; ++i) {
-      struct pipe_surface *surf = state->cbufs[i];
-      struct agx_resource *tex = agx_resource(surf->texture);
-      const struct util_format_description *desc =
-         util_format_description(surf->format);
-      unsigned level = surf->u.tex.level;
-      unsigned layer = surf->u.tex.first_layer;
-
-      agx_batch_writes(ctx->batch, tex);
-
-      assert(surf->u.tex.last_layer == layer);
-
-      agx_pack(ctx->render_target[i], RENDER_TARGET, cfg) {
-         cfg.layout = agx_translate_layout(tex->layout.tiling);
-         cfg.channels = agx_pixel_format[surf->format].channels;
-         cfg.type = agx_pixel_format[surf->format].type;
-
-         assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
-         cfg.swizzle_r = agx_channel_from_pipe(desc->swizzle[0]) & 3;
-
-         if (desc->nr_channels >= 2)
-            cfg.swizzle_g = agx_channel_from_pipe(desc->swizzle[1]) & 3;
-
-         if (desc->nr_channels >= 3)
-            cfg.swizzle_b = agx_channel_from_pipe(desc->swizzle[2]) & 3;
-
-         if (desc->nr_channels >= 4)
-            cfg.swizzle_a = agx_channel_from_pipe(desc->swizzle[3]) & 3;
-
-         cfg.width = state->width;
-         cfg.height = state->height;
-         cfg.level = surf->u.tex.level;
-         cfg.buffer = agx_map_texture_gpu(tex, layer);
-         cfg.unk_mipmapped = tex->mipmapped;
-
-         if (tex->layout.tiling == AIL_TILING_LINEAR) {
-            cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
-            cfg.levels = 1;
-         } else {
-            cfg.unk_tiled = true;
-            cfg.levels = tex->base.last_level + 1;
-         }
-      };
-   }
+   ctx->batch = NULL;
+   agx_dirty_all(ctx);
 }

+uint64_t
+agx_batch_upload_pbe(struct agx_batch *batch, unsigned rt)
+{
+   struct pipe_surface *surf = batch->key.cbufs[rt];
+   struct agx_resource *tex = agx_resource(surf->texture);
+   const struct util_format_description *desc =
+      util_format_description(surf->format);
+   unsigned level = surf->u.tex.level;
+   unsigned layer = surf->u.tex.first_layer;
+
+   assert(surf->u.tex.last_layer == layer);
+
+   struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, AGX_RENDER_TARGET_LENGTH, 256);
+
+   agx_pack(T.cpu, RENDER_TARGET, cfg) {
+      cfg.layout = agx_translate_layout(tex->layout.tiling);
+      cfg.channels = agx_pixel_format[surf->format].channels;
+      cfg.type = agx_pixel_format[surf->format].type;
+
+      assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
+      cfg.swizzle_r = agx_channel_from_pipe(desc->swizzle[0]) & 3;
+
+      if (desc->nr_channels >= 2)
+         cfg.swizzle_g = agx_channel_from_pipe(desc->swizzle[1]) & 3;
+
+      if (desc->nr_channels >= 3)
+         cfg.swizzle_b = agx_channel_from_pipe(desc->swizzle[2]) & 3;
+
+      if (desc->nr_channels >= 4)
+         cfg.swizzle_a = agx_channel_from_pipe(desc->swizzle[3]) & 3;
+
+      cfg.width = batch->key.width;
+      cfg.height = batch->key.height;
+      cfg.level = surf->u.tex.level;
+      cfg.buffer = agx_map_texture_gpu(tex, layer);
+      cfg.unk_mipmapped = tex->mipmapped;
+
+      if (tex->layout.tiling == AIL_TILING_LINEAR) {
+         cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
+         cfg.levels = 1;
+      } else {
+         cfg.unk_tiled = true;
+         cfg.levels = tex->base.last_level + 1;
+      }
+   };
+
+   return T.gpu;
+}
+
+
 /* Likewise constant buffers, textures, and samplers are handled in a common
 * per-draw path, with dirty tracking to reduce the costs involved.
 */
@ -1224,18 +1222,20 @@ agx_update_vs(struct agx_context *ctx)
 }

 static bool
-agx_update_fs(struct agx_context *ctx)
+agx_update_fs(struct agx_batch *batch)
 {
+   struct agx_context *ctx = batch->ctx;
+
   struct asahi_shader_key key = {
-      .nr_cbufs = ctx->batch->key.nr_cbufs,
+      .nr_cbufs = batch->key.nr_cbufs,
      .clip_plane_enable = ctx->rast->base.clip_plane_enable,
   };

-   if (ctx->batch->reduced_prim == PIPE_PRIM_POINTS)
+   if (batch->reduced_prim == PIPE_PRIM_POINTS)
      key.sprite_coord_enable = ctx->rast->base.sprite_coord_enable;

   for (unsigned i = 0; i < key.nr_cbufs; ++i) {
-      struct pipe_surface *surf = ctx->batch->key.cbufs[i];
+      struct pipe_surface *surf = batch->key.cbufs[i];

      if (surf) {
         enum pipe_format fmt = surf->format;
@ -1557,9 +1557,6 @@ agx_batch_init_state(struct agx_batch *batch)

   agx_ppp_fini(&out, &ppp);
   batch->encoder_current = out;
-
-   /* We need to emit prim state at the start. Max collides with all. */
-   batch->reduced_prim = PIPE_PRIM_MAX;
 }

 static enum agx_object_type
@ -1586,9 +1583,10 @@ agx_pass_type_for_shader(struct agx_shader_info *info)
 #define MAX_PPP_UPDATES 2

 static uint8_t *
-agx_encode_state(struct agx_context *ctx, uint8_t *out,
+agx_encode_state(struct agx_batch *batch, uint8_t *out,
                 bool is_lines, bool is_points)
 {
+   struct agx_context *ctx = batch->ctx;
   struct agx_rasterizer *rast = ctx->rast;
   unsigned ppp_updates = 0;

@ -1613,7 +1611,7 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
      out += AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH;

      agx_pack(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
-         cfg.pipeline = agx_build_pipeline(ctx->batch, ctx->vs, PIPE_SHADER_VERTEX);
+         cfg.pipeline = agx_build_pipeline(batch, ctx->vs, PIPE_SHADER_VERTEX);
      }
      out += AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH;

@ -1634,17 +1632,17 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
      out += 4;
   }

-   struct agx_pool *pool = &ctx->batch->pool;
+   struct agx_pool *pool = &batch->pool;
   struct agx_compiled_shader *vs = ctx->vs, *fs = ctx->fs;
   unsigned zbias = 0;

   if (ctx->rast->base.offset_tri) {
-      zbias = agx_upload_depth_bias(ctx->batch, &ctx->rast->base);
+      zbias = agx_upload_depth_bias(batch, &ctx->rast->base);
      ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
   }

   if (ctx->dirty & (AGX_DIRTY_VIEWPORT | AGX_DIRTY_SCISSOR_ZBIAS)) {
-      agx_upload_viewport_scissor(pool, ctx->batch, &out, &ctx->viewport,
+      agx_upload_viewport_scissor(pool, batch, &out, &ctx->viewport,
            ctx->rast->base.scissor ? &ctx->scissor : NULL,
            zbias);
   }
@ -1652,7 +1650,7 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
   bool varyings_dirty = false;

   if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS)) {
-      ctx->batch->varyings = agx_link_varyings_vs_fs(&ctx->batch->pipeline_pool,
+      batch->varyings = agx_link_varyings_vs_fs(&batch->pipeline_pool,
            &ctx->vs->info.varyings.vs,
            &ctx->fs->info.varyings.fs,
            ctx->rast->base.flatshade_first);
@ -1774,13 +1772,13 @@ agx_encode_state(struct agx_context *ctx, uint8_t *out,
   if (IS_DIRTY(FS) || varyings_dirty) {
      unsigned frag_tex_count = ctx->stage[PIPE_SHADER_FRAGMENT].texture_count;
      agx_ppp_push(&ppp, FRAGMENT_SHADER, cfg) {
-         cfg.pipeline = agx_build_pipeline(ctx->batch, ctx->fs, PIPE_SHADER_FRAGMENT),
+         cfg.pipeline = agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT),
         cfg.uniform_register_count = ctx->fs->info.push_count;
         cfg.preshader_register_count = ctx->fs->info.nr_preamble_gprs;
         cfg.texture_state_register_count = frag_tex_count;
         cfg.sampler_state_register_count = frag_tex_count;
         cfg.cf_binding_count = ctx->fs->info.varyings.fs.nr_bindings;
-         cfg.cf_bindings = ctx->batch->varyings;
+         cfg.cf_bindings = batch->varyings;

         /* XXX: This is probably wrong */
         cfg.unknown_30 = frag_tex_count >= 4;
@ -1883,18 +1881,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   }

   struct agx_context *ctx = agx_context(pctx);
-   struct agx_batch *batch = ctx->batch;
+   struct agx_batch *batch = agx_get_batch(ctx);

   if (agx_scissor_culls_everything(ctx))
 	   return;

-#ifndef NDEBUG
-   /* For debugging dirty tracking, mark all state as dirty every draw, forcing
-    * everything to be re-emitted fresh.
-    */
-   if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
-      agx_dirty_all(ctx);
-#endif
+   agx_dirty_all(ctx);

   /* Dirty track the reduced prim: lines vs points vs triangles */
   enum pipe_prim_type reduced_prim = u_reduced_prim(info->mode);
@ -1902,8 +1894,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   batch->reduced_prim = reduced_prim;

   /* TODO: masks */
-   ctx->batch->draw |= ~0;
-   ctx->batch->load |= ~0;
+   batch->draw |= ~0;
+   batch->load |= ~0;

   /* TODO: These are expensive calls, consider finer dirty tracking */
   if (agx_update_vs(ctx))
@ -1911,7 +1903,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   else if (ctx->stage[PIPE_SHADER_VERTEX].dirty)
      ctx->dirty |= AGX_DIRTY_VS;

-   if (agx_update_fs(ctx))
+   if (agx_update_fs(batch))
      ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
   else if (ctx->stage[PIPE_SHADER_FRAGMENT].dirty)
      ctx->dirty |= AGX_DIRTY_FS;
@ -1939,7 +1931,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
                               AGX_INDEX_LIST_START_LENGTH +
                               AGX_INDEX_LIST_BUFFER_SIZE_LENGTH);

-   uint8_t *out = agx_encode_state(ctx, batch->encoder_current,
+   uint8_t *out = agx_encode_state(batch, batch->encoder_current,
                                   reduced_prim == PIPE_PRIM_LINES,
                                   reduced_prim == PIPE_PRIM_POINTS);

@ -2008,6 +2000,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   assert(batch->encoder_current <= batch->encoder_end &&
          "Failed to reserve sufficient space in encoder");
   ctx->dirty = 0;
+
+   assert(batch == agx_get_batch(ctx) && "batch should not change under us");
 }

 void agx_init_state_functions(struct pipe_context *ctx);
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -95,6 +95,7 @@ struct agx_array {
 struct agx_batch {
   struct agx_context *ctx;
   struct pipe_framebuffer_state key;
+   uint64_t seqnum;

   /* PIPE_CLEAR_* bitmask */
   uint32_t clear, draw, load;
@ -174,11 +175,24 @@ enum agx_dirty {
   AGX_DIRTY_FS_PROG    = BITFIELD_BIT(11),
 };

+#define AGX_MAX_BATCHES (2)
+
 struct agx_context {
   struct pipe_context base;
   struct agx_compiled_shader *vs, *fs;
   uint32_t dirty;

+   /* Set of batches. When full, the LRU entry (the batch with the smallest
+    * seqnum) is flushed to free a slot.
+    */
+   struct {
+      uint64_t seqnum;
+      struct agx_batch slots[AGX_MAX_BATCHES];
+
+      /** Set of active batches for faster traversal */
+      BITSET_DECLARE(active, AGX_MAX_BATCHES);
+   } batches;
+
   struct agx_batch *batch;

   struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
@ -204,8 +218,6 @@ struct agx_context {
   struct util_debug_callback debug;
   bool is_noop;

-   uint8_t render_target[8][AGX_RENDER_TARGET_LENGTH];
-
   struct blitter_context *blitter;

   /* Map of agx_resource to agx_batch that writes that resource */
@ -350,6 +362,9 @@ uint64_t
 agx_push_location(struct agx_batch *batch, struct agx_push push,
                  enum pipe_shader_type stage);

+bool
+agx_batch_is_active(struct agx_batch *batch);
+
 uint64_t
 agx_build_clear_pipeline(struct agx_batch *batch, uint32_t code, uint64_t clear_buf);

@ -360,6 +375,9 @@ agx_build_store_pipeline(struct agx_batch *batch, uint32_t code,
 uint64_t
 agx_build_reload_pipeline(struct agx_batch *batch, uint32_t code, struct pipe_surface *surf);

+uint64_t
+agx_batch_upload_pbe(struct agx_batch *batch, unsigned rt);
+
 /* Add a BO to a batch. This needs to be amortized O(1) since it's called in
 * hot paths. To achieve this we model BO lists by bit sets */

@ -383,7 +401,7 @@ agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
 {
   /* Double the size of the BO list if we run out, this is amortized O(1) */
   if (unlikely(bo->handle > agx_batch_bo_list_bits(batch))) {
-      batch->bo_list.set = rerzalloc(batch, batch->bo_list.set, BITSET_WORD,
+      batch->bo_list.set = rerzalloc(batch->ctx, batch->bo_list.set, BITSET_WORD,
                                     batch->bo_list.word_count,
                                     batch->bo_list.word_count * 2);
      batch->bo_list.word_count *= 2;
@ -408,6 +426,7 @@ agx_batch_num_bo(struct agx_batch *batch)
   BITSET_FOREACH_SET(handle, (batch)->bo_list.set, agx_batch_bo_list_bits(batch))

 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
+void agx_flush_all(struct agx_context *ctx, const char *reason);
 void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason);
 void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason);

@ -415,6 +434,9 @@ void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const
 void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc);
 void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc);

+struct agx_batch *agx_get_batch(struct agx_context *ctx);
+void agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch);
+
 /* Blit shaders */
 void
 agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
@ -426,12 +448,6 @@ void agx_blit(struct pipe_context *pipe,
 void agx_internal_shaders(struct agx_device *dev);

 /* Batch logic */
-static void
-agx_flush_all(struct agx_context *ctx, const char *reason)
-{
-   perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
-   ctx->base.flush(&ctx->base, NULL, 0);
-}

 void
 agx_batch_init_state(struct agx_batch *batch);