From 401b400de3f2d84bfb3436b7fed0fa90cf70b8e4 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Fri, 10 Jan 2025 13:19:22 -0500
Subject: [PATCH] nir,asahi,hk: add barrier argument to MESA_DISPATCH_PRECOMP

In the current API, precomp implicitly assumes full barriers both before & after
every dispatch. That's not good for performance. However, dropping the barriers
and requiring user to explicitly call barrier functions before/after would have
bad ergonomics.

So, we add a new parameter to the standard MESA_DISPATCH_PRECOMP signature
representing the barriers required around the dispatch. As usual, the actual
type & semantic is left to drivers to define what makes sense for their
hardware. We just reserve the place for it. (I think most drivers will want
bitflags here, but I don't think the actual flags are worth. If a driver wanted
to use a struct here, that would work too.)

Since the asahi stack doesn't do anything clever with barriers yet, we
mechnically add an AGX_BARRIER_ALL barrier to all precomp users in-tree. We can
optimize that later, this just gets the flag-day change in with no functional
change.

For JM panfrost, this will provide a convenient place to stash both their "job
barrier" bit and their "suppress prefetch" bit (which is really a sort of
barrier / cache flush, if you think about it).

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32980>
---
 src/asahi/lib/agx_helpers.h           | 13 +++----
 src/asahi/vulkan/hk_cmd_buffer.c      |  3 +-
 src/asahi/vulkan/hk_cmd_buffer.h      |  5 +--
 src/asahi/vulkan/hk_cmd_dispatch.c    |  4 +--
 src/asahi/vulkan/hk_cmd_draw.c        | 49 +++++++++++++++------------
 src/asahi/vulkan/hk_query_pool.c      |  8 ++---
 src/compiler/nir/nir_precompiled.h    | 17 +++++-----
 src/gallium/drivers/asahi/agx_query.c |  3 +-
 src/gallium/drivers/asahi/agx_state.c | 48 ++++++++++++++------------
 src/gallium/drivers/asahi/agx_state.h |  4 +--
 10 files changed, 86 insertions(+), 68 deletions(-)

diff --git a/src/asahi/lib/agx_helpers.h b/src/asahi/lib/agx_helpers.h
index b80208c83c1..d7a711a2b39 100644
--- a/src/asahi/lib/agx_helpers.h
+++ b/src/asahi/lib/agx_helpers.h
@@ -260,20 +260,21 @@ agx_fill_decompress_args(struct ail_layout *layout, unsigned layer,
 }
 
 #undef libagx_decompress
-#define libagx_decompress(context, grid, layout, layer, level, ptr, images)    \
+#define libagx_decompress(context, grid, barrier, layout, layer, level, ptr,   \
+                          images)                                              \
    libagx_decompress_struct(                                                   \
-      context, grid,                                                           \
+      context, grid, barrier,                                                  \
       agx_fill_decompress_args(layout, layer, level, ptr, images),             \
       util_logbase2(layout->sample_count_sa))
 
-#define libagx_tessellate(context, grid, prim, mode, state)                    \
+#define libagx_tessellate(context, grid, barrier, prim, mode, state)           \
    if (prim == TESS_PRIMITIVE_QUADS) {                                         \
-      libagx_tess_quad(context, grid, state, mode);                            \
+      libagx_tess_quad(context, grid, barrier, state, mode);                   \
    } else if (prim == TESS_PRIMITIVE_TRIANGLES) {                              \
-      libagx_tess_tri(context, grid, state, mode);                             \
+      libagx_tess_tri(context, grid, barrier, state, mode);                    \
    } else {                                                                    \
       assert(prim == TESS_PRIMITIVE_ISOLINES);                                 \
-      libagx_tess_isoline(context, grid, state, mode);                         \
+      libagx_tess_isoline(context, grid, barrier, state, mode);                \
    }
 
 struct agx_border_packed;
diff --git a/src/asahi/vulkan/hk_cmd_buffer.c b/src/asahi/vulkan/hk_cmd_buffer.c
index 7f6ef6978d2..0d5862a48f4 100644
--- a/src/asahi/vulkan/hk_cmd_buffer.c
+++ b/src/asahi/vulkan/hk_cmd_buffer.c
@@ -709,7 +709,8 @@ hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
 
 void
 hk_dispatch_precomp(struct hk_cs *cs, struct agx_grid grid,
-                    enum libagx_program idx, void *data, size_t data_size)
+                    enum agx_barrier barrier, enum libagx_program idx,
+                    void *data, size_t data_size)
 {
    struct hk_device *dev = hk_cmd_buffer_device(cs->cmd);
    struct agx_precompiled_shader *prog = agx_get_precompiled(&dev->bg_eot, idx);
diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h
index 6e50279c4d6..87f0298695f 100644
--- a/src/asahi/vulkan/hk_cmd_buffer.h
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@@ -803,8 +803,9 @@ hk_dispatch_with_local_size(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
    hk_dispatch_with_usc(dev, cs, &s->b.info, usc, grid, local_size);
 }
 
-void hk_dispatch_precomp(struct hk_cs *cs, struct agx_grid gird,
-                         enum libagx_program idx, void *data, size_t data_size);
+void hk_dispatch_precomp(struct hk_cs *cs, struct agx_grid grid,
+                         enum agx_barrier barrier, enum libagx_program idx,
+                         void *data, size_t data_size);
 
 #define MESA_DISPATCH_PRECOMP hk_dispatch_precomp
 
diff --git a/src/asahi/vulkan/hk_cmd_dispatch.c b/src/asahi/vulkan/hk_cmd_dispatch.c
index d1131c99280..2d49def6fa0 100644
--- a/src/asahi/vulkan/hk_cmd_dispatch.c
+++ b/src/asahi/vulkan/hk_cmd_dispatch.c
@@ -103,8 +103,8 @@ dispatch(struct hk_cmd_buffer *cmd, struct agx_grid grid)
       perf_debug(dev, "CS invocation statistic");
       uint64_t grid = cmd->state.cs.descriptors.root.cs.group_count_addr;
 
-      libagx_increment_cs_invocations(cs, agx_1d(1), grid, stat,
-                                      agx_workgroup_threads(local_size));
+      libagx_increment_cs_invocations(cs, agx_1d(1), grid, AGX_BARRIER_ALL,
+                                      stat, agx_workgroup_threads(local_size));
    }
 
    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c
index 80d48f7ea5d..e98a87e4f81 100644
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@@ -861,7 +861,8 @@ hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
                   agx_3d(ail_metadata_width_tl(layout, level) * 32,
                          ail_metadata_height_tl(layout, level), layer_count);
 
-               libagx_decompress(cs, grid, layout, layer, level, base,
+               libagx_decompress(cs, grid, AGX_BARRIER_ALL, layout, layer,
+                                 level, base,
                                  hk_pool_upload(cmd, &imgs, sizeof(imgs), 64));
             }
          }
@@ -1389,8 +1390,8 @@ hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       .zero_sink = dev->rodata.zero_sink,
    };
 
-   libagx_unroll_restart_struct(cs, agx_1d(1024 * draw_count), ia,
-                                draw.index_size, libagx_compact_prim(prim));
+   libagx_unroll_restart_struct(cs, agx_1d(1024 * draw_count), AGX_BARRIER_ALL,
+                                ia, draw.index_size, libagx_compact_prim(prim));
 
    return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
                                     dev->heap->size, draw.index_size,
@@ -1460,7 +1461,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
          gsi.index_buffer_range_el = agx_draw_index_range_el(draw);
       }
 
-      libagx_gs_setup_indirect_struct(cs, agx_1d(1), gsi);
+      libagx_gs_setup_indirect_struct(cs, agx_1d(1), AGX_BARRIER_ALL, gsi);
 
       grid_vs = agx_grid_indirect(
          geometry_params + offsetof(struct agx_geometry_params, vs_grid));
@@ -1486,7 +1487,8 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       hk_dispatch_with_local_size(cmd, cs, count, grid_gs,
                                   agx_workgroup(1, 1, 1));
 
-      libagx_prefix_sum_geom(cs, agx_1d(1024 * count_words), geometry_params);
+      libagx_prefix_sum_geom(cs, agx_1d(1024 * count_words), AGX_BARRIER_ALL,
+                             geometry_params);
    }
 
    /* Pre-GS shader */
@@ -1549,7 +1551,7 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
          args.in_index_buffer_range_el = agx_draw_index_range_el(draw);
       }
 
-      libagx_tess_setup_indirect_struct(cs, agx_1d(1), args);
+      libagx_tess_setup_indirect_struct(cs, agx_1d(1), AGX_BARRIER_ALL, args);
 
       uint32_t grid_stride = sizeof(uint32_t) * 6;
       grid_vs = agx_grid_indirect_local(gfx->tess.grids + 0 * grid_stride);
@@ -1565,7 +1567,8 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       /* TCS invocation counter increments once per-patch */
       if (tcs_stat) {
          perf_debug(dev, "Direct TCS statistic");
-         libagx_increment_statistic(cs, agx_1d(1), tcs_stat, patches);
+         libagx_increment_statistic(cs, agx_1d(1), AGX_BARRIER_ALL, tcs_stat,
+                                    patches);
       }
    }
 
@@ -1583,10 +1586,13 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
       grid_tcs, agx_workgroup(tcs->info.tess.tcs_output_patch_size, 1, 1));
 
    /* First generate counts, then prefix sum them, and then tessellate. */
-   libagx_tessellate(cs, grid_tess, info.mode, LIBAGX_TESS_MODE_COUNT, state);
-   libagx_prefix_sum_tess(cs, agx_1d(1024), state);
-   libagx_tessellate(cs, grid_tess, info.mode, LIBAGX_TESS_MODE_WITH_COUNTS,
-                     state);
+   libagx_tessellate(cs, grid_tess, AGX_BARRIER_ALL, info.mode,
+                     LIBAGX_TESS_MODE_COUNT, state);
+
+   libagx_prefix_sum_tess(cs, agx_1d(1024), AGX_BARRIER_ALL, state);
+
+   libagx_tessellate(cs, grid_tess, AGX_BARRIER_ALL, info.mode,
+                     LIBAGX_TESS_MODE_WITH_COUNTS, state);
 
    return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
                                     dev->heap->size, AGX_INDEX_SIZE_U32, false);
@@ -3358,12 +3364,13 @@ hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
       uint32_t index_size_B = agx_index_size_to_B(draw.index_size);
 
       libagx_increment_ia_restart(
-         cs, agx_1d(1024), ia_vertices, ia_prims, vs_invocations, c_prims,
-         c_inv, draw_ptr, draw.index_buffer, agx_draw_index_range_el(draw),
-         cmd->state.gfx.index.restart, index_size_B, prim);
+         cs, agx_1d(1024), AGX_BARRIER_ALL, ia_vertices, ia_prims,
+         vs_invocations, c_prims, c_inv, draw_ptr, draw.index_buffer,
+         agx_draw_index_range_el(draw), cmd->state.gfx.index.restart,
+         index_size_B, prim);
    } else {
-      libagx_increment_ia(cs, agx_1d(1), ia_vertices, ia_prims, vs_invocations,
-                          c_prims, c_inv, draw_ptr, prim);
+      libagx_increment_ia(cs, agx_1d(1), AGX_BARRIER_ALL, ia_vertices, ia_prims,
+                          vs_invocations, c_prims, c_inv, draw_ptr, prim);
    }
 }
 
@@ -3476,7 +3483,7 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
             struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
 
             libagx_draw_without_adj(
-               ccs, agx_1d(1), out_draw, draw.b.ptr,
+               ccs, agx_1d(1), AGX_BARRIER_ALL, out_draw, draw.b.ptr,
                desc->root.draw.input_assembly, draw.index_buffer,
                draw.indexed ? agx_draw_index_range_el(draw) : 0,
                draw.indexed ? agx_index_size_to_B(draw.index_size) : 0, prim);
@@ -3503,7 +3510,7 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
          size_t size_B = libagx_draw_robust_index_vdm_size();
          uint64_t target = hk_cs_alloc_for_indirect(cs, size_B);
 
-         libagx_draw_robust_index(ccs, agx_1d(32), target,
+         libagx_draw_robust_index(ccs, agx_1d(32), AGX_BARRIER_ALL, target,
                                   hk_geometry_state(cmd), draw.b.ptr,
                                   draw.index_buffer, draw.index_buffer_range_B,
                                   draw.restart, topology, draw.index_size);
@@ -3728,8 +3735,8 @@ hk_draw_indirect_count(VkCommandBuffer commandBuffer, VkBuffer _buffer,
    uint64_t in = hk_buffer_address(buffer, offset);
    uint64_t count_addr = hk_buffer_address(count_buffer, countBufferOffset);
 
-   libagx_predicate_indirect(cs, agx_1d(maxDrawCount), patched, in, count_addr,
-                             stride / 4, indexed);
+   libagx_predicate_indirect(cs, agx_1d(maxDrawCount), AGX_BARRIER_ALL, patched,
+                             in, count_addr, stride / 4, indexed);
 
    if (indexed) {
       hk_draw_indexed_indirect_inner(commandBuffer, patched, maxDrawCount,
@@ -3847,7 +3854,7 @@ hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
    if (copies > 0) {
       perf_debug(dev, "XFB counter copy");
 
-      libagx_copy_xfb_counters(cs, agx_1d(copies),
+      libagx_copy_xfb_counters(cs, agx_1d(copies), AGX_BARRIER_ALL,
                                hk_pool_upload(cmd, &params, sizeof(params), 8));
    }
 }
diff --git a/src/asahi/vulkan/hk_query_pool.c b/src/asahi/vulkan/hk_query_pool.c
index e0187c7ba7f..7d132ba498c 100644
--- a/src/asahi/vulkan/hk_query_pool.c
+++ b/src/asahi/vulkan/hk_query_pool.c
@@ -268,7 +268,7 @@ hk_dispatch_imm_writes(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
       util_dynarray_num_elements(&cs->imm_writes, struct libagx_imm_write);
    assert(count > 0);
 
-   libagx_write_u32s(cs, agx_1d(count), params);
+   libagx_write_u32s(cs, agx_1d(count), AGX_BARRIER_ALL, params);
 }
 
 void
@@ -305,7 +305,7 @@ hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
    hk_cdm_cache_flush(dev, cs);
 
    perf_debug(dev, "Queued write");
-   libagx_write_u32(cs, agx_1d(1), address, value);
+   libagx_write_u32(cs, agx_1d(1), AGX_BARRIER_ALL, address, value);
 }
 
 /**
@@ -433,7 +433,7 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
       if (!after)
          return;
 
-      libagx_copy_timestamp(after, agx_1d(1), report_addr,
+      libagx_copy_timestamp(after, agx_1d(1), AGX_BARRIER_ALL, report_addr,
                             cs->timestamp.end.addr);
    } else {
       cs->timestamp.end = (struct agx_timestamp_req){
@@ -702,5 +702,5 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
       .with_availability = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
    };
 
-   libagx_copy_query_struct(cs, agx_1d(queryCount), info);
+   libagx_copy_query_struct(cs, agx_1d(queryCount), AGX_BARRIER_ALL, info);
 }
diff --git a/src/compiler/nir/nir_precompiled.h b/src/compiler/nir/nir_precompiled.h
index 48d34c81fa5..01eed53fbd3 100644
--- a/src/compiler/nir/nir_precompiled.h
+++ b/src/compiler/nir/nir_precompiled.h
@@ -104,14 +104,15 @@
  * implement that mechanism, a driver must implement the following function
  * signature:
  *
- *    MESA_DISPATCH_PRECOMP(context, grid, kernel index, argument pointer,
- *                          size of arguments)
+ *    MESA_DISPATCH_PRECOMP(context, grid, barrier, kernel index,
+ *                          argument pointer, size of arguments)
  *
  * The exact types used are determined by the driver. context is something like
- * a Vulkan command buffer. grid represents the 3D dispatch size. kernel index
- * is the index of the precompiled kernel (nir_precomp_index). argument pointer
- * is a host pointer to the sized argument structure, which the driver must
- * upload and bind (e.g. as push constants).
+ * a Vulkan command buffer. grid represents the 3D dispatch size. barrier
+ * describes the synchronization and cache flushing required before and after
+ * the dispatch. kernel index is the index of the precompiled kernel
+ * (nir_precomp_index). argument pointer is a host pointer to the sized argument
+ * structure, which the driver must upload and bind (e.g. as push constants).
  *
  * Because the types are ambiguous here, the same mechanism works for both
  * Gallium and Vulkan drivers.
@@ -479,7 +480,7 @@ nir_precomp_print_dispatch_macros(FILE *fp, const struct nir_precomp_opts *opt,
       for (unsigned i = 0; i < 2; ++i) {
          bool is_struct = i == 0;
 
-         fprintf(fp, "#define %s%s(_context, _grid%s", func->name,
+         fprintf(fp, "#define %s%s(_context, _grid, _barrier%s", func->name,
                  is_struct ? "_struct" : "", is_struct ? ", _data" : "");
 
          /* Add the arguments, including variant parameters. For struct macros,
@@ -523,7 +524,7 @@ nir_precomp_print_dispatch_macros(FILE *fp, const struct nir_precomp_opts *opt,
          /* Dispatch via MESA_DISPATCH_PRECOMP, which the driver must #define
           * suitably before #include-ing this file.
           */
-         fprintf(fp, "   MESA_DISPATCH_PRECOMP(_context, _grid, ");
+         fprintf(fp, "   MESA_DISPATCH_PRECOMP(_context, _grid, _barrier, ");
          nir_precomp_print_enum_value(fp, func);
          nir_precomp_print_variant_params(fp, func, false);
          fprintf(fp, ", &_args, sizeof(_args)); \\\n");
diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c
index c15cc6f226c..2a8ff5c1c6b 100644
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -15,6 +15,7 @@
 #include "agx_device.h"
 #include "agx_state.h"
 #include "libagx.h"
+#include "libagx_dgc.h"
 #include "libagx_shaders.h"
 
 static bool
@@ -500,7 +501,7 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx,
                         : copy_type == QUERY_COPY_BOOL32 ? 4
                                                          : 0;
 
-   libagx_copy_query_gl(batch, agx_1d(1), query->ptr.gpu,
+   libagx_copy_query_gl(batch, agx_1d(1), AGX_BARRIER_ALL, query->ptr.gpu,
                         rsrc->bo->va->addr + offset, result_type, bool_size);
    return true;
 }
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 6fa06b66c4c..c0d4f0ccc70 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -3103,7 +3103,8 @@ agx_launch_internal(struct agx_batch *batch, struct agx_grid grid,
 
 void
 agx_launch_precomp(struct agx_batch *batch, struct agx_grid grid,
-                   enum libagx_program program, void *args, size_t arg_size)
+                   enum agx_barrier barrier, enum libagx_program program,
+                   void *args, size_t arg_size)
 {
    struct agx_device *dev = agx_device(batch->ctx->base.screen);
    struct agx_precompiled_shader *cs =
@@ -3935,14 +3936,15 @@ agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
       perf_debug(dev, "Input assembly counters with primitive restart");
 
       libagx_increment_ia_restart(
-         batch, agx_1d(1024), ia_vertices, ia_primitives, vs_invocations,
-         c_prims, c_invs, draw, ib, ib_range_el, info->restart_index,
-         info->index_size, info->mode);
+         batch, agx_1d(1024), AGX_BARRIER_ALL, ia_vertices, ia_primitives,
+         vs_invocations, c_prims, c_invs, draw, ib, ib_range_el,
+         info->restart_index, info->index_size, info->mode);
    } else {
       perf_debug(dev, "Input assembly counters");
 
-      libagx_increment_ia(batch, agx_1d(1), ia_vertices, ia_primitives,
-                          vs_invocations, c_prims, c_invs, draw, info->mode);
+      libagx_increment_ia(batch, agx_1d(1), AGX_BARRIER_ALL, ia_vertices,
+                          ia_primitives, vs_invocations, c_prims, c_invs, draw,
+                          info->mode);
    }
 }
 
@@ -4146,7 +4148,7 @@ agx_launch_gs_prerast(struct agx_batch *batch,
          .prim = info->mode,
       };
 
-      libagx_gs_setup_indirect_struct(batch, agx_1d(1), gsi);
+      libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
 
       wg = agx_workgroup(1, 1, 1);
       grid_vs =
@@ -4172,7 +4174,8 @@ agx_launch_gs_prerast(struct agx_batch *batch,
       agx_launch(batch, grid_gs, wg, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY,
                  0);
 
-      libagx_prefix_sum_geom(batch, agx_1d(1024 * gs->gs_count_words), gp);
+      libagx_prefix_sum_geom(batch, agx_1d(1024 * gs->gs_count_words),
+                             AGX_BARRIER_ALL, gp);
    }
 
    /* Pre-GS shader */
@@ -4243,9 +4246,9 @@ agx_draw_without_restart(struct agx_batch *batch,
    };
 
    /* Unroll the index buffer for each draw */
-   libagx_unroll_restart_struct(batch, agx_1d(1024 * indirect->draw_count),
-                                unroll, util_logbase2(info->index_size),
-                                libagx_compact_prim(info->mode));
+   libagx_unroll_restart_struct(
+      batch, agx_1d(1024 * indirect->draw_count), AGX_BARRIER_ALL, unroll,
+      util_logbase2(info->index_size), libagx_compact_prim(info->mode));
 
    /* Now draw the results without restart */
    struct pipe_draw_info new_info = {
@@ -4675,10 +4678,10 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
       uint64_t grids =
          agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
 
-      libagx_tess_setup_indirect(batch, agx_1d(1), state, grids,
-                                 0 /* XXX: IA */, indirect_ptr, vertex_out_ptr,
-                                 0, 0, 0 /* XXX: Index buffer */,
-                                 ctx->vs->b.info.outputs, tcs_statistic);
+      libagx_tess_setup_indirect(
+         batch, agx_1d(1), AGX_BARRIER_ALL, state, grids, 0 /* XXX: IA */,
+         indirect_ptr, vertex_out_ptr, 0, 0, 0 /* XXX: Index buffer */,
+         ctx->vs->b.info.outputs, tcs_statistic);
 
       batch->uniforms.vertex_output_buffer_ptr = vertex_out_ptr;
 
@@ -4698,10 +4701,11 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
    batch->uniforms.vertex_output_buffer_ptr = 0;
 
    /* Generate counts, then prefix sum them, then finally tessellate. */
-   libagx_tessellate(batch, tess_grid, mode, LIBAGX_TESS_MODE_COUNT, state);
-   libagx_prefix_sum_tess(batch, agx_1d(1024), state);
-   libagx_tessellate(batch, tess_grid, mode, LIBAGX_TESS_MODE_WITH_COUNTS,
-                     state);
+   libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
+                     LIBAGX_TESS_MODE_COUNT, state);
+   libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state);
+   libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
+                     LIBAGX_TESS_MODE_WITH_COUNTS, state);
 
    /* Face culling state needs to be specialized for tess */
    ctx->dirty |= AGX_DIRTY_RS;
@@ -5307,7 +5311,8 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
       if (indirect) {
          uint64_t addr = agx_get_query_address(batch, statistic);
 
-         libagx_increment_cs_invocations(batch, agx_1d(1), indirect, addr,
+         libagx_increment_cs_invocations(batch, agx_1d(1), AGX_BARRIER_ALL,
+                                         indirect, addr,
                                          agx_workgroup_threads(wg));
       } else {
          agx_query_increment_cpu(ctx, statistic,
@@ -5435,7 +5440,8 @@ agx_decompress_inplace(struct agx_batch *batch, struct pipe_surface *surf,
              ail_metadata_height_tl(layout, level),
              surf->u.tex.last_layer - surf->u.tex.first_layer + 1);
 
-   libagx_decompress(batch, grid, layout, surf->u.tex.first_layer, level,
+   libagx_decompress(batch, grid, AGX_BARRIER_ALL, layout,
+                     surf->u.tex.first_layer, level,
                      agx_map_texture_gpu(rsrc, 0), images.gpu);
 }
 
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 1a70ab79261..628c0e84e76 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -796,8 +796,8 @@ void agx_launch(struct agx_batch *batch, struct agx_grid grid,
                 unsigned variable_shared_mem);
 
 void agx_launch_precomp(struct agx_batch *batch, struct agx_grid grid,
-                        enum libagx_program program, void *args,
-                        size_t arg_size);
+                        enum agx_barrier barrier, enum libagx_program program,
+                        void *args, size_t arg_size);
 
 #define MESA_DISPATCH_PRECOMP agx_launch_precomp