asahi: Unroll GS/XFB primitive restart on the GPU

..and fix bugs versus the CPU unroll while we're at it. CPU based unrolling is invalid in Vulkan, but this slow-as-dogs GPU unroll is ok. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26614>
2026-01-22 04:10:40 +01:00 · 2023-11-27 11:34:31 -04:00 · 2023-11-27 11:34:31 -04:00 · f4a648c607
commit f4a648c607
parent 15957219ad
6 changed files with 293 additions and 44 deletions
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@ -1189,3 +1189,28 @@ agx_nir_gs_setup_indirect(const nir_shader *libagx, enum mesa_prim prim,
   agx_preprocess_nir(b.shader, libagx, false, &info);
   return b.shader;
 }
+
+nir_shader *
+agx_nir_unroll_restart(const nir_shader *libagx, enum mesa_prim prim,
+                       unsigned index_size_B)
+{
+   nir_builder b = nir_builder_init_simple_shader(
+      MESA_SHADER_COMPUTE, &agx_nir_options, "Primitive restart unroll");
+
+   nir_def *ia = nir_load_input_assembly_buffer_agx(&b);
+   nir_def *draw = nir_channel(&b, nir_load_workgroup_id(&b), 0);
+   nir_def *mode = nir_imm_int(&b, prim);
+
+   if (index_size_B == 1)
+      libagx_unroll_restart_u8(&b, ia, mode, draw);
+   else if (index_size_B == 2)
+      libagx_unroll_restart_u16(&b, ia, mode, draw);
+   else if (index_size_B == 4)
+      libagx_unroll_restart_u32(&b, ia, mode, draw);
+   else
+      unreachable("invalid index size");
+
+   UNUSED struct agx_uncompiled_shader_info info;
+   agx_preprocess_nir(b.shader, libagx, false, &info);
+   return b.shader;
+}
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -29,4 +29,8 @@ struct nir_shader *agx_nir_gs_setup_indirect(const struct nir_shader *libagx,
                                             enum mesa_prim prim,
                                             bool multidraw);

+struct nir_shader *agx_nir_unroll_restart(const struct nir_shader *libagx,
+                                          enum mesa_prim prim,
+                                          unsigned index_size_B);
+
 #endif
--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@ -6,6 +6,12 @@

 #include "geometry.h"

+static uint
+align(uint x, uint y)
+{
+   return (x + 1) & ~(y - 1);
+}
+
 /* TODO: Primitive restart */
 uint
 libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
@ -110,6 +116,101 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
   }
 }

+/*
+ * When unrolling the index buffer for a draw, we translate the old indirect
+ * draws to new indirect draws. This routine allocates the new index buffer and
+ * sets up most of the new draw descriptor.
+ */
+static global void *
+setup_unroll_for_draw(global struct agx_ia_state *ia, constant uint *in_draw,
+                      uint draw, enum mesa_prim mode, uint index_size_B)
+{
+   /* Determine an upper bound on the memory required for the index buffer.
+    * Restarts only decrease the unrolled index buffer size, so the maximum size
+    * is the unrolled size when the input has no restarts.
+    */
+   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
+   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
+   uint alloc_size = max_verts * index_size_B;
+
+   /* Allocate memory from the heap for the unrolled index buffer. Use an atomic
+    * since multiple threads may be running to handle multidraw in parallel.
+    */
+   global struct agx_geometry_state *heap = ia->heap;
+   uint old_heap_bottom = atomic_fetch_add(
+      (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
+
+   /* Regardless of the input stride, we use tightly packed output draws */
+   global uint *out = &ia->out_draws[5 * draw];
+
+   /* Setup most of the descriptor. Count will be determined after unroll. */
+   out[1] = in_draw[1];                     /* instance count */
+   out[2] = old_heap_bottom / index_size_B; /* index offset */
+   out[3] = in_draw[3];                     /* index bias */
+   out[4] = in_draw[4];                     /* base instance */
+
+   /* Return the index buffer we allocated */
+   return (global uchar *)heap->heap + (old_heap_bottom * index_size_B);
+}
+
+#define UNROLL(INDEX, suffix)                                                  \
+   void libagx_unroll_restart_##suffix(global struct agx_ia_state *ia,         \
+                                       enum mesa_prim mode, uint draw)         \
+   {                                                                           \
+      /* For an indirect multidraw, we are dispatched maxDraws times and       \
+       * terminate trailing invocations.                                       \
+       */                                                                      \
+      if (ia->count && draw >= *(ia->count))                                   \
+         return;                                                               \
+                                                                               \
+      constant uint *in_draw =                                                 \
+         (constant uint *)(ia->draws + (draw * ia->draw_stride));              \
+                                                                               \
+      uint count = in_draw[0];                                                 \
+      constant INDEX *in = (constant INDEX *)ia->index_buffer;                 \
+                                                                               \
+      global INDEX *out =                                                      \
+         setup_unroll_for_draw(ia, in_draw, draw, mode, sizeof(INDEX));        \
+                                                                               \
+      uint out_prims = 0;                                                      \
+      INDEX restart_idx = ia->restart_index;                                   \
+      bool flatshade_first = ia->flatshade_first;                              \
+      uint in_size_el = ia->index_buffer_size_B / sizeof(INDEX);               \
+                                                                               \
+      uint needle = 0;                                                         \
+      uint per_prim = mesa_vertices_per_prim(mode);                            \
+      while (needle < count) {                                                 \
+         /* Search for next restart or the end */                              \
+         uint next_restart = needle;                                           \
+         while ((next_restart < count) && in[next_restart] != restart_idx)     \
+            ++next_restart;                                                    \
+                                                                               \
+         /* Emit up to the next restart */                                     \
+         uint subcount = next_restart - needle;                                \
+         uint subprims = u_decomposed_prims_for_vertices(mode, subcount);      \
+         for (uint i = 0; i < subprims; ++i) {                                 \
+            for (uint vtx = 0; vtx < per_prim; ++vtx) {                        \
+               uint id = libagx_vertex_id_for_topology(mode, flatshade_first,  \
+                                                       i, vtx, subprims);      \
+               uint offset = needle + id;                                      \
+                                                                               \
+               out[(out_prims * per_prim) + vtx] =                             \
+                  offset < in_size_el ? in[offset] : 0;                        \
+            }                                                                  \
+                                                                               \
+            out_prims++;                                                       \
+         }                                                                     \
+                                                                               \
+         needle = next_restart + 1;                                            \
+      }                                                                        \
+                                                                               \
+      ia->out_draws[(5 * draw) + 0] = out_prims * per_prim;                    \
+   }
+
+UNROLL(uchar, u8)
+UNROLL(ushort, u16)
+UNROLL(uint, u32)
+
 uintptr_t
 libagx_index_buffer(constant struct agx_ia_state *p, uint id,
                    uint index_size)
@ -193,12 +294,6 @@ libagx_end_primitive(global int *index_buffer, uint total_verts,
   out[verts_in_prim] = -1;
 }

-static uint
-align(uint x, uint y)
-{
-   return (x + 1) & ~(y - 1);
-}
-
 void
 libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,
                     uint vertices, uint primitives, uint output_stride_B)
--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@ -41,7 +41,20 @@ struct agx_ia_key {
   bool indirect_multidraw;
 };

+/* Packed geometry state buffer */
+struct agx_geometry_state {
+   /* Heap to allocate from, in either direction. By convention, the top is used
+    * for intra-draw allocations and the bottom is used for full-batch
+    * allocations. In the future we could use kernel support to improve this.
+    */
+   GLOBAL(uchar) heap;
+   uint32_t heap_bottom, heap_top, heap_size, padding;
+} PACKED;
+
 struct agx_ia_state {
+   /* Heap to allocate from across draws */
+   GLOBAL(struct agx_geometry_state) heap;
+
   /* Input: index buffer if present. */
   CONST(uchar) index_buffer;

@ -57,21 +70,26 @@ struct agx_ia_state {
    */
   GLOBAL(uint) prefix_sums;

+   /* When unrolling primitive restart, output draw descriptors */
+   GLOBAL(uint) out_draws;
+
+   /* Primitive restart index, if unrolling */
+   uint32_t restart_index;
+
+   /* Input index buffer size in bytes, if unrolling */
+   uint32_t index_buffer_size_B;
+
   /* Stride for the draw descrptor array */
   uint32_t draw_stride;

-   /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
-   uint8_t index_size_B;
-} PACKED;
-
-/* Packed geometry state buffer */
-struct agx_geometry_state {
-   /* Heap to allocate from, in either direction. By convention, the top is used
-    * for intra-draw allocations and the bottom is used for full-batch
-    * allocations. In the future we could use kernel support to improve this.
+   /* When unrolling primitive restart, use first vertex as the provoking vertex
+    * for flat shading. We could stick this in the key, but meh, you're already
+    * hosed for perf on the unroll path.
    */
-   GLOBAL(uchar) heap;
-   uint32_t heap_bottom, heap_top, heap_size, padding;
+   uint32_t flatshade_first;
+
+   /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
+   uint32_t index_size_B;
 } PACKED;

 struct agx_geometry_params {
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -39,6 +39,7 @@
 #include "util/macros.h"
 #include "util/u_dump.h"
 #include "util/u_inlines.h"
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
 #include "util/u_resource.h"
@ -3311,6 +3312,17 @@ agx_index_buffer_direct_ptr(struct agx_batch *batch,
   }
 }

+static uint64_t
+agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
+                     const struct pipe_draw_start_count_bias *draw,
+                     size_t *extent)
+{
+   if (draw)
+      return agx_index_buffer_direct_ptr(batch, draw, info, extent);
+   else
+      return agx_index_buffer_rsrc_ptr(batch, info, extent);
+}
+
 static bool
 agx_scissor_culls_everything(struct agx_context *ctx)
 {
@ -3399,16 +3411,20 @@ agx_batch_geometry_state(struct agx_batch *batch)
   return batch->geometry_state;
 }

-static uint64_t
-agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
-                          const struct pipe_draw_info *info,
-                          const struct pipe_draw_start_count_bias *draw,
-                          const struct pipe_draw_indirect_info *indirect)
+static void
+agx_upload_ia_params(struct agx_batch *batch, const struct pipe_draw_info *info,
+                     const struct pipe_draw_indirect_info *indirect,
+                     uint64_t input_index_buffer, size_t index_buffer_size_B,
+                     uint64_t unroll_output)
 {
-   /* XXX move me */
   struct agx_ia_state ia = {
+      .heap = agx_batch_geometry_state(batch),
      .index_buffer = input_index_buffer,
      .index_size_B = info->index_size,
+      .out_draws = unroll_output,
+      .restart_index = info->restart_index,
+      .index_buffer_size_B = index_buffer_size_B,
+      .flatshade_first = batch->ctx->rast->base.flatshade_first,
   };

   if (indirect) {
@ -3425,13 +3441,27 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
      ia.count = rsrc->bo->ptr.gpu + indirect->indirect_draw_count_offset;
      ia.draw_stride = indirect->stride;

-      size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
-      ia.prefix_sums =
-         agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
+      /* MDI requires prefix sums, but not for our current unroll path */
+      if (!unroll_output) {
+         size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
+         ia.prefix_sums =
+            agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
+      }
   }

   batch->uniforms.input_assembly =
      agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
+}
+
+static uint64_t
+agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
+                          size_t index_buffer_size_B,
+                          const struct pipe_draw_info *info,
+                          const struct pipe_draw_start_count_bias *draw,
+                          const struct pipe_draw_indirect_info *indirect)
+{
+   agx_upload_ia_params(batch, info, indirect, input_index_buffer,
+                        index_buffer_size_B, 0);

   struct agx_geometry_params params = {
      .state = agx_batch_geometry_state(batch),
@ -3511,10 +3541,7 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
      batch->cdm = agx_encoder_allocate(batch, dev);
   }

-   if (info->primitive_restart) {
-      fprintf(stderr, "Mode: %s\n", util_str_prim_mode(info->mode, true));
-      unreachable("TODO: Primitive restart with GS");
-   }
+   assert(!info->primitive_restart && "should have been lowered");

   struct pipe_grid_info grid = {.block = {1, 1, 1}};
   struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
@ -3632,6 +3659,88 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
   memcpy(ctx->prims_generated, prim_queries, sizeof(prim_queries));
 }

+static void
+agx_draw_without_restart(struct agx_batch *batch,
+                         const struct pipe_draw_info *info,
+                         unsigned drawid_offset,
+                         const struct pipe_draw_indirect_info *indirect,
+                         const struct pipe_draw_start_count_bias *draw)
+{
+   struct agx_context *ctx = batch->ctx;
+   struct agx_device *dev = agx_device(ctx->base.screen);
+
+   perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
+
+   agx_batch_init_state(batch);
+
+   size_t ib_extent = 0;
+   uint64_t ib = agx_index_buffer_ptr(batch, info, draw, &ib_extent);
+
+   /* The rest of this function handles only the general case of indirect
+    * multidraws, so synthesize an indexed indirect draw now if we need one for
+    * a direct draw (necessarily only one). This unifies the code paths.
+    */
+   struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
+
+   if (!indirect) {
+      uint32_t desc[5] = {draw->count, info->instance_count, draw->start,
+                          draw->index_bias, info->start_instance};
+
+      u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
+                    &indirect_synthesized.offset, &indirect_synthesized.buffer);
+
+      indirect = &indirect_synthesized;
+   }
+
+   /* Next, we unroll the index buffer used by the indirect draw */
+   uint8_t log2_idx_size = util_logbase2(info->index_size);
+   assert(log2_idx_size <= 2);
+
+   if (!batch->cdm.bo)
+      batch->cdm = agx_encoder_allocate(batch, dev);
+
+   if (!ctx->gs_unroll_restart[info->mode][log2_idx_size]) {
+      struct agx_shader_key base_key = {0};
+
+      ctx->gs_unroll_restart[info->mode][log2_idx_size] = agx_compile_nir(
+         dev, agx_nir_unroll_restart(dev->libagx, info->mode, info->index_size),
+         &base_key, NULL);
+   }
+
+   /* Allocate output indirect draw descriptors. This is exact. */
+   struct agx_resource out_draws_rsrc = {0};
+   struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
+      &batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
+      &out_draws_rsrc.bo);
+
+   agx_upload_ia_params(batch, info, indirect, ib, ib_extent, out_draws.gpu);
+
+   /* Unroll the index buffer for each draw */
+   const struct pipe_grid_info grid_setup = {
+      .block = {1, 1, 1},
+      .grid = {indirect->draw_count, 1, 1},
+   };
+
+   agx_launch(batch, &grid_setup,
+              ctx->gs_unroll_restart[info->mode][log2_idx_size],
+              PIPE_SHADER_COMPUTE);
+
+   /* Now draw the results without restart */
+   struct pipe_draw_info new_info = *info;
+   new_info.primitive_restart = false;
+   new_info.mode = u_decomposed_prim(info->mode);
+   new_info.index.resource = ctx->heap;
+   new_info.has_user_indices = false;
+
+   struct pipe_draw_indirect_info new_indirect = *indirect;
+   new_indirect.buffer = &out_draws_rsrc.base;
+   new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->ptr.gpu;
+   new_indirect.stride = 5 * sizeof(uint32_t);
+
+   ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, draw,
+                      1);
+}
+
 static bool
 agx_needs_passthrough_gs(struct agx_context *ctx,
                         const struct pipe_draw_info *info,
@ -3841,15 +3950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      return;
   }

-   bool uses_gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
-
-   if (uses_gs && info->primitive_restart) {
-      perf_debug_ctx(ctx, "Emulating primitive restart due to GS");
-      util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect,
-                                         draws);
-      return;
-   }
-
   /* Only the rasterization stream counts */
   if (ctx->active_queries && ctx->prims_generated[0] &&
       !ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
@ -3859,6 +3959,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   }

   struct agx_batch *batch = agx_get_batch(ctx);
+
+   if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
+       info->index_size) {
+
+      agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
+      return;
+   }
+
   agx_batch_add_timestamp_query(batch, ctx->time_elapsed);

   unsigned idx_size = info->index_size;
@ -3866,10 +3974,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   size_t ib_extent = 0;

   if (idx_size) {
-      if (indirect != NULL)
-         ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
-      else
-         ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
+      ib =
+         agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
   }

 #ifndef NDEBUG
@ -3905,7 +4011,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
                                .gpu;

      batch->uniforms.geometry_params =
-         agx_batch_geometry_params(batch, ib, info, draws, indirect);
+         agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
   }

   struct agx_compiled_shader *vs = ctx->vs;
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -510,6 +510,7 @@ struct agx_context {

   struct agx_compiled_shader *gs_prefix_sums[16];
   struct agx_compiled_shader *gs_setup_indirect[MESA_PRIM_MAX][2];
+   struct agx_compiled_shader *gs_unroll_restart[MESA_PRIM_MAX][3];
   struct agx_meta_cache meta;

   uint32_t syncobj;