asahi: rip out existing MDI+GS implementation

I don't love deleting working code without a working replacement. But this impl is deeply questionable as-is, and will only balloon in complexity for shader objects. We need to go back to the drawing board with MDI and lower to real draws, possibly with control stream looping. Good news is that this is GL so we can prototype all this. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2026-05-01 03:48:06 +02:00 · 2024-01-28 19:45:46 -04:00 · 2024-01-28 19:45:46 -04:00 · 784323cf0f
commit 784323cf0f
parent 21403278ac
6 changed files with 18 additions and 260 deletions
--- a/src/asahi/lib/agx_nir_lower_gs.c
+++ b/src/asahi/lib/agx_nir_lower_gs.c
@ -1158,9 +1158,6 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx,
      NIR_PASS(progress, gs, nir_opt_loop_unroll);
   } while (progress);

-   if (ia->indirect_multidraw)
-      NIR_PASS(_, gs, agx_nir_lower_multidraw, ia);
-
   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
            nir_metadata_block_index | nir_metadata_dominance, NULL);

@ -1300,16 +1297,10 @@ agx_nir_gs_setup_indirect(nir_builder *b, const void *data)
 {
   const struct agx_gs_setup_indirect_key *key = data;

-   if (key->multidraw) {
-      uint32_t subgroup_size = 32;
-      b->shader->info.workgroup_size[0] = subgroup_size;
-   }
-
   libagx_gs_setup_indirect(b, nir_load_geometry_param_buffer_agx(b),
                            nir_load_input_assembly_buffer_agx(b),
                            nir_imm_int(b, key->prim),
-                            nir_channel(b, nir_load_local_invocation_id(b), 0),
-                            nir_imm_bool(b, key->multidraw));
+                            nir_channel(b, nir_load_local_invocation_id(b), 0));
 }

 void
--- a/src/asahi/lib/agx_nir_lower_gs.h
+++ b/src/asahi/lib/agx_nir_lower_gs.h
@ -28,8 +28,6 @@ bool agx_lower_output_to_var(struct nir_builder *b, struct nir_instr *instr,

 bool agx_nir_lower_ia(struct nir_shader *s, struct agx_ia_key *ia);

-bool agx_nir_lower_multidraw(struct nir_shader *s, struct agx_ia_key *key);
-
 bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader *vs,
                      const struct nir_shader *libagx, struct agx_ia_key *ia,
                      bool rasterizer_discard, struct nir_shader **gs_count,
@ -40,7 +38,6 @@ void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data);

 struct agx_gs_setup_indirect_key {
   enum mesa_prim prim;
-   bool multidraw;
 };

 void agx_nir_gs_setup_indirect(struct nir_builder *b, const void *key);
--- a/src/asahi/lib/agx_nir_lower_ia.c
+++ b/src/asahi/lib/agx_nir_lower_ia.c
@ -18,14 +18,6 @@
 * This file implements input assembly in software for geometry/tessellation
 * shaders. load_vertex_id is lowered based on the topology. Most of the logic
 * lives in CL library routines.
- *
- * When geom/tess is used, multidraw indirect is implemented by:
- *
- * 1. Prefix summing the vertex counts across draws.
- * 2. Issuing a single indirect draw for the summed vertices.
- * 3. Binary searching the prefix sum buffer in software index fetch.
- *
- * This multidraw implementation kicks off the prefix sum and lowered draw.
 */

 /*
@ -89,16 +81,6 @@ load_vertex_id(nir_builder *b, struct agx_ia_key *key)
   if (key->index_size) {
      nir_def *ia = nir_load_input_assembly_buffer_agx(b);

-      /*
-       * For multidraw, apply the index buffer offset. For !multidraw, this is
-       * handled ahead-of-time and baked into the index buffer pointer.
-       */
-      if (key->indirect_multidraw) {
-         nir_def *first = libagx_multidraw_param(b, ia, nir_load_draw_id(b),
-                                                 nir_imm_int(b, 2));
-         id = nir_iadd(b, id, first);
-      }
-
      nir_def *address =
         libagx_index_buffer(b, ia, id, nir_imm_int(b, key->index_size));

@ -133,93 +115,3 @@ agx_nir_lower_ia(nir_shader *s, struct agx_ia_key *key)
      s, lower_vertex_id, nir_metadata_block_index | nir_metadata_dominance,
      key);
 }
-
-struct multidraw_state {
-   nir_def *raw_id, *draw, *primitive, *first_vertex, *base_instance;
-   nir_def *num_vertices;
-
-   bool indexed;
-};
-
-static nir_def *
-map_multidraw_param(nir_builder *b, nir_intrinsic_op intrin,
-                    struct multidraw_state *state)
-{
-   switch (intrin) {
-   case nir_intrinsic_load_draw_id:
-      return state->draw;
-
-   case nir_intrinsic_load_primitive_id:
-      return state->primitive;
-
-   case nir_intrinsic_load_base_vertex:
-      return state->indexed ? state->first_vertex : nir_imm_int(b, 0);
-
-   case nir_intrinsic_load_first_vertex:
-      return state->first_vertex;
-
-   case nir_intrinsic_load_base_instance:
-      return state->base_instance;
-
-   case nir_intrinsic_load_num_vertices:
-      return state->num_vertices;
-
-   default:
-      return NULL;
-   }
-}
-
-static bool
-lower_multidraw(nir_builder *b, nir_intrinsic_instr *intr, void *data)
-{
-   b->cursor = nir_before_instr(&intr->instr);
-   nir_def *id = map_multidraw_param(b, intr->intrinsic, data);
-   if (!id)
-      return false;
-
-   nir_instr_remove(&intr->instr);
-   nir_def_rewrite_uses(&intr->def, id);
-   return true;
-}
-
-bool
-agx_nir_lower_multidraw(nir_shader *s, struct agx_ia_key *key)
-{
-   assert(key->indirect_multidraw);
-
-   nir_builder b_ =
-      nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
-   nir_builder *b = &b_;
-
-   struct multidraw_state state = {
-      /* Filled in at the end to avoid recursion */
-      .raw_id = nir_undef(b, 1, 32),
-      .indexed = key->index_size > 0,
-   };
-
-   nir_def *ia = nir_load_input_assembly_buffer_agx(b);
-   state.draw = libagx_multidraw_draw_id(b, ia, state.raw_id);
-
-   state.primitive = libagx_multidraw_primitive_id(
-      b, ia, state.draw, state.raw_id, nir_imm_int(b, key->mode));
-
-   state.num_vertices =
-      libagx_multidraw_param(b, ia, state.draw, nir_imm_int(b, 0));
-
-   state.first_vertex = libagx_multidraw_param(
-      b, ia, state.draw, nir_imm_int(b, state.indexed ? 3 : 2));
-
-   state.base_instance = libagx_multidraw_param(
-      b, ia, state.draw, nir_imm_int(b, state.indexed ? 4 : 3));
-
-   nir_shader_intrinsics_pass(b->shader, lower_multidraw,
-                              nir_metadata_block_index | nir_metadata_dominance,
-                              &state);
-
-   b->cursor = nir_before_impl(b->impl);
-   nir_def_rewrite_uses(state.raw_id, nir_load_primitive_id(b));
-
-   nir_metadata_preserve(b->impl,
-                         nir_metadata_block_index | nir_metadata_dominance);
-   return true;
-}
--- a/src/asahi/lib/shaders/geometry.cl
+++ b/src/asahi/lib/shaders/geometry.cl
@ -261,40 +261,6 @@ libagx_index_buffer(constant struct agx_ia_state *p, uint id,
   return (uintptr_t)&p->index_buffer[id * index_size];
 }

-uint
-libagx_multidraw_draw_id(constant struct agx_ia_state *p, uint raw_id)
-{
-   global uint *sums = p->prefix_sums;
-
-   /* TODO: replace with binary search or interpolation search */
-   uint i = 0;
-   for (i = 0; raw_id >= sums[i]; ++i)
-      ;
-   return i;
-}
-
-uint
-libagx_multidraw_param(constant struct agx_ia_state *p, uint draw_id, uint word)
-{
-   global uint *draw = (global uint *)(p->draws + (draw_id * p->draw_stride));
-   return draw[word];
-}
-
-uint
-libagx_multidraw_primitive_id(constant struct agx_ia_state *p, uint draw_id,
-                              uint raw_id, enum mesa_prim mode)
-{
-   uint start = draw_id > 0 ? p->prefix_sums[draw_id - 1] : 0;
-   uint raw_offset = raw_id - start;
-
-   /* Note: if we wanted, we could precompute magic divisors in the setup kernel
-    * to avoid the non-constant division here.
-    */
-   uint vertex_count = libagx_multidraw_param(p, draw_id, 0);
-   uint primitive_count = u_decomposed_prims_for_vertices(mode, vertex_count);
-   return raw_offset % primitive_count;
-}
-
 uint
 libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i)
 {
@ -376,79 +342,19 @@ libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,
   }
 }

-uint2
-process_draw(global uint *draw, enum mesa_prim mode)
-{
-   /* Regardless of indexing being enabled, this holds */
-   uint vertex_count = draw[0];
-   uint instance_count = draw[1];
-
-   uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count);
-   return (uint2)(prim_per_instance, instance_count);
-}
-
-uint2
-process_multidraw(global struct agx_ia_state *s, uint local_id,
-                  enum mesa_prim mode)
-{
-   uintptr_t draw_ptr = s->draws;
-   uint draw_stride = s->draw_stride;
-
-   /* Determine the number of draws. This is given by the application, but must
-    * be clamped to the minimum provided to the driver, implementing spec text:
-    *
-    *    The actual number of executed draw calls is the minimum of the count
-    *    specified in countBuffer and maxDrawCount.
-    */
-   uint len = min(*(s->count), s->max_draws);
-
-   /* Prefix sum the vertex counts (multiplied by instance counts) across draws.
-    * The number of draws is expected to be small, so this serialization should
-    * be ok in practice. See libagx_prefix_sum for algorithm details.
-    */
-   uint i, count = 0;
-   uint len_remainder = len % 32;
-   uint len_rounded_down = len - len_remainder;
-
-   for (i = local_id; i < len_rounded_down; i += 32) {
-      global uint *draw_ = (global uint *)(draw_ptr + (i * draw_stride));
-      uint2 draw = process_draw(draw_, mode);
-
-      /* Total primitives */
-      uint value = draw.x * draw.y;
-
-      /* TODO: use inclusive once that's wired up */
-      uint value_prefix_sum = sub_group_scan_exclusive_add(value) + value;
-      s->prefix_sums[i] = count + value_prefix_sum;
-      count += sub_group_broadcast(value_prefix_sum, 31);
-   }
-
-   if (local_id < len_remainder) {
-      global uint *draw_ = (global uint *)(draw_ptr + (i * draw_stride));
-      uint2 draw = process_draw(draw_, mode);
-      uint value = draw.x * draw.y;
-
-      /* TODO: use inclusive once that's wired up */
-      s->prefix_sums[i] = count + sub_group_scan_exclusive_add(value) + value;
-   }
-
-   return (uint2)(len > 0 ? s->prefix_sums[len - 1] : 0, 1);
-}
-
 void
 libagx_gs_setup_indirect(global struct agx_geometry_params *p,
                         global struct agx_ia_state *ia, enum mesa_prim mode,
-                         uint local_id, bool multidraw)
+                         uint local_id)
 {
-   /* Determine the (primitives, instances) grid size. For multidraw, this will
-    * be a synthetic grid for the entire collection, but that's ok.
-    */
-   uint2 draw = multidraw ? process_multidraw(ia, local_id, mode)
-                          : process_draw((global uint *)ia->draws, mode);
+   global uint *in_draw = (global uint *)ia->draws;

-   /* Elect a single lane */
-   if (multidraw && local_id != 0)
-      return;
+   /* Determine the (primitives, instances) grid size. */
+   uint vertex_count = in_draw[0];
+   uint instance_count = in_draw[1];
+
+   uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count);
+   uint2 draw = (uint2)(prim_per_instance, instance_count);

   /* There are primitives*instances primitives total */
   p->input_primitives = draw.x * draw.y;
@ -462,11 +368,8 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p,
    * in elements. Apply that offset now that we have it. For a hardware
    * indirect draw, the hardware would do this for us, but for software input
    * assembly we need to do it ourselves.
-    *
-    * For multidraw, this happens per-draw in the input assembly instead. We
-    * could do that for non-multidraw too, but it'd be less efficient.
    */
-   if (ia->index_buffer && !multidraw) {
+   if (ia->index_buffer) {
      ia->index_buffer += ((constant uint *)ia->draws)[2] * ia->index_size_B;
   }

--- a/src/asahi/lib/shaders/geometry.h
+++ b/src/asahi/lib/shaders/geometry.h
@ -31,13 +31,6 @@ struct agx_ia_key {

   /* Use first vertex as the provoking vertex for flat shading */
   bool flatshade_first;
-
-   /* Whether we are doing input assembly for an indirect multidraw that is
-    * implemented by a single superdraw with a prefix sum of vertex counts per
-    * draw. This requires lowering lots of sysvals to index into the draw
-    * descriptors according to the associated dynamic multidraw state.
-    */
-   bool indirect_multidraw;
 };

 /* Packed geometry state buffer */
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -2533,8 +2533,6 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
      .ia.mode = translate_ia_mode(info->mode),
      .ia.flatshade_first =
         ia_needs_provoking(info->mode) && ctx->rast->base.flatshade_first,
-      .ia.indirect_multidraw =
-         indirect && indirect->indirect_draw_count != NULL,

      .rasterizer_discard = ctx->rast->base.rasterizer_discard,
   };
@ -4073,22 +4071,6 @@ agx_upload_ia_params(struct agx_batch *batch, const struct pipe_draw_info *info,
      ia.draws = rsrc->bo->ptr.gpu + indirect->offset;
   }

-   if (indirect && indirect->indirect_draw_count) {
-      struct agx_resource *rsrc = agx_resource(indirect->indirect_draw_count);
-      agx_batch_reads(batch, rsrc);
-
-      ia.count = rsrc->bo->ptr.gpu + indirect->indirect_draw_count_offset;
-      ia.max_draws = indirect->draw_count;
-      ia.draw_stride = indirect->stride;
-
-      /* MDI requires prefix sums, but not for our current unroll path */
-      if (!unroll_output) {
-         size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
-         ia.prefix_sums =
-            agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
-      }
-   }
-
   batch->uniforms.input_assembly =
      agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
 }
@ -4211,11 +4193,10 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,

      struct agx_gs_setup_indirect_key key = {
         .prim = info->mode,
-         .multidraw = (indirect->indirect_draw_count != NULL),
      };

      const struct pipe_grid_info grid_setup = {
-         .block = {key.multidraw ? 32 : 1, 1, 1},
+         .block = {1, 1, 1},
         .grid = {1, 1, 1},
      };

@ -4448,12 +4429,6 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
      return true;
   }

-   /* TODO: also sloppy, we should generate VDM commands from a shader */
-   if (indirect && indirect->indirect_draw_count) {
-      perf_debug_ctx(ctx, "Using passthrough GS due to multidraw indirect");
-      return true;
-   }
-
   /* Edge flags are emulated with a geometry shader */
   if (has_edgeflags(ctx, info->mode)) {
      perf_debug_ctx(ctx, "Using passthrough GS due to edge flags");
@ -4898,6 +4873,13 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
      return;
   }

+   /* TODO: stop cheating */
+   if (indirect && indirect->indirect_draw_count) {
+      perf_debug_ctx(ctx, "multi-draw indirect");
+      util_draw_indirect(pctx, info, indirect);
+      return;
+   }
+
   /* TODO: stop cheating */
   if (info->mode == MESA_PRIM_PATCHES && indirect) {
      perf_debug_ctx(ctx, "indirect tessellation");