poly,asahi: Pull restart unrolling into libpoly

The interface here intentionally doesn't handle multi-draw. It's intended that the caller will sort that out in whatever way they want to handle multi-draw dispatches. Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Reviewed-by: Mary Guillemard <mary@mary.zone> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38404>
2025-12-20 16:00:08 +01:00 · 2025-11-14 14:10:23 -05:00 · 2025-11-14 14:10:23 -05:00 · d9f795e6d0
commit d9f795e6d0
parent ddff3700a4
2 changed files with 144 additions and 109 deletions
--- a/src/asahi/libagx/geometry.cl
+++ b/src/asahi/libagx/geometry.cl
@ -5,6 +5,7 @@
 */

 #include "compiler/libcl/libcl_vk.h"
+#include "poly/cl/restart.h"
 #include "poly/geometry.h"
 #include "poly/prim.h"
 #include "poly/tessellator.h"
@ -84,57 +85,6 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
   }
 }

-/*
- * Return the ID of the first thread in the workgroup where cond is true, or
- * 1024 if cond is false across the workgroup.
- */
-static uint
-first_true_thread_in_workgroup(bool cond, local uint *scratch)
-{
-   barrier(CLK_LOCAL_MEM_FENCE);
-   scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   uint first_group =
-      ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
-   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
-   return (first_group * 32) + off;
-}
-
-/*
- * When unrolling the index buffer for a draw, we translate the old indirect
- * draws to new indirect draws. This routine allocates the new index buffer and
- * sets up most of the new draw descriptor.
- */
-static global void *
-setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw,
-                      global uint *out, enum mesa_prim mode, uint index_size_B)
-{
-   /* Determine an upper bound on the memory required for the index buffer.
-    * Restarts only decrease the unrolled index buffer size, so the maximum size
-    * is the unrolled size when the input has no restarts.
-    */
-   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
-   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
-   uint alloc_size = max_verts * index_size_B;
-
-   /* Allocate unrolled index buffer.
-    *
-    * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
-    * currently wired up in any driver.
-    */
-   uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
-
-   /* Setup most of the descriptor. Count will be determined after unroll. */
-   out[1] = in_draw[1];                       /* instance count */
-   out[2] = old_heap_bottom_B / index_size_B; /* index offset */
-   out[3] = in_draw[3];                       /* index bias */
-   out[4] = in_draw[4];                       /* base instance */
-
-   /* Return the index buffer we allocated */
-   return (global uchar *)heap->base + old_heap_bottom_B;
-}
-
 KERNEL(1024)
 libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
                      constant uint *in_draw, global uint32_t *out_draw,
@ -144,65 +94,11 @@ libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
 {
   uint32_t index_size_B = 1 << index_size_log2;
   enum mesa_prim mode = poly_uncompact_prim(mode__11);
-   uint tid = cl_local_id.x;
-   uint count = in_draw[0];

-   local uintptr_t out_ptr;
-   if (tid == 0) {
-      out_ptr = (uintptr_t)setup_unroll_for_draw(heap, in_draw, out_draw, mode,
-                                                 index_size_B);
-   }
-
-   barrier(CLK_LOCAL_MEM_FENCE);
-
-   uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
-      index_buffer, index_buffer_size_el, in_draw[2], index_size_B));
-
-   local uint scratch[32];
-
-   uint out_prims = 0;
-   uint needle = 0;
-   uint per_prim = mesa_vertices_per_prim(mode);
-   while (needle < count) {
-      /* Search for next restart or the end. Lanes load in parallel. */
-      uint next_restart = needle;
-      for (;;) {
-         uint idx = next_restart + tid;
-         bool restart =
-            idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx,
-                                            index_size_B) == restart_index;
-
-         uint next_offs = first_true_thread_in_workgroup(restart, scratch);
-
-         next_restart += next_offs;
-         if (next_offs < 1024)
-            break;
-      }
-
-      /* Emit up to the next restart. Lanes output in parallel */
-      uint subcount = next_restart - needle;
-      uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
-      uint out_prims_base = out_prims;
-      for (uint i = tid; i < subprims; i += 1024) {
-         for (uint vtx = 0; vtx < per_prim; ++vtx) {
-            uint id =
-               poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
-            uint offset = needle + id;
-
-            uint x = ((out_prims_base + i) * per_prim) + vtx;
-            uint y = poly_load_index(in_ptr, index_buffer_size_el, offset,
-                                     index_size_B);
-
-            poly_store_index(out_ptr, index_size_B, x, y);
-         }
-      }
-
-      out_prims += subprims;
-      needle = next_restart + 1;
-   }
-
-   if (tid == 0)
-      out_draw[0] = out_prims * per_prim;
+   POLY_DECL_UNROLL_RESTART_SCRATCH(scratch, 1024);
+   poly_unroll_restart(out_draw, heap, in_draw, index_buffer,
+                       index_buffer_size_el, index_size_B, restart_index,
+                       flatshade_first, mode, scratch);
 }

 KERNEL(1)
--- a/src/poly/cl/restart.h
+++ b/src/poly/cl/restart.h
@ -0,0 +1,139 @@
+/*
+ * Copyright 2023 Alyssa Rosenzweig
+ * Copyright 2023 Valve Corporation
+ * Copyright 2025 Collabora, Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/libcl/libcl.h"
+#include "poly/geometry.h"
+#include "poly/prim.h"
+
+#define POLY_DECL_UNROLL_RESTART_SCRATCH(__scratch, __wg_size) \
+   local uint __scratch[MAX2(__wg_size / 32, sizeof(void *))]
+
+/*
+ * Return the ID of the first thread in the workgroup where cond is true, or
+ * a value greater than or equal to the workgroup size if cond is false across
+ * the workgroup.
+ */
+static inline uint
+poly_work_group_first_true(bool cond, local uint *scratch)
+{
+   barrier(CLK_LOCAL_MEM_FENCE);
+   scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   uint first_group =
+      ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
+   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
+   return (first_group * 32) + off;
+}
+
+/*
+ * When unrolling the index buffer for a draw, we translate the old indirect
+ * draws to new indirect draws. This routine allocates the new index buffer and
+ * sets up most of the new draw descriptor.
+ */
+static inline global void *
+poly_setup_unroll_for_draw(global struct poly_heap *heap,
+                           constant uint *in_draw, global uint *out_draw,
+                           enum mesa_prim mode, uint index_size_B)
+{
+   /* Determine an upper bound on the memory required for the index buffer.
+    * Restarts only decrease the unrolled index buffer size, so the maximum size
+    * is the unrolled size when the input has no restarts.
+    */
+   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
+   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
+   uint alloc_size = max_verts * index_size_B;
+
+   /* Allocate unrolled index buffer.
+    *
+    * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
+    * currently wired up in any driver.
+    */
+   uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
+
+   /* Setup most of the descriptor. Count will be determined after unroll. */
+   out_draw[1] = in_draw[1];                       /* instance count */
+   out_draw[2] = old_heap_bottom_B / index_size_B; /* index offset */
+   out_draw[3] = in_draw[3];                       /* index bias */
+   out_draw[4] = in_draw[4];                       /* base instance */
+
+   /* Return the index buffer we allocated */
+   return (global uchar *)heap->base + old_heap_bottom_B;
+}
+
+static inline void
+poly_unroll_restart(global uint32_t *out_draw,
+                    global struct poly_heap *heap,
+                    constant uint *in_draw,
+                    uint64_t index_buffer,
+                    uint32_t index_buffer_range_el,
+                    uint32_t index_size_B,
+                    uint32_t restart_index,
+                    uint32_t flatshade_first,
+                    enum mesa_prim mode,
+                    local void *scratch)
+{
+   uint tid = cl_local_id.x;
+   uint count = in_draw[0];
+
+   uintptr_t out_ptr;
+   if (tid == 0) {
+      out_ptr = (uintptr_t)poly_setup_unroll_for_draw(heap, in_draw, out_draw,
+                                                      mode, index_size_B);
+      *(uintptr_t *)scratch = out_ptr;
+   }
+
+   barrier(CLK_LOCAL_MEM_FENCE);
+   out_ptr = *(uintptr_t *)scratch;
+
+   uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
+      index_buffer, index_buffer_range_el, in_draw[2], index_size_B));
+
+   uint out_prims = 0;
+   uint needle = 0;
+   uint per_prim = mesa_vertices_per_prim(mode);
+   while (needle < count) {
+      /* Search for next restart or the end. Lanes load in parallel. */
+      uint next_restart = needle;
+      for (;;) {
+         uint idx = next_restart + tid;
+         bool restart =
+            idx >= count || poly_load_index(in_ptr, index_buffer_range_el, idx,
+                                            index_size_B) == restart_index;
+
+         uint next_offs = poly_work_group_first_true(restart, scratch);
+
+         next_restart += next_offs;
+         if (next_offs < 1024)
+            break;
+      }
+
+      /* Emit up to the next restart. Lanes output in parallel */
+      uint subcount = next_restart - needle;
+      uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
+      uint out_prims_base = out_prims;
+      for (uint i = tid; i < subprims; i += 1024) {
+         for (uint vtx = 0; vtx < per_prim; ++vtx) {
+            uint id =
+               poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
+            uint offset = needle + id;
+
+            uint x = ((out_prims_base + i) * per_prim) + vtx;
+            uint y = poly_load_index(in_ptr, index_buffer_range_el, offset,
+                                     index_size_B);
+
+            poly_store_index(out_ptr, index_size_B, x, y);
+         }
+      }
+
+      out_prims += subprims;
+      needle = next_restart + 1;
+   }
+
+   if (tid == 0)
+      out_draw[0] = out_prims * per_prim;
+}