poly,asahi: Pull restart unrolling into libpoly

The interface here intentionally doesn't handle multi-draw. It's intended that the caller will sort that out in whatever way they want to handle multi-draw dispatches. Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Reviewed-by: Mary Guillemard <mary@mary.zone> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38404>
2025-12-20 20:20:18 +01:00 · 2025-11-14 14:10:23 -05:00 · 2025-11-14 14:10:23 -05:00 · d9f795e6d0
commit d9f795e6d0
parent ddff3700a4
2 changed files with 144 additions and 109 deletions
--- a/src/asahi/libagx/geometry.cl
+++ b/src/asahi/libagx/geometry.cl
@ -5,6 +5,7 @@
 */
 #include "compiler/libcl/libcl_vk.h"
 #include "poly/cl/restart.h"
 #include "poly/geometry.h"
 #include "poly/prim.h"
 #include "poly/tessellator.h"
@ -84,57 +85,6 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
   }
 }
 /*
 * Return the ID of the first thread in the workgroup where cond is true, or
 * 1024 if cond is false across the workgroup.
 */
 static uint
 first_true_thread_in_workgroup(bool cond, local uint *scratch)
 {
   barrier(CLK_LOCAL_MEM_FENCE);
   scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
   barrier(CLK_LOCAL_MEM_FENCE);
   uint first_group =
      ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
   return (first_group * 32) + off;
 }
 /*
 * When unrolling the index buffer for a draw, we translate the old indirect
 * draws to new indirect draws. This routine allocates the new index buffer and
 * sets up most of the new draw descriptor.
 */
 static global void *
 setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw,
                      global uint *out, enum mesa_prim mode, uint index_size_B)
 {
   /* Determine an upper bound on the memory required for the index buffer.
    * Restarts only decrease the unrolled index buffer size, so the maximum size
    * is the unrolled size when the input has no restarts.
    */
   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
   uint alloc_size = max_verts * index_size_B;
   /* Allocate unrolled index buffer.
    *
    * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
    * currently wired up in any driver.
    */
   uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
   /* Setup most of the descriptor. Count will be determined after unroll. */
   out[1] = in_draw[1];                       /* instance count */
   out[2] = old_heap_bottom_B / index_size_B; /* index offset */
   out[3] = in_draw[3];                       /* index bias */
   out[4] = in_draw[4];                       /* base instance */
   /* Return the index buffer we allocated */
   return (global uchar *)heap->base + old_heap_bottom_B;
 }
 KERNEL(1024)
 libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
                      constant uint *in_draw, global uint32_t *out_draw,
@ -144,65 +94,11 @@ libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
 {
   uint32_t index_size_B = 1 << index_size_log2;
   enum mesa_prim mode = poly_uncompact_prim(mode__11);
   uint tid = cl_local_id.x;
   uint count = in_draw[0];
-   local uintptr_t out_ptr;
+   POLY_DECL_UNROLL_RESTART_SCRATCH(scratch, 1024);
-   if (tid == 0) {
+   poly_unroll_restart(out_draw, heap, in_draw, index_buffer,
-      out_ptr = (uintptr_t)setup_unroll_for_draw(heap, in_draw, out_draw, mode,
+                       index_buffer_size_el, index_size_B, restart_index,
-                                                 index_size_B);
+                       flatshade_first, mode, scratch);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
      index_buffer, index_buffer_size_el, in_draw[2], index_size_B));
   local uint scratch[32];
   uint out_prims = 0;
   uint needle = 0;
   uint per_prim = mesa_vertices_per_prim(mode);
   while (needle < count) {
      /* Search for next restart or the end. Lanes load in parallel. */
      uint next_restart = needle;
      for (;;) {
         uint idx = next_restart + tid;
         bool restart =
            idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx,
                                            index_size_B) == restart_index;
         uint next_offs = first_true_thread_in_workgroup(restart, scratch);
         next_restart += next_offs;
         if (next_offs < 1024)
            break;
      }
      /* Emit up to the next restart. Lanes output in parallel */
      uint subcount = next_restart - needle;
      uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
      uint out_prims_base = out_prims;
      for (uint i = tid; i < subprims; i += 1024) {
         for (uint vtx = 0; vtx < per_prim; ++vtx) {
            uint id =
               poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
            uint offset = needle + id;
            uint x = ((out_prims_base + i) * per_prim) + vtx;
            uint y = poly_load_index(in_ptr, index_buffer_size_el, offset,
                                     index_size_B);
            poly_store_index(out_ptr, index_size_B, x, y);
         }
      }
      out_prims += subprims;
      needle = next_restart + 1;
   }
   if (tid == 0)
      out_draw[0] = out_prims * per_prim;
 }
 KERNEL(1)
--- a/src/poly/cl/restart.h
+++ b/src/poly/cl/restart.h
@ -0,0 +1,139 @@
 /*
 * Copyright 2023 Alyssa Rosenzweig
 * Copyright 2023 Valve Corporation
 * Copyright 2025 Collabora, Ltd.
 * SPDX-License-Identifier: MIT
 */
 #include "compiler/libcl/libcl.h"
 #include "poly/geometry.h"
 #include "poly/prim.h"
 #define POLY_DECL_UNROLL_RESTART_SCRATCH(__scratch, __wg_size) \
   local uint __scratch[MAX2(__wg_size / 32, sizeof(void *))]
 /*
 * Return the ID of the first thread in the workgroup where cond is true, or
 * a value greater than or equal to the workgroup size if cond is false across
 * the workgroup.
 */
 static inline uint
 poly_work_group_first_true(bool cond, local uint *scratch)
 {
   barrier(CLK_LOCAL_MEM_FENCE);
   scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
   barrier(CLK_LOCAL_MEM_FENCE);
   uint first_group =
      ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
   return (first_group * 32) + off;
 }
 /*
 * When unrolling the index buffer for a draw, we translate the old indirect
 * draws to new indirect draws. This routine allocates the new index buffer and
 * sets up most of the new draw descriptor.
 */
 static inline global void *
 poly_setup_unroll_for_draw(global struct poly_heap *heap,
                           constant uint *in_draw, global uint *out_draw,
                           enum mesa_prim mode, uint index_size_B)
 {
   /* Determine an upper bound on the memory required for the index buffer.
    * Restarts only decrease the unrolled index buffer size, so the maximum size
    * is the unrolled size when the input has no restarts.
    */
   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
   uint alloc_size = max_verts * index_size_B;
   /* Allocate unrolled index buffer.
    *
    * TODO: For multidraw, should be atomic. But multidraw+unroll isn't
    * currently wired up in any driver.
    */
   uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
   /* Setup most of the descriptor. Count will be determined after unroll. */
   out_draw[1] = in_draw[1];                       /* instance count */
   out_draw[2] = old_heap_bottom_B / index_size_B; /* index offset */
   out_draw[3] = in_draw[3];                       /* index bias */
   out_draw[4] = in_draw[4];                       /* base instance */
   /* Return the index buffer we allocated */
   return (global uchar *)heap->base + old_heap_bottom_B;
 }
 static inline void
 poly_unroll_restart(global uint32_t *out_draw,
                    global struct poly_heap *heap,
                    constant uint *in_draw,
                    uint64_t index_buffer,
                    uint32_t index_buffer_range_el,
                    uint32_t index_size_B,
                    uint32_t restart_index,
                    uint32_t flatshade_first,
                    enum mesa_prim mode,
                    local void *scratch)
 {
   uint tid = cl_local_id.x;
   uint count = in_draw[0];
   uintptr_t out_ptr;
   if (tid == 0) {
      out_ptr = (uintptr_t)poly_setup_unroll_for_draw(heap, in_draw, out_draw,
                                                      mode, index_size_B);
      *(uintptr_t *)scratch = out_ptr;
   }
   barrier(CLK_LOCAL_MEM_FENCE);
   out_ptr = *(uintptr_t *)scratch;
   uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
      index_buffer, index_buffer_range_el, in_draw[2], index_size_B));
   uint out_prims = 0;
   uint needle = 0;
   uint per_prim = mesa_vertices_per_prim(mode);
   while (needle < count) {
      /* Search for next restart or the end. Lanes load in parallel. */
      uint next_restart = needle;
      for (;;) {
         uint idx = next_restart + tid;
         bool restart =
            idx >= count || poly_load_index(in_ptr, index_buffer_range_el, idx,
                                            index_size_B) == restart_index;
         uint next_offs = poly_work_group_first_true(restart, scratch);
         next_restart += next_offs;
         if (next_offs < 1024)
            break;
      }
      /* Emit up to the next restart. Lanes output in parallel */
      uint subcount = next_restart - needle;
      uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
      uint out_prims_base = out_prims;
      for (uint i = tid; i < subprims; i += 1024) {
         for (uint vtx = 0; vtx < per_prim; ++vtx) {
            uint id =
               poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
            uint offset = needle + id;
            uint x = ((out_prims_base + i) * per_prim) + vtx;
            uint y = poly_load_index(in_ptr, index_buffer_range_el, offset,
                                     index_size_B);
            poly_store_index(out_ptr, index_size_B, x, y);
         }
      }
      out_prims += subprims;
      needle = next_restart + 1;
   }
   if (tid == 0)
      out_draw[0] = out_prims * per_prim;
 }