poly,asahi: Pull restart unrolling into libpoly

The interface here intentionally doesn't handle multi-draw.  It's
intended that the caller will sort that out in whatever way they want to
handle multi-draw dispatches.

Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Mary Guillemard <mary@mary.zone>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38404>
This commit is contained in:
Faith Ekstrand 2025-11-14 14:10:23 -05:00 committed by Marge Bot
parent ddff3700a4
commit d9f795e6d0
2 changed files with 144 additions and 109 deletions

View file

@ -5,6 +5,7 @@
*/ */
#include "compiler/libcl/libcl_vk.h" #include "compiler/libcl/libcl_vk.h"
#include "poly/cl/restart.h"
#include "poly/geometry.h" #include "poly/geometry.h"
#include "poly/prim.h" #include "poly/prim.h"
#include "poly/tessellator.h" #include "poly/tessellator.h"
@ -84,57 +85,6 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
} }
} }
/*
* Return the ID of the first thread in the workgroup where cond is true, or
* 1024 if cond is false across the workgroup.
*/
static uint
first_true_thread_in_workgroup(bool cond, local uint *scratch)
{
barrier(CLK_LOCAL_MEM_FENCE);
scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
barrier(CLK_LOCAL_MEM_FENCE);
uint first_group =
ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
return (first_group * 32) + off;
}
/*
* When unrolling the index buffer for a draw, we translate the old indirect
* draws to new indirect draws. This routine allocates the new index buffer and
* sets up most of the new draw descriptor.
*/
static global void *
setup_unroll_for_draw(global struct poly_heap *heap, constant uint *in_draw,
global uint *out, enum mesa_prim mode, uint index_size_B)
{
/* Determine an upper bound on the memory required for the index buffer.
* Restarts only decrease the unrolled index buffer size, so the maximum size
* is the unrolled size when the input has no restarts.
*/
uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
uint max_verts = max_prims * mesa_vertices_per_prim(mode);
uint alloc_size = max_verts * index_size_B;
/* Allocate unrolled index buffer.
*
* TODO: For multidraw, should be atomic. But multidraw+unroll isn't
* currently wired up in any driver.
*/
uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
/* Setup most of the descriptor. Count will be determined after unroll. */
out[1] = in_draw[1]; /* instance count */
out[2] = old_heap_bottom_B / index_size_B; /* index offset */
out[3] = in_draw[3]; /* index bias */
out[4] = in_draw[4]; /* base instance */
/* Return the index buffer we allocated */
return (global uchar *)heap->base + old_heap_bottom_B;
}
KERNEL(1024) KERNEL(1024)
libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer, libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
constant uint *in_draw, global uint32_t *out_draw, constant uint *in_draw, global uint32_t *out_draw,
@ -144,65 +94,11 @@ libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
{ {
uint32_t index_size_B = 1 << index_size_log2; uint32_t index_size_B = 1 << index_size_log2;
enum mesa_prim mode = poly_uncompact_prim(mode__11); enum mesa_prim mode = poly_uncompact_prim(mode__11);
uint tid = cl_local_id.x;
uint count = in_draw[0];
local uintptr_t out_ptr; POLY_DECL_UNROLL_RESTART_SCRATCH(scratch, 1024);
if (tid == 0) { poly_unroll_restart(out_draw, heap, in_draw, index_buffer,
out_ptr = (uintptr_t)setup_unroll_for_draw(heap, in_draw, out_draw, mode, index_buffer_size_el, index_size_B, restart_index,
index_size_B); flatshade_first, mode, scratch);
}
barrier(CLK_LOCAL_MEM_FENCE);
uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
index_buffer, index_buffer_size_el, in_draw[2], index_size_B));
local uint scratch[32];
uint out_prims = 0;
uint needle = 0;
uint per_prim = mesa_vertices_per_prim(mode);
while (needle < count) {
/* Search for next restart or the end. Lanes load in parallel. */
uint next_restart = needle;
for (;;) {
uint idx = next_restart + tid;
bool restart =
idx >= count || poly_load_index(in_ptr, index_buffer_size_el, idx,
index_size_B) == restart_index;
uint next_offs = first_true_thread_in_workgroup(restart, scratch);
next_restart += next_offs;
if (next_offs < 1024)
break;
}
/* Emit up to the next restart. Lanes output in parallel */
uint subcount = next_restart - needle;
uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
uint out_prims_base = out_prims;
for (uint i = tid; i < subprims; i += 1024) {
for (uint vtx = 0; vtx < per_prim; ++vtx) {
uint id =
poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
uint offset = needle + id;
uint x = ((out_prims_base + i) * per_prim) + vtx;
uint y = poly_load_index(in_ptr, index_buffer_size_el, offset,
index_size_B);
poly_store_index(out_ptr, index_size_B, x, y);
}
}
out_prims += subprims;
needle = next_restart + 1;
}
if (tid == 0)
out_draw[0] = out_prims * per_prim;
} }
KERNEL(1) KERNEL(1)

139
src/poly/cl/restart.h Normal file
View file

@ -0,0 +1,139 @@
/*
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2023 Valve Corporation
* Copyright 2025 Collabora, Ltd.
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl.h"
#include "poly/geometry.h"
#include "poly/prim.h"
#define POLY_DECL_UNROLL_RESTART_SCRATCH(__scratch, __wg_size) \
local uint __scratch[MAX2(__wg_size / 32, sizeof(void *))]
/*
* Return the ID of the first thread in the workgroup where cond is true, or
* a value greater than or equal to the workgroup size if cond is false across
* the workgroup.
*/
static inline uint
poly_work_group_first_true(bool cond, local uint *scratch)
{
barrier(CLK_LOCAL_MEM_FENCE);
scratch[get_sub_group_id()] = sub_group_ballot(cond)[0];
barrier(CLK_LOCAL_MEM_FENCE);
uint first_group =
ctz(sub_group_ballot(scratch[get_sub_group_local_id()])[0]);
uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
return (first_group * 32) + off;
}
/*
* When unrolling the index buffer for a draw, we translate the old indirect
* draws to new indirect draws. This routine allocates the new index buffer and
* sets up most of the new draw descriptor.
*/
static inline global void *
poly_setup_unroll_for_draw(global struct poly_heap *heap,
constant uint *in_draw, global uint *out_draw,
enum mesa_prim mode, uint index_size_B)
{
/* Determine an upper bound on the memory required for the index buffer.
* Restarts only decrease the unrolled index buffer size, so the maximum size
* is the unrolled size when the input has no restarts.
*/
uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
uint max_verts = max_prims * mesa_vertices_per_prim(mode);
uint alloc_size = max_verts * index_size_B;
/* Allocate unrolled index buffer.
*
* TODO: For multidraw, should be atomic. But multidraw+unroll isn't
* currently wired up in any driver.
*/
uint old_heap_bottom_B = poly_heap_alloc_nonatomic_offs(heap, alloc_size);
/* Setup most of the descriptor. Count will be determined after unroll. */
out_draw[1] = in_draw[1]; /* instance count */
out_draw[2] = old_heap_bottom_B / index_size_B; /* index offset */
out_draw[3] = in_draw[3]; /* index bias */
out_draw[4] = in_draw[4]; /* base instance */
/* Return the index buffer we allocated */
return (global uchar *)heap->base + old_heap_bottom_B;
}
static inline void
poly_unroll_restart(global uint32_t *out_draw,
global struct poly_heap *heap,
constant uint *in_draw,
uint64_t index_buffer,
uint32_t index_buffer_range_el,
uint32_t index_size_B,
uint32_t restart_index,
uint32_t flatshade_first,
enum mesa_prim mode,
local void *scratch)
{
uint tid = cl_local_id.x;
uint count = in_draw[0];
uintptr_t out_ptr;
if (tid == 0) {
out_ptr = (uintptr_t)poly_setup_unroll_for_draw(heap, in_draw, out_draw,
mode, index_size_B);
*(uintptr_t *)scratch = out_ptr;
}
barrier(CLK_LOCAL_MEM_FENCE);
out_ptr = *(uintptr_t *)scratch;
uintptr_t in_ptr = (uintptr_t)(poly_index_buffer(
index_buffer, index_buffer_range_el, in_draw[2], index_size_B));
uint out_prims = 0;
uint needle = 0;
uint per_prim = mesa_vertices_per_prim(mode);
while (needle < count) {
/* Search for next restart or the end. Lanes load in parallel. */
uint next_restart = needle;
for (;;) {
uint idx = next_restart + tid;
bool restart =
idx >= count || poly_load_index(in_ptr, index_buffer_range_el, idx,
index_size_B) == restart_index;
uint next_offs = poly_work_group_first_true(restart, scratch);
next_restart += next_offs;
if (next_offs < 1024)
break;
}
/* Emit up to the next restart. Lanes output in parallel */
uint subcount = next_restart - needle;
uint subprims = u_decomposed_prims_for_vertices(mode, subcount);
uint out_prims_base = out_prims;
for (uint i = tid; i < subprims; i += 1024) {
for (uint vtx = 0; vtx < per_prim; ++vtx) {
uint id =
poly_vertex_id_for_topology(mode, flatshade_first, i, vtx, subprims);
uint offset = needle + id;
uint x = ((out_prims_base + i) * per_prim) + vtx;
uint y = poly_load_index(in_ptr, index_buffer_range_el, offset,
index_size_B);
poly_store_index(out_ptr, index_size_B, x, y);
}
}
out_prims += subprims;
needle = next_restart + 1;
}
if (tid == 0)
out_draw[0] = out_prims * per_prim;
}