asahi: Unroll GS/XFB primitive restart on the GPU

..and fix bugs versus the CPU unroll while we're at it. CPU based unrolling is
invalid in Vulkan, but this slow-as-dogs GPU unroll is ok.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26614>
This commit is contained in:
Alyssa Rosenzweig 2023-11-27 11:34:31 -04:00
parent 15957219ad
commit f4a648c607
6 changed files with 293 additions and 44 deletions

View file

@ -1189,3 +1189,28 @@ agx_nir_gs_setup_indirect(const nir_shader *libagx, enum mesa_prim prim,
agx_preprocess_nir(b.shader, libagx, false, &info);
return b.shader;
}
nir_shader *
agx_nir_unroll_restart(const nir_shader *libagx, enum mesa_prim prim,
unsigned index_size_B)
{
nir_builder b = nir_builder_init_simple_shader(
MESA_SHADER_COMPUTE, &agx_nir_options, "Primitive restart unroll");
nir_def *ia = nir_load_input_assembly_buffer_agx(&b);
nir_def *draw = nir_channel(&b, nir_load_workgroup_id(&b), 0);
nir_def *mode = nir_imm_int(&b, prim);
if (index_size_B == 1)
libagx_unroll_restart_u8(&b, ia, mode, draw);
else if (index_size_B == 2)
libagx_unroll_restart_u16(&b, ia, mode, draw);
else if (index_size_B == 4)
libagx_unroll_restart_u32(&b, ia, mode, draw);
else
unreachable("invalid index size");
UNUSED struct agx_uncompiled_shader_info info;
agx_preprocess_nir(b.shader, libagx, false, &info);
return b.shader;
}

View file

@ -29,4 +29,8 @@ struct nir_shader *agx_nir_gs_setup_indirect(const struct nir_shader *libagx,
enum mesa_prim prim,
bool multidraw);
struct nir_shader *agx_nir_unroll_restart(const struct nir_shader *libagx,
enum mesa_prim prim,
unsigned index_size_B);
#endif

View file

@ -6,6 +6,12 @@
#include "geometry.h"
static uint
align(uint x, uint y)
{
return (x + 1) & ~(y - 1);
}
/* TODO: Primitive restart */
uint
libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
@ -110,6 +116,101 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
}
}
/*
* When unrolling the index buffer for a draw, we translate the old indirect
* draws to new indirect draws. This routine allocates the new index buffer and
* sets up most of the new draw descriptor.
*/
static global void *
setup_unroll_for_draw(global struct agx_ia_state *ia, constant uint *in_draw,
uint draw, enum mesa_prim mode, uint index_size_B)
{
/* Determine an upper bound on the memory required for the index buffer.
* Restarts only decrease the unrolled index buffer size, so the maximum size
* is the unrolled size when the input has no restarts.
*/
uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
uint max_verts = max_prims * mesa_vertices_per_prim(mode);
uint alloc_size = max_verts * index_size_B;
/* Allocate memory from the heap for the unrolled index buffer. Use an atomic
* since multiple threads may be running to handle multidraw in parallel.
*/
global struct agx_geometry_state *heap = ia->heap;
uint old_heap_bottom = atomic_fetch_add(
(volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
/* Regardless of the input stride, we use tightly packed output draws */
global uint *out = &ia->out_draws[5 * draw];
/* Setup most of the descriptor. Count will be determined after unroll. */
out[1] = in_draw[1]; /* instance count */
out[2] = old_heap_bottom / index_size_B; /* index offset */
out[3] = in_draw[3]; /* index bias */
out[4] = in_draw[4]; /* base instance */
/* Return the index buffer we allocated */
return (global uchar *)heap->heap + (old_heap_bottom * index_size_B);
}
#define UNROLL(INDEX, suffix) \
void libagx_unroll_restart_##suffix(global struct agx_ia_state *ia, \
enum mesa_prim mode, uint draw) \
{ \
/* For an indirect multidraw, we are dispatched maxDraws times and \
* terminate trailing invocations. \
*/ \
if (ia->count && draw >= *(ia->count)) \
return; \
\
constant uint *in_draw = \
(constant uint *)(ia->draws + (draw * ia->draw_stride)); \
\
uint count = in_draw[0]; \
constant INDEX *in = (constant INDEX *)ia->index_buffer; \
\
global INDEX *out = \
setup_unroll_for_draw(ia, in_draw, draw, mode, sizeof(INDEX)); \
\
uint out_prims = 0; \
INDEX restart_idx = ia->restart_index; \
bool flatshade_first = ia->flatshade_first; \
uint in_size_el = ia->index_buffer_size_B / sizeof(INDEX); \
\
uint needle = 0; \
uint per_prim = mesa_vertices_per_prim(mode); \
while (needle < count) { \
/* Search for next restart or the end */ \
uint next_restart = needle; \
while ((next_restart < count) && in[next_restart] != restart_idx) \
++next_restart; \
\
/* Emit up to the next restart */ \
uint subcount = next_restart - needle; \
uint subprims = u_decomposed_prims_for_vertices(mode, subcount); \
for (uint i = 0; i < subprims; ++i) { \
for (uint vtx = 0; vtx < per_prim; ++vtx) { \
uint id = libagx_vertex_id_for_topology(mode, flatshade_first, \
i, vtx, subprims); \
uint offset = needle + id; \
\
out[(out_prims * per_prim) + vtx] = \
offset < in_size_el ? in[offset] : 0; \
} \
\
out_prims++; \
} \
\
needle = next_restart + 1; \
} \
\
ia->out_draws[(5 * draw) + 0] = out_prims * per_prim; \
}
UNROLL(uchar, u8)
UNROLL(ushort, u16)
UNROLL(uint, u32)
uintptr_t
libagx_index_buffer(constant struct agx_ia_state *p, uint id,
uint index_size)
@ -193,12 +294,6 @@ libagx_end_primitive(global int *index_buffer, uint total_verts,
out[verts_in_prim] = -1;
}
static uint
align(uint x, uint y)
{
return (x + 1) & ~(y - 1);
}
void
libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,
uint vertices, uint primitives, uint output_stride_B)

View file

@ -41,7 +41,20 @@ struct agx_ia_key {
bool indirect_multidraw;
};
/* Packed geometry state buffer */
struct agx_geometry_state {
/* Heap to allocate from, in either direction. By convention, the top is used
* for intra-draw allocations and the bottom is used for full-batch
* allocations. In the future we could use kernel support to improve this.
*/
GLOBAL(uchar) heap;
uint32_t heap_bottom, heap_top, heap_size, padding;
} PACKED;
struct agx_ia_state {
/* Heap to allocate from across draws */
GLOBAL(struct agx_geometry_state) heap;
/* Input: index buffer if present. */
CONST(uchar) index_buffer;
@ -57,21 +70,26 @@ struct agx_ia_state {
*/
GLOBAL(uint) prefix_sums;
/* When unrolling primitive restart, output draw descriptors */
GLOBAL(uint) out_draws;
/* Primitive restart index, if unrolling */
uint32_t restart_index;
/* Input index buffer size in bytes, if unrolling */
uint32_t index_buffer_size_B;
/* Stride for the draw descrptor array */
uint32_t draw_stride;
/* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
uint8_t index_size_B;
} PACKED;
/* Packed geometry state buffer */
struct agx_geometry_state {
/* Heap to allocate from, in either direction. By convention, the top is used
* for intra-draw allocations and the bottom is used for full-batch
* allocations. In the future we could use kernel support to improve this.
/* When unrolling primitive restart, use first vertex as the provoking vertex
* for flat shading. We could stick this in the key, but meh, you're already
* hosed for perf on the unroll path.
*/
GLOBAL(uchar) heap;
uint32_t heap_bottom, heap_top, heap_size, padding;
uint32_t flatshade_first;
/* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
uint32_t index_size_B;
} PACKED;
struct agx_geometry_params {

View file

@ -39,6 +39,7 @@
#include "util/macros.h"
#include "util/u_dump.h"
#include "util/u_inlines.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/u_prim.h"
#include "util/u_resource.h"
@ -3311,6 +3312,17 @@ agx_index_buffer_direct_ptr(struct agx_batch *batch,
}
}
static uint64_t
agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draw,
size_t *extent)
{
if (draw)
return agx_index_buffer_direct_ptr(batch, draw, info, extent);
else
return agx_index_buffer_rsrc_ptr(batch, info, extent);
}
static bool
agx_scissor_culls_everything(struct agx_context *ctx)
{
@ -3399,16 +3411,20 @@ agx_batch_geometry_state(struct agx_batch *batch)
return batch->geometry_state;
}
static uint64_t
agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draw,
const struct pipe_draw_indirect_info *indirect)
static void
agx_upload_ia_params(struct agx_batch *batch, const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
uint64_t input_index_buffer, size_t index_buffer_size_B,
uint64_t unroll_output)
{
/* XXX move me */
struct agx_ia_state ia = {
.heap = agx_batch_geometry_state(batch),
.index_buffer = input_index_buffer,
.index_size_B = info->index_size,
.out_draws = unroll_output,
.restart_index = info->restart_index,
.index_buffer_size_B = index_buffer_size_B,
.flatshade_first = batch->ctx->rast->base.flatshade_first,
};
if (indirect) {
@ -3425,13 +3441,27 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
ia.count = rsrc->bo->ptr.gpu + indirect->indirect_draw_count_offset;
ia.draw_stride = indirect->stride;
size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
ia.prefix_sums =
agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
/* MDI requires prefix sums, but not for our current unroll path */
if (!unroll_output) {
size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
ia.prefix_sums =
agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
}
}
batch->uniforms.input_assembly =
agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
}
static uint64_t
agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
size_t index_buffer_size_B,
const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draw,
const struct pipe_draw_indirect_info *indirect)
{
agx_upload_ia_params(batch, info, indirect, input_index_buffer,
index_buffer_size_B, 0);
struct agx_geometry_params params = {
.state = agx_batch_geometry_state(batch),
@ -3511,10 +3541,7 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
batch->cdm = agx_encoder_allocate(batch, dev);
}
if (info->primitive_restart) {
fprintf(stderr, "Mode: %s\n", util_str_prim_mode(info->mode, true));
unreachable("TODO: Primitive restart with GS");
}
assert(!info->primitive_restart && "should have been lowered");
struct pipe_grid_info grid = {.block = {1, 1, 1}};
struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
@ -3632,6 +3659,88 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
memcpy(ctx->prims_generated, prim_queries, sizeof(prim_queries));
}
static void
agx_draw_without_restart(struct agx_batch *batch,
const struct pipe_draw_info *info,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *draw)
{
struct agx_context *ctx = batch->ctx;
struct agx_device *dev = agx_device(ctx->base.screen);
perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
agx_batch_init_state(batch);
size_t ib_extent = 0;
uint64_t ib = agx_index_buffer_ptr(batch, info, draw, &ib_extent);
/* The rest of this function handles only the general case of indirect
* multidraws, so synthesize an indexed indirect draw now if we need one for
* a direct draw (necessarily only one). This unifies the code paths.
*/
struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
if (!indirect) {
uint32_t desc[5] = {draw->count, info->instance_count, draw->start,
draw->index_bias, info->start_instance};
u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
&indirect_synthesized.offset, &indirect_synthesized.buffer);
indirect = &indirect_synthesized;
}
/* Next, we unroll the index buffer used by the indirect draw */
uint8_t log2_idx_size = util_logbase2(info->index_size);
assert(log2_idx_size <= 2);
if (!batch->cdm.bo)
batch->cdm = agx_encoder_allocate(batch, dev);
if (!ctx->gs_unroll_restart[info->mode][log2_idx_size]) {
struct agx_shader_key base_key = {0};
ctx->gs_unroll_restart[info->mode][log2_idx_size] = agx_compile_nir(
dev, agx_nir_unroll_restart(dev->libagx, info->mode, info->index_size),
&base_key, NULL);
}
/* Allocate output indirect draw descriptors. This is exact. */
struct agx_resource out_draws_rsrc = {0};
struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
&batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
&out_draws_rsrc.bo);
agx_upload_ia_params(batch, info, indirect, ib, ib_extent, out_draws.gpu);
/* Unroll the index buffer for each draw */
const struct pipe_grid_info grid_setup = {
.block = {1, 1, 1},
.grid = {indirect->draw_count, 1, 1},
};
agx_launch(batch, &grid_setup,
ctx->gs_unroll_restart[info->mode][log2_idx_size],
PIPE_SHADER_COMPUTE);
/* Now draw the results without restart */
struct pipe_draw_info new_info = *info;
new_info.primitive_restart = false;
new_info.mode = u_decomposed_prim(info->mode);
new_info.index.resource = ctx->heap;
new_info.has_user_indices = false;
struct pipe_draw_indirect_info new_indirect = *indirect;
new_indirect.buffer = &out_draws_rsrc.base;
new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->ptr.gpu;
new_indirect.stride = 5 * sizeof(uint32_t);
ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, draw,
1);
}
static bool
agx_needs_passthrough_gs(struct agx_context *ctx,
const struct pipe_draw_info *info,
@ -3841,15 +3950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
return;
}
bool uses_gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
if (uses_gs && info->primitive_restart) {
perf_debug_ctx(ctx, "Emulating primitive restart due to GS");
util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect,
draws);
return;
}
/* Only the rasterization stream counts */
if (ctx->active_queries && ctx->prims_generated[0] &&
!ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
@ -3859,6 +3959,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
}
struct agx_batch *batch = agx_get_batch(ctx);
if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
info->index_size) {
agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
return;
}
agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
unsigned idx_size = info->index_size;
@ -3866,10 +3974,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
size_t ib_extent = 0;
if (idx_size) {
if (indirect != NULL)
ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
else
ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
ib =
agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
}
#ifndef NDEBUG
@ -3905,7 +4011,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
.gpu;
batch->uniforms.geometry_params =
agx_batch_geometry_params(batch, ib, info, draws, indirect);
agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
}
struct agx_compiled_shader *vs = ctx->vs;

View file

@ -510,6 +510,7 @@ struct agx_context {
struct agx_compiled_shader *gs_prefix_sums[16];
struct agx_compiled_shader *gs_setup_indirect[MESA_PRIM_MAX][2];
struct agx_compiled_shader *gs_unroll_restart[MESA_PRIM_MAX][3];
struct agx_meta_cache meta;
uint32_t syncobj;