diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index 201fb980559..8933f1f426a 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -1189,3 +1189,28 @@ agx_nir_gs_setup_indirect(const nir_shader *libagx, enum mesa_prim prim, agx_preprocess_nir(b.shader, libagx, false, &info); return b.shader; } + +nir_shader * +agx_nir_unroll_restart(const nir_shader *libagx, enum mesa_prim prim, + unsigned index_size_B) +{ + nir_builder b = nir_builder_init_simple_shader( + MESA_SHADER_COMPUTE, &agx_nir_options, "Primitive restart unroll"); + + nir_def *ia = nir_load_input_assembly_buffer_agx(&b); + nir_def *draw = nir_channel(&b, nir_load_workgroup_id(&b), 0); + nir_def *mode = nir_imm_int(&b, prim); + + if (index_size_B == 1) + libagx_unroll_restart_u8(&b, ia, mode, draw); + else if (index_size_B == 2) + libagx_unroll_restart_u16(&b, ia, mode, draw); + else if (index_size_B == 4) + libagx_unroll_restart_u32(&b, ia, mode, draw); + else + unreachable("invalid index size"); + + UNUSED struct agx_uncompiled_shader_info info; + agx_preprocess_nir(b.shader, libagx, false, &info); + return b.shader; +} diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h index a3b81142797..e628fb1ad0d 100644 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ b/src/asahi/lib/agx_nir_lower_gs.h @@ -29,4 +29,8 @@ struct nir_shader *agx_nir_gs_setup_indirect(const struct nir_shader *libagx, enum mesa_prim prim, bool multidraw); +struct nir_shader *agx_nir_unroll_restart(const struct nir_shader *libagx, + enum mesa_prim prim, + unsigned index_size_B); + #endif diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl index 526b247d89d..962bfcbfbda 100644 --- a/src/asahi/lib/shaders/geometry.cl +++ b/src/asahi/lib/shaders/geometry.cl @@ -6,6 +6,12 @@ #include "geometry.h" +static uint +align(uint x, uint y) +{ + return (x + 1) & ~(y - 1); +} + /* TODO: Primitive restart */ uint libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, @@ -110,6 +116,101 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, } } +/* + * When unrolling the index buffer for a draw, we translate the old indirect + * draws to new indirect draws. This routine allocates the new index buffer and + * sets up most of the new draw descriptor. + */ +static global void * +setup_unroll_for_draw(global struct agx_ia_state *ia, constant uint *in_draw, + uint draw, enum mesa_prim mode, uint index_size_B) +{ + /* Determine an upper bound on the memory required for the index buffer. + * Restarts only decrease the unrolled index buffer size, so the maximum size + * is the unrolled size when the input has no restarts. + */ + uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]); + uint max_verts = max_prims * mesa_vertices_per_prim(mode); + uint alloc_size = max_verts * index_size_B; + + /* Allocate memory from the heap for the unrolled index buffer. Use an atomic + * since multiple threads may be running to handle multidraw in parallel. + */ + global struct agx_geometry_state *heap = ia->heap; + uint old_heap_bottom = atomic_fetch_add( + (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4)); + + /* Regardless of the input stride, we use tightly packed output draws */ + global uint *out = &ia->out_draws[5 * draw]; + + /* Setup most of the descriptor. Count will be determined after unroll. */ + out[1] = in_draw[1]; /* instance count */ + out[2] = old_heap_bottom / index_size_B; /* index offset */ + out[3] = in_draw[3]; /* index bias */ + out[4] = in_draw[4]; /* base instance */ + + /* Return the index buffer we allocated */ + return (global uchar *)heap->heap + (old_heap_bottom * index_size_B); +} + +#define UNROLL(INDEX, suffix) \ + void libagx_unroll_restart_##suffix(global struct agx_ia_state *ia, \ + enum mesa_prim mode, uint draw) \ + { \ + /* For an indirect multidraw, we are dispatched maxDraws times and \ + * terminate trailing invocations. \ + */ \ + if (ia->count && draw >= *(ia->count)) \ + return; \ + \ + constant uint *in_draw = \ + (constant uint *)(ia->draws + (draw * ia->draw_stride)); \ + \ + uint count = in_draw[0]; \ + constant INDEX *in = (constant INDEX *)ia->index_buffer; \ + \ + global INDEX *out = \ + setup_unroll_for_draw(ia, in_draw, draw, mode, sizeof(INDEX)); \ + \ + uint out_prims = 0; \ + INDEX restart_idx = ia->restart_index; \ + bool flatshade_first = ia->flatshade_first; \ + uint in_size_el = ia->index_buffer_size_B / sizeof(INDEX); \ + \ + uint needle = 0; \ + uint per_prim = mesa_vertices_per_prim(mode); \ + while (needle < count) { \ + /* Search for next restart or the end */ \ + uint next_restart = needle; \ + while ((next_restart < count) && in[next_restart] != restart_idx) \ + ++next_restart; \ + \ + /* Emit up to the next restart */ \ + uint subcount = next_restart - needle; \ + uint subprims = u_decomposed_prims_for_vertices(mode, subcount); \ + for (uint i = 0; i < subprims; ++i) { \ + for (uint vtx = 0; vtx < per_prim; ++vtx) { \ + uint id = libagx_vertex_id_for_topology(mode, flatshade_first, \ + i, vtx, subprims); \ + uint offset = needle + id; \ + \ + out[(out_prims * per_prim) + vtx] = \ + offset < in_size_el ? in[offset] : 0; \ + } \ + \ + out_prims++; \ + } \ + \ + needle = next_restart + 1; \ + } \ + \ + ia->out_draws[(5 * draw) + 0] = out_prims * per_prim; \ + } + +UNROLL(uchar, u8) +UNROLL(ushort, u16) +UNROLL(uint, u32) + uintptr_t libagx_index_buffer(constant struct agx_ia_state *p, uint id, uint index_size) @@ -193,12 +294,6 @@ libagx_end_primitive(global int *index_buffer, uint total_verts, out[verts_in_prim] = -1; } -static uint -align(uint x, uint y) -{ - return (x + 1) & ~(y - 1); -} - void libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed, uint vertices, uint primitives, uint output_stride_B) diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h index 2356fd01edb..bbccca6b8ac 100644 --- a/src/asahi/lib/shaders/geometry.h +++ b/src/asahi/lib/shaders/geometry.h @@ -41,7 +41,20 @@ struct agx_ia_key { bool indirect_multidraw; }; +/* Packed geometry state buffer */ +struct agx_geometry_state { + /* Heap to allocate from, in either direction. By convention, the top is used + * for intra-draw allocations and the bottom is used for full-batch + * allocations. In the future we could use kernel support to improve this. + */ + GLOBAL(uchar) heap; + uint32_t heap_bottom, heap_top, heap_size, padding; +} PACKED; + struct agx_ia_state { + /* Heap to allocate from across draws */ + GLOBAL(struct agx_geometry_state) heap; + /* Input: index buffer if present. */ CONST(uchar) index_buffer; @@ -57,21 +70,26 @@ struct agx_ia_state { */ GLOBAL(uint) prefix_sums; + /* When unrolling primitive restart, output draw descriptors */ + GLOBAL(uint) out_draws; + + /* Primitive restart index, if unrolling */ + uint32_t restart_index; + + /* Input index buffer size in bytes, if unrolling */ + uint32_t index_buffer_size_B; + /* Stride for the draw descrptor array */ uint32_t draw_stride; - /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */ - uint8_t index_size_B; -} PACKED; - -/* Packed geometry state buffer */ -struct agx_geometry_state { - /* Heap to allocate from, in either direction. By convention, the top is used - * for intra-draw allocations and the bottom is used for full-batch - * allocations. In the future we could use kernel support to improve this. + /* When unrolling primitive restart, use first vertex as the provoking vertex + * for flat shading. We could stick this in the key, but meh, you're already + * hosed for perf on the unroll path. */ - GLOBAL(uchar) heap; - uint32_t heap_bottom, heap_top, heap_size, padding; + uint32_t flatshade_first; + + /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */ + uint32_t index_size_B; } PACKED; struct agx_geometry_params { diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index b648b6afbaa..d172713a216 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -39,6 +39,7 @@ #include "util/macros.h" #include "util/u_dump.h" #include "util/u_inlines.h" +#include "util/u_math.h" #include "util/u_memory.h" #include "util/u_prim.h" #include "util/u_resource.h" @@ -3311,6 +3312,17 @@ agx_index_buffer_direct_ptr(struct agx_batch *batch, } } +static uint64_t +agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw, + size_t *extent) +{ + if (draw) + return agx_index_buffer_direct_ptr(batch, draw, info, extent); + else + return agx_index_buffer_rsrc_ptr(batch, info, extent); +} + static bool agx_scissor_culls_everything(struct agx_context *ctx) { @@ -3399,16 +3411,20 @@ agx_batch_geometry_state(struct agx_batch *batch) return batch->geometry_state; } -static uint64_t -agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, - const struct pipe_draw_info *info, - const struct pipe_draw_start_count_bias *draw, - const struct pipe_draw_indirect_info *indirect) +static void +agx_upload_ia_params(struct agx_batch *batch, const struct pipe_draw_info *info, + const struct pipe_draw_indirect_info *indirect, + uint64_t input_index_buffer, size_t index_buffer_size_B, + uint64_t unroll_output) { - /* XXX move me */ struct agx_ia_state ia = { + .heap = agx_batch_geometry_state(batch), .index_buffer = input_index_buffer, .index_size_B = info->index_size, + .out_draws = unroll_output, + .restart_index = info->restart_index, + .index_buffer_size_B = index_buffer_size_B, + .flatshade_first = batch->ctx->rast->base.flatshade_first, }; if (indirect) { @@ -3425,13 +3441,27 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, ia.count = rsrc->bo->ptr.gpu + indirect->indirect_draw_count_offset; ia.draw_stride = indirect->stride; - size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count; - ia.prefix_sums = - agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu; + /* MDI requires prefix sums, but not for our current unroll path */ + if (!unroll_output) { + size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count; + ia.prefix_sums = + agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu; + } } batch->uniforms.input_assembly = agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8); +} + +static uint64_t +agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, + size_t index_buffer_size_B, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count_bias *draw, + const struct pipe_draw_indirect_info *indirect) +{ + agx_upload_ia_params(batch, info, indirect, input_index_buffer, + index_buffer_size_B, 0); struct agx_geometry_params params = { .state = agx_batch_geometry_state(batch), @@ -3511,10 +3541,7 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info, batch->cdm = agx_encoder_allocate(batch, dev); } - if (info->primitive_restart) { - fprintf(stderr, "Mode: %s\n", util_str_prim_mode(info->mode, true)); - unreachable("TODO: Primitive restart with GS"); - } + assert(!info->primitive_restart && "should have been lowered"); struct pipe_grid_info grid = {.block = {1, 1, 1}}; struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo}; @@ -3632,6 +3659,88 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info, memcpy(ctx->prims_generated, prim_queries, sizeof(prim_queries)); } +static void +agx_draw_without_restart(struct agx_batch *batch, + const struct pipe_draw_info *info, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draw) +{ + struct agx_context *ctx = batch->ctx; + struct agx_device *dev = agx_device(ctx->base.screen); + + perf_debug(dev, "Unrolling primitive restart due to GS/XFB"); + + agx_batch_init_state(batch); + + size_t ib_extent = 0; + uint64_t ib = agx_index_buffer_ptr(batch, info, draw, &ib_extent); + + /* The rest of this function handles only the general case of indirect + * multidraws, so synthesize an indexed indirect draw now if we need one for + * a direct draw (necessarily only one). This unifies the code paths. + */ + struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1}; + + if (!indirect) { + uint32_t desc[5] = {draw->count, info->instance_count, draw->start, + draw->index_bias, info->start_instance}; + + u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc, + &indirect_synthesized.offset, &indirect_synthesized.buffer); + + indirect = &indirect_synthesized; + } + + /* Next, we unroll the index buffer used by the indirect draw */ + uint8_t log2_idx_size = util_logbase2(info->index_size); + assert(log2_idx_size <= 2); + + if (!batch->cdm.bo) + batch->cdm = agx_encoder_allocate(batch, dev); + + if (!ctx->gs_unroll_restart[info->mode][log2_idx_size]) { + struct agx_shader_key base_key = {0}; + + ctx->gs_unroll_restart[info->mode][log2_idx_size] = agx_compile_nir( + dev, agx_nir_unroll_restart(dev->libagx, info->mode, info->index_size), + &base_key, NULL); + } + + /* Allocate output indirect draw descriptors. This is exact. */ + struct agx_resource out_draws_rsrc = {0}; + struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo( + &batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4, + &out_draws_rsrc.bo); + + agx_upload_ia_params(batch, info, indirect, ib, ib_extent, out_draws.gpu); + + /* Unroll the index buffer for each draw */ + const struct pipe_grid_info grid_setup = { + .block = {1, 1, 1}, + .grid = {indirect->draw_count, 1, 1}, + }; + + agx_launch(batch, &grid_setup, + ctx->gs_unroll_restart[info->mode][log2_idx_size], + PIPE_SHADER_COMPUTE); + + /* Now draw the results without restart */ + struct pipe_draw_info new_info = *info; + new_info.primitive_restart = false; + new_info.mode = u_decomposed_prim(info->mode); + new_info.index.resource = ctx->heap; + new_info.has_user_indices = false; + + struct pipe_draw_indirect_info new_indirect = *indirect; + new_indirect.buffer = &out_draws_rsrc.base; + new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->ptr.gpu; + new_indirect.stride = 5 * sizeof(uint32_t); + + ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, draw, + 1); +} + static bool agx_needs_passthrough_gs(struct agx_context *ctx, const struct pipe_draw_info *info, @@ -3841,15 +3950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } - bool uses_gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader; - - if (uses_gs && info->primitive_restart) { - perf_debug_ctx(ctx, "Emulating primitive restart due to GS"); - util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect, - draws); - return; - } - /* Only the rasterization stream counts */ if (ctx->active_queries && ctx->prims_generated[0] && !ctx->stage[PIPE_SHADER_GEOMETRY].shader) { @@ -3859,6 +3959,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, } struct agx_batch *batch = agx_get_batch(ctx); + + if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart && + info->index_size) { + + agx_draw_without_restart(batch, info, drawid_offset, indirect, draws); + return; + } + agx_batch_add_timestamp_query(batch, ctx->time_elapsed); unsigned idx_size = info->index_size; @@ -3866,10 +3974,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, size_t ib_extent = 0; if (idx_size) { - if (indirect != NULL) - ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent); - else - ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent); + ib = + agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent); } #ifndef NDEBUG @@ -3905,7 +4011,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, .gpu; batch->uniforms.geometry_params = - agx_batch_geometry_params(batch, ib, info, draws, indirect); + agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect); } struct agx_compiled_shader *vs = ctx->vs; diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 440734c60e0..bbb916805aa 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -510,6 +510,7 @@ struct agx_context { struct agx_compiled_shader *gs_prefix_sums[16]; struct agx_compiled_shader *gs_setup_indirect[MESA_PRIM_MAX][2]; + struct agx_compiled_shader *gs_unroll_restart[MESA_PRIM_MAX][3]; struct agx_meta_cache meta; uint32_t syncobj;