mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-22 04:10:40 +01:00
asahi: Unroll GS/XFB primitive restart on the GPU
..and fix bugs versus the CPU unroll while we're at it. CPU based unrolling is invalid in Vulkan, but this slow-as-dogs GPU unroll is ok. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26614>
This commit is contained in:
parent
15957219ad
commit
f4a648c607
6 changed files with 293 additions and 44 deletions
|
|
@ -1189,3 +1189,28 @@ agx_nir_gs_setup_indirect(const nir_shader *libagx, enum mesa_prim prim,
|
|||
agx_preprocess_nir(b.shader, libagx, false, &info);
|
||||
return b.shader;
|
||||
}
|
||||
|
||||
nir_shader *
|
||||
agx_nir_unroll_restart(const nir_shader *libagx, enum mesa_prim prim,
|
||||
unsigned index_size_B)
|
||||
{
|
||||
nir_builder b = nir_builder_init_simple_shader(
|
||||
MESA_SHADER_COMPUTE, &agx_nir_options, "Primitive restart unroll");
|
||||
|
||||
nir_def *ia = nir_load_input_assembly_buffer_agx(&b);
|
||||
nir_def *draw = nir_channel(&b, nir_load_workgroup_id(&b), 0);
|
||||
nir_def *mode = nir_imm_int(&b, prim);
|
||||
|
||||
if (index_size_B == 1)
|
||||
libagx_unroll_restart_u8(&b, ia, mode, draw);
|
||||
else if (index_size_B == 2)
|
||||
libagx_unroll_restart_u16(&b, ia, mode, draw);
|
||||
else if (index_size_B == 4)
|
||||
libagx_unroll_restart_u32(&b, ia, mode, draw);
|
||||
else
|
||||
unreachable("invalid index size");
|
||||
|
||||
UNUSED struct agx_uncompiled_shader_info info;
|
||||
agx_preprocess_nir(b.shader, libagx, false, &info);
|
||||
return b.shader;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,4 +29,8 @@ struct nir_shader *agx_nir_gs_setup_indirect(const struct nir_shader *libagx,
|
|||
enum mesa_prim prim,
|
||||
bool multidraw);
|
||||
|
||||
struct nir_shader *agx_nir_unroll_restart(const struct nir_shader *libagx,
|
||||
enum mesa_prim prim,
|
||||
unsigned index_size_B);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -6,6 +6,12 @@
|
|||
|
||||
#include "geometry.h"
|
||||
|
||||
static uint
|
||||
align(uint x, uint y)
|
||||
{
|
||||
return (x + 1) & ~(y - 1);
|
||||
}
|
||||
|
||||
/* TODO: Primitive restart */
|
||||
uint
|
||||
libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
|
||||
|
|
@ -110,6 +116,101 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* When unrolling the index buffer for a draw, we translate the old indirect
|
||||
* draws to new indirect draws. This routine allocates the new index buffer and
|
||||
* sets up most of the new draw descriptor.
|
||||
*/
|
||||
static global void *
|
||||
setup_unroll_for_draw(global struct agx_ia_state *ia, constant uint *in_draw,
|
||||
uint draw, enum mesa_prim mode, uint index_size_B)
|
||||
{
|
||||
/* Determine an upper bound on the memory required for the index buffer.
|
||||
* Restarts only decrease the unrolled index buffer size, so the maximum size
|
||||
* is the unrolled size when the input has no restarts.
|
||||
*/
|
||||
uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
|
||||
uint max_verts = max_prims * mesa_vertices_per_prim(mode);
|
||||
uint alloc_size = max_verts * index_size_B;
|
||||
|
||||
/* Allocate memory from the heap for the unrolled index buffer. Use an atomic
|
||||
* since multiple threads may be running to handle multidraw in parallel.
|
||||
*/
|
||||
global struct agx_geometry_state *heap = ia->heap;
|
||||
uint old_heap_bottom = atomic_fetch_add(
|
||||
(volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
|
||||
|
||||
/* Regardless of the input stride, we use tightly packed output draws */
|
||||
global uint *out = &ia->out_draws[5 * draw];
|
||||
|
||||
/* Setup most of the descriptor. Count will be determined after unroll. */
|
||||
out[1] = in_draw[1]; /* instance count */
|
||||
out[2] = old_heap_bottom / index_size_B; /* index offset */
|
||||
out[3] = in_draw[3]; /* index bias */
|
||||
out[4] = in_draw[4]; /* base instance */
|
||||
|
||||
/* Return the index buffer we allocated */
|
||||
return (global uchar *)heap->heap + (old_heap_bottom * index_size_B);
|
||||
}
|
||||
|
||||
#define UNROLL(INDEX, suffix) \
|
||||
void libagx_unroll_restart_##suffix(global struct agx_ia_state *ia, \
|
||||
enum mesa_prim mode, uint draw) \
|
||||
{ \
|
||||
/* For an indirect multidraw, we are dispatched maxDraws times and \
|
||||
* terminate trailing invocations. \
|
||||
*/ \
|
||||
if (ia->count && draw >= *(ia->count)) \
|
||||
return; \
|
||||
\
|
||||
constant uint *in_draw = \
|
||||
(constant uint *)(ia->draws + (draw * ia->draw_stride)); \
|
||||
\
|
||||
uint count = in_draw[0]; \
|
||||
constant INDEX *in = (constant INDEX *)ia->index_buffer; \
|
||||
\
|
||||
global INDEX *out = \
|
||||
setup_unroll_for_draw(ia, in_draw, draw, mode, sizeof(INDEX)); \
|
||||
\
|
||||
uint out_prims = 0; \
|
||||
INDEX restart_idx = ia->restart_index; \
|
||||
bool flatshade_first = ia->flatshade_first; \
|
||||
uint in_size_el = ia->index_buffer_size_B / sizeof(INDEX); \
|
||||
\
|
||||
uint needle = 0; \
|
||||
uint per_prim = mesa_vertices_per_prim(mode); \
|
||||
while (needle < count) { \
|
||||
/* Search for next restart or the end */ \
|
||||
uint next_restart = needle; \
|
||||
while ((next_restart < count) && in[next_restart] != restart_idx) \
|
||||
++next_restart; \
|
||||
\
|
||||
/* Emit up to the next restart */ \
|
||||
uint subcount = next_restart - needle; \
|
||||
uint subprims = u_decomposed_prims_for_vertices(mode, subcount); \
|
||||
for (uint i = 0; i < subprims; ++i) { \
|
||||
for (uint vtx = 0; vtx < per_prim; ++vtx) { \
|
||||
uint id = libagx_vertex_id_for_topology(mode, flatshade_first, \
|
||||
i, vtx, subprims); \
|
||||
uint offset = needle + id; \
|
||||
\
|
||||
out[(out_prims * per_prim) + vtx] = \
|
||||
offset < in_size_el ? in[offset] : 0; \
|
||||
} \
|
||||
\
|
||||
out_prims++; \
|
||||
} \
|
||||
\
|
||||
needle = next_restart + 1; \
|
||||
} \
|
||||
\
|
||||
ia->out_draws[(5 * draw) + 0] = out_prims * per_prim; \
|
||||
}
|
||||
|
||||
UNROLL(uchar, u8)
|
||||
UNROLL(ushort, u16)
|
||||
UNROLL(uint, u32)
|
||||
|
||||
uintptr_t
|
||||
libagx_index_buffer(constant struct agx_ia_state *p, uint id,
|
||||
uint index_size)
|
||||
|
|
@ -193,12 +294,6 @@ libagx_end_primitive(global int *index_buffer, uint total_verts,
|
|||
out[verts_in_prim] = -1;
|
||||
}
|
||||
|
||||
static uint
|
||||
align(uint x, uint y)
|
||||
{
|
||||
return (x + 1) & ~(y - 1);
|
||||
}
|
||||
|
||||
void
|
||||
libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,
|
||||
uint vertices, uint primitives, uint output_stride_B)
|
||||
|
|
|
|||
|
|
@ -41,7 +41,20 @@ struct agx_ia_key {
|
|||
bool indirect_multidraw;
|
||||
};
|
||||
|
||||
/* Packed geometry state buffer */
|
||||
struct agx_geometry_state {
|
||||
/* Heap to allocate from, in either direction. By convention, the top is used
|
||||
* for intra-draw allocations and the bottom is used for full-batch
|
||||
* allocations. In the future we could use kernel support to improve this.
|
||||
*/
|
||||
GLOBAL(uchar) heap;
|
||||
uint32_t heap_bottom, heap_top, heap_size, padding;
|
||||
} PACKED;
|
||||
|
||||
struct agx_ia_state {
|
||||
/* Heap to allocate from across draws */
|
||||
GLOBAL(struct agx_geometry_state) heap;
|
||||
|
||||
/* Input: index buffer if present. */
|
||||
CONST(uchar) index_buffer;
|
||||
|
||||
|
|
@ -57,21 +70,26 @@ struct agx_ia_state {
|
|||
*/
|
||||
GLOBAL(uint) prefix_sums;
|
||||
|
||||
/* When unrolling primitive restart, output draw descriptors */
|
||||
GLOBAL(uint) out_draws;
|
||||
|
||||
/* Primitive restart index, if unrolling */
|
||||
uint32_t restart_index;
|
||||
|
||||
/* Input index buffer size in bytes, if unrolling */
|
||||
uint32_t index_buffer_size_B;
|
||||
|
||||
/* Stride for the draw descrptor array */
|
||||
uint32_t draw_stride;
|
||||
|
||||
/* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
|
||||
uint8_t index_size_B;
|
||||
} PACKED;
|
||||
|
||||
/* Packed geometry state buffer */
|
||||
struct agx_geometry_state {
|
||||
/* Heap to allocate from, in either direction. By convention, the top is used
|
||||
* for intra-draw allocations and the bottom is used for full-batch
|
||||
* allocations. In the future we could use kernel support to improve this.
|
||||
/* When unrolling primitive restart, use first vertex as the provoking vertex
|
||||
* for flat shading. We could stick this in the key, but meh, you're already
|
||||
* hosed for perf on the unroll path.
|
||||
*/
|
||||
GLOBAL(uchar) heap;
|
||||
uint32_t heap_bottom, heap_top, heap_size, padding;
|
||||
uint32_t flatshade_first;
|
||||
|
||||
/* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
|
||||
uint32_t index_size_B;
|
||||
} PACKED;
|
||||
|
||||
struct agx_geometry_params {
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@
|
|||
#include "util/macros.h"
|
||||
#include "util/u_dump.h"
|
||||
#include "util/u_inlines.h"
|
||||
#include "util/u_math.h"
|
||||
#include "util/u_memory.h"
|
||||
#include "util/u_prim.h"
|
||||
#include "util/u_resource.h"
|
||||
|
|
@ -3311,6 +3312,17 @@ agx_index_buffer_direct_ptr(struct agx_batch *batch,
|
|||
}
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
|
||||
const struct pipe_draw_start_count_bias *draw,
|
||||
size_t *extent)
|
||||
{
|
||||
if (draw)
|
||||
return agx_index_buffer_direct_ptr(batch, draw, info, extent);
|
||||
else
|
||||
return agx_index_buffer_rsrc_ptr(batch, info, extent);
|
||||
}
|
||||
|
||||
static bool
|
||||
agx_scissor_culls_everything(struct agx_context *ctx)
|
||||
{
|
||||
|
|
@ -3399,16 +3411,20 @@ agx_batch_geometry_state(struct agx_batch *batch)
|
|||
return batch->geometry_state;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
|
||||
const struct pipe_draw_info *info,
|
||||
const struct pipe_draw_start_count_bias *draw,
|
||||
const struct pipe_draw_indirect_info *indirect)
|
||||
static void
|
||||
agx_upload_ia_params(struct agx_batch *batch, const struct pipe_draw_info *info,
|
||||
const struct pipe_draw_indirect_info *indirect,
|
||||
uint64_t input_index_buffer, size_t index_buffer_size_B,
|
||||
uint64_t unroll_output)
|
||||
{
|
||||
/* XXX move me */
|
||||
struct agx_ia_state ia = {
|
||||
.heap = agx_batch_geometry_state(batch),
|
||||
.index_buffer = input_index_buffer,
|
||||
.index_size_B = info->index_size,
|
||||
.out_draws = unroll_output,
|
||||
.restart_index = info->restart_index,
|
||||
.index_buffer_size_B = index_buffer_size_B,
|
||||
.flatshade_first = batch->ctx->rast->base.flatshade_first,
|
||||
};
|
||||
|
||||
if (indirect) {
|
||||
|
|
@ -3425,13 +3441,27 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
|
|||
ia.count = rsrc->bo->ptr.gpu + indirect->indirect_draw_count_offset;
|
||||
ia.draw_stride = indirect->stride;
|
||||
|
||||
size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
|
||||
ia.prefix_sums =
|
||||
agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
|
||||
/* MDI requires prefix sums, but not for our current unroll path */
|
||||
if (!unroll_output) {
|
||||
size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count;
|
||||
ia.prefix_sums =
|
||||
agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu;
|
||||
}
|
||||
}
|
||||
|
||||
batch->uniforms.input_assembly =
|
||||
agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
|
||||
size_t index_buffer_size_B,
|
||||
const struct pipe_draw_info *info,
|
||||
const struct pipe_draw_start_count_bias *draw,
|
||||
const struct pipe_draw_indirect_info *indirect)
|
||||
{
|
||||
agx_upload_ia_params(batch, info, indirect, input_index_buffer,
|
||||
index_buffer_size_B, 0);
|
||||
|
||||
struct agx_geometry_params params = {
|
||||
.state = agx_batch_geometry_state(batch),
|
||||
|
|
@ -3511,10 +3541,7 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
|
|||
batch->cdm = agx_encoder_allocate(batch, dev);
|
||||
}
|
||||
|
||||
if (info->primitive_restart) {
|
||||
fprintf(stderr, "Mode: %s\n", util_str_prim_mode(info->mode, true));
|
||||
unreachable("TODO: Primitive restart with GS");
|
||||
}
|
||||
assert(!info->primitive_restart && "should have been lowered");
|
||||
|
||||
struct pipe_grid_info grid = {.block = {1, 1, 1}};
|
||||
struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
|
||||
|
|
@ -3632,6 +3659,88 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
|
|||
memcpy(ctx->prims_generated, prim_queries, sizeof(prim_queries));
|
||||
}
|
||||
|
||||
static void
|
||||
agx_draw_without_restart(struct agx_batch *batch,
|
||||
const struct pipe_draw_info *info,
|
||||
unsigned drawid_offset,
|
||||
const struct pipe_draw_indirect_info *indirect,
|
||||
const struct pipe_draw_start_count_bias *draw)
|
||||
{
|
||||
struct agx_context *ctx = batch->ctx;
|
||||
struct agx_device *dev = agx_device(ctx->base.screen);
|
||||
|
||||
perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
|
||||
|
||||
agx_batch_init_state(batch);
|
||||
|
||||
size_t ib_extent = 0;
|
||||
uint64_t ib = agx_index_buffer_ptr(batch, info, draw, &ib_extent);
|
||||
|
||||
/* The rest of this function handles only the general case of indirect
|
||||
* multidraws, so synthesize an indexed indirect draw now if we need one for
|
||||
* a direct draw (necessarily only one). This unifies the code paths.
|
||||
*/
|
||||
struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
|
||||
|
||||
if (!indirect) {
|
||||
uint32_t desc[5] = {draw->count, info->instance_count, draw->start,
|
||||
draw->index_bias, info->start_instance};
|
||||
|
||||
u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
|
||||
&indirect_synthesized.offset, &indirect_synthesized.buffer);
|
||||
|
||||
indirect = &indirect_synthesized;
|
||||
}
|
||||
|
||||
/* Next, we unroll the index buffer used by the indirect draw */
|
||||
uint8_t log2_idx_size = util_logbase2(info->index_size);
|
||||
assert(log2_idx_size <= 2);
|
||||
|
||||
if (!batch->cdm.bo)
|
||||
batch->cdm = agx_encoder_allocate(batch, dev);
|
||||
|
||||
if (!ctx->gs_unroll_restart[info->mode][log2_idx_size]) {
|
||||
struct agx_shader_key base_key = {0};
|
||||
|
||||
ctx->gs_unroll_restart[info->mode][log2_idx_size] = agx_compile_nir(
|
||||
dev, agx_nir_unroll_restart(dev->libagx, info->mode, info->index_size),
|
||||
&base_key, NULL);
|
||||
}
|
||||
|
||||
/* Allocate output indirect draw descriptors. This is exact. */
|
||||
struct agx_resource out_draws_rsrc = {0};
|
||||
struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
|
||||
&batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
|
||||
&out_draws_rsrc.bo);
|
||||
|
||||
agx_upload_ia_params(batch, info, indirect, ib, ib_extent, out_draws.gpu);
|
||||
|
||||
/* Unroll the index buffer for each draw */
|
||||
const struct pipe_grid_info grid_setup = {
|
||||
.block = {1, 1, 1},
|
||||
.grid = {indirect->draw_count, 1, 1},
|
||||
};
|
||||
|
||||
agx_launch(batch, &grid_setup,
|
||||
ctx->gs_unroll_restart[info->mode][log2_idx_size],
|
||||
PIPE_SHADER_COMPUTE);
|
||||
|
||||
/* Now draw the results without restart */
|
||||
struct pipe_draw_info new_info = *info;
|
||||
new_info.primitive_restart = false;
|
||||
new_info.mode = u_decomposed_prim(info->mode);
|
||||
new_info.index.resource = ctx->heap;
|
||||
new_info.has_user_indices = false;
|
||||
|
||||
struct pipe_draw_indirect_info new_indirect = *indirect;
|
||||
new_indirect.buffer = &out_draws_rsrc.base;
|
||||
new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->ptr.gpu;
|
||||
new_indirect.stride = 5 * sizeof(uint32_t);
|
||||
|
||||
ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, draw,
|
||||
1);
|
||||
}
|
||||
|
||||
static bool
|
||||
agx_needs_passthrough_gs(struct agx_context *ctx,
|
||||
const struct pipe_draw_info *info,
|
||||
|
|
@ -3841,15 +3950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
|||
return;
|
||||
}
|
||||
|
||||
bool uses_gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
|
||||
|
||||
if (uses_gs && info->primitive_restart) {
|
||||
perf_debug_ctx(ctx, "Emulating primitive restart due to GS");
|
||||
util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect,
|
||||
draws);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Only the rasterization stream counts */
|
||||
if (ctx->active_queries && ctx->prims_generated[0] &&
|
||||
!ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
|
||||
|
|
@ -3859,6 +3959,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
|||
}
|
||||
|
||||
struct agx_batch *batch = agx_get_batch(ctx);
|
||||
|
||||
if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
|
||||
info->index_size) {
|
||||
|
||||
agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
|
||||
return;
|
||||
}
|
||||
|
||||
agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
|
||||
|
||||
unsigned idx_size = info->index_size;
|
||||
|
|
@ -3866,10 +3974,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
|||
size_t ib_extent = 0;
|
||||
|
||||
if (idx_size) {
|
||||
if (indirect != NULL)
|
||||
ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
|
||||
else
|
||||
ib = agx_index_buffer_direct_ptr(batch, draws, info, &ib_extent);
|
||||
ib =
|
||||
agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
@ -3905,7 +4011,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
|||
.gpu;
|
||||
|
||||
batch->uniforms.geometry_params =
|
||||
agx_batch_geometry_params(batch, ib, info, draws, indirect);
|
||||
agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
|
||||
}
|
||||
|
||||
struct agx_compiled_shader *vs = ctx->vs;
|
||||
|
|
|
|||
|
|
@ -510,6 +510,7 @@ struct agx_context {
|
|||
|
||||
struct agx_compiled_shader *gs_prefix_sums[16];
|
||||
struct agx_compiled_shader *gs_setup_indirect[MESA_PRIM_MAX][2];
|
||||
struct agx_compiled_shader *gs_unroll_restart[MESA_PRIM_MAX][3];
|
||||
struct agx_meta_cache meta;
|
||||
|
||||
uint32_t syncobj;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue