asahi/gs: only prefix sum with XFB

otherwise, an atomic suffices for the count shader.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33901>
This commit is contained in:
Alyssa Rosenzweig 2025-01-27 15:54:24 -05:00 committed by Marge Bot
parent 184416a5e8
commit 40aa260209
5 changed files with 57 additions and 12 deletions

View file

@ -44,6 +44,7 @@ struct lower_gs_state {
int count_index[MAX_VERTEX_STREAMS];
bool rasterizer_discard;
bool prefix_summing;
};
/* Helpers for loading from the geometry state buffer */
@ -304,11 +305,20 @@ write_xfb_counts(nir_builder *b, nir_intrinsic_instr *intr,
struct lower_gs_state *state)
{
/* Store each required counter */
nir_def *addr = load_xfb_count_address(b, state, calc_unrolled_id(b),
nir_intrinsic_stream_id(intr));
nir_def *id =
state->prefix_summing ? calc_unrolled_id(b) : nir_imm_int(b, 0);
if (addr)
nir_def *addr =
load_xfb_count_address(b, state, id, nir_intrinsic_stream_id(intr));
if (!addr)
return;
if (state->prefix_summing) {
nir_store_global(b, addr, 4, intr->src[2].ssa, nir_component_mask(1));
} else {
nir_global_atomic(b, 32, addr, intr->src[2].ssa,
.atomic_op = nir_atomic_op_iadd);
}
}
static bool
@ -700,7 +710,7 @@ previous_xfb_primitives(nir_builder *b, struct lower_gs_state *state,
* we can calculate the base.
*/
return nir_imul_imm(b, unrolled_id, static_count);
} else {
} else if (state->prefix_summing) {
/* Otherwise, we need to load from the prefix sum buffer. Note that the
* sums are inclusive, so index 0 is nonzero. This requires a little
* fixup here. We use a saturating unsigned subtraction so we don't read
@ -713,6 +723,12 @@ previous_xfb_primitives(nir_builder *b, struct lower_gs_state *state,
return nir_bcsel(b, nir_ieq_imm(b, unrolled_id, 0), nir_imm_int(b, 0),
nir_load_global_constant(b, addr, 4, 1, 32));
} else {
/* If we aren't prefix summing, the count is the only element */
nir_def *addr =
load_xfb_count_address(b, state, nir_imm_int(b, 0), stream);
return nir_load_global_constant(b, addr, 4, 1, 32);
}
}
@ -1282,6 +1298,9 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
gs->info.gs.output_primitive, gs->info.gs.vertices_out,
static_vertices[0], static_primitives[0]);
gs_state.prefix_summing =
gs_state.count_stride_el > 0 && gs->xfb_info != NULL;
bool side_effects_for_rast = false;
*gs_copy = agx_nir_create_gs_rast_shader(gs, &side_effects_for_rast);
@ -1387,6 +1406,7 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
*info = (struct agx_gs_info){
.mode = gs->info.gs.output_primitive,
.count_words = gs_state.count_stride_el,
.prefix_sum = gs_state.prefix_summing,
};
return true;

View file

@ -36,6 +36,9 @@ struct agx_gs_info {
/* Number of words per primitive in the count buffer */
unsigned count_words;
/* Whether a prefix sum is required on the count outputs */
bool prefix_sum;
};
bool agx_nir_lower_gs(struct nir_shader *gs, bool rasterizer_discard,

View file

@ -632,7 +632,8 @@ libagx_gs_setup_indirect(
uint64_t vs_outputs /* Vertex (TES) output mask */,
uint32_t index_size_B /* 0 if no index bffer */,
uint32_t index_buffer_range_el,
uint32_t prim /* Input primitive type, enum mesa_prim */)
uint32_t prim /* Input primitive type, enum mesa_prim */,
int is_prefix_summing)
{
/* Determine the (primitives, instances) grid size. */
uint vertex_count = draw[0];
@ -672,9 +673,11 @@ libagx_gs_setup_indirect(
uint vertex_buffer_size =
libagx_tcs_in_size(vertex_count * instance_count, vs_outputs);
p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
state->heap_bottom +=
align(p->input_primitives * p->count_buffer_stride, 16);
if (is_prefix_summing) {
p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
state->heap_bottom +=
align(p->input_primitives * p->count_buffer_stride, 16);
}
p->input_buffer = (uintptr_t)(state->heap + state->heap_bottom);
*vertex_buffer = p->input_buffer;

View file

@ -1165,6 +1165,12 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
*/
params.count_buffer_stride = count->info.gs.count_words * 4;
if (!count->info.gs.prefix_sum && params.count_buffer_stride) {
struct agx_ptr T = hk_pool_alloc(cmd, 16, 4);
memset(T.cpu, 0, 16);
params.count_buffer = T.gpu;
}
if (indirect) {
params.vs_grid[2] = params.gs_grid[2] = 1;
} else {
@ -1177,7 +1183,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
params.input_primitives = params.gs_grid[0] * instances;
unsigned size = params.input_primitives * params.count_buffer_stride;
if (size) {
if (count->info.gs.prefix_sum && size) {
params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
}
}
@ -1433,6 +1439,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
.p = desc->root.draw.geometry_params,
.vs_outputs = vs->b.info.outputs,
.prim = mode,
.is_prefix_summing = count->info.gs.prefix_sum,
};
if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
@ -1476,8 +1483,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
hk_dispatch_with_local_size(cmd, cs, count, grid_gs,
agx_workgroup(1, 1, 1));
libagx_prefix_sum_geom(cmd, agx_1d(1024 * count_words),
AGX_BARRIER_ALL | AGX_PREGFX, geometry_params);
if (count->info.gs.prefix_sum) {
libagx_prefix_sum_geom(cmd, agx_1d(1024 * count_words),
AGX_BARRIER_ALL | AGX_PREGFX, geometry_params);
}
}
/* Pre-GS shader */

View file

@ -4039,6 +4039,13 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
params.input_mask = batch->uniforms.vertex_outputs;
params.count_buffer_stride = batch->ctx->gs->gs.count_words * 4;
bool prefix_sum = batch->ctx->gs->gs.prefix_sum;
if (!prefix_sum && params.count_buffer_stride) {
struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, 16, 4);
memset(T.cpu, 0, 16);
params.count_buffer = T.gpu;
}
if (indirect) {
batch->uniforms.vertex_output_buffer_ptr =
agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
@ -4057,7 +4064,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
batch->uniforms.vertex_outputs);
unsigned size = params.input_primitives * params.count_buffer_stride;
if (size) {
if (size && prefix_sum) {
params.count_buffer =
agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
}
@ -4141,6 +4148,7 @@ agx_launch_gs_prerast(struct agx_batch *batch,
.vs_outputs = batch->uniforms.vertex_outputs,
.index_size_B = info->index_size,
.prim = info->mode,
.is_prefix_summing = gs->gs.prefix_sum,
};
libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
@ -4168,7 +4176,9 @@ agx_launch_gs_prerast(struct agx_batch *batch,
perf_debug(dev, "Geometry shader count");
agx_launch(batch, grid_gs, wg, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY,
0);
}
if (gs->gs.prefix_sum) {
libagx_prefix_sum_geom(batch, agx_1d(1024 * gs->gs.count_words),
AGX_BARRIER_ALL, gp);
}