agx/nir_lower_gs: rework gs rast shape handling

rather than a bunch of subtle booleans telling the driver how to invoke the GS
rast shader, collect everything into a common enum, and provide (CL safe)
helpers to do the appropriate calculations rather than duplicating across
GL/VK/indirects.

this fixes suboptimal handling of instancing with list topologies.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34661>
This commit is contained in:
Alyssa Rosenzweig 2025-04-22 12:06:38 -04:00 committed by Marge Bot
parent 2a0314250b
commit 5640266eb3
6 changed files with 185 additions and 119 deletions

View file

@ -615,45 +615,45 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
if (shader->info.gs.output_primitive != MESA_PRIM_POINTS)
shader->info.outputs_written &= ~VARYING_BIT_PSIZ;
nir_def *output_id, *unrolled;
if (state->info->instanced) {
/* vertex ID = ID within the primitive, instance ID = unrolled prim ID */
output_id = nir_load_vertex_id(b);
unrolled = nir_load_instance_id(b);
} else {
/* vertex ID = unrolled (see calc_unrolled_index_id), no instancing */
nir_def *raw_id = nir_load_vertex_id(b);
unsigned stride = state->info->indexed
? output_vertex_id_pot_stride(gs)
: MAX2(state->info->max_indices, 1);
nir_def *raw_vertex_id = nir_load_vertex_id(b);
struct lower_gs_rast_state rs = {.raw_instance_id = nir_load_instance_id(b)};
output_id = nir_umod_imm(b, raw_id, stride);
unrolled = nir_udiv_imm(b, raw_id, stride);
}
switch (state->info->shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED: {
unsigned stride = output_vertex_id_pot_stride(gs);
/* If we are indexed, we know indices are sparse and rounded up to powers of
* two, so we can just shift & mask to pick apart. Otherwise, we fall back on
* a slower integer division.
*/
nir_def *instance_id, *primitive_id;
if (state->info->indexed) {
nir_def *unrolled = nir_udiv_imm(b, raw_vertex_id, stride);
nir_def *primitives_log2 = load_geometry_param(b, primitives_log2);
instance_id = nir_ushr(b, unrolled, primitives_log2);
primitive_id = nir_iand(
b, unrolled,
nir_iadd_imm(b, nir_ishl(b, nir_imm_int(b, 1), primitives_log2), -1));
} else {
nir_def *primitives = load_geometry_param(b, gs_grid[0]);
instance_id = nir_udiv(b, unrolled, primitives);
primitive_id = nir_umod(b, unrolled, primitives);
nir_def *bit = nir_ishl(b, nir_imm_int(b, 1), primitives_log2);
rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
rs.instance_id = nir_ushr(b, unrolled, primitives_log2);
rs.primitive_id = nir_iand(b, unrolled, nir_iadd_imm(b, bit, -1));
break;
}
struct lower_gs_rast_state rast_state = {
.raw_instance_id = unrolled,
.instance_id = instance_id,
.primitive_id = primitive_id,
.output_id = output_id,
};
case AGX_GS_SHAPE_STATIC_INDEXED:
case AGX_GS_SHAPE_STATIC_PER_PRIM: {
nir_def *stride = load_geometry_param(b, gs_grid[0]);
rs.output_id = raw_vertex_id;
rs.instance_id = nir_udiv(b, rs.raw_instance_id, stride);
rs.primitive_id = nir_umod(b, rs.raw_instance_id, stride);
break;
}
case AGX_GS_SHAPE_STATIC_PER_INSTANCE: {
unsigned stride = MAX2(state->info->max_indices, 1);
rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
rs.primitive_id = nir_udiv_imm(b, raw_vertex_id, stride);
rs.instance_id = rs.raw_instance_id;
break;
}
default:
unreachable("invalid shape");
}
u_foreach_bit64(slot, shader->info.outputs_written) {
const char *slot_name =
@ -664,24 +664,24 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast,
(slot == VARYING_SLOT_VIEWPORT);
unsigned comps = scalar ? 1 : 4;
rast_state.outputs.outputs[slot] = nir_variable_create(
rs.outputs.outputs[slot] = nir_variable_create(
shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, comps),
ralloc_asprintf(shader, "%s-temp", slot_name));
rast_state.selected.outputs[slot] = nir_variable_create(
rs.selected.outputs[slot] = nir_variable_create(
shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, comps),
ralloc_asprintf(shader, "%s-selected", slot_name));
}
nir_shader_intrinsics_pass(shader, lower_to_gs_rast,
nir_metadata_control_flow, &rast_state);
nir_metadata_control_flow, &rs);
b->cursor = nir_after_impl(b->impl);
/* Forward each selected output to the rasterizer */
u_foreach_bit64(slot, shader->info.outputs_written) {
assert(rast_state.selected.outputs[slot] != NULL);
nir_def *value = nir_load_var(b, rast_state.selected.outputs[slot]);
assert(rs.selected.outputs[slot] != NULL);
nir_def *value = nir_load_var(b, rs.selected.outputs[slot]);
/* We set NIR_COMPACT_ARRAYS so clip/cull distance needs to come all in
* DIST0. Undo the offset if we need to.
@ -909,7 +909,7 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state)
switch (intr->intrinsic) {
case nir_intrinsic_set_vertex_and_primitive_count: {
if (!state_->info->dynamic_topology)
if (state_->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
break;
/* Points write their index buffer here, other primitives write on end. We
@ -930,8 +930,7 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state)
}
case nir_intrinsic_end_primitive_with_counter: {
/* If the topology is static, we use the static index buffer instead. */
if (!state_->info->dynamic_topology)
if (state_->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED)
break;
unsigned min = nir_verts_in_output_prim(b->shader);
@ -1242,7 +1241,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
* other stuff).
*/
if (intr->instr.block != nir_start_block(b->impl)) {
info->dynamic_topology = true;
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
return false;
}
@ -1250,7 +1249,7 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
if (!nir_src_is_const(intr->src[0]) || !nir_src_is_const(intr->src[1]) ||
!nir_src_is_const(intr->src[2])) {
info->dynamic_topology = true;
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
return false;
}
@ -1289,7 +1288,7 @@ match_list_topology(struct agx_gs_info *info, uint32_t count)
}
/* If we match, rewrite the topology and drop indexing */
info->indexed = false;
info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
info->mode = u_decomposed_prim(info->mode);
info->max_indices = (info->max_indices / count_with_restart) * count;
return true;
@ -1327,12 +1326,12 @@ static void
optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
{
nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, info);
if (info->dynamic_topology)
if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED)
return;
/* Points are always lists, we never have restarts/instancing */
/* Points are always lists */
if (gs->info.gs.output_primitive == MESA_PRIM_POINTS) {
info->indexed = false;
info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE;
return;
}
@ -1341,14 +1340,14 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs)
if (match_list_topology(info, count))
return;
/* Because we're instancing, we can always drop the trailing restart index */
info->instanced = true;
/* Instancing means we can always drop the trailing restart index */
info->max_indices--;
/* Try to pattern match a strip topology */
if (is_strip_topology(info->topology, info->max_indices)) {
info->indexed = false;
return;
info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM;
} else {
info->shape = AGX_GS_SHAPE_STATIC_INDEXED;
}
}
@ -1433,7 +1432,7 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
*info = (struct agx_gs_info){
.mode = gs->info.gs.output_primitive,
.xfb = gs->xfb_info != NULL,
.indexed = true,
.shape = -1,
};
int static_vertices[4] = {0}, static_primitives[4] = {0};
@ -1458,7 +1457,7 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count,
if (static_vertices[0] >= 0 && static_primitives[0] >= 0) {
optimize_static_topology(info, gs);
} else {
info->dynamic_topology = true;
info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED;
}
bool side_effects_for_rast = false;

View file

@ -7,6 +7,7 @@
#include <stdbool.h>
#include <stdint.h>
#include "libagx/geometry.h"
#include "nir.h"
#include "shader_enums.h"
@ -37,16 +38,10 @@ struct agx_gs_info {
/* Whether a prefix sum is required on the count outputs. Implies xfb */
bool prefix_sum;
/* Whether we need to dynamically allocate an index buffer. */
bool dynamic_topology;
/* Shape of the rasterization draw, named by the instance ID */
enum agx_gs_shape shape;
/* Whether the topology requires an index buffer */
bool indexed;
/* Whether the topology requires hardware instancing */
bool instanced;
/* Static topology used if dynamic_topology is false. */
/* Static topology used if shape = AGX_GS_SHAPE_STATIC_INDEXED */
uint32_t topology[384];
};

View file

@ -583,8 +583,7 @@ libagx_gs_setup_indirect(
uint32_t index_size_B /* 0 if no index bffer */,
uint32_t index_buffer_range_el,
uint32_t prim /* Input primitive type, enum mesa_prim */,
int is_prefix_summing, uint indices_per_in_prim, int dynamic_topology,
int instanced)
int is_prefix_summing, uint max_indices, enum agx_gs_shape shape)
{
/* Determine the (primitives, instances) grid size. */
uint vertex_count = draw[0];
@ -637,21 +636,21 @@ libagx_gs_setup_indirect(
/* Allocate the index buffer and write the draw consuming it */
global VkDrawIndexedIndirectCommand *cmd = (global void *)p->indirect_desc;
uint count = (instanced ? 1 : p->input_primitives) * indices_per_in_prim;
uint index_buffer_offset_B = 0;
if (dynamic_topology) {
index_buffer_offset_B = agx_heap_alloc_nonatomic_offs(state, count * 4);
p->output_index_buffer =
(global uint *)(state->heap + index_buffer_offset_B);
}
*cmd = (VkDrawIndexedIndirectCommand){
.indexCount = count,
.instanceCount = instanced ? p->input_primitives : 1,
.firstIndex = index_buffer_offset_B / 4,
.indexCount = agx_gs_rast_vertices(shape, max_indices, prim_per_instance,
instance_count),
.instanceCount = agx_gs_rast_instances(shape, max_indices,
prim_per_instance, instance_count),
};
if (shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
cmd->firstIndex =
agx_heap_alloc_nonatomic_offs(state, cmd->indexCount * 4) / 4;
p->output_index_buffer =
(global uint *)(state->heap + (cmd->firstIndex * 4));
}
}
/*

View file

@ -16,6 +16,81 @@
#define MAX_SO_BUFFERS 4
#define MAX_VERTEX_STREAMS 4
enum agx_gs_shape {
/* Indexed, where indices are encoded as:
*
* round_to_pot(max_indices) * round_to_pot(input_primitives) *
* * instance_count
*
* invoked for max_indices * input_primitives * instance_count indices.
*
* This is used with any dynamic topology. No hardware instancing used.
*/
AGX_GS_SHAPE_DYNAMIC_INDEXED,
/* Indexed with a static index buffer. Indices ranges up to max_indices.
* Hardware instance count = input_primitives * software instance count.
*/
AGX_GS_SHAPE_STATIC_INDEXED,
/* Non-indexed. Dispatched as:
*
* (max_indices, input_primitives * instance count).
*/
AGX_GS_SHAPE_STATIC_PER_PRIM,
/* Non-indexed. Dispatched as:
*
* (max_indices * input_primitives, instance count).
*/
AGX_GS_SHAPE_STATIC_PER_INSTANCE,
};
static inline unsigned
agx_gs_rast_vertices(enum agx_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED:
return max_indices * input_primitives * instance_count;
case AGX_GS_SHAPE_STATIC_INDEXED:
case AGX_GS_SHAPE_STATIC_PER_PRIM:
return max_indices;
case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
return max_indices * input_primitives;
}
unreachable("invalid shape");
}
static inline unsigned
agx_gs_rast_instances(enum agx_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case AGX_GS_SHAPE_DYNAMIC_INDEXED:
return 1;
case AGX_GS_SHAPE_STATIC_INDEXED:
case AGX_GS_SHAPE_STATIC_PER_PRIM:
return input_primitives * instance_count;
case AGX_GS_SHAPE_STATIC_PER_INSTANCE:
return instance_count;
}
unreachable("invalid shape");
}
static inline bool
agx_gs_indexed(enum agx_gs_shape shape)
{
return shape == AGX_GS_SHAPE_DYNAMIC_INDEXED ||
shape == AGX_GS_SHAPE_STATIC_INDEXED;
}
/* Packed geometry state buffer */
struct agx_geometry_state {
/* Heap to allocate from. */

View file

@ -1156,14 +1156,19 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
params.vs_grid[4] = params.gs_grid[4] = 1;
params.vs_grid[5] = params.gs_grid[5] = 1;
struct agx_gs_info *gsi = &count->info.gs;
if (indirect) {
/* TODO: size */
cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
params.indirect_desc = cmd->geom_indirect;
params.vs_grid[2] = params.gs_grid[2] = 1;
cmd->geom_index_buffer = dev->heap->va->addr;
cmd->geom_index_count = dev->heap->size;
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
cmd->geom_index_buffer = dev->heap->va->addr;
cmd->geom_index_count = dev->heap->size;
}
} else {
uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
@ -1178,17 +1183,13 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
}
if (count->info.gs.instanced) {
cmd->geom_index_count = count->info.gs.max_indices;
cmd->geom_instance_count = params.input_primitives;
} else {
cmd->geom_index_count =
params.input_primitives * count->info.gs.max_indices;
cmd->geom_index_count = agx_gs_rast_vertices(
gsi->shape, gsi->max_indices, params.gs_grid[0], instances);
cmd->geom_instance_count = 1;
}
cmd->geom_instance_count = agx_gs_rast_instances(
gsi->shape, gsi->max_indices, params.gs_grid[0], instances);
if (count->info.gs.dynamic_topology) {
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
params.output_index_buffer =
hk_pool_alloc(cmd, cmd->geom_index_count * 4, 4).gpu;
@ -1196,9 +1197,9 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
}
}
if (count->info.gs.indexed && !count->info.gs.dynamic_topology) {
cmd->geom_index_buffer = hk_pool_upload(
cmd, count->info.gs.topology, count->info.gs.max_indices * 4, 4);
if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
cmd->geom_index_buffer =
hk_pool_upload(cmd, count->info.gs.topology, gsi->max_indices * 4, 4);
}
desc->root_dirty = true;
@ -1456,9 +1457,8 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
.vs_outputs = vs->b.info.outputs,
.prim = mode,
.is_prefix_summing = count->info.gs.prefix_sum,
.indices_per_in_prim = count->info.gs.max_indices,
.dynamic_topology = count->info.gs.dynamic_topology,
.instanced = count->info.gs.instanced,
.max_indices = count->info.gs.max_indices,
.shape = count->info.gs.shape,
};
if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
@ -1533,7 +1533,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
hk_dispatch_with_local_size(cmd, cs, main, grid_gs, wg);
if (agx_is_indirect(draw.b)) {
if (count->info.gs.indexed) {
if (agx_gs_indexed(count->info.gs.shape)) {
return agx_draw_indexed_indirect(
cmd->geom_indirect, cmd->geom_index_buffer, cmd->geom_index_count,
AGX_INDEX_SIZE_U32, true);
@ -1541,7 +1541,7 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
return agx_draw_indirect(cmd->geom_indirect);
}
} else {
if (count->info.gs.indexed) {
if (agx_gs_indexed(count->info.gs.shape)) {
return agx_draw_indexed(
cmd->geom_index_count, cmd->geom_instance_count, 0, 0, 0,
cmd->geom_index_buffer, cmd->geom_index_count * 4,

View file

@ -60,6 +60,7 @@
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "agx_tilebuffer.h"
#include "geometry.h"
#include "libagx.h"
#include "libagx_dgc.h"
#include "libagx_shaders.h"
@ -4078,9 +4079,9 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
params.input_buffer = addr;
}
if (batch->ctx->gs->gs.dynamic_topology) {
unsigned idx_size =
params.input_primitives * batch->ctx->gs->gs.max_indices;
struct agx_gs_info *gsi = &batch->ctx->gs->gs;
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
unsigned idx_size = params.input_primitives * gsi->max_indices;
params.output_index_buffer =
agx_pool_alloc_aligned_with_bo(&batch->pool, idx_size * 4, 4,
@ -4161,9 +4162,8 @@ agx_launch_gs_prerast(struct agx_batch *batch,
.index_size_B = info->index_size,
.prim = info->mode,
.is_prefix_summing = gs->gs.prefix_sum,
.indices_per_in_prim = gs->gs.max_indices,
.instanced = gs->gs.instanced,
.dynamic_topology = gs->gs.dynamic_topology,
.max_indices = gs->gs.max_indices,
.shape = gs->gs.shape,
};
libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
@ -5152,16 +5152,16 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
return;
/* Setup to rasterize the GS results */
struct agx_gs_info *gsi = &ctx->gs->gs;
info_gs = (struct pipe_draw_info){
.mode = ctx->gs->gs.mode,
.index_size = ctx->gs->gs.indexed ? 4 : 0,
.primitive_restart = ctx->gs->gs.indexed,
.mode = gsi->mode,
.index_size = agx_gs_indexed(gsi->shape) ? 4 : 0,
.primitive_restart = agx_gs_indexed(gsi->shape),
.restart_index = ~0,
.index.resource = &index_rsrc.base,
.instance_count = 1,
};
unsigned unrolled_prims = 0;
if (indirect) {
indirect_gs = (struct pipe_draw_indirect_info){
.draw_count = 1,
@ -5171,20 +5171,18 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
indirect = &indirect_gs;
} else {
bool instanced = ctx->gs->gs.instanced;
unrolled_prims =
u_decomposed_prims_for_vertices(info->mode, draws->count) *
info->instance_count;
unsigned prims =
u_decomposed_prims_for_vertices(info->mode, draws->count);
draw_gs = (struct pipe_draw_start_count_bias){
.count = ctx->gs->gs.max_indices * (instanced ? 1 : unrolled_prims),
.count = agx_gs_rast_vertices(gsi->shape, gsi->max_indices, prims,
info->instance_count),
};
draws = &draw_gs;
info_gs.instance_count = agx_gs_rast_instances(
gsi->shape, gsi->max_indices, prims, info->instance_count);
if (ctx->gs->gs.instanced) {
info_gs.instance_count = unrolled_prims;
}
draws = &draw_gs;
}
info = &info_gs;
@ -5193,12 +5191,12 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
batch->reduced_prim = u_reduced_prim(info->mode);
ctx->dirty |= AGX_DIRTY_PRIM;
if (ctx->gs->gs.dynamic_topology) {
if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) {
ib = batch->geom_index;
ib_extent = index_rsrc.bo->size - (batch->geom_index - ib);
} else if (ctx->gs->gs.indexed) {
ib_extent = ctx->gs->gs.max_indices * 4;
ib = agx_pool_upload(&batch->pool, ctx->gs->gs.topology, ib_extent);
} else if (gsi->shape == AGX_GS_SHAPE_STATIC_INDEXED) {
ib_extent = gsi->max_indices * 4;
ib = agx_pool_upload(&batch->pool, gsi->topology, ib_extent);
}
/* We need to reemit geometry descriptors since the txf sampler may change