poly,asahi: Move vertex_output_buffer to poly_vertex_param

Instead of having the vertex output buffer be a system value and
something the driver needs to manage, put it in poly_vertex_param.  We
already need to have it somewhere GPU-writable so we can write it from
indirect setup kernels.  Instead of manually allocating 8B all over the
place just to hold this one pointer, stick it in poly_vertex_param.
This also lets us get rid of a NIR intrinsic.

Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Mary Guillemard <mary@mary.zone>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38404>
This commit is contained in:
Faith Ekstrand 2025-11-13 17:55:00 -05:00 committed by Marge Bot
parent 8950efc006
commit 89fbb9cf84
14 changed files with 55 additions and 103 deletions

View file

@ -343,7 +343,6 @@ libagx_unroll_restart(global struct poly_heap *heap, uint64_t index_buffer,
KERNEL(1)
libagx_gs_setup_indirect(
uint64_t index_buffer, constant uint *draw,
global uintptr_t *vertex_buffer /* output */,
global struct poly_vertex_params *vp /* output */,
global struct poly_geometry_params *p /* output */,
global struct poly_heap *heap,
@ -353,7 +352,7 @@ libagx_gs_setup_indirect(
uint32_t prim /* Input primitive type, enum mesa_prim */,
int is_prefix_summing, uint max_indices, enum poly_gs_shape shape)
{
poly_gs_setup_indirect(index_buffer, draw, vertex_buffer, vp, p, heap,
poly_gs_setup_indirect(index_buffer, draw, vp, p, heap,
vs_outputs, index_size_B, index_buffer_range_el, prim,
is_prefix_summing, max_indices, shape);
}

View file

@ -11,7 +11,7 @@ libagx_tess_setup_indirect(
global struct poly_tess_params *p,
global uint32_t *grids /* output: VS then TCS then tess */,
global struct poly_vertex_params *vp /* output */, global uint32_t *indirect,
global uint64_t *vertex_output_buffer_ptr, uint64_t in_index_buffer,
uint64_t in_index_buffer,
uint32_t in_index_buffer_range_el, uint32_t in_index_size_B,
uint64_t vertex_outputs /* bitfield */,
@ -51,7 +51,7 @@ libagx_tess_setup_indirect(
p->coord_allocs = (global uint *)(blob + patch_coord_offs);
p->nr_patches = unrolled_patches;
*vertex_output_buffer_ptr = (uintptr_t)(blob + vb_offs);
vp->output_buffer = (uintptr_t)(blob + vb_offs);
p->counts = (global uint32_t *)(blob + count_offs);
if (vp) {

View file

@ -64,9 +64,6 @@ struct hk_root_descriptor_table {
uint32_t attrib_clamps[AGX_MAX_VBUFS];
uint32_t attrib_strides[AGX_MAX_VBUFS];
/* Pointer to the VS->TCS, VS->GS, or TES->GS buffer. */
uint64_t vertex_output_buffer;
/* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
uint64_t vertex_outputs;

View file

@ -1041,11 +1041,14 @@ hk_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
}
static uint64_t
hk_upload_vertex_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
hk_upload_vertex_params(struct hk_cmd_buffer *cmd,
uint64_t vertex_output_buffer,
struct agx_draw draw)
{
assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");
struct poly_vertex_params params = {.verts_per_instance = draw.b.count[0]};
struct poly_vertex_params params = {
.verts_per_instance = draw.b.count[0],
.output_buffer = vertex_output_buffer,
};
if (draw.indexed) {
unsigned index_size_B = agx_index_size_to_B(draw.index_size);
@ -1097,7 +1100,9 @@ hk_rast_prim(struct hk_cmd_buffer *cmd)
}
static uint64_t
hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
hk_upload_geometry_params(struct hk_cmd_buffer *cmd,
uint64_t vertex_output_buffer,
struct agx_draw draw)
{
struct hk_device *dev = hk_cmd_buffer_device(cmd);
struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
@ -1122,7 +1127,7 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
/* Overriden by the indirect setup kernel. As tess->GS is always indirect,
* we can assume here that we're VS->GS.
*/
.input_buffer = desc->root.draw.vertex_output_buffer,
.input_buffer = vertex_output_buffer,
.input_mask = desc->root.draw.vertex_outputs,
};
@ -1483,15 +1488,6 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
.shape = count->info.gs.shape,
};
if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
gsi.vertex_buffer = desc->root.draw.tess_params +
offsetof(struct poly_tess_params, tes_buffer);
} else {
gsi.vertex_buffer = desc->root.root_desc_addr +
offsetof(struct hk_root_descriptor_table,
draw.vertex_output_buffer);
}
if (draw.indexed) {
gsi.index_size_B = agx_index_size_to_B(draw.index_size);
gsi.index_buffer_range_el = agx_draw_index_range_el(draw);
@ -1610,9 +1606,6 @@ hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
.indirect = draw.b.ptr,
.vp = gfx->descriptors.root.draw.vertex_params,
.vertex_outputs = vs->b.info.outputs,
.vertex_output_buffer_ptr =
gfx->root + offsetof(struct hk_root_descriptor_table,
draw.vertex_output_buffer),
.tcs_statistic = hk_pipeline_stat_addr(
cmd,
VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT),
@ -3056,17 +3049,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
gfx->dirty |= HK_DIRTY_VARYINGS;
}
if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
gfx->shaders[MESA_SHADER_GEOMETRY] || linked_vs->sw_indexing) {
/* XXX: We should deduplicate this logic */
bool indirect = agx_is_indirect(draw.b) || draw.restart;
desc->root.draw.vertex_params =
indirect ? hk_pool_alloc(cmd, sizeof(struct poly_vertex_params), 4).gpu
: hk_upload_vertex_params(cmd, draw);
desc->root_dirty = true;
}
uint64_t vertex_output_buffer = 0;
if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
gfx->shaders[MESA_SHADER_GEOMETRY]) {
@ -3087,12 +3070,18 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
*
* dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
*/
desc->root.draw.vertex_output_buffer =
vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
: AGX_SCRATCH_PAGE_ADDRESS;
vertex_output_buffer = vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
: AGX_SCRATCH_PAGE_ADDRESS;
}
}
if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
gfx->shaders[MESA_SHADER_GEOMETRY] || linked_vs->sw_indexing) {
desc->root.draw.vertex_params =
hk_upload_vertex_params(cmd, vertex_output_buffer, draw);
desc->root_dirty = true;
}
struct agx_ptr tess_args = {0};
if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
tess_args = hk_pool_alloc(cmd, sizeof(struct poly_tess_params), 4);
@ -3102,7 +3091,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
gfx->descriptors.root.draw.geometry_params =
hk_upload_geometry_params(cmd, draw);
hk_upload_geometry_params(cmd, vertex_output_buffer, draw);
gfx->descriptors.root_dirty = true;
}

View file

@ -424,9 +424,6 @@ lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
case nir_intrinsic_load_geometry_param_buffer_poly:
return lower_sysval_to_root_table(b, intrin, draw.geometry_params);
case nir_intrinsic_load_vs_output_buffer_poly:
return lower_sysval_to_root_table(b, intrin, draw.vertex_output_buffer);
case nir_intrinsic_load_vs_outputs_poly:
return lower_sysval_to_root_table(b, intrin, draw.vertex_outputs);

View file

@ -1431,9 +1431,6 @@ system_value("ro_sink_address_poly", 1, bit_sizes=[64])
# mesa_prim for the input topology (in a geometry shader)
system_value("input_topology_poly", 1)
# Pointer to the buffer passing outputs VS->TCS, VS->GS, or TES->GS linkage.
system_value("vs_output_buffer_poly", 1, bit_sizes=[64])
# Mask of VS->TCS, VS->GS, or TES->GS outputs. This is modelled as a sysval
# so it can be dynamic with shader objects or constant folded with monolithic.
system_value("vs_outputs_poly", 1, bit_sizes=[64])

View file

@ -184,9 +184,6 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
return load_sysval_root(b, 1, 64, &u->vertex_params);
case nir_intrinsic_load_geometry_param_buffer_poly:
return load_sysval_root(b, 1, 64, &u->geometry_params);
case nir_intrinsic_load_vs_output_buffer_poly:
return nir_load_global_constant(
b, 1, 64, load_sysval_root(b, 1, 64, &u->vertex_output_buffer_ptr));
case nir_intrinsic_load_vs_outputs_poly:
return load_sysval_root(b, 1, 64, &u->vertex_outputs);
case nir_intrinsic_load_tess_param_buffer_poly:

View file

@ -4003,9 +4003,6 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
params.vs_grid[5] = params.gs_grid[5] = 1;
if (indirect) {
batch->uniforms.vertex_output_buffer_ptr =
agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
params.vs_grid[2] = params.gs_grid[2] = 1;
} else {
params.vs_grid[0] = draw->count;
@ -4026,11 +4023,9 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
}
if (vb_size) {
uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
batch->uniforms.vertex_output_buffer_ptr =
agx_pool_upload(&batch->pool, &addr, 8);
params.input_buffer = addr;
vp.output_buffer =
agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
params.input_buffer = vp.output_buffer;
}
struct poly_gs_info *gsi = &batch->ctx->gs->gs;
@ -4112,7 +4107,6 @@ agx_launch_gs_prerast(struct agx_batch *batch,
.index_buffer = ib,
.index_buffer_range_el = ib_extent / info->index_size,
.draw = agx_indirect_buffer_ptr(batch, indirect),
.vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
.vp = batch->uniforms.vertex_params,
.p = batch->uniforms.geometry_params,
.heap = agx_batch_heap(batch),
@ -4646,9 +4640,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
unsigned vb_size = poly_tcs_in_size(draws->count * info->instance_count,
batch->uniforms.vertex_outputs);
uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
batch->uniforms.vertex_output_buffer_ptr =
agx_pool_upload(&batch->pool, &addr, 8);
vp.output_buffer = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
vs_grid = agx_3d(draws->count, info->instance_count, 1);
tcs_grid = agx_3d(in_patches * tcs->tess.output_patch_size,
@ -4661,8 +4653,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
.gpu;
}
batch->uniforms.vertex_params =
uint64_t vertex_state =
agx_pool_upload_aligned(&batch->pool, &vp, sizeof(vp), 8);
batch->uniforms.vertex_params = vertex_state;
uint64_t state =
agx_pool_upload_aligned(&batch->pool, &params, sizeof(params), 4);
@ -4673,7 +4666,6 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
uint32_t grid_stride = sizeof(uint32_t) * 6;
uint64_t vertex_out_ptr = agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
uint64_t indirect_ptr = agx_indirect_buffer_ptr(batch, indirect);
uint64_t tcs_statistic = agx_get_query_address(
@ -4684,12 +4676,10 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
libagx_tess_setup_indirect(
batch, agx_1d(1), AGX_BARRIER_ALL, state, grids, 0 /* XXX: IA */,
indirect_ptr, vertex_out_ptr, 0, 0, 0 /* XXX: Index buffer */,
batch, agx_1d(1), AGX_BARRIER_ALL, state, grids, vertex_state,
indirect_ptr, 0, 0, 0 /* XXX: Index buffer */,
ctx->vs->b.info.outputs, tcs_statistic);
batch->uniforms.vertex_output_buffer_ptr = vertex_out_ptr;
vs_grid = agx_grid_indirect_local(grids + 0 * grid_stride);
tcs_grid = agx_grid_indirect_local(grids + 1 * grid_stride);
tess_grid = agx_grid_indirect_local(grids + 2 * grid_stride);
@ -4701,8 +4691,6 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
agx_launch(batch, tcs_grid, agx_workgroup(tcs->tess.output_patch_size, 1, 1),
ctx->tcs, NULL, MESA_SHADER_TESS_CTRL, 0);
batch->uniforms.vertex_output_buffer_ptr = 0;
uint64_t c_prims = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES]);
uint64_t c_invs = agx_get_query_address(

View file

@ -117,13 +117,6 @@ struct PACKED agx_draw_uniforms {
/* Addresses for the results of pipeline statistics queries */
uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS];
/* Pointer to base address of the VS->TCS, VS->GS, or TES->GS buffer.
* Indirected so it can be written to in an indirect setup kernel. G13
* appears to prefetch uniforms across dispatches, but does not pre-run
* preambles, so this indirection saves us from splitting the batch.
*/
uint64_t vertex_output_buffer_ptr;
/* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
uint64_t vertex_outputs;

View file

@ -357,6 +357,12 @@ poly_pad_index_gs(global int *index_buffer, uint inv_index_offset,
}
}
uintptr_t
poly_vertex_output_buffer(constant struct poly_vertex_params *p)
{
return p->output_buffer;
}
uintptr_t
poly_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
gl_varying_slot location)

View file

@ -24,12 +24,6 @@ poly_tcs_unrolled_id(constant struct poly_tess_params *p, uint3 wg_id)
return (wg_id.y * p->patches_per_instance) + wg_id.x;
}
uint64_t
poly_tes_buffer(constant struct poly_tess_params *p)
{
return p->tes_buffer;
}
/*
* Helper to lower indexing for a tess eval shader ran as a compute shader. This
* handles the tess+geom case. This is simpler than the general input assembly

View file

@ -189,8 +189,11 @@ struct poly_vertex_params {
* setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
*/
uint32_t verts_per_instance;
/* Output buffer for vertex data */
uint64_t output_buffer;
} PACKED;
static_assert(sizeof(struct poly_vertex_params) == 4 * 4);
static_assert(sizeof(struct poly_vertex_params) == 6 * 4);
static inline uint
poly_index_buffer_range_el(uint size_el, uint offset_el)
@ -522,7 +525,6 @@ poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives,
static inline void
poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw,
global uintptr_t *vertex_buffer /* output */,
global struct poly_vertex_params *vp /* output */,
global struct poly_geometry_params *p /* output */,
global struct poly_heap *heap,
@ -574,9 +576,10 @@ poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw,
heap, p->input_primitives * p->count_buffer_stride);
}
p->input_buffer =
const uintptr_t vertex_buffer =
(uintptr_t)poly_heap_alloc_nonatomic(heap, vertex_buffer_size);
*vertex_buffer = p->input_buffer;
vp->output_buffer = vertex_buffer;
p->input_buffer = vertex_buffer;
p->input_mask = vs_outputs;

View file

@ -293,7 +293,8 @@ poly_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL);
/* TCS always preceded by VS so we use the VS state directly */
addr = poly_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
nir_def *vp = nir_load_vertex_param_buffer_poly(b);
addr = poly_vertex_output_address(b, poly_vertex_output_buffer(b, vp),
nir_load_vs_outputs_poly(b), vertex,
location);
}
@ -1401,18 +1402,14 @@ lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);
nir_def *buffer, *nr_verts, *instance_id, *primitive_id;
if (b->shader->info.stage == MESA_SHADER_VERTEX) {
buffer = nir_load_vs_output_buffer_poly(b);
nr_verts = poly_input_vertices(b, nir_load_vertex_param_buffer_poly(b));
} else {
assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);
nir_def *vp = nir_load_vertex_param_buffer_poly(b);
nir_def *buffer = poly_vertex_output_buffer(b, vp);
/* Instancing is unrolled during tessellation so nr_verts is ignored. */
nr_verts = nir_imm_int(b, 0);
buffer = poly_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
}
/* Instancing is unrolled during tessellation so nr_verts is ignored. */
nir_def *nr_verts = b->shader->info.stage == MESA_SHADER_VERTEX ?
poly_input_vertices(b, vp) : nir_imm_int(b, 0);
nir_def *instance_id, *primitive_id;
if (b->shader->info.stage == MESA_SHADER_VERTEX &&
!b->shader->info.vs.tes_poly) {
primitive_id = nir_load_vertex_id_zero_base(b);

View file

@ -59,11 +59,6 @@ struct poly_tess_params {
*/
DEVICE(uint32_t) statistic;
/* When geom+tess used together, the buffer containing TES outputs (executed
* as a hardware compute shader).
*/
uint64_t tes_buffer;
/* Bitfield of TCS per-vertex outputs */
uint64_t tcs_per_vertex_outputs;
@ -105,4 +100,4 @@ struct poly_tess_params {
*/
uint32_t ccw;
} PACKED;
static_assert(sizeof(struct poly_tess_params) == 36 * 4);
static_assert(sizeof(struct poly_tess_params) == 34 * 4);