libagx,hk: handle pipeline stats queries without a GS

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32759>
This commit is contained in:
Alyssa Rosenzweig 2024-12-08 09:21:52 -05:00 committed by Marge Bot
parent 5bc89aa991
commit a3b796ba19
4 changed files with 115 additions and 60 deletions

View file

@ -1077,15 +1077,6 @@ agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx,
}
}
/* The geometry shader receives a number of input primitives. The driver
* should disable this counter when tessellation is active TODO and count
* patches separately.
*/
add_counter(
b,
nir_load_stat_query_address_agx(b, .base = PIPE_STAT_QUERY_IA_PRIMITIVES),
unrolled_in_prims);
/* The geometry shader is invoked once per primitive (after unrolling
* primitive restart). From the spec:
*

View file

@ -318,31 +318,40 @@ libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
}
static void
increment_ia_counters(global uint32_t *ia_vertices,
global uint32_t *vs_invocations, uint count)
increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c,
uint count)
{
if (ia_vertices) {
*ia_vertices += count;
}
global uint32_t *ptr[] = {a, b, c};
if (vs_invocations) {
*vs_invocations += count;
for (uint i = 0; i < 3; ++i) {
if (ptr[i]) {
*(ptr[i]) += count;
}
}
}
KERNEL(1)
libagx_increment_ia(global uint32_t *ia_vertices,
global uint32_t *vs_invocations, constant uint32_t *draw)
global uint32_t *ia_primitives,
global uint32_t *vs_invocations, global uint32_t *c_prims,
global uint32_t *c_invs, constant uint32_t *draw,
enum mesa_prim prim)
{
increment_ia_counters(ia_vertices, vs_invocations, draw[0] * draw[1]);
increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]);
uint prims = u_decomposed_prims_for_vertices(prim, draw[0]) * draw[1];
increment_counters(ia_primitives, c_prims, c_invs, prims);
}
KERNEL(1024)
libagx_increment_ia_restart(global uint32_t *ia_vertices,
global uint32_t *ia_primitives,
global uint32_t *vs_invocations,
global uint32_t *c_prims, global uint32_t *c_invs,
constant uint32_t *draw, uint64_t index_buffer,
uint32_t index_buffer_range_el,
uint32_t restart_index, uint32_t index_size_B)
uint32_t restart_index, uint32_t index_size_B,
enum mesa_prim prim)
{
uint tid = get_global_id(0);
unsigned count = draw[0];
@ -368,7 +377,30 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
/* Elect a single thread from the workgroup to increment the counters */
if (tid == 0) {
increment_ia_counters(ia_vertices, vs_invocations, scratch * draw[1]);
increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]);
}
/* TODO: We should vectorize this */
if ((ia_primitives || c_prims || c_invs) && tid == 0) {
uint accum = 0;
int last_restart = -1;
for (uint i = 0; i < count; ++i) {
uint index = load_index(index_buffer, index_buffer_range_el, start + i,
index_size_B);
if (index == restart_index) {
accum +=
u_decomposed_prims_for_vertices(prim, i - last_restart - 1);
last_restart = i;
}
}
{
accum +=
u_decomposed_prims_for_vertices(prim, count - last_restart - 1);
}
increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
}
}

View file

@ -39,6 +39,7 @@
#include "util/format/u_formats.h"
#include "util/macros.h"
#include "util/ralloc.h"
#include "util/u_prim.h"
#include "vulkan/vulkan_core.h"
#include "layout.h"
#include "libagx_dgc.h"
@ -3108,23 +3109,6 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
bool needs_gs = xfb_outputs;
/* Various pipeline statistics are implemented in the pre-GS shader. TODO:
* This could easily be optimized.
*/
VkQueryPipelineStatisticFlagBits ia_statistics[] = {
VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
};
bool ia_stats = false;
for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) {
ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0;
}
needs_gs |= ia_stats;
/* If we already have a matching GS configuration, we're done */
if ((gs != NULL) == needs_gs)
return;
@ -3157,8 +3141,7 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
}
struct hk_device *dev = hk_cmd_buffer_device(cmd);
perf_debug(dev, "Binding passthrough GS for%s%s\n",
xfb_outputs ? " XFB" : "", ia_stats ? " statistics" : "");
perf_debug(dev, "Binding passthrough GS for%s\n", xfb_outputs ? " XFB" : "");
gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
gs->is_passthrough = true;
@ -3337,7 +3320,8 @@ hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
static void
hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
uint64_t ia_vertices, uint64_t vs_invocations)
uint64_t ia_vertices, uint64_t ia_prims, uint64_t vs_invocations,
uint64_t c_prims, uint64_t c_inv)
{
/* XXX: stream link needed? */
struct hk_device *dev = hk_cmd_buffer_device(cmd);
@ -3351,15 +3335,33 @@ hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
}
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
/* Clipper counters depend on geom/tess outputs and must be written with the
* geom/tess output. They are updated as IA counters only when geom/tess is
* not used.
*
* TODO: Tessellation clipper counters not actually wired up, pending CTS.
*/
if (geom || tess) {
c_prims = 0;
c_inv = 0;
}
if (draw.restart) {
uint32_t index_size_B = agx_index_size_to_B(draw.index_size);
libagx_increment_ia_restart(cs, agx_1d(1024), ia_vertices, vs_invocations,
draw_ptr, draw.index_buffer,
agx_draw_index_range_el(draw),
cmd->state.gfx.index.restart, index_size_B);
libagx_increment_ia_restart(
cs, agx_1d(1024), ia_vertices, ia_prims, vs_invocations, c_prims,
c_inv, draw_ptr, draw.index_buffer, agx_draw_index_range_el(draw),
cmd->state.gfx.index.restart, index_size_B, prim);
} else {
libagx_increment_ia(cs, agx_1d(1), ia_vertices, vs_invocations, draw_ptr);
libagx_increment_ia(cs, agx_1d(1), ia_vertices, ia_prims, vs_invocations,
c_prims, c_inv, draw_ptr, prim);
}
}
@ -3380,10 +3382,20 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
uint64_t stat_ia_verts = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
uint64_t stat_ia_prims = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT);
uint64_t stat_vs_inv = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
bool ia_stats = stat_ia_verts || stat_vs_inv;
uint64_t stat_c_inv = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT);
uint64_t stat_c_prims = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT);
bool ia_stats = stat_ia_verts || stat_ia_prims || stat_vs_inv ||
stat_c_inv || stat_c_prims;
struct hk_device *dev = hk_cmd_buffer_device(cmd);
hk_foreach_view(cmd) {
@ -3423,7 +3435,8 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
}
if (ia_stats) {
hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv);
hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_ia_prims, stat_vs_inv,
stat_c_inv, stat_c_prims);
}
if (tess) {

View file

@ -48,6 +48,7 @@
#include "util/u_inlines.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/u_prim.h"
#include "util/u_transfer.h"
#include "util/u_upload_mgr.h"
#include "agx_bg_eot.h"
@ -3890,19 +3891,41 @@ agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
uint64_t ia_vertices = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]);
uint64_t ia_primitives = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES]);
uint64_t vs_invocations = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]);
uint64_t c_prims = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES]);
uint64_t c_invs = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]);
/* With a geometry shader, clipper counters are written by the pre-GS kernel
* since they depend on the output on the geometry shader. Without a geometry
* shader, they are written along with IA.
*
* TODO: Broken tessellation interaction, but nobody cares.
*/
if (ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
c_prims = 0;
c_invs = 0;
}
if (info->primitive_restart) {
perf_debug(dev, "Input assembly counters with primitive restart");
libagx_increment_ia_restart(batch, agx_1d(1024), ia_vertices,
vs_invocations, draw, ib, ib_range_el,
info->restart_index, info->index_size);
libagx_increment_ia_restart(
batch, agx_1d(1024), ia_vertices, ia_primitives, vs_invocations,
c_prims, c_invs, draw, ib, ib_range_el, info->restart_index,
info->index_size, info->mode);
} else {
perf_debug(dev, "Input assembly counters");
libagx_increment_ia(batch, agx_1d(1), ia_vertices, vs_invocations, draw);
libagx_increment_ia(batch, agx_1d(1), ia_vertices, ia_primitives,
vs_invocations, c_prims, c_invs, draw, info->mode);
}
}
@ -4269,14 +4292,6 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
return true;
}
/* Various pipeline statistics are implemented in the pre-GS shader. */
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) {
perf_debug_ctx(ctx, "Using passthrough GS due to pipeline statistics");
return true;
}
/* Transform feedback is layered on geometry shaders, so if transform
* feedback is used, we need a GS.
*/
@ -4870,7 +4885,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
if (ctx->active_queries && !ctx->active_draw_without_restart &&
(ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) {
ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS] ||
((ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) &&
!ctx->stage[PIPE_SHADER_GEOMETRY].shader))) {
uint64_t ptr;
if (indirect) {