libagx,hk: handle pipeline stats queries without a GS

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32759>
This commit is contained in:
Alyssa Rosenzweig 2024-12-08 09:21:52 -05:00 committed by Marge Bot
parent 5bc89aa991
commit a3b796ba19
4 changed files with 115 additions and 60 deletions

View file

@ -1077,15 +1077,6 @@ agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx,
} }
} }
/* The geometry shader receives a number of input primitives. The driver
* should disable this counter when tessellation is active TODO and count
* patches separately.
*/
add_counter(
b,
nir_load_stat_query_address_agx(b, .base = PIPE_STAT_QUERY_IA_PRIMITIVES),
unrolled_in_prims);
/* The geometry shader is invoked once per primitive (after unrolling /* The geometry shader is invoked once per primitive (after unrolling
* primitive restart). From the spec: * primitive restart). From the spec:
* *

View file

@ -318,31 +318,40 @@ libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
} }
static void static void
increment_ia_counters(global uint32_t *ia_vertices, increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c,
global uint32_t *vs_invocations, uint count) uint count)
{ {
if (ia_vertices) { global uint32_t *ptr[] = {a, b, c};
*ia_vertices += count;
}
if (vs_invocations) { for (uint i = 0; i < 3; ++i) {
*vs_invocations += count; if (ptr[i]) {
*(ptr[i]) += count;
}
} }
} }
KERNEL(1) KERNEL(1)
libagx_increment_ia(global uint32_t *ia_vertices, libagx_increment_ia(global uint32_t *ia_vertices,
global uint32_t *vs_invocations, constant uint32_t *draw) global uint32_t *ia_primitives,
global uint32_t *vs_invocations, global uint32_t *c_prims,
global uint32_t *c_invs, constant uint32_t *draw,
enum mesa_prim prim)
{ {
increment_ia_counters(ia_vertices, vs_invocations, draw[0] * draw[1]); increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]);
uint prims = u_decomposed_prims_for_vertices(prim, draw[0]) * draw[1];
increment_counters(ia_primitives, c_prims, c_invs, prims);
} }
KERNEL(1024) KERNEL(1024)
libagx_increment_ia_restart(global uint32_t *ia_vertices, libagx_increment_ia_restart(global uint32_t *ia_vertices,
global uint32_t *ia_primitives,
global uint32_t *vs_invocations, global uint32_t *vs_invocations,
global uint32_t *c_prims, global uint32_t *c_invs,
constant uint32_t *draw, uint64_t index_buffer, constant uint32_t *draw, uint64_t index_buffer,
uint32_t index_buffer_range_el, uint32_t index_buffer_range_el,
uint32_t restart_index, uint32_t index_size_B) uint32_t restart_index, uint32_t index_size_B,
enum mesa_prim prim)
{ {
uint tid = get_global_id(0); uint tid = get_global_id(0);
unsigned count = draw[0]; unsigned count = draw[0];
@ -368,7 +377,30 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices,
/* Elect a single thread from the workgroup to increment the counters */ /* Elect a single thread from the workgroup to increment the counters */
if (tid == 0) { if (tid == 0) {
increment_ia_counters(ia_vertices, vs_invocations, scratch * draw[1]); increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]);
}
/* TODO: We should vectorize this */
if ((ia_primitives || c_prims || c_invs) && tid == 0) {
uint accum = 0;
int last_restart = -1;
for (uint i = 0; i < count; ++i) {
uint index = load_index(index_buffer, index_buffer_range_el, start + i,
index_size_B);
if (index == restart_index) {
accum +=
u_decomposed_prims_for_vertices(prim, i - last_restart - 1);
last_restart = i;
}
}
{
accum +=
u_decomposed_prims_for_vertices(prim, count - last_restart - 1);
}
increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]);
} }
} }

View file

@ -39,6 +39,7 @@
#include "util/format/u_formats.h" #include "util/format/u_formats.h"
#include "util/macros.h" #include "util/macros.h"
#include "util/ralloc.h" #include "util/ralloc.h"
#include "util/u_prim.h"
#include "vulkan/vulkan_core.h" #include "vulkan/vulkan_core.h"
#include "layout.h" #include "layout.h"
#include "libagx_dgc.h" #include "libagx_dgc.h"
@ -3108,23 +3109,6 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
uint32_t xfb_outputs = last_sw->info.xfb_info.output_count; uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
bool needs_gs = xfb_outputs; bool needs_gs = xfb_outputs;
/* Various pipeline statistics are implemented in the pre-GS shader. TODO:
* This could easily be optimized.
*/
VkQueryPipelineStatisticFlagBits ia_statistics[] = {
VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
};
bool ia_stats = false;
for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) {
ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0;
}
needs_gs |= ia_stats;
/* If we already have a matching GS configuration, we're done */ /* If we already have a matching GS configuration, we're done */
if ((gs != NULL) == needs_gs) if ((gs != NULL) == needs_gs)
return; return;
@ -3157,8 +3141,7 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
} }
struct hk_device *dev = hk_cmd_buffer_device(cmd); struct hk_device *dev = hk_cmd_buffer_device(cmd);
perf_debug(dev, "Binding passthrough GS for%s%s\n", perf_debug(dev, "Binding passthrough GS for%s\n", xfb_outputs ? " XFB" : "");
xfb_outputs ? " XFB" : "", ia_stats ? " statistics" : "");
gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size); gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
gs->is_passthrough = true; gs->is_passthrough = true;
@ -3337,7 +3320,8 @@ hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
static void static void
hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw, hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
uint64_t ia_vertices, uint64_t vs_invocations) uint64_t ia_vertices, uint64_t ia_prims, uint64_t vs_invocations,
uint64_t c_prims, uint64_t c_inv)
{ {
/* XXX: stream link needed? */ /* XXX: stream link needed? */
struct hk_device *dev = hk_cmd_buffer_device(cmd); struct hk_device *dev = hk_cmd_buffer_device(cmd);
@ -3351,15 +3335,33 @@ hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4); draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
} }
struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
/* Clipper counters depend on geom/tess outputs and must be written with the
* geom/tess output. They are updated as IA counters only when geom/tess is
* not used.
*
* TODO: Tessellation clipper counters not actually wired up, pending CTS.
*/
if (geom || tess) {
c_prims = 0;
c_inv = 0;
}
if (draw.restart) { if (draw.restart) {
uint32_t index_size_B = agx_index_size_to_B(draw.index_size); uint32_t index_size_B = agx_index_size_to_B(draw.index_size);
libagx_increment_ia_restart(cs, agx_1d(1024), ia_vertices, vs_invocations, libagx_increment_ia_restart(
draw_ptr, draw.index_buffer, cs, agx_1d(1024), ia_vertices, ia_prims, vs_invocations, c_prims,
agx_draw_index_range_el(draw), c_inv, draw_ptr, draw.index_buffer, agx_draw_index_range_el(draw),
cmd->state.gfx.index.restart, index_size_B); cmd->state.gfx.index.restart, index_size_B, prim);
} else { } else {
libagx_increment_ia(cs, agx_1d(1), ia_vertices, vs_invocations, draw_ptr); libagx_increment_ia(cs, agx_1d(1), ia_vertices, ia_prims, vs_invocations,
c_prims, c_inv, draw_ptr, prim);
} }
} }
@ -3380,10 +3382,20 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
uint64_t stat_ia_verts = hk_pipeline_stat_addr( uint64_t stat_ia_verts = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT); cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
uint64_t stat_ia_prims = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT);
uint64_t stat_vs_inv = hk_pipeline_stat_addr( uint64_t stat_vs_inv = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT); cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
bool ia_stats = stat_ia_verts || stat_vs_inv; uint64_t stat_c_inv = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT);
uint64_t stat_c_prims = hk_pipeline_stat_addr(
cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT);
bool ia_stats = stat_ia_verts || stat_ia_prims || stat_vs_inv ||
stat_c_inv || stat_c_prims;
struct hk_device *dev = hk_cmd_buffer_device(cmd); struct hk_device *dev = hk_cmd_buffer_device(cmd);
hk_foreach_view(cmd) { hk_foreach_view(cmd) {
@ -3423,7 +3435,8 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
} }
if (ia_stats) { if (ia_stats) {
hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv); hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_ia_prims, stat_vs_inv,
stat_c_inv, stat_c_prims);
} }
if (tess) { if (tess) {

View file

@ -48,6 +48,7 @@
#include "util/u_inlines.h" #include "util/u_inlines.h"
#include "util/u_math.h" #include "util/u_math.h"
#include "util/u_memory.h" #include "util/u_memory.h"
#include "util/u_prim.h"
#include "util/u_transfer.h" #include "util/u_transfer.h"
#include "util/u_upload_mgr.h" #include "util/u_upload_mgr.h"
#include "agx_bg_eot.h" #include "agx_bg_eot.h"
@ -3890,19 +3891,41 @@ agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
uint64_t ia_vertices = agx_get_query_address( uint64_t ia_vertices = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]); batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]);
uint64_t ia_primitives = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES]);
uint64_t vs_invocations = agx_get_query_address( uint64_t vs_invocations = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]); batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]);
uint64_t c_prims = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES]);
uint64_t c_invs = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]);
/* With a geometry shader, clipper counters are written by the pre-GS kernel
* since they depend on the output on the geometry shader. Without a geometry
* shader, they are written along with IA.
*
* TODO: Broken tessellation interaction, but nobody cares.
*/
if (ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
c_prims = 0;
c_invs = 0;
}
if (info->primitive_restart) { if (info->primitive_restart) {
perf_debug(dev, "Input assembly counters with primitive restart"); perf_debug(dev, "Input assembly counters with primitive restart");
libagx_increment_ia_restart(batch, agx_1d(1024), ia_vertices, libagx_increment_ia_restart(
vs_invocations, draw, ib, ib_range_el, batch, agx_1d(1024), ia_vertices, ia_primitives, vs_invocations,
info->restart_index, info->index_size); c_prims, c_invs, draw, ib, ib_range_el, info->restart_index,
info->index_size, info->mode);
} else { } else {
perf_debug(dev, "Input assembly counters"); perf_debug(dev, "Input assembly counters");
libagx_increment_ia(batch, agx_1d(1), ia_vertices, vs_invocations, draw); libagx_increment_ia(batch, agx_1d(1), ia_vertices, ia_primitives,
vs_invocations, c_prims, c_invs, draw, info->mode);
} }
} }
@ -4269,14 +4292,6 @@ agx_needs_passthrough_gs(struct agx_context *ctx,
return true; return true;
} }
/* Various pipeline statistics are implemented in the pre-GS shader. */
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) {
perf_debug_ctx(ctx, "Using passthrough GS due to pipeline statistics");
return true;
}
/* Transform feedback is layered on geometry shaders, so if transform /* Transform feedback is layered on geometry shaders, so if transform
* feedback is used, we need a GS. * feedback is used, we need a GS.
*/ */
@ -4870,7 +4885,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
if (ctx->active_queries && !ctx->active_draw_without_restart && if (ctx->active_queries && !ctx->active_draw_without_restart &&
(ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] || (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) { ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS] ||
((ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) &&
!ctx->stage[PIPE_SHADER_GEOMETRY].shader))) {
uint64_t ptr; uint64_t ptr;
if (indirect) { if (indirect) {