diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index 82e1302abf2..5d79bf652e8 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -1077,15 +1077,6 @@ agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx, } } - /* The geometry shader receives a number of input primitives. The driver - * should disable this counter when tessellation is active TODO and count - * patches separately. - */ - add_counter( - b, - nir_load_stat_query_address_agx(b, .base = PIPE_STAT_QUERY_IA_PRIMITIVES), - unrolled_in_prims); - /* The geometry shader is invoked once per primitive (after unrolling * primitive restart). From the spec: * diff --git a/src/asahi/libagx/geometry.cl b/src/asahi/libagx/geometry.cl index 931ad4c3073..46ccb076c87 100644 --- a/src/asahi/libagx/geometry.cl +++ b/src/asahi/libagx/geometry.cl @@ -318,31 +318,40 @@ libagx_load_index_buffer(constant struct agx_ia_state *p, uint id, } static void -increment_ia_counters(global uint32_t *ia_vertices, - global uint32_t *vs_invocations, uint count) +increment_counters(global uint32_t *a, global uint32_t *b, global uint32_t *c, + uint count) { - if (ia_vertices) { - *ia_vertices += count; - } + global uint32_t *ptr[] = {a, b, c}; - if (vs_invocations) { - *vs_invocations += count; + for (uint i = 0; i < 3; ++i) { + if (ptr[i]) { + *(ptr[i]) += count; + } } } KERNEL(1) libagx_increment_ia(global uint32_t *ia_vertices, - global uint32_t *vs_invocations, constant uint32_t *draw) + global uint32_t *ia_primitives, + global uint32_t *vs_invocations, global uint32_t *c_prims, + global uint32_t *c_invs, constant uint32_t *draw, + enum mesa_prim prim) { - increment_ia_counters(ia_vertices, vs_invocations, draw[0] * draw[1]); + increment_counters(ia_vertices, vs_invocations, NULL, draw[0] * draw[1]); + + uint prims = u_decomposed_prims_for_vertices(prim, draw[0]) * draw[1]; + increment_counters(ia_primitives, c_prims, c_invs, prims); } KERNEL(1024) libagx_increment_ia_restart(global uint32_t *ia_vertices, + global uint32_t *ia_primitives, global uint32_t *vs_invocations, + global uint32_t *c_prims, global uint32_t *c_invs, constant uint32_t *draw, uint64_t index_buffer, uint32_t index_buffer_range_el, - uint32_t restart_index, uint32_t index_size_B) + uint32_t restart_index, uint32_t index_size_B, + enum mesa_prim prim) { uint tid = get_global_id(0); unsigned count = draw[0]; @@ -368,7 +377,30 @@ libagx_increment_ia_restart(global uint32_t *ia_vertices, /* Elect a single thread from the workgroup to increment the counters */ if (tid == 0) { - increment_ia_counters(ia_vertices, vs_invocations, scratch * draw[1]); + increment_counters(ia_vertices, vs_invocations, NULL, scratch * draw[1]); + } + + /* TODO: We should vectorize this */ + if ((ia_primitives || c_prims || c_invs) && tid == 0) { + uint accum = 0; + int last_restart = -1; + for (uint i = 0; i < count; ++i) { + uint index = load_index(index_buffer, index_buffer_range_el, start + i, + index_size_B); + + if (index == restart_index) { + accum += + u_decomposed_prims_for_vertices(prim, i - last_restart - 1); + last_restart = i; + } + } + + { + accum += + u_decomposed_prims_for_vertices(prim, count - last_restart - 1); + } + + increment_counters(ia_primitives, c_prims, c_invs, accum * draw[1]); } } diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c index 9f1e824dd8a..89dc3a7e022 100644 --- a/src/asahi/vulkan/hk_cmd_draw.c +++ b/src/asahi/vulkan/hk_cmd_draw.c @@ -39,6 +39,7 @@ #include "util/format/u_formats.h" #include "util/macros.h" #include "util/ralloc.h" +#include "util/u_prim.h" #include "vulkan/vulkan_core.h" #include "layout.h" #include "libagx_dgc.h" @@ -3108,23 +3109,6 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw) uint32_t xfb_outputs = last_sw->info.xfb_info.output_count; bool needs_gs = xfb_outputs; - /* Various pipeline statistics are implemented in the pre-GS shader. TODO: - * This could easily be optimized. - */ - VkQueryPipelineStatisticFlagBits ia_statistics[] = { - VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT, - VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT, - VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT, - }; - - bool ia_stats = false; - - for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) { - ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0; - } - - needs_gs |= ia_stats; - /* If we already have a matching GS configuration, we're done */ if ((gs != NULL) == needs_gs) return; @@ -3157,8 +3141,7 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw) } struct hk_device *dev = hk_cmd_buffer_device(cmd); - perf_debug(dev, "Binding passthrough GS for%s%s\n", - xfb_outputs ? " XFB" : "", ia_stats ? " statistics" : ""); + perf_debug(dev, "Binding passthrough GS for%s\n", xfb_outputs ? " XFB" : ""); gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size); gs->is_passthrough = true; @@ -3337,7 +3320,8 @@ hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx) static void hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw, - uint64_t ia_vertices, uint64_t vs_invocations) + uint64_t ia_vertices, uint64_t ia_prims, uint64_t vs_invocations, + uint64_t c_prims, uint64_t c_inv) { /* XXX: stream link needed? */ struct hk_device *dev = hk_cmd_buffer_device(cmd); @@ -3351,15 +3335,33 @@ hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw, draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4); } + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology); + + bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]; + bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]; + + /* Clipper counters depend on geom/tess outputs and must be written with the + * geom/tess output. They are updated as IA counters only when geom/tess is + * not used. + * + * TODO: Tessellation clipper counters not actually wired up, pending CTS. + */ + if (geom || tess) { + c_prims = 0; + c_inv = 0; + } + if (draw.restart) { uint32_t index_size_B = agx_index_size_to_B(draw.index_size); - libagx_increment_ia_restart(cs, agx_1d(1024), ia_vertices, vs_invocations, - draw_ptr, draw.index_buffer, - agx_draw_index_range_el(draw), - cmd->state.gfx.index.restart, index_size_B); + libagx_increment_ia_restart( + cs, agx_1d(1024), ia_vertices, ia_prims, vs_invocations, c_prims, + c_inv, draw_ptr, draw.index_buffer, agx_draw_index_range_el(draw), + cmd->state.gfx.index.restart, index_size_B, prim); } else { - libagx_increment_ia(cs, agx_1d(1), ia_vertices, vs_invocations, draw_ptr); + libagx_increment_ia(cs, agx_1d(1), ia_vertices, ia_prims, vs_invocations, + c_prims, c_inv, draw_ptr, prim); } } @@ -3380,10 +3382,20 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_) uint64_t stat_ia_verts = hk_pipeline_stat_addr( cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT); + uint64_t stat_ia_prims = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT); + uint64_t stat_vs_inv = hk_pipeline_stat_addr( cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT); - bool ia_stats = stat_ia_verts || stat_vs_inv; + uint64_t stat_c_inv = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT); + + uint64_t stat_c_prims = hk_pipeline_stat_addr( + cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT); + + bool ia_stats = stat_ia_verts || stat_ia_prims || stat_vs_inv || + stat_c_inv || stat_c_prims; struct hk_device *dev = hk_cmd_buffer_device(cmd); hk_foreach_view(cmd) { @@ -3423,7 +3435,8 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_) } if (ia_stats) { - hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv); + hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_ia_prims, stat_vs_inv, + stat_c_inv, stat_c_prims); } if (tess) { diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 2815772ec4c..c803cc9d96e 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -48,6 +48,7 @@ #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" +#include "util/u_prim.h" #include "util/u_transfer.h" #include "util/u_upload_mgr.h" #include "agx_bg_eot.h" @@ -3890,19 +3891,41 @@ agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info, uint64_t ia_vertices = agx_get_query_address( batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]); + uint64_t ia_primitives = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES]); + uint64_t vs_invocations = agx_get_query_address( batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]); + uint64_t c_prims = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES]); + + uint64_t c_invs = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]); + + /* With a geometry shader, clipper counters are written by the pre-GS kernel + * since they depend on the output on the geometry shader. Without a geometry + * shader, they are written along with IA. + * + * TODO: Broken tessellation interaction, but nobody cares. + */ + if (ctx->stage[PIPE_SHADER_GEOMETRY].shader) { + c_prims = 0; + c_invs = 0; + } + if (info->primitive_restart) { perf_debug(dev, "Input assembly counters with primitive restart"); - libagx_increment_ia_restart(batch, agx_1d(1024), ia_vertices, - vs_invocations, draw, ib, ib_range_el, - info->restart_index, info->index_size); + libagx_increment_ia_restart( + batch, agx_1d(1024), ia_vertices, ia_primitives, vs_invocations, + c_prims, c_invs, draw, ib, ib_range_el, info->restart_index, + info->index_size, info->mode); } else { perf_debug(dev, "Input assembly counters"); - libagx_increment_ia(batch, agx_1d(1), ia_vertices, vs_invocations, draw); + libagx_increment_ia(batch, agx_1d(1), ia_vertices, ia_primitives, + vs_invocations, c_prims, c_invs, draw, info->mode); } } @@ -4269,14 +4292,6 @@ agx_needs_passthrough_gs(struct agx_context *ctx, return true; } - /* Various pipeline statistics are implemented in the pre-GS shader. */ - if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] || - ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] || - ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) { - perf_debug_ctx(ctx, "Using passthrough GS due to pipeline statistics"); - return true; - } - /* Transform feedback is layered on geometry shaders, so if transform * feedback is used, we need a GS. */ @@ -4870,7 +4885,11 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, if (ctx->active_queries && !ctx->active_draw_without_restart && (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] || - ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) { + ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] || + ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS] || + ((ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] || + ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) && + !ctx->stage[PIPE_SHADER_GEOMETRY].shader))) { uint64_t ptr; if (indirect) {