panfrost: port to common stats framework

this adds full support for executable statistics in panvk. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Acked-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33921>
2025-12-20 13:50:11 +01:00 · 2025-02-26 11:16:36 -05:00 · 2025-02-26 11:16:36 -05:00 · 4da7b12000
commit 4da7b12000
parent 7b1f1a107e
7 changed files with 167 additions and 148 deletions
--- a/src/gallium/drivers/panfrost/pan_shader.c
+++ b/src/gallium/drivers/panfrost/pan_shader.c
@ -36,6 +36,7 @@
 #include "nir_serialize.h"
 #include "pan_bo.h"
 #include "pan_context.h"
+#include "shader_enums.h"

 static struct panfrost_uncompiled_shader *
 panfrost_alloc_shader(const nir_shader *nir)
@ -128,7 +129,6 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
      pan_shader_preprocess(s, panfrost_device_gpu_id(dev));

   struct panfrost_compile_inputs inputs = {
-      .debug = dbg,
      .gpu_id = panfrost_device_gpu_id(dev),
   };

@ -201,6 +201,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,

   screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);

+   panfrost_stats_util_debug(dbg, gl_shader_stage_name(s->info.stage),
+                             &out->info.stats);
+
+   if (s->info.stage == MESA_SHADER_VERTEX && out->info.vs.idvs) {
+      panfrost_stats_util_debug(dbg, "MESA_SHADER_POSITION",
+                                &out->info.stats_idvs_varying);
+   }
+
   assert(req_local_mem >= out->info.wls_size);
   out->info.wls_size = req_local_mem;

--- a/src/panfrost/compiler/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost_compile.c
@ -28,6 +28,7 @@
 #include "compiler/glsl/glsl_to_nir.h"
 #include "compiler/glsl_types.h"
 #include "compiler/nir/nir_builder.h"
+#include "panfrost/util/pan_ir.h"
 #include "util/u_debug.h"

 #include "bifrost/disassemble.h"
@ -4737,10 +4738,10 @@ bi_shader_stage_name(bi_context *ctx)
      return gl_shader_stage_name(ctx->stage);
 }

-static char *
-bi_print_stats(bi_context *ctx, unsigned size)
+static void
+bi_gather_stats(bi_context *ctx, unsigned size, struct bifrost_stats *out)
 {
-   struct bi_stats stats = {0};
+   struct bi_stats counts = {0};

   /* Count instructions, clauses, and tuples. Also attempt to construct
    * normalized execution engine cycle counts, using the following ratio:
@ -4756,57 +4757,46 @@ bi_print_stats(bi_context *ctx, unsigned size)

   bi_foreach_block(ctx, block) {
      bi_foreach_clause_in_block(block, clause) {
-         stats.nr_clauses++;
-         stats.nr_tuples += clause->tuple_count;
+         counts.nr_clauses++;
+         counts.nr_tuples += clause->tuple_count;

         for (unsigned i = 0; i < clause->tuple_count; ++i)
-            bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
+            bi_count_tuple_stats(clause, &clause->tuples[i], &counts);
      }
   }

-   float cycles_arith = ((float)stats.nr_arith) / 24.0;
-   float cycles_texture = ((float)stats.nr_texture) / 2.0;
-   float cycles_varying = ((float)stats.nr_varying) / 16.0;
-   float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
-
-   float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
-   float cycles_bound = MAX2(cycles_arith, cycles_message);
-
   /* Thread count and register pressure are traded off only on v7 */
   bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
-   unsigned nr_threads = full_threads ? 2 : 1;

-   /* Dump stats */
-   char *str = ralloc_asprintf(
-      NULL,
-      "%s shader: "
-      "%u inst, %u tuples, %u clauses, "
-      "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
-      "%u quadwords, %u threads",
-      bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
-      stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
-      cycles_varying, cycles_ldst, size / 16, nr_threads);
+   *out = (struct bifrost_stats){
+      .instrs = counts.nr_ins,
+      .tuples = counts.nr_tuples,
+      .clauses = counts.nr_clauses,
+      .arith = ((float)counts.nr_arith) / 24.0,
+      .t = ((float)counts.nr_texture) / 2.0,
+      .v = ((float)counts.nr_varying) / 16.0,
+      .ldst = ((float)counts.nr_ldst) / 1.0,
+      .code_size = size,
+      .preloads = ctx->arch == 7 ? bi_count_preload_cost(ctx) : 0,
+      .threads = full_threads ? 2 : 1,
+      .loops = ctx->loop_count,
+      .spills = ctx->spills,
+      .fills = ctx->fills,
+   };

-   if (ctx->arch == 7) {
-      ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
-   }
-
-   ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
-                          ctx->loop_count, ctx->spills, ctx->fills);
-
-   return str;
+   out->cycles = MAX2(out->arith, MAX3(out->t, out->v, out->ldst));
 }

-static char *
-va_print_stats(bi_context *ctx, unsigned size)
+static void
+va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out)
 {
   unsigned nr_ins = 0;
-   struct va_stats stats = {0};
+   struct va_stats counts = {0};

   /* Count instructions */
   bi_foreach_instr_global(ctx, I) {
      nr_ins++;
-      va_count_instr_stats(I, &stats);
+      va_count_instr_stats(I, &counts);
   }

   /* Mali G78 peak performance:
@ -4818,31 +4808,24 @@ va_print_stats(bi_context *ctx, unsigned size)
    * 4 texture instructions per cycle
    * 1 load/store operation per cycle
    */
-
-   float cycles_fma = ((float)stats.fma) / 64.0;
-   float cycles_cvt = ((float)stats.cvt) / 64.0;
-   float cycles_sfu = ((float)stats.sfu) / 16.0;
-   float cycles_v = ((float)stats.v) / 16.0;
-   float cycles_t = ((float)stats.t) / 4.0;
-   float cycles_ls = ((float)stats.ls) / 1.0;
+   *out = (struct valhall_stats){
+      .instrs = nr_ins,
+      .code_size = size,
+      .fma = ((float)counts.fma) / 64.0,
+      .cvt = ((float)counts.cvt) / 64.0,
+      .sfu = ((float)counts.sfu) / 16.0,
+      .v = ((float)counts.v) / 16.0,
+      .t = ((float)counts.t) / 4.0,
+      .ls = ((float)counts.ls) / 1.0,
+      .threads = (ctx->info.work_reg_count <= 32) ? 2 : 1,
+      .loops = ctx->loop_count,
+      .spills = ctx->spills,
+      .fills = ctx->fills,
+   };

   /* Calculate the bound */
-   float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
-                       MAX3(cycles_v, cycles_t, cycles_ls));
-
-   /* Thread count and register pressure are traded off */
-   unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
-
-   /* Dump stats */
-   return ralloc_asprintf(NULL,
-                          "%s shader: "
-                          "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
-                          "%f t, %f ls, %u quadwords, %u threads, %u loops, "
-                          "%u:%u spills:fills",
-                          bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
-                          cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
-                          size / 16, nr_threads, ctx->loop_count, ctx->spills,
-                          ctx->fills);
+   out->cycles =
+      MAX2(MAX3(out->fma, out->cvt, out->sfu), MAX3(out->v, out->t, out->ls));
 }

 static int
@ -5748,7 +5731,7 @@ static bi_context *
 bi_compile_variant_nir(nir_shader *nir,
                       const struct panfrost_compile_inputs *inputs,
                       struct util_dynarray *binary, struct bi_shader_info info,
-                       enum bi_idvs_mode idvs)
+                       struct panfrost_stats *stats, enum bi_idvs_mode idvs)
 {
   bi_context *ctx = rzalloc(NULL, bi_context);

@ -5985,23 +5968,17 @@ bi_compile_variant_nir(nir_shader *nir,
      fflush(stdout);
   }

-   if (!skip_internal &&
-       ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
-      char *shaderdb;
+   if (ctx->arch >= 9) {
+      stats->isa = PANFROST_STAT_VALHALL;
+      va_gather_stats(ctx, binary->size - offset, &stats->valhall);
+   } else {
+      stats->isa = PANFROST_STAT_BIFROST;
+      bi_gather_stats(ctx, binary->size - offset, &stats->bifrost);
+   }

-      if (ctx->arch >= 9) {
-         shaderdb = va_print_stats(ctx, binary->size - offset);
-      } else {
-         shaderdb = bi_print_stats(ctx, binary->size - offset);
-      }
-
-      if (bifrost_debug & BIFROST_DBG_SHADERDB)
-         fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
-
-      if (inputs->debug)
-         util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
-
-      ralloc_free(shaderdb);
+   if ((bifrost_debug & BIFROST_DBG_SHADERDB) && !skip_internal) {
+      const char *prefix = bi_shader_stage_name(ctx);
+      panfrost_stats_fprintf(stderr, prefix, stats);
   }

   return ctx;
@ -6034,8 +6011,11 @@ bi_compile_variant(nir_shader *nir,
    * offset, to keep the ABI simple. */
   assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));

+   struct panfrost_stats *stats =
+      idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats;
+
   bi_context *ctx =
-      bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
+      bi_compile_variant_nir(nir, inputs, binary, local_info, stats, idvs);

   /* A register is preloaded <==> it is live before the first block */
   bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
--- a/src/panfrost/compiler/compiler.h
+++ b/src/panfrost/compiler/compiler.h
@ -30,6 +30,7 @@
 #include "compiler/nir/nir.h"
 #include "panfrost/util/pan_ir.h"
 #include "util/half_float.h"
+#include "util/shader_stats.h"
 #include "util/u_math.h"
 #include "util/u_worklist.h"
 #include "bi_opcodes.h"
@ -834,6 +835,7 @@ bi_block_add_successor(bi_block *block, bi_block *successor)
 struct bi_shader_info {
   struct panfrost_ubo_push *push;
   struct bifrost_shader_info *bifrost;
+   struct panfrost_stats stats;
   unsigned tls_size;
   unsigned work_reg_count;
   unsigned push_offset;
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@ -36,6 +36,7 @@
 #include "compiler/nir/nir_builder.h"
 #include "util/half_float.h"
 #include "util/list.h"
+#include "util/shader_stats.h"
 #include "util/u_debug.h"
 #include "util/u_dynarray.h"
 #include "util/u_math.h"
@ -49,6 +50,7 @@
 #include "midgard_quirks.h"

 #include "disassemble.h"
+#include "shader_enums.h"

 static const struct debug_named_value midgard_debug_options[] = {
   {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"},
@ -3155,51 +3157,34 @@ midgard_compile_shader_nir(nir_shader *nir,
   if (binary->size)
      memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);

-   if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->debug) &&
-       !nir->info.internal) {
-      unsigned nr_bundles = 0, nr_ins = 0;
+   struct midgard_stats stats = {
+      .quadwords = ctx->quadword_count,
+      .registers = info->work_reg_count,
+      .loops = ctx->loop_count,
+      .spills = ctx->spills,
+      .fills = ctx->fills,
+   };

-      /* Count instructions and bundles */
+   /* Count instructions and bundles */
+   mir_foreach_block(ctx, _block) {
+      midgard_block *block = (midgard_block *)_block;
+      stats.bundles +=
+         util_dynarray_num_elements(&block->bundles, midgard_bundle);

-      mir_foreach_block(ctx, _block) {
-         midgard_block *block = (midgard_block *)_block;
-         nr_bundles +=
-            util_dynarray_num_elements(&block->bundles, midgard_bundle);
+      mir_foreach_bundle_in_block(block, bun)
+         stats.inst += bun->instruction_count;
+   }

-         mir_foreach_bundle_in_block(block, bun)
-            nr_ins += bun->instruction_count;
-      }
+   /* Calculate thread count. There are certain cutoffs by
+    * register count for thread count */
+   stats.threads = (stats.registers <= 4) ? 4 : (stats.registers <= 8) ? 2 : 1;

-      /* Calculate thread count. There are certain cutoffs by
-       * register count for thread count */
+   info->stats.isa = PANFROST_STAT_MIDGARD;
+   info->stats.midgard = stats;

-      unsigned nr_registers = info->work_reg_count;
-
-      unsigned nr_threads = (nr_registers <= 4)   ? 4
-                            : (nr_registers <= 8) ? 2
-                                                  : 1;
-
-      char *shaderdb = NULL;
-
-      /* Dump stats */
-
-      asprintf(&shaderdb,
-               "%s shader: "
-               "%u inst, %u bundles, %u quadwords, "
-               "%u registers, %u threads, %u loops, "
-               "%u:%u spills:fills",
-               ctx->inputs->is_blend ? "PAN_SHADER_BLEND"
-                                     : gl_shader_stage_name(ctx->stage),
-               nr_ins, nr_bundles, ctx->quadword_count, nr_registers,
-               nr_threads, ctx->loop_count, ctx->spills, ctx->fills);
-
-      if (midgard_debug & MIDGARD_DBG_SHADERDB)
-         fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
-
-      if (inputs->debug)
-         util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
-
-      free(shaderdb);
+   if ((midgard_debug & MIDGARD_DBG_SHADERDB) && !nir->info.internal) {
+      const char *prefix = _mesa_shader_stage_to_abbrev(ctx->stage);
+      midgard_stats_fprintf(stderr, prefix, &stats);
   }

   _mesa_hash_table_u64_destroy(ctx->ssa_constants);
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@ -27,6 +27,7 @@
 #include <stdint.h>
 #include "compiler/nir/nir.h"
 #include "util/hash_table.h"
+#include "util/shader_stats.h"
 #include "util/u_dynarray.h"

 /* Indices for named (non-XFB) varyings that are present. These are packed
@ -95,8 +96,6 @@ unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo,
                               unsigned offs);

 struct panfrost_compile_inputs {
-   struct util_debug_callback *debug;
-
   unsigned gpu_id;
   bool is_blend, is_blit;
   struct {
@ -196,6 +195,8 @@ struct pan_shader_info {
   unsigned tls_size;
   unsigned wls_size;

+   struct panfrost_stats stats, stats_idvs_varying;
+
   /* Bit mask of preloaded registers */
   uint64_t preload;

--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -41,11 +41,13 @@
 #include "spirv/nir_spirv.h"
 #include "util/memstream.h"
 #include "util/mesa-sha1.h"
+#include "util/shader_stats.h"
 #include "util/u_dynarray.h"
 #include "nir_builder.h"
 #include "nir_conversion_builder.h"
 #include "nir_deref.h"

+#include "shader_enums.h"
 #include "vk_graphics_state.h"
 #include "vk_nir_convert_ycbcr.h"
 #include "vk_shader_module.h"
@ -1416,13 +1418,6 @@ panvk_shader_serialize(struct vk_device *vk_dev,
   return !blob->out_of_memory;
 }

-#define WRITE_STR(field, ...)                                                  \
-   ({                                                                          \
-      memset(field, 0, sizeof(field));                                         \
-      UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__);              \
-      assert(i > 0 && i < sizeof(field));                                      \
-   })
-
 static VkResult
 panvk_shader_get_executable_properties(
   UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
@ -1438,10 +1433,20 @@ panvk_shader_get_executable_properties(
   {
      props->stages = mesa_to_vk_shader_stage(shader->info.stage);
      props->subgroupSize = 8;
-      WRITE_STR(props->name, "%s",
-                _mesa_shader_stage_to_string(shader->info.stage));
-      WRITE_STR(props->description, "%s shader",
-                _mesa_shader_stage_to_string(shader->info.stage));
+      VK_COPY_STR(props->name,
+                  _mesa_shader_stage_to_string(shader->info.stage));
+      VK_PRINT_STR(props->description, "%s shader",
+                   _mesa_shader_stage_to_string(shader->info.stage));
+   }
+
+   if (shader->info.stage == MESA_SHADER_VERTEX && shader->info.vs.idvs) {
+      vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
+      {
+         props->stages = mesa_to_vk_shader_stage(shader->info.stage);
+         props->subgroupSize = 8;
+         VK_COPY_STR(props->name, "varying");
+         VK_COPY_STR(props->description, "Varying shader");
+      }
   }

   return vk_outarray_status(&out);
@ -1459,19 +1464,11 @@ panvk_shader_get_executable_statistics(
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
                          statistic_count);

-   assert(executable_index == 0);
-
-   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
-   {
-      WRITE_STR(stat->name, "Code Size");
-      WRITE_STR(stat->description,
-                "Size of the compiled shader binary, in bytes");
-      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
-      stat->value.u64 = shader->bin_size;
-   }
-
-   /* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */
+   assert(executable_index == 0 || executable_index == 1);
+   struct panfrost_stats *stats =
+      executable_index ? &shader->info.stats_idvs_varying : &shader->info.stats;

+   vk_add_panfrost_stats(out, stats);
   return vk_outarray_status(&out);
 }

@ -1513,8 +1510,8 @@ panvk_shader_get_executable_internal_representations(
      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
                               &out, ir)
      {
-         WRITE_STR(ir->name, "NIR shader");
-         WRITE_STR(ir->description,
+         VK_COPY_STR(ir->name, "NIR shader");
+         VK_COPY_STR(ir->description,
                   "NIR shader before sending to the back-end compiler");
         if (!write_ir_text(ir, shader->nir_str))
            incomplete_text = true;
@ -1525,8 +1522,8 @@ panvk_shader_get_executable_internal_representations(
      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
                               &out, ir)
      {
-         WRITE_STR(ir->name, "Assembly");
-         WRITE_STR(ir->description, "Final Assembly");
+         VK_COPY_STR(ir->name, "Assembly");
+         VK_COPY_STR(ir->description, "Final Assembly");
         if (!write_ir_text(ir, shader->asm_str))
            incomplete_text = true;
      }
--- a/src/util/shader_stats.xml
+++ b/src/util/shader_stats.xml
@ -13,4 +13,50 @@
      <stat name="Spills">Number of spill (stack store) instructions</stat>
      <stat name="Fills">Number of fill (stack load) instructions</stat>
    </isa>
+
+   <family name="Panfrost">
+     <isa name="Midgard">
+        <stat name="Instructions" display="Inst">Instruction count</stat>
+        <stat name="Bundles">Instruction bundles</stat>
+        <stat name="Registers" type="u16">Register usage in vec4s</stat>
+        <stat name="Threads" more="better" type="u16">Maximum number of threads in flight on a compute unit</stat>
+        <stat name="Quadwords">Binary size in quadwords</stat>
+        <stat name="Loops">Number of hardware loops</stat>
+        <stat name="Spills">Number of spill instructions</stat>
+        <stat name="Fills">Number of fill instructions</stat>
+     </isa>
+
+     <isa name="Bifrost">
+        <stat name="Instructions" display="Instrs">Instruction count</stat>
+        <stat name="Tuples">Tuple count</stat>
+        <stat name="Clauses">Clause count</stat>
+        <stat name="Cycles" type="float">Estimated normalized cycles</stat>
+        <stat name="Arithmetic" display="Arith" type="float">Estimated normalized arithmetic cycles</stat>
+        <stat name="Texture" display="T" type="float">Estimated normalized Texture cycles</stat>
+        <stat name="Load/store" display="LDST" type="float">Estimated normalized Load/Store cycles</stat>
+        <stat name="Varying" display="V" type="float">Estimated normalized Varying cycles</stat>
+        <stat name="Preloads" type="u16">Preload count</stat>
+        <stat name="Threads" more="better" type="u16">Maximum number of threads in flight on a compute unit</stat>
+        <stat name="Code size">Binary size in bytes</stat>
+        <stat name="Loops">Number of hardware loops</stat>
+        <stat name="Spills">Number of spill instructions</stat>
+        <stat name="Fills">Number of fill instructions</stat>
+     </isa>
+
+     <isa name="Valhall">
+        <stat name="Instructions" display="Instrs">Instruction count</stat>
+        <stat name="Cycles" type="float">Estimated normalized cycles</stat>
+        <stat name="FMA" type="float">Estimated normalized FMA (Fused Multiply-Add) cycles</stat>
+        <stat name="CVT" type="float">Estimated normalized CVT (ConVerT) cycles</stat>
+        <stat name="SFU" type="float">Estimated normalized SFU (Special Function Unit) cycles</stat>
+        <stat name="Varying" display="V" type="float">Estimated normalized Varying cycles</stat>
+        <stat name="Texture" display="T" type="float">Estimated normalized Texture cycles</stat>
+        <stat name="Load/store" display="LS" type="float">Estimated normalized Load/Store cycles</stat>
+        <stat name="Code size">Binary size in bytes</stat>
+        <stat name="Threads" more="better" type="u16">Maximum number of threads in flight on a compute unit</stat>
+        <stat name="Loops">Number of hardware loops</stat>
+        <stat name="Spills">Number of spill instructions</stat>
+        <stat name="Fills">Number of fill instructions</stat>
+     </isa>
+   </family>
 </shaderdb>