From 325dfd809ab94012beaa8df80b2e4f1d0cf6d8c3 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Tue, 17 Jun 2025 15:07:04 +0100
Subject: [PATCH] radv,aco: switch to shader statistics framework

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Gitlab: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12756
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35583>
---
 src/amd/compiler/aco_interface.cpp           | 55 ++----------
 src/amd/compiler/aco_interface.h             | 18 ++--
 src/amd/compiler/aco_ir.h                    |  3 +-
 src/amd/compiler/aco_lower_to_hw_instr.cpp   |  8 +-
 src/amd/compiler/aco_statistics.cpp          | 28 +++---
 src/amd/vulkan/radv_pipeline.c               | 93 +++++++++-----------
 src/amd/vulkan/radv_pipeline_cache.c         |  2 +-
 src/amd/vulkan/radv_shader.c                 |  7 +-
 src/amd/vulkan/radv_shader.h                 |  3 +-
 src/gallium/drivers/radeonsi/si_shader_aco.c |  2 +-
 src/util/shader_stats.xml                    | 29 ++++++
 11 files changed, 110 insertions(+), 138 deletions(-)
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index a08f6560578..a80a8189b65 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -20,36 +20,6 @@ using namespace aco;
 
 namespace {
 
-static const std::array<aco_compiler_statistic_info, aco_num_statistics> statistic_infos = []()
-{
-   std::array<aco_compiler_statistic_info, aco_num_statistics> ret{};
-   ret[aco_statistic_hash] =
-      aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
-   ret[aco_statistic_instructions] =
-      aco_compiler_statistic_info{"Instructions", "Instruction count"};
-   ret[aco_statistic_copies] =
-      aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
-   ret[aco_statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
-   ret[aco_statistic_latency] =
-      aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
-   ret[aco_statistic_inv_throughput] = aco_compiler_statistic_info{
-      "Inverse Throughput", "Estimated busy cycles to execute one wave"};
-   ret[aco_statistic_vmem_clauses] = aco_compiler_statistic_info{
-      "VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
-   ret[aco_statistic_smem_clauses] = aco_compiler_statistic_info{
-      "SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
-   ret[aco_statistic_sgpr_presched] =
-      aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
-   ret[aco_statistic_vgpr_presched] =
-      aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
-   ret[aco_statistic_valu] = aco_compiler_statistic_info{"VALU", "Number of VALU instructions"};
-   ret[aco_statistic_salu] = aco_compiler_statistic_info{"SALU", "Number of SALU instructions"};
-   ret[aco_statistic_vmem] = aco_compiler_statistic_info{"VMEM", "Number of VMEM instructions"};
-   ret[aco_statistic_smem] = aco_compiler_statistic_info{"SMEM", "Number of SMEM instructions"};
-   ret[aco_statistic_vopd] = aco_compiler_statistic_info{"VOPD", "Number of VOPD instructions"};
-   return ret;
-}();
-
 static void
 validate(Program* program)
 {
@@ -222,8 +192,7 @@ aco_compile_shader_part(const struct aco_compiler_options* options,
    std::unique_ptr<Program> program{new Program};
 
    program->collect_statistics = options->record_stats;
-   if (program->collect_statistics)
-      memset(program->statistics, 0, sizeof(program->statistics));
+   memset(&program->statistics, 0, sizeof(program->statistics));
 
    program->debug.func = options->debug.func;
    program->debug.private_data = options->debug.private_data;
@@ -262,8 +231,7 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
    std::unique_ptr<Program> program{new Program};
 
    program->collect_statistics = options->record_stats;
-   if (program->collect_statistics)
-      memset(program->statistics, 0, sizeof(program->statistics));
+   memset(&program->statistics, 0, sizeof(program->statistics));
 
    program->debug.func = options->debug.func;
    program->debug.private_data = options->debug.private_data;
@@ -289,14 +257,9 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
    if (options->record_asm)
       disasm = get_disasm_string(program.get(), code, exec_size);
 
-   size_t stats_size = 0;
-   if (program->collect_statistics)
-      stats_size = aco_num_statistics * sizeof(uint32_t);
-
    (*build_binary)(binary, &config, llvm_ir.c_str(), llvm_ir.size(), disasm.c_str(), disasm.size(),
-                   program->statistics, stats_size, exec_size, code.data(), code.size(),
-                   symbols.data(), symbols.size(), program->debug_info.data(),
-                   program->debug_info.size());
+                   &program->statistics, exec_size, code.data(), code.size(), symbols.data(),
+                   symbols.size(), program->debug_info.data(), program->debug_info.size());
 }
 
 void
@@ -337,8 +300,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
    if (options->record_asm)
       disasm = get_disasm_string(program.get(), code, exec_size);
 
-   (*build_prolog)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), program->statistics, 0,
-                   exec_size, code.data(), code.size(), NULL, 0, NULL, 0);
+   (*build_prolog)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), NULL, exec_size,
+                   code.data(), code.size(), NULL, 0, NULL, 0);
 }
 
 void
@@ -437,8 +400,8 @@ aco_compile_trap_handler(const struct aco_compiler_options* options,
    if (options->record_asm)
       disasm = get_disasm_string(program.get(), code, exec_size);
 
-   (*build_binary)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), program->statistics, 0,
-                   exec_size, code.data(), code.size(), NULL, 0, NULL, 0);
+   (*build_binary)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), NULL, exec_size,
+                   code.data(), code.size(), NULL, 0, NULL, 0);
 }
 
 uint64_t
@@ -514,8 +477,6 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu)
    }
 }
 
-const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data();
-
 void
 aco_print_asm(const struct radeon_info *info, unsigned wave_size,
               uint32_t *binary, unsigned num_dw)
diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h
index db858b75848..30bd8f1b67f 100644
--- a/src/amd/compiler/aco_interface.h
+++ b/src/amd/compiler/aco_interface.h
@@ -8,10 +8,11 @@
 #define ACO_INTERFACE_H
 
 #include "aco_shader_info.h"
-#include "ac_shader_debug_info.h"
 
 #include "nir_defines.h"
+#include "util/shader_stats.h"
 
+#include "ac_shader_debug_info.h"
 #include "amd_family.h"
 #ifdef __cplusplus
 extern "C" {
@@ -26,24 +27,17 @@ struct aco_vs_prolog_info;
 struct aco_ps_epilog_info;
 struct radeon_info;
 
-struct aco_compiler_statistic_info {
-   char name[32];
-   char desc[64];
-};
-
 typedef void(aco_callback)(void** priv_ptr, const struct ac_shader_config* config,
                            const char* llvm_ir_str, unsigned llvm_ir_size, const char* disasm_str,
-                           unsigned disasm_size, uint32_t* statistics, uint32_t stats_size,
-                           uint32_t exec_size, const uint32_t* code, uint32_t code_dw,
-                           const struct aco_symbol* symbols, unsigned num_symbols,
-                           const struct ac_shader_debug_info* debug_info, unsigned debug_info_count);
+                           unsigned disasm_size, struct amd_stats* stats, uint32_t exec_size,
+                           const uint32_t* code, uint32_t code_dw, const struct aco_symbol* symbols,
+                           unsigned num_symbols, const struct ac_shader_debug_info* debug_info,
+                           unsigned debug_info_count);
 
 typedef void(aco_shader_part_callback)(void** priv_ptr, uint32_t num_sgprs, uint32_t num_vgprs,
                                        const uint32_t* code, uint32_t code_size,
                                        const char* disasm_str, uint32_t disasm_size);
 
-extern const struct aco_compiler_statistic_info* aco_statistic_infos;
-
 void aco_compile_shader(const struct aco_compiler_options* options,
                         const struct aco_shader_info* info, unsigned shader_count,
                         struct nir_shader* const* shaders, const struct ac_shader_args* args,
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 9399d4ed9ce..9f86216764a 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -12,6 +12,7 @@
 #include "aco_util.h"
 
 #include "util/compiler.h"
+#include "util/shader_stats.h"
 
 #include "ac_binary.h"
 #include "ac_hw_stage.h"
@@ -2162,7 +2163,7 @@ public:
    CompilationProgress progress;
 
    bool collect_statistics = false;
-   uint32_t statistics[aco_num_statistics];
+   amd_stats statistics;
 
    float_mode next_fp_mode;
    unsigned next_loop_depth = 0;
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index c9a9fa9eb08..03f4a272d83 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1806,8 +1806,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
    bool skip_partial_copies = true;
    for (auto it = copy_map.begin();;) {
       if (copy_map.empty()) {
-         ctx->program->statistics[aco_statistic_copies] +=
-            ctx->instructions.size() - num_instructions_before;
+         ctx->program->statistics.copies += ctx->instructions.size() - num_instructions_before;
          return;
       }
       if (it == copy_map.end()) {
@@ -2085,8 +2084,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
             break;
       }
    }
-   ctx->program->statistics[aco_statistic_copies] +=
-      ctx->instructions.size() - num_instructions_before;
+   ctx->program->statistics.copies += ctx->instructions.size() - num_instructions_before;
 }
 
 void
@@ -2122,7 +2120,7 @@ handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_c
       pi->scratch_sgpr = scratch_sgpr;
    }
 
-   ctx->program->statistics[aco_statistic_copies] += scratch_sgpr == scc ? 2 : 4;
+   ctx->program->statistics.copies += scratch_sgpr == scc ? 2 : 4;
 }
 
 void
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index 5700220dcb1..aa4e9c4c2bf 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -470,8 +470,8 @@ collect_presched_stats(Program* program)
    RegisterDemand presched_demand;
    for (Block& block : program->blocks)
       presched_demand.update(block.register_demand);
-   program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
-   program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
+   program->statistics.presgprs = presched_demand.sgpr;
+   program->statistics.prevgprs = presched_demand.vgpr;
 }
 
 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
@@ -482,31 +482,31 @@ collect_preasm_stats(Program* program)
       std::set<Instruction*> vmem_clause;
       std::set<Instruction*> smem_clause;
 
-      program->statistics[aco_statistic_instructions] += block.instructions.size();
+      program->statistics.instrs += block.instructions.size();
 
       for (aco_ptr<Instruction>& instr : block.instructions) {
          const bool is_branch =
             instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
          if (is_branch)
-            program->statistics[aco_statistic_branches]++;
+            program->statistics.branches++;
 
          if (instr->isVALU() || instr->isVINTRP())
-            program->statistics[aco_statistic_valu]++;
+            program->statistics.valu++;
          if (instr->isSALU() && !instr->isSOPP() &&
              instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
-            program->statistics[aco_statistic_salu]++;
+            program->statistics.salu++;
          if (instr->isVOPD())
-            program->statistics[aco_statistic_vopd]++;
+            program->statistics.vopd++;
 
          if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
              !instr->operands.empty()) {
             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
                              [&](Instruction* other)
                              { return should_form_clause(instr.get(), other); }))
-               program->statistics[aco_statistic_vmem_clauses]++;
+               program->statistics.vclause++;
             vmem_clause.insert(instr.get());
 
-            program->statistics[aco_statistic_vmem]++;
+            program->statistics.vmem++;
          } else {
             vmem_clause.clear();
          }
@@ -515,10 +515,10 @@ collect_preasm_stats(Program* program)
             if (std::none_of(smem_clause.begin(), smem_clause.end(),
                              [&](Instruction* other)
                              { return should_form_clause(instr.get(), other); }))
-               program->statistics[aco_statistic_smem_clauses]++;
+               program->statistics.sclause++;
             smem_clause.insert(instr.get());
 
-            program->statistics[aco_statistic_smem]++;
+            program->statistics.smem++;
          } else {
             smem_clause.clear();
          }
@@ -598,8 +598,8 @@ collect_preasm_stats(Program* program)
          program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
    wave64_per_cycle *= max_utilization;
 
-   program->statistics[aco_statistic_latency] = round(latency);
-   program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
+   program->statistics.latency = round(latency);
+   program->statistics.invthroughput = round(1.0 / wave64_per_cycle);
 
    if (debug_flags & DEBUG_PERF_INFO) {
       aco_print_program(program, stderr, print_no_ssa | print_perf_info);
@@ -624,7 +624,7 @@ collect_preasm_stats(Program* program)
 void
 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
 {
-   program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
+   program->statistics.hash = util_hash_crc32(code.data(), code.size() * 4);
 }
 
 Instruction_cycle_info
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 72d7051c6eb..7534e039fc8 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -814,44 +814,41 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
 
    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, pStatistics, pStatisticCount);
 
-   vk_add_exec_statistic_u64(out, "Driver pipeline hash", "Driver pipeline hash used by RGP", pipeline->pipeline_hash);
-   vk_add_exec_statistic_u64(out, "SGPRs", "Number of SGPR registers allocated per subgroup", shader->config.num_sgprs);
-   vk_add_exec_statistic_u64(out, "VGPRs", "Number of VGPR registers allocated per subgroup", shader->config.num_vgprs);
-   vk_add_exec_statistic_u64(out, "Spilled SGPRs", "Number of SGPR registers spilled per subgroup",
-                             shader->config.spilled_sgprs);
-   vk_add_exec_statistic_u64(out, "Spilled VGPRs", "Number of VGPR registers spilled per subgroup",
-                             shader->config.spilled_vgprs);
-   vk_add_exec_statistic_u64(out, "Code size", "Code size in bytes", shader->exec_size);
-   vk_add_exec_statistic_u64(out, "LDS size", "LDS size in bytes per workgroup",
-                             shader->config.lds_size * lds_increment);
-   vk_add_exec_statistic_u64(out, "Scratch size", "Private memory in bytes per subgroup",
-                             shader->config.scratch_bytes_per_wave);
-   vk_add_exec_statistic_u64(out, "Subgroups per SIMD", "The maximum number of subgroups in flight on a SIMD unit",
-                             shader->max_waves);
+   struct amd_stats stats = {};
+   if (shader->statistics)
+      stats = *shader->statistics;
+   stats.driverhash = pipeline->pipeline_hash;
+   stats.sgprs = shader->config.num_sgprs;
+   stats.vgprs = shader->config.num_vgprs;
+   stats.spillsgprs = shader->config.spilled_sgprs;
+   stats.spillvgprs = shader->config.spilled_vgprs;
+   stats.codesize = shader->exec_size;
+   stats.lds = shader->config.lds_size * lds_increment;
+   stats.scratch = shader->config.scratch_bytes_per_wave;
+   stats.maxwaves = shader->max_waves;
 
-   uint64_t inputs = 0;
    switch (stage) {
    case MESA_SHADER_VERTEX:
       if (gfx_level <= GFX8 || (!shader->info.vs.as_es && !shader->info.vs.as_ls)) {
          /* VS inputs when VS is a separate stage */
-         inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
+         stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
       }
       break;
 
    case MESA_SHADER_TESS_CTRL:
       if (gfx_level >= GFX9) {
          /* VS inputs when pipeline has tess */
-         inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
+         stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
       }
 
       /* VS -> TCS inputs */
-      inputs += shader->info.tcs.num_linked_inputs;
+      stats.inputs += shader->info.tcs.num_linked_inputs;
       break;
 
    case MESA_SHADER_TESS_EVAL:
       if (gfx_level <= GFX8 || !shader->info.tes.as_es) {
          /* TCS -> TES inputs when TES is a separate stage */
-         inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
+         stats.inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
       }
       break;
 
@@ -863,60 +860,57 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
       if (gfx_level >= GFX9) {
          if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
             /* VS inputs when pipeline has GS but no tess */
-            inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
+            stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
          } else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
             /* TCS -> TES inputs when pipeline has GS */
-            inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
+            stats.inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
          }
       }
 
       /* VS -> GS or TES -> GS inputs */
-      inputs += shader->info.gs.num_linked_inputs;
+      stats.inputs += shader->info.gs.num_linked_inputs;
       break;
 
    case MESA_SHADER_FRAGMENT:
-      inputs += shader->info.ps.num_inputs;
+      stats.inputs += shader->info.ps.num_inputs;
       break;
 
    default:
       /* Other stages don't have IO or we are not interested in them. */
       break;
    }
-   vk_add_exec_statistic_u64(out, "Combined inputs",
-                             "Number of input slots reserved for the shader (including merged stages)", inputs);
 
-   uint64_t outputs = 0;
    switch (stage) {
    case MESA_SHADER_VERTEX:
       if (!shader->info.vs.as_ls && !shader->info.vs.as_es) {
          /* VS -> FS outputs. */
-         outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
-                    shader->info.outinfo.prim_param_exports;
+         stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
+                          shader->info.outinfo.prim_param_exports;
       } else if (gfx_level <= GFX8) {
          /* VS -> TCS, VS -> GS outputs on GFX6-8 */
-         outputs += shader->info.vs.num_linked_outputs;
+         stats.outputs += shader->info.vs.num_linked_outputs;
       }
       break;
 
    case MESA_SHADER_TESS_CTRL:
       if (gfx_level >= GFX9) {
          /* VS -> TCS outputs on GFX9+ */
-         outputs += shader->info.vs.num_linked_outputs;
+         stats.outputs += shader->info.vs.num_linked_outputs;
       }
 
       /* TCS -> TES outputs */
-      outputs += shader->info.tcs.io_info.highest_remapped_vram_output +
-                 shader->info.tcs.io_info.highest_remapped_vram_patch_output;
+      stats.outputs += shader->info.tcs.io_info.highest_remapped_vram_output +
+                       shader->info.tcs.io_info.highest_remapped_vram_patch_output;
       break;
 
    case MESA_SHADER_TESS_EVAL:
       if (!shader->info.tes.as_es) {
          /* TES -> FS outputs */
-         outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
-                    shader->info.outinfo.prim_param_exports;
+         stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
+                          shader->info.outinfo.prim_param_exports;
       } else if (gfx_level <= GFX8) {
          /* TES -> GS outputs on GFX6-8 */
-         outputs += shader->info.tes.num_linked_outputs;
+         stats.outputs += shader->info.tes.num_linked_outputs;
       }
       break;
 
@@ -928,48 +922,41 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
       if (gfx_level >= GFX9) {
          if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
             /* VS -> GS outputs on GFX9+ */
-            outputs += shader->info.vs.num_linked_outputs;
+            stats.outputs += shader->info.vs.num_linked_outputs;
          } else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
             /* TES -> GS outputs on GFX9+ */
-            outputs += shader->info.tes.num_linked_outputs;
+            stats.outputs += shader->info.tes.num_linked_outputs;
          }
       }
 
       if (shader->info.is_ngg) {
          /* GS -> FS outputs (GFX10+ NGG) */
-         outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
-                    shader->info.outinfo.prim_param_exports;
+         stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
+                          shader->info.outinfo.prim_param_exports;
       } else {
          /* GS -> FS outputs (GFX6-10.3 legacy) */
-         outputs += shader->info.gs.gsvs_vertex_size / 16;
+         stats.outputs += shader->info.gs.gsvs_vertex_size / 16;
       }
       break;
 
    case MESA_SHADER_MESH:
       /* MS -> FS outputs */
-      outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
-                 shader->info.outinfo.prim_param_exports;
+      stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
+                       shader->info.outinfo.prim_param_exports;
       break;
 
    case MESA_SHADER_FRAGMENT:
-      outputs += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z +
-                 !!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask +
-                 !!shader->info.ps.writes_mrt0_alpha;
+      stats.outputs += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z +
+                       !!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask +
+                       !!shader->info.ps.writes_mrt0_alpha;
       break;
 
    default:
       /* Other stages don't have IO or we are not interested in them. */
       break;
    }
-   vk_add_exec_statistic_u64(out, "Combined outputs",
-                             "Number of output slots reserved for the shader (including merged stages)", outputs);
 
-   if (shader->statistics) {
-      for (unsigned i = 0; i < aco_num_statistics; i++) {
-         const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
-         vk_add_exec_statistic_u64(out, info->name, info->desc, shader->statistics[i]);
-      }
-   }
+   vk_add_amd_stats(out, &stats);
 
    return vk_outarray_status(&out);
 }
diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c
index 8f0f511c6c9..397b0c07c5b 100644
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -95,7 +95,7 @@ radv_shader_cache_deserialize(struct vk_pipeline_cache *cache, const void *key_d
 void
 radv_shader_serialize(struct radv_shader *shader, struct blob *blob)
 {
-   size_t stats_size = shader->statistics ? aco_num_statistics * sizeof(uint32_t) : 0;
+   size_t stats_size = shader->statistics ? sizeof(struct amd_stats) : 0;
    size_t code_size = shader->code_size;
    uint32_t total_size = sizeof(struct radv_shader_binary_legacy) + code_size + stats_size;
 
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 17caa1f2f4c..9872ee39e6c 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -2984,14 +2984,15 @@ radv_dump_nir_shaders(const struct radv_instance *instance, struct nir_shader *c
 
 static void
 radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config, const char *llvm_ir_str,
-                             unsigned llvm_ir_size, const char *disasm_str, unsigned disasm_size, uint32_t *statistics,
-                             uint32_t stats_size, uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
+                             unsigned llvm_ir_size, const char *disasm_str, unsigned disasm_size,
+                             struct amd_stats *statistics, uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
                              const struct aco_symbol *symbols, unsigned num_symbols,
                              const struct ac_shader_debug_info *debug_info, unsigned debug_info_count)
 {
    struct radv_shader_binary **binary = (struct radv_shader_binary **)bin;
 
    uint32_t debug_info_size = debug_info_count * sizeof(struct ac_shader_debug_info);
+   uint32_t stats_size = statistics ? sizeof(struct amd_stats) : 0;
 
    size_t size = llvm_ir_size;
 
@@ -3019,7 +3020,7 @@ radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config,
    struct radv_shader_binary_layout layout = radv_shader_binary_get_layout(legacy_binary);
 
    if (stats_size)
-      memcpy(layout.stats, statistics, stats_size);
+      amd_stats_serialize(layout.stats, statistics);
 
    memcpy(layout.code, code, code_dw * sizeof(uint32_t));
 
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 4905c72c63d..d74bb9d60b7 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -12,6 +12,7 @@
 #define RADV_SHADER_H
 
 #include "util/mesa-blake3.h"
+#include "util/shader_stats.h"
 #include "util/u_math.h"
 #include "vulkan/vulkan.h"
 #include "ac_binary.h"
@@ -460,7 +461,7 @@ struct radv_shader {
    char *nir_string;
    char *disasm_string;
    char *ir_string;
-   uint32_t *statistics;
+   struct amd_stats *statistics;
    struct ac_shader_debug_info *debug_info;
    uint32_t debug_info_count;
 };
diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c
index 5253677f512..1b06c1e21b8 100644
--- a/src/gallium/drivers/radeonsi/si_shader_aco.c
+++ b/src/gallium/drivers/radeonsi/si_shader_aco.c
@@ -105,7 +105,7 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
 static void
 si_aco_build_shader_binary(void **data, const struct ac_shader_config *config,
                            const char *llvm_ir_str, unsigned llvm_ir_size, const char *disasm_str,
-                           unsigned disasm_size, uint32_t *statistics, uint32_t stats_size,
+                           unsigned disasm_size, struct amd_stats *statistics,
                            uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
                            const struct aco_symbol *symbols, unsigned num_symbols,
                            const struct ac_shader_debug_info *debug_info, unsigned debug_info_count)
diff --git a/src/util/shader_stats.xml b/src/util/shader_stats.xml
index 5bf7ce34fdc..a6466e6b6b4 100644
--- a/src/util/shader_stats.xml
+++ b/src/util/shader_stats.xml
@@ -90,4 +90,33 @@
       <stat name="TMU Fills" display="Fills">Number of times a register was filled from memory</stat>
       <stat name="QPU Read Stalls" display="Read Stalls">Number of cycles the QPU stalls for a register read dependency</stat>
    </isa>
+
+   <isa name="AMD">
+      <stat name="Driver pipeline hash" display="DriverHash" hash="true" type="u64">Driver pipeline hash used by RGP</stat>
+      <stat name="SGPRs">Number of SGPR registers allocated per subgroup</stat>
+      <stat name="VGPRs">Number of VGPR registers allocated per subgroup</stat>
+      <stat name="Spilled SGPRs" display="SpillSGPRs">Number of SGPR registers spilled per subgroup</stat>
+      <stat name="Spilled VGPRs" display="SpillVGPRs">Number of VGPR registers spilled per subgroup</stat>
+      <stat name="Code size" display="CodeSize">Code size in bytes</stat>
+      <stat name="LDS size" display="LDS">LDS size in bytes per workgroup</stat>
+      <stat name="Scratch size" display="Scratch">Private memory in bytes per subgroup</stat>
+      <stat name="Subgroups per SIMD" display="MaxWaves" more="better">The maximum number of subgroups in flight on a SIMD unit</stat>
+      <stat name="Combined inputs" display="Inputs">Number of input slots reserved for the shader (including merged stages)</stat>
+      <stat name="Combined outputs" display="Outputs">Number of output slots reserved for the shader (including merged stages)</stat>
+      <stat name="Hash" hash="true">CRC32 hash of code and constant data</stat>
+      <stat name="Instructions" display="Instrs">Instruction count</stat>
+      <stat name="Copies">Copy instructions created for pseudo-instructions</stat>
+      <stat name="Branches">Branch instructions</stat>
+      <stat name="Latency">Issue cycles plus stall cycles</stat>
+      <stat name="Inverse Throughput" display="InvThroughput">Estimated busy cycles to execute one wave</stat>
+      <stat name="VMEM Clause" display="VClause">Number of VMEM clauses (includes 1-sized clauses)</stat>
+      <stat name="SMEM Clause" display="SClause">Number of SMEM clauses (includes 1-sized clauses)</stat>
+      <stat name="Pre-Sched SGPRs" display="PreSGPRs">SGPR usage before scheduling</stat>
+      <stat name="Pre-Sched VGPRs" display="PreVGPRs">VGPR usage before scheduling</stat>
+      <stat name="VALU">Number of VALU instructions</stat>
+      <stat name="SALU">Number of SALU instructions</stat>
+      <stat name="VMEM">Number of VMEM instructions</stat>
+      <stat name="SMEM">Number of SMEM instructions</stat>
+      <stat name="VOPD" more="better">Number of VOPD instructions</stat>
+   </isa>
 </shaderdb>