From 325dfd809ab94012beaa8df80b2e4f1d0cf6d8c3 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 17 Jun 2025 15:07:04 +0100 Subject: [PATCH] radv,aco: switch to shader statistics framework Signed-off-by: Rhys Perry Gitlab: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12756 Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/compiler/aco_interface.cpp | 55 ++---------- src/amd/compiler/aco_interface.h | 18 ++-- src/amd/compiler/aco_ir.h | 3 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 8 +- src/amd/compiler/aco_statistics.cpp | 28 +++--- src/amd/vulkan/radv_pipeline.c | 93 +++++++++----------- src/amd/vulkan/radv_pipeline_cache.c | 2 +- src/amd/vulkan/radv_shader.c | 7 +- src/amd/vulkan/radv_shader.h | 3 +- src/gallium/drivers/radeonsi/si_shader_aco.c | 2 +- src/util/shader_stats.xml | 29 ++++++ 11 files changed, 110 insertions(+), 138 deletions(-) diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index a08f6560578..a80a8189b65 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -20,36 +20,6 @@ using namespace aco; namespace { -static const std::array statistic_infos = []() -{ - std::array ret{}; - ret[aco_statistic_hash] = - aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"}; - ret[aco_statistic_instructions] = - aco_compiler_statistic_info{"Instructions", "Instruction count"}; - ret[aco_statistic_copies] = - aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"}; - ret[aco_statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"}; - ret[aco_statistic_latency] = - aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"}; - ret[aco_statistic_inv_throughput] = aco_compiler_statistic_info{ - "Inverse Throughput", "Estimated busy cycles to execute one wave"}; - ret[aco_statistic_vmem_clauses] = aco_compiler_statistic_info{ - "VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"}; - ret[aco_statistic_smem_clauses] = aco_compiler_statistic_info{ - "SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"}; - ret[aco_statistic_sgpr_presched] = - aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"}; - ret[aco_statistic_vgpr_presched] = - aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"}; - ret[aco_statistic_valu] = aco_compiler_statistic_info{"VALU", "Number of VALU instructions"}; - ret[aco_statistic_salu] = aco_compiler_statistic_info{"SALU", "Number of SALU instructions"}; - ret[aco_statistic_vmem] = aco_compiler_statistic_info{"VMEM", "Number of VMEM instructions"}; - ret[aco_statistic_smem] = aco_compiler_statistic_info{"SMEM", "Number of SMEM instructions"}; - ret[aco_statistic_vopd] = aco_compiler_statistic_info{"VOPD", "Number of VOPD instructions"}; - return ret; -}(); - static void validate(Program* program) { @@ -222,8 +192,7 @@ aco_compile_shader_part(const struct aco_compiler_options* options, std::unique_ptr program{new Program}; program->collect_statistics = options->record_stats; - if (program->collect_statistics) - memset(program->statistics, 0, sizeof(program->statistics)); + memset(&program->statistics, 0, sizeof(program->statistics)); program->debug.func = options->debug.func; program->debug.private_data = options->debug.private_data; @@ -262,8 +231,7 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_ std::unique_ptr program{new Program}; program->collect_statistics = options->record_stats; - if (program->collect_statistics) - memset(program->statistics, 0, sizeof(program->statistics)); + memset(&program->statistics, 0, sizeof(program->statistics)); program->debug.func = options->debug.func; program->debug.private_data = options->debug.private_data; @@ -289,14 +257,9 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_ if (options->record_asm) disasm = get_disasm_string(program.get(), code, exec_size); - size_t stats_size = 0; - if (program->collect_statistics) - stats_size = aco_num_statistics * sizeof(uint32_t); - (*build_binary)(binary, &config, llvm_ir.c_str(), llvm_ir.size(), disasm.c_str(), disasm.size(), - program->statistics, stats_size, exec_size, code.data(), code.size(), - symbols.data(), symbols.size(), program->debug_info.data(), - program->debug_info.size()); + &program->statistics, exec_size, code.data(), code.size(), symbols.data(), + symbols.size(), program->debug_info.data(), program->debug_info.size()); } void @@ -337,8 +300,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options, if (options->record_asm) disasm = get_disasm_string(program.get(), code, exec_size); - (*build_prolog)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), program->statistics, 0, - exec_size, code.data(), code.size(), NULL, 0, NULL, 0); + (*build_prolog)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), NULL, exec_size, + code.data(), code.size(), NULL, 0, NULL, 0); } void @@ -437,8 +400,8 @@ aco_compile_trap_handler(const struct aco_compiler_options* options, if (options->record_asm) disasm = get_disasm_string(program.get(), code, exec_size); - (*build_binary)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), program->statistics, 0, - exec_size, code.data(), code.size(), NULL, 0, NULL, 0); + (*build_binary)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), NULL, exec_size, + code.data(), code.size(), NULL, 0, NULL, 0); } uint64_t @@ -514,8 +477,6 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu) } } -const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data(); - void aco_print_asm(const struct radeon_info *info, unsigned wave_size, uint32_t *binary, unsigned num_dw) diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h index db858b75848..30bd8f1b67f 100644 --- a/src/amd/compiler/aco_interface.h +++ b/src/amd/compiler/aco_interface.h @@ -8,10 +8,11 @@ #define ACO_INTERFACE_H #include "aco_shader_info.h" -#include "ac_shader_debug_info.h" #include "nir_defines.h" +#include "util/shader_stats.h" +#include "ac_shader_debug_info.h" #include "amd_family.h" #ifdef __cplusplus extern "C" { @@ -26,24 +27,17 @@ struct aco_vs_prolog_info; struct aco_ps_epilog_info; struct radeon_info; -struct aco_compiler_statistic_info { - char name[32]; - char desc[64]; -}; - typedef void(aco_callback)(void** priv_ptr, const struct ac_shader_config* config, const char* llvm_ir_str, unsigned llvm_ir_size, const char* disasm_str, - unsigned disasm_size, uint32_t* statistics, uint32_t stats_size, - uint32_t exec_size, const uint32_t* code, uint32_t code_dw, - const struct aco_symbol* symbols, unsigned num_symbols, - const struct ac_shader_debug_info* debug_info, unsigned debug_info_count); + unsigned disasm_size, struct amd_stats* stats, uint32_t exec_size, + const uint32_t* code, uint32_t code_dw, const struct aco_symbol* symbols, + unsigned num_symbols, const struct ac_shader_debug_info* debug_info, + unsigned debug_info_count); typedef void(aco_shader_part_callback)(void** priv_ptr, uint32_t num_sgprs, uint32_t num_vgprs, const uint32_t* code, uint32_t code_size, const char* disasm_str, uint32_t disasm_size); -extern const struct aco_compiler_statistic_info* aco_statistic_infos; - void aco_compile_shader(const struct aco_compiler_options* options, const struct aco_shader_info* info, unsigned shader_count, struct nir_shader* const* shaders, const struct ac_shader_args* args, diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 9399d4ed9ce..9f86216764a 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -12,6 +12,7 @@ #include "aco_util.h" #include "util/compiler.h" +#include "util/shader_stats.h" #include "ac_binary.h" #include "ac_hw_stage.h" @@ -2162,7 +2163,7 @@ public: CompilationProgress progress; bool collect_statistics = false; - uint32_t statistics[aco_num_statistics]; + amd_stats statistics; float_mode next_fp_mode; unsigned next_loop_depth = 0; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index c9a9fa9eb08..03f4a272d83 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1806,8 +1806,7 @@ handle_operands(std::map& copy_map, lower_context* ctx, bool skip_partial_copies = true; for (auto it = copy_map.begin();;) { if (copy_map.empty()) { - ctx->program->statistics[aco_statistic_copies] += - ctx->instructions.size() - num_instructions_before; + ctx->program->statistics.copies += ctx->instructions.size() - num_instructions_before; return; } if (it == copy_map.end()) { @@ -2085,8 +2084,7 @@ handle_operands(std::map& copy_map, lower_context* ctx, break; } } - ctx->program->statistics[aco_statistic_copies] += - ctx->instructions.size() - num_instructions_before; + ctx->program->statistics.copies += ctx->instructions.size() - num_instructions_before; } void @@ -2122,7 +2120,7 @@ handle_operands_linear_vgpr(std::map& copy_map, lower_c pi->scratch_sgpr = scratch_sgpr; } - ctx->program->statistics[aco_statistic_copies] += scratch_sgpr == scc ? 2 : 4; + ctx->program->statistics.copies += scratch_sgpr == scc ? 2 : 4; } void diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index 5700220dcb1..aa4e9c4c2bf 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -470,8 +470,8 @@ collect_presched_stats(Program* program) RegisterDemand presched_demand; for (Block& block : program->blocks) presched_demand.update(block.register_demand); - program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr; - program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr; + program->statistics.presgprs = presched_demand.sgpr; + program->statistics.prevgprs = presched_demand.vgpr; } /* instructions/branches/vmem_clauses/smem_clauses/cycles */ @@ -482,31 +482,31 @@ collect_preasm_stats(Program* program) std::set vmem_clause; std::set smem_clause; - program->statistics[aco_statistic_instructions] += block.instructions.size(); + program->statistics.instrs += block.instructions.size(); for (aco_ptr& instr : block.instructions) { const bool is_branch = instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch; if (is_branch) - program->statistics[aco_statistic_branches]++; + program->statistics.branches++; if (instr->isVALU() || instr->isVINTRP()) - program->statistics[aco_statistic_valu]++; + program->statistics.valu++; if (instr->isSALU() && !instr->isSOPP() && instr_info.classes[(int)instr->opcode] != instr_class::waitcnt) - program->statistics[aco_statistic_salu]++; + program->statistics.salu++; if (instr->isVOPD()) - program->statistics[aco_statistic_vopd]++; + program->statistics.vopd++; if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) && !instr->operands.empty()) { if (std::none_of(vmem_clause.begin(), vmem_clause.end(), [&](Instruction* other) { return should_form_clause(instr.get(), other); })) - program->statistics[aco_statistic_vmem_clauses]++; + program->statistics.vclause++; vmem_clause.insert(instr.get()); - program->statistics[aco_statistic_vmem]++; + program->statistics.vmem++; } else { vmem_clause.clear(); } @@ -515,10 +515,10 @@ collect_preasm_stats(Program* program) if (std::none_of(smem_clause.begin(), smem_clause.end(), [&](Instruction* other) { return should_form_clause(instr.get(), other); })) - program->statistics[aco_statistic_smem_clauses]++; + program->statistics.sclause++; smem_clause.insert(instr.get()); - program->statistics[aco_statistic_smem]++; + program->statistics.smem++; } else { smem_clause.clear(); } @@ -598,8 +598,8 @@ collect_preasm_stats(Program* program) program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); wave64_per_cycle *= max_utilization; - program->statistics[aco_statistic_latency] = round(latency); - program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle); + program->statistics.latency = round(latency); + program->statistics.invthroughput = round(1.0 / wave64_per_cycle); if (debug_flags & DEBUG_PERF_INFO) { aco_print_program(program, stderr, print_no_ssa | print_perf_info); @@ -624,7 +624,7 @@ collect_preasm_stats(Program* program) void collect_postasm_stats(Program* program, const std::vector& code) { - program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4); + program->statistics.hash = util_hash_crc32(code.data(), code.size() * 4); } Instruction_cycle_info diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 72d7051c6eb..7534e039fc8 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -814,44 +814,41 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, pStatistics, pStatisticCount); - vk_add_exec_statistic_u64(out, "Driver pipeline hash", "Driver pipeline hash used by RGP", pipeline->pipeline_hash); - vk_add_exec_statistic_u64(out, "SGPRs", "Number of SGPR registers allocated per subgroup", shader->config.num_sgprs); - vk_add_exec_statistic_u64(out, "VGPRs", "Number of VGPR registers allocated per subgroup", shader->config.num_vgprs); - vk_add_exec_statistic_u64(out, "Spilled SGPRs", "Number of SGPR registers spilled per subgroup", - shader->config.spilled_sgprs); - vk_add_exec_statistic_u64(out, "Spilled VGPRs", "Number of VGPR registers spilled per subgroup", - shader->config.spilled_vgprs); - vk_add_exec_statistic_u64(out, "Code size", "Code size in bytes", shader->exec_size); - vk_add_exec_statistic_u64(out, "LDS size", "LDS size in bytes per workgroup", - shader->config.lds_size * lds_increment); - vk_add_exec_statistic_u64(out, "Scratch size", "Private memory in bytes per subgroup", - shader->config.scratch_bytes_per_wave); - vk_add_exec_statistic_u64(out, "Subgroups per SIMD", "The maximum number of subgroups in flight on a SIMD unit", - shader->max_waves); + struct amd_stats stats = {}; + if (shader->statistics) + stats = *shader->statistics; + stats.driverhash = pipeline->pipeline_hash; + stats.sgprs = shader->config.num_sgprs; + stats.vgprs = shader->config.num_vgprs; + stats.spillsgprs = shader->config.spilled_sgprs; + stats.spillvgprs = shader->config.spilled_vgprs; + stats.codesize = shader->exec_size; + stats.lds = shader->config.lds_size * lds_increment; + stats.scratch = shader->config.scratch_bytes_per_wave; + stats.maxwaves = shader->max_waves; - uint64_t inputs = 0; switch (stage) { case MESA_SHADER_VERTEX: if (gfx_level <= GFX8 || (!shader->info.vs.as_es && !shader->info.vs.as_ls)) { /* VS inputs when VS is a separate stage */ - inputs += util_bitcount(shader->info.vs.input_slot_usage_mask); + stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask); } break; case MESA_SHADER_TESS_CTRL: if (gfx_level >= GFX9) { /* VS inputs when pipeline has tess */ - inputs += util_bitcount(shader->info.vs.input_slot_usage_mask); + stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask); } /* VS -> TCS inputs */ - inputs += shader->info.tcs.num_linked_inputs; + stats.inputs += shader->info.tcs.num_linked_inputs; break; case MESA_SHADER_TESS_EVAL: if (gfx_level <= GFX8 || !shader->info.tes.as_es) { /* TCS -> TES inputs when TES is a separate stage */ - inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs; + stats.inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs; } break; @@ -863,60 +860,57 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut if (gfx_level >= GFX9) { if (shader->info.gs.es_type == MESA_SHADER_VERTEX) { /* VS inputs when pipeline has GS but no tess */ - inputs += util_bitcount(shader->info.vs.input_slot_usage_mask); + stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask); } else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) { /* TCS -> TES inputs when pipeline has GS */ - inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs; + stats.inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs; } } /* VS -> GS or TES -> GS inputs */ - inputs += shader->info.gs.num_linked_inputs; + stats.inputs += shader->info.gs.num_linked_inputs; break; case MESA_SHADER_FRAGMENT: - inputs += shader->info.ps.num_inputs; + stats.inputs += shader->info.ps.num_inputs; break; default: /* Other stages don't have IO or we are not interested in them. */ break; } - vk_add_exec_statistic_u64(out, "Combined inputs", - "Number of input slots reserved for the shader (including merged stages)", inputs); - uint64_t outputs = 0; switch (stage) { case MESA_SHADER_VERTEX: if (!shader->info.vs.as_ls && !shader->info.vs.as_es) { /* VS -> FS outputs. */ - outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + - shader->info.outinfo.prim_param_exports; + stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + + shader->info.outinfo.prim_param_exports; } else if (gfx_level <= GFX8) { /* VS -> TCS, VS -> GS outputs on GFX6-8 */ - outputs += shader->info.vs.num_linked_outputs; + stats.outputs += shader->info.vs.num_linked_outputs; } break; case MESA_SHADER_TESS_CTRL: if (gfx_level >= GFX9) { /* VS -> TCS outputs on GFX9+ */ - outputs += shader->info.vs.num_linked_outputs; + stats.outputs += shader->info.vs.num_linked_outputs; } /* TCS -> TES outputs */ - outputs += shader->info.tcs.io_info.highest_remapped_vram_output + - shader->info.tcs.io_info.highest_remapped_vram_patch_output; + stats.outputs += shader->info.tcs.io_info.highest_remapped_vram_output + + shader->info.tcs.io_info.highest_remapped_vram_patch_output; break; case MESA_SHADER_TESS_EVAL: if (!shader->info.tes.as_es) { /* TES -> FS outputs */ - outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + - shader->info.outinfo.prim_param_exports; + stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + + shader->info.outinfo.prim_param_exports; } else if (gfx_level <= GFX8) { /* TES -> GS outputs on GFX6-8 */ - outputs += shader->info.tes.num_linked_outputs; + stats.outputs += shader->info.tes.num_linked_outputs; } break; @@ -928,48 +922,41 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut if (gfx_level >= GFX9) { if (shader->info.gs.es_type == MESA_SHADER_VERTEX) { /* VS -> GS outputs on GFX9+ */ - outputs += shader->info.vs.num_linked_outputs; + stats.outputs += shader->info.vs.num_linked_outputs; } else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) { /* TES -> GS outputs on GFX9+ */ - outputs += shader->info.tes.num_linked_outputs; + stats.outputs += shader->info.tes.num_linked_outputs; } } if (shader->info.is_ngg) { /* GS -> FS outputs (GFX10+ NGG) */ - outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + - shader->info.outinfo.prim_param_exports; + stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + + shader->info.outinfo.prim_param_exports; } else { /* GS -> FS outputs (GFX6-10.3 legacy) */ - outputs += shader->info.gs.gsvs_vertex_size / 16; + stats.outputs += shader->info.gs.gsvs_vertex_size / 16; } break; case MESA_SHADER_MESH: /* MS -> FS outputs */ - outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + - shader->info.outinfo.prim_param_exports; + stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports + + shader->info.outinfo.prim_param_exports; break; case MESA_SHADER_FRAGMENT: - outputs += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z + - !!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask + - !!shader->info.ps.writes_mrt0_alpha; + stats.outputs += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z + + !!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask + + !!shader->info.ps.writes_mrt0_alpha; break; default: /* Other stages don't have IO or we are not interested in them. */ break; } - vk_add_exec_statistic_u64(out, "Combined outputs", - "Number of output slots reserved for the shader (including merged stages)", outputs); - if (shader->statistics) { - for (unsigned i = 0; i < aco_num_statistics; i++) { - const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i]; - vk_add_exec_statistic_u64(out, info->name, info->desc, shader->statistics[i]); - } - } + vk_add_amd_stats(out, &stats); return vk_outarray_status(&out); } diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c index 8f0f511c6c9..397b0c07c5b 100644 --- a/src/amd/vulkan/radv_pipeline_cache.c +++ b/src/amd/vulkan/radv_pipeline_cache.c @@ -95,7 +95,7 @@ radv_shader_cache_deserialize(struct vk_pipeline_cache *cache, const void *key_d void radv_shader_serialize(struct radv_shader *shader, struct blob *blob) { - size_t stats_size = shader->statistics ? aco_num_statistics * sizeof(uint32_t) : 0; + size_t stats_size = shader->statistics ? sizeof(struct amd_stats) : 0; size_t code_size = shader->code_size; uint32_t total_size = sizeof(struct radv_shader_binary_legacy) + code_size + stats_size; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 17caa1f2f4c..9872ee39e6c 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2984,14 +2984,15 @@ radv_dump_nir_shaders(const struct radv_instance *instance, struct nir_shader *c static void radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config, const char *llvm_ir_str, - unsigned llvm_ir_size, const char *disasm_str, unsigned disasm_size, uint32_t *statistics, - uint32_t stats_size, uint32_t exec_size, const uint32_t *code, uint32_t code_dw, + unsigned llvm_ir_size, const char *disasm_str, unsigned disasm_size, + struct amd_stats *statistics, uint32_t exec_size, const uint32_t *code, uint32_t code_dw, const struct aco_symbol *symbols, unsigned num_symbols, const struct ac_shader_debug_info *debug_info, unsigned debug_info_count) { struct radv_shader_binary **binary = (struct radv_shader_binary **)bin; uint32_t debug_info_size = debug_info_count * sizeof(struct ac_shader_debug_info); + uint32_t stats_size = statistics ? sizeof(struct amd_stats) : 0; size_t size = llvm_ir_size; @@ -3019,7 +3020,7 @@ radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config, struct radv_shader_binary_layout layout = radv_shader_binary_get_layout(legacy_binary); if (stats_size) - memcpy(layout.stats, statistics, stats_size); + amd_stats_serialize(layout.stats, statistics); memcpy(layout.code, code, code_dw * sizeof(uint32_t)); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 4905c72c63d..d74bb9d60b7 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -12,6 +12,7 @@ #define RADV_SHADER_H #include "util/mesa-blake3.h" +#include "util/shader_stats.h" #include "util/u_math.h" #include "vulkan/vulkan.h" #include "ac_binary.h" @@ -460,7 +461,7 @@ struct radv_shader { char *nir_string; char *disasm_string; char *ir_string; - uint32_t *statistics; + struct amd_stats *statistics; struct ac_shader_debug_info *debug_info; uint32_t debug_info_count; }; diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c index 5253677f512..1b06c1e21b8 100644 --- a/src/gallium/drivers/radeonsi/si_shader_aco.c +++ b/src/gallium/drivers/radeonsi/si_shader_aco.c @@ -105,7 +105,7 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info, static void si_aco_build_shader_binary(void **data, const struct ac_shader_config *config, const char *llvm_ir_str, unsigned llvm_ir_size, const char *disasm_str, - unsigned disasm_size, uint32_t *statistics, uint32_t stats_size, + unsigned disasm_size, struct amd_stats *statistics, uint32_t exec_size, const uint32_t *code, uint32_t code_dw, const struct aco_symbol *symbols, unsigned num_symbols, const struct ac_shader_debug_info *debug_info, unsigned debug_info_count) diff --git a/src/util/shader_stats.xml b/src/util/shader_stats.xml index 5bf7ce34fdc..a6466e6b6b4 100644 --- a/src/util/shader_stats.xml +++ b/src/util/shader_stats.xml @@ -90,4 +90,33 @@ Number of times a register was filled from memory Number of cycles the QPU stalls for a register read dependency + + + Driver pipeline hash used by RGP + Number of SGPR registers allocated per subgroup + Number of VGPR registers allocated per subgroup + Number of SGPR registers spilled per subgroup + Number of VGPR registers spilled per subgroup + Code size in bytes + LDS size in bytes per workgroup + Private memory in bytes per subgroup + The maximum number of subgroups in flight on a SIMD unit + Number of input slots reserved for the shader (including merged stages) + Number of output slots reserved for the shader (including merged stages) + CRC32 hash of code and constant data + Instruction count + Copy instructions created for pseudo-instructions + Branch instructions + Issue cycles plus stall cycles + Estimated busy cycles to execute one wave + Number of VMEM clauses (includes 1-sized clauses) + Number of SMEM clauses (includes 1-sized clauses) + SGPR usage before scheduling + VGPR usage before scheduling + Number of VALU instructions + Number of SALU instructions + Number of VMEM instructions + Number of SMEM instructions + Number of VOPD instructions +