mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-29 12:20:10 +01:00
radv,aco: switch to shader statistics framework
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Gitlab: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12756 Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35583>
This commit is contained in:
parent
2382d657ec
commit
325dfd809a
11 changed files with 110 additions and 138 deletions
|
|
@ -20,36 +20,6 @@ using namespace aco;
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
static const std::array<aco_compiler_statistic_info, aco_num_statistics> statistic_infos = []()
|
|
||||||
{
|
|
||||||
std::array<aco_compiler_statistic_info, aco_num_statistics> ret{};
|
|
||||||
ret[aco_statistic_hash] =
|
|
||||||
aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
|
|
||||||
ret[aco_statistic_instructions] =
|
|
||||||
aco_compiler_statistic_info{"Instructions", "Instruction count"};
|
|
||||||
ret[aco_statistic_copies] =
|
|
||||||
aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
|
|
||||||
ret[aco_statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
|
|
||||||
ret[aco_statistic_latency] =
|
|
||||||
aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
|
|
||||||
ret[aco_statistic_inv_throughput] = aco_compiler_statistic_info{
|
|
||||||
"Inverse Throughput", "Estimated busy cycles to execute one wave"};
|
|
||||||
ret[aco_statistic_vmem_clauses] = aco_compiler_statistic_info{
|
|
||||||
"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
|
|
||||||
ret[aco_statistic_smem_clauses] = aco_compiler_statistic_info{
|
|
||||||
"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
|
|
||||||
ret[aco_statistic_sgpr_presched] =
|
|
||||||
aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
|
|
||||||
ret[aco_statistic_vgpr_presched] =
|
|
||||||
aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
|
|
||||||
ret[aco_statistic_valu] = aco_compiler_statistic_info{"VALU", "Number of VALU instructions"};
|
|
||||||
ret[aco_statistic_salu] = aco_compiler_statistic_info{"SALU", "Number of SALU instructions"};
|
|
||||||
ret[aco_statistic_vmem] = aco_compiler_statistic_info{"VMEM", "Number of VMEM instructions"};
|
|
||||||
ret[aco_statistic_smem] = aco_compiler_statistic_info{"SMEM", "Number of SMEM instructions"};
|
|
||||||
ret[aco_statistic_vopd] = aco_compiler_statistic_info{"VOPD", "Number of VOPD instructions"};
|
|
||||||
return ret;
|
|
||||||
}();
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
validate(Program* program)
|
validate(Program* program)
|
||||||
{
|
{
|
||||||
|
|
@ -222,8 +192,7 @@ aco_compile_shader_part(const struct aco_compiler_options* options,
|
||||||
std::unique_ptr<Program> program{new Program};
|
std::unique_ptr<Program> program{new Program};
|
||||||
|
|
||||||
program->collect_statistics = options->record_stats;
|
program->collect_statistics = options->record_stats;
|
||||||
if (program->collect_statistics)
|
memset(&program->statistics, 0, sizeof(program->statistics));
|
||||||
memset(program->statistics, 0, sizeof(program->statistics));
|
|
||||||
|
|
||||||
program->debug.func = options->debug.func;
|
program->debug.func = options->debug.func;
|
||||||
program->debug.private_data = options->debug.private_data;
|
program->debug.private_data = options->debug.private_data;
|
||||||
|
|
@ -262,8 +231,7 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
|
||||||
std::unique_ptr<Program> program{new Program};
|
std::unique_ptr<Program> program{new Program};
|
||||||
|
|
||||||
program->collect_statistics = options->record_stats;
|
program->collect_statistics = options->record_stats;
|
||||||
if (program->collect_statistics)
|
memset(&program->statistics, 0, sizeof(program->statistics));
|
||||||
memset(program->statistics, 0, sizeof(program->statistics));
|
|
||||||
|
|
||||||
program->debug.func = options->debug.func;
|
program->debug.func = options->debug.func;
|
||||||
program->debug.private_data = options->debug.private_data;
|
program->debug.private_data = options->debug.private_data;
|
||||||
|
|
@ -289,14 +257,9 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
|
||||||
if (options->record_asm)
|
if (options->record_asm)
|
||||||
disasm = get_disasm_string(program.get(), code, exec_size);
|
disasm = get_disasm_string(program.get(), code, exec_size);
|
||||||
|
|
||||||
size_t stats_size = 0;
|
|
||||||
if (program->collect_statistics)
|
|
||||||
stats_size = aco_num_statistics * sizeof(uint32_t);
|
|
||||||
|
|
||||||
(*build_binary)(binary, &config, llvm_ir.c_str(), llvm_ir.size(), disasm.c_str(), disasm.size(),
|
(*build_binary)(binary, &config, llvm_ir.c_str(), llvm_ir.size(), disasm.c_str(), disasm.size(),
|
||||||
program->statistics, stats_size, exec_size, code.data(), code.size(),
|
&program->statistics, exec_size, code.data(), code.size(), symbols.data(),
|
||||||
symbols.data(), symbols.size(), program->debug_info.data(),
|
symbols.size(), program->debug_info.data(), program->debug_info.size());
|
||||||
program->debug_info.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -337,8 +300,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
||||||
if (options->record_asm)
|
if (options->record_asm)
|
||||||
disasm = get_disasm_string(program.get(), code, exec_size);
|
disasm = get_disasm_string(program.get(), code, exec_size);
|
||||||
|
|
||||||
(*build_prolog)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), program->statistics, 0,
|
(*build_prolog)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), NULL, exec_size,
|
||||||
exec_size, code.data(), code.size(), NULL, 0, NULL, 0);
|
code.data(), code.size(), NULL, 0, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -437,8 +400,8 @@ aco_compile_trap_handler(const struct aco_compiler_options* options,
|
||||||
if (options->record_asm)
|
if (options->record_asm)
|
||||||
disasm = get_disasm_string(program.get(), code, exec_size);
|
disasm = get_disasm_string(program.get(), code, exec_size);
|
||||||
|
|
||||||
(*build_binary)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), program->statistics, 0,
|
(*build_binary)(binary, &config, NULL, 0, disasm.c_str(), disasm.size(), NULL, exec_size,
|
||||||
exec_size, code.data(), code.size(), NULL, 0, NULL, 0);
|
code.data(), code.size(), NULL, 0, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t
|
uint64_t
|
||||||
|
|
@ -514,8 +477,6 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data();
|
|
||||||
|
|
||||||
void
|
void
|
||||||
aco_print_asm(const struct radeon_info *info, unsigned wave_size,
|
aco_print_asm(const struct radeon_info *info, unsigned wave_size,
|
||||||
uint32_t *binary, unsigned num_dw)
|
uint32_t *binary, unsigned num_dw)
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,11 @@
|
||||||
#define ACO_INTERFACE_H
|
#define ACO_INTERFACE_H
|
||||||
|
|
||||||
#include "aco_shader_info.h"
|
#include "aco_shader_info.h"
|
||||||
#include "ac_shader_debug_info.h"
|
|
||||||
|
|
||||||
#include "nir_defines.h"
|
#include "nir_defines.h"
|
||||||
|
#include "util/shader_stats.h"
|
||||||
|
|
||||||
|
#include "ac_shader_debug_info.h"
|
||||||
#include "amd_family.h"
|
#include "amd_family.h"
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
@ -26,24 +27,17 @@ struct aco_vs_prolog_info;
|
||||||
struct aco_ps_epilog_info;
|
struct aco_ps_epilog_info;
|
||||||
struct radeon_info;
|
struct radeon_info;
|
||||||
|
|
||||||
struct aco_compiler_statistic_info {
|
|
||||||
char name[32];
|
|
||||||
char desc[64];
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef void(aco_callback)(void** priv_ptr, const struct ac_shader_config* config,
|
typedef void(aco_callback)(void** priv_ptr, const struct ac_shader_config* config,
|
||||||
const char* llvm_ir_str, unsigned llvm_ir_size, const char* disasm_str,
|
const char* llvm_ir_str, unsigned llvm_ir_size, const char* disasm_str,
|
||||||
unsigned disasm_size, uint32_t* statistics, uint32_t stats_size,
|
unsigned disasm_size, struct amd_stats* stats, uint32_t exec_size,
|
||||||
uint32_t exec_size, const uint32_t* code, uint32_t code_dw,
|
const uint32_t* code, uint32_t code_dw, const struct aco_symbol* symbols,
|
||||||
const struct aco_symbol* symbols, unsigned num_symbols,
|
unsigned num_symbols, const struct ac_shader_debug_info* debug_info,
|
||||||
const struct ac_shader_debug_info* debug_info, unsigned debug_info_count);
|
unsigned debug_info_count);
|
||||||
|
|
||||||
typedef void(aco_shader_part_callback)(void** priv_ptr, uint32_t num_sgprs, uint32_t num_vgprs,
|
typedef void(aco_shader_part_callback)(void** priv_ptr, uint32_t num_sgprs, uint32_t num_vgprs,
|
||||||
const uint32_t* code, uint32_t code_size,
|
const uint32_t* code, uint32_t code_size,
|
||||||
const char* disasm_str, uint32_t disasm_size);
|
const char* disasm_str, uint32_t disasm_size);
|
||||||
|
|
||||||
extern const struct aco_compiler_statistic_info* aco_statistic_infos;
|
|
||||||
|
|
||||||
void aco_compile_shader(const struct aco_compiler_options* options,
|
void aco_compile_shader(const struct aco_compiler_options* options,
|
||||||
const struct aco_shader_info* info, unsigned shader_count,
|
const struct aco_shader_info* info, unsigned shader_count,
|
||||||
struct nir_shader* const* shaders, const struct ac_shader_args* args,
|
struct nir_shader* const* shaders, const struct ac_shader_args* args,
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@
|
||||||
#include "aco_util.h"
|
#include "aco_util.h"
|
||||||
|
|
||||||
#include "util/compiler.h"
|
#include "util/compiler.h"
|
||||||
|
#include "util/shader_stats.h"
|
||||||
|
|
||||||
#include "ac_binary.h"
|
#include "ac_binary.h"
|
||||||
#include "ac_hw_stage.h"
|
#include "ac_hw_stage.h"
|
||||||
|
|
@ -2162,7 +2163,7 @@ public:
|
||||||
CompilationProgress progress;
|
CompilationProgress progress;
|
||||||
|
|
||||||
bool collect_statistics = false;
|
bool collect_statistics = false;
|
||||||
uint32_t statistics[aco_num_statistics];
|
amd_stats statistics;
|
||||||
|
|
||||||
float_mode next_fp_mode;
|
float_mode next_fp_mode;
|
||||||
unsigned next_loop_depth = 0;
|
unsigned next_loop_depth = 0;
|
||||||
|
|
|
||||||
|
|
@ -1806,8 +1806,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
||||||
bool skip_partial_copies = true;
|
bool skip_partial_copies = true;
|
||||||
for (auto it = copy_map.begin();;) {
|
for (auto it = copy_map.begin();;) {
|
||||||
if (copy_map.empty()) {
|
if (copy_map.empty()) {
|
||||||
ctx->program->statistics[aco_statistic_copies] +=
|
ctx->program->statistics.copies += ctx->instructions.size() - num_instructions_before;
|
||||||
ctx->instructions.size() - num_instructions_before;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (it == copy_map.end()) {
|
if (it == copy_map.end()) {
|
||||||
|
|
@ -2085,8 +2084,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ctx->program->statistics[aco_statistic_copies] +=
|
ctx->program->statistics.copies += ctx->instructions.size() - num_instructions_before;
|
||||||
ctx->instructions.size() - num_instructions_before;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
@ -2122,7 +2120,7 @@ handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_c
|
||||||
pi->scratch_sgpr = scratch_sgpr;
|
pi->scratch_sgpr = scratch_sgpr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->program->statistics[aco_statistic_copies] += scratch_sgpr == scc ? 2 : 4;
|
ctx->program->statistics.copies += scratch_sgpr == scc ? 2 : 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
||||||
|
|
@ -470,8 +470,8 @@ collect_presched_stats(Program* program)
|
||||||
RegisterDemand presched_demand;
|
RegisterDemand presched_demand;
|
||||||
for (Block& block : program->blocks)
|
for (Block& block : program->blocks)
|
||||||
presched_demand.update(block.register_demand);
|
presched_demand.update(block.register_demand);
|
||||||
program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
|
program->statistics.presgprs = presched_demand.sgpr;
|
||||||
program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
|
program->statistics.prevgprs = presched_demand.vgpr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* instructions/branches/vmem_clauses/smem_clauses/cycles */
|
/* instructions/branches/vmem_clauses/smem_clauses/cycles */
|
||||||
|
|
@ -482,31 +482,31 @@ collect_preasm_stats(Program* program)
|
||||||
std::set<Instruction*> vmem_clause;
|
std::set<Instruction*> vmem_clause;
|
||||||
std::set<Instruction*> smem_clause;
|
std::set<Instruction*> smem_clause;
|
||||||
|
|
||||||
program->statistics[aco_statistic_instructions] += block.instructions.size();
|
program->statistics.instrs += block.instructions.size();
|
||||||
|
|
||||||
for (aco_ptr<Instruction>& instr : block.instructions) {
|
for (aco_ptr<Instruction>& instr : block.instructions) {
|
||||||
const bool is_branch =
|
const bool is_branch =
|
||||||
instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
|
instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
|
||||||
if (is_branch)
|
if (is_branch)
|
||||||
program->statistics[aco_statistic_branches]++;
|
program->statistics.branches++;
|
||||||
|
|
||||||
if (instr->isVALU() || instr->isVINTRP())
|
if (instr->isVALU() || instr->isVINTRP())
|
||||||
program->statistics[aco_statistic_valu]++;
|
program->statistics.valu++;
|
||||||
if (instr->isSALU() && !instr->isSOPP() &&
|
if (instr->isSALU() && !instr->isSOPP() &&
|
||||||
instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
|
instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
|
||||||
program->statistics[aco_statistic_salu]++;
|
program->statistics.salu++;
|
||||||
if (instr->isVOPD())
|
if (instr->isVOPD())
|
||||||
program->statistics[aco_statistic_vopd]++;
|
program->statistics.vopd++;
|
||||||
|
|
||||||
if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
|
if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
|
||||||
!instr->operands.empty()) {
|
!instr->operands.empty()) {
|
||||||
if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
|
if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
|
||||||
[&](Instruction* other)
|
[&](Instruction* other)
|
||||||
{ return should_form_clause(instr.get(), other); }))
|
{ return should_form_clause(instr.get(), other); }))
|
||||||
program->statistics[aco_statistic_vmem_clauses]++;
|
program->statistics.vclause++;
|
||||||
vmem_clause.insert(instr.get());
|
vmem_clause.insert(instr.get());
|
||||||
|
|
||||||
program->statistics[aco_statistic_vmem]++;
|
program->statistics.vmem++;
|
||||||
} else {
|
} else {
|
||||||
vmem_clause.clear();
|
vmem_clause.clear();
|
||||||
}
|
}
|
||||||
|
|
@ -515,10 +515,10 @@ collect_preasm_stats(Program* program)
|
||||||
if (std::none_of(smem_clause.begin(), smem_clause.end(),
|
if (std::none_of(smem_clause.begin(), smem_clause.end(),
|
||||||
[&](Instruction* other)
|
[&](Instruction* other)
|
||||||
{ return should_form_clause(instr.get(), other); }))
|
{ return should_form_clause(instr.get(), other); }))
|
||||||
program->statistics[aco_statistic_smem_clauses]++;
|
program->statistics.sclause++;
|
||||||
smem_clause.insert(instr.get());
|
smem_clause.insert(instr.get());
|
||||||
|
|
||||||
program->statistics[aco_statistic_smem]++;
|
program->statistics.smem++;
|
||||||
} else {
|
} else {
|
||||||
smem_clause.clear();
|
smem_clause.clear();
|
||||||
}
|
}
|
||||||
|
|
@ -598,8 +598,8 @@ collect_preasm_stats(Program* program)
|
||||||
program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
|
program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
|
||||||
wave64_per_cycle *= max_utilization;
|
wave64_per_cycle *= max_utilization;
|
||||||
|
|
||||||
program->statistics[aco_statistic_latency] = round(latency);
|
program->statistics.latency = round(latency);
|
||||||
program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
|
program->statistics.invthroughput = round(1.0 / wave64_per_cycle);
|
||||||
|
|
||||||
if (debug_flags & DEBUG_PERF_INFO) {
|
if (debug_flags & DEBUG_PERF_INFO) {
|
||||||
aco_print_program(program, stderr, print_no_ssa | print_perf_info);
|
aco_print_program(program, stderr, print_no_ssa | print_perf_info);
|
||||||
|
|
@ -624,7 +624,7 @@ collect_preasm_stats(Program* program)
|
||||||
void
|
void
|
||||||
collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
|
collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
|
||||||
{
|
{
|
||||||
program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
|
program->statistics.hash = util_hash_crc32(code.data(), code.size() * 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
Instruction_cycle_info
|
Instruction_cycle_info
|
||||||
|
|
|
||||||
|
|
@ -814,44 +814,41 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
|
||||||
|
|
||||||
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, pStatistics, pStatisticCount);
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, pStatistics, pStatisticCount);
|
||||||
|
|
||||||
vk_add_exec_statistic_u64(out, "Driver pipeline hash", "Driver pipeline hash used by RGP", pipeline->pipeline_hash);
|
struct amd_stats stats = {};
|
||||||
vk_add_exec_statistic_u64(out, "SGPRs", "Number of SGPR registers allocated per subgroup", shader->config.num_sgprs);
|
if (shader->statistics)
|
||||||
vk_add_exec_statistic_u64(out, "VGPRs", "Number of VGPR registers allocated per subgroup", shader->config.num_vgprs);
|
stats = *shader->statistics;
|
||||||
vk_add_exec_statistic_u64(out, "Spilled SGPRs", "Number of SGPR registers spilled per subgroup",
|
stats.driverhash = pipeline->pipeline_hash;
|
||||||
shader->config.spilled_sgprs);
|
stats.sgprs = shader->config.num_sgprs;
|
||||||
vk_add_exec_statistic_u64(out, "Spilled VGPRs", "Number of VGPR registers spilled per subgroup",
|
stats.vgprs = shader->config.num_vgprs;
|
||||||
shader->config.spilled_vgprs);
|
stats.spillsgprs = shader->config.spilled_sgprs;
|
||||||
vk_add_exec_statistic_u64(out, "Code size", "Code size in bytes", shader->exec_size);
|
stats.spillvgprs = shader->config.spilled_vgprs;
|
||||||
vk_add_exec_statistic_u64(out, "LDS size", "LDS size in bytes per workgroup",
|
stats.codesize = shader->exec_size;
|
||||||
shader->config.lds_size * lds_increment);
|
stats.lds = shader->config.lds_size * lds_increment;
|
||||||
vk_add_exec_statistic_u64(out, "Scratch size", "Private memory in bytes per subgroup",
|
stats.scratch = shader->config.scratch_bytes_per_wave;
|
||||||
shader->config.scratch_bytes_per_wave);
|
stats.maxwaves = shader->max_waves;
|
||||||
vk_add_exec_statistic_u64(out, "Subgroups per SIMD", "The maximum number of subgroups in flight on a SIMD unit",
|
|
||||||
shader->max_waves);
|
|
||||||
|
|
||||||
uint64_t inputs = 0;
|
|
||||||
switch (stage) {
|
switch (stage) {
|
||||||
case MESA_SHADER_VERTEX:
|
case MESA_SHADER_VERTEX:
|
||||||
if (gfx_level <= GFX8 || (!shader->info.vs.as_es && !shader->info.vs.as_ls)) {
|
if (gfx_level <= GFX8 || (!shader->info.vs.as_es && !shader->info.vs.as_ls)) {
|
||||||
/* VS inputs when VS is a separate stage */
|
/* VS inputs when VS is a separate stage */
|
||||||
inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
|
stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_TESS_CTRL:
|
case MESA_SHADER_TESS_CTRL:
|
||||||
if (gfx_level >= GFX9) {
|
if (gfx_level >= GFX9) {
|
||||||
/* VS inputs when pipeline has tess */
|
/* VS inputs when pipeline has tess */
|
||||||
inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
|
stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* VS -> TCS inputs */
|
/* VS -> TCS inputs */
|
||||||
inputs += shader->info.tcs.num_linked_inputs;
|
stats.inputs += shader->info.tcs.num_linked_inputs;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_TESS_EVAL:
|
case MESA_SHADER_TESS_EVAL:
|
||||||
if (gfx_level <= GFX8 || !shader->info.tes.as_es) {
|
if (gfx_level <= GFX8 || !shader->info.tes.as_es) {
|
||||||
/* TCS -> TES inputs when TES is a separate stage */
|
/* TCS -> TES inputs when TES is a separate stage */
|
||||||
inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
|
stats.inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
@ -863,60 +860,57 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
|
||||||
if (gfx_level >= GFX9) {
|
if (gfx_level >= GFX9) {
|
||||||
if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
|
if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
|
||||||
/* VS inputs when pipeline has GS but no tess */
|
/* VS inputs when pipeline has GS but no tess */
|
||||||
inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
|
stats.inputs += util_bitcount(shader->info.vs.input_slot_usage_mask);
|
||||||
} else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
|
} else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
|
||||||
/* TCS -> TES inputs when pipeline has GS */
|
/* TCS -> TES inputs when pipeline has GS */
|
||||||
inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
|
stats.inputs += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* VS -> GS or TES -> GS inputs */
|
/* VS -> GS or TES -> GS inputs */
|
||||||
inputs += shader->info.gs.num_linked_inputs;
|
stats.inputs += shader->info.gs.num_linked_inputs;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_FRAGMENT:
|
case MESA_SHADER_FRAGMENT:
|
||||||
inputs += shader->info.ps.num_inputs;
|
stats.inputs += shader->info.ps.num_inputs;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* Other stages don't have IO or we are not interested in them. */
|
/* Other stages don't have IO or we are not interested in them. */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
vk_add_exec_statistic_u64(out, "Combined inputs",
|
|
||||||
"Number of input slots reserved for the shader (including merged stages)", inputs);
|
|
||||||
|
|
||||||
uint64_t outputs = 0;
|
|
||||||
switch (stage) {
|
switch (stage) {
|
||||||
case MESA_SHADER_VERTEX:
|
case MESA_SHADER_VERTEX:
|
||||||
if (!shader->info.vs.as_ls && !shader->info.vs.as_es) {
|
if (!shader->info.vs.as_ls && !shader->info.vs.as_es) {
|
||||||
/* VS -> FS outputs. */
|
/* VS -> FS outputs. */
|
||||||
outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
||||||
shader->info.outinfo.prim_param_exports;
|
shader->info.outinfo.prim_param_exports;
|
||||||
} else if (gfx_level <= GFX8) {
|
} else if (gfx_level <= GFX8) {
|
||||||
/* VS -> TCS, VS -> GS outputs on GFX6-8 */
|
/* VS -> TCS, VS -> GS outputs on GFX6-8 */
|
||||||
outputs += shader->info.vs.num_linked_outputs;
|
stats.outputs += shader->info.vs.num_linked_outputs;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_TESS_CTRL:
|
case MESA_SHADER_TESS_CTRL:
|
||||||
if (gfx_level >= GFX9) {
|
if (gfx_level >= GFX9) {
|
||||||
/* VS -> TCS outputs on GFX9+ */
|
/* VS -> TCS outputs on GFX9+ */
|
||||||
outputs += shader->info.vs.num_linked_outputs;
|
stats.outputs += shader->info.vs.num_linked_outputs;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TCS -> TES outputs */
|
/* TCS -> TES outputs */
|
||||||
outputs += shader->info.tcs.io_info.highest_remapped_vram_output +
|
stats.outputs += shader->info.tcs.io_info.highest_remapped_vram_output +
|
||||||
shader->info.tcs.io_info.highest_remapped_vram_patch_output;
|
shader->info.tcs.io_info.highest_remapped_vram_patch_output;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_TESS_EVAL:
|
case MESA_SHADER_TESS_EVAL:
|
||||||
if (!shader->info.tes.as_es) {
|
if (!shader->info.tes.as_es) {
|
||||||
/* TES -> FS outputs */
|
/* TES -> FS outputs */
|
||||||
outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
||||||
shader->info.outinfo.prim_param_exports;
|
shader->info.outinfo.prim_param_exports;
|
||||||
} else if (gfx_level <= GFX8) {
|
} else if (gfx_level <= GFX8) {
|
||||||
/* TES -> GS outputs on GFX6-8 */
|
/* TES -> GS outputs on GFX6-8 */
|
||||||
outputs += shader->info.tes.num_linked_outputs;
|
stats.outputs += shader->info.tes.num_linked_outputs;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
@ -928,48 +922,41 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
|
||||||
if (gfx_level >= GFX9) {
|
if (gfx_level >= GFX9) {
|
||||||
if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
|
if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
|
||||||
/* VS -> GS outputs on GFX9+ */
|
/* VS -> GS outputs on GFX9+ */
|
||||||
outputs += shader->info.vs.num_linked_outputs;
|
stats.outputs += shader->info.vs.num_linked_outputs;
|
||||||
} else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
|
} else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
|
||||||
/* TES -> GS outputs on GFX9+ */
|
/* TES -> GS outputs on GFX9+ */
|
||||||
outputs += shader->info.tes.num_linked_outputs;
|
stats.outputs += shader->info.tes.num_linked_outputs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shader->info.is_ngg) {
|
if (shader->info.is_ngg) {
|
||||||
/* GS -> FS outputs (GFX10+ NGG) */
|
/* GS -> FS outputs (GFX10+ NGG) */
|
||||||
outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
||||||
shader->info.outinfo.prim_param_exports;
|
shader->info.outinfo.prim_param_exports;
|
||||||
} else {
|
} else {
|
||||||
/* GS -> FS outputs (GFX6-10.3 legacy) */
|
/* GS -> FS outputs (GFX6-10.3 legacy) */
|
||||||
outputs += shader->info.gs.gsvs_vertex_size / 16;
|
stats.outputs += shader->info.gs.gsvs_vertex_size / 16;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_MESH:
|
case MESA_SHADER_MESH:
|
||||||
/* MS -> FS outputs */
|
/* MS -> FS outputs */
|
||||||
outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
stats.outputs += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
|
||||||
shader->info.outinfo.prim_param_exports;
|
shader->info.outinfo.prim_param_exports;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case MESA_SHADER_FRAGMENT:
|
case MESA_SHADER_FRAGMENT:
|
||||||
outputs += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z +
|
stats.outputs += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z +
|
||||||
!!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask +
|
!!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask +
|
||||||
!!shader->info.ps.writes_mrt0_alpha;
|
!!shader->info.ps.writes_mrt0_alpha;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* Other stages don't have IO or we are not interested in them. */
|
/* Other stages don't have IO or we are not interested in them. */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
vk_add_exec_statistic_u64(out, "Combined outputs",
|
|
||||||
"Number of output slots reserved for the shader (including merged stages)", outputs);
|
|
||||||
|
|
||||||
if (shader->statistics) {
|
vk_add_amd_stats(out, &stats);
|
||||||
for (unsigned i = 0; i < aco_num_statistics; i++) {
|
|
||||||
const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
|
|
||||||
vk_add_exec_statistic_u64(out, info->name, info->desc, shader->statistics[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return vk_outarray_status(&out);
|
return vk_outarray_status(&out);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ radv_shader_cache_deserialize(struct vk_pipeline_cache *cache, const void *key_d
|
||||||
void
|
void
|
||||||
radv_shader_serialize(struct radv_shader *shader, struct blob *blob)
|
radv_shader_serialize(struct radv_shader *shader, struct blob *blob)
|
||||||
{
|
{
|
||||||
size_t stats_size = shader->statistics ? aco_num_statistics * sizeof(uint32_t) : 0;
|
size_t stats_size = shader->statistics ? sizeof(struct amd_stats) : 0;
|
||||||
size_t code_size = shader->code_size;
|
size_t code_size = shader->code_size;
|
||||||
uint32_t total_size = sizeof(struct radv_shader_binary_legacy) + code_size + stats_size;
|
uint32_t total_size = sizeof(struct radv_shader_binary_legacy) + code_size + stats_size;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2984,14 +2984,15 @@ radv_dump_nir_shaders(const struct radv_instance *instance, struct nir_shader *c
|
||||||
|
|
||||||
static void
|
static void
|
||||||
radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config, const char *llvm_ir_str,
|
radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config, const char *llvm_ir_str,
|
||||||
unsigned llvm_ir_size, const char *disasm_str, unsigned disasm_size, uint32_t *statistics,
|
unsigned llvm_ir_size, const char *disasm_str, unsigned disasm_size,
|
||||||
uint32_t stats_size, uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
|
struct amd_stats *statistics, uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
|
||||||
const struct aco_symbol *symbols, unsigned num_symbols,
|
const struct aco_symbol *symbols, unsigned num_symbols,
|
||||||
const struct ac_shader_debug_info *debug_info, unsigned debug_info_count)
|
const struct ac_shader_debug_info *debug_info, unsigned debug_info_count)
|
||||||
{
|
{
|
||||||
struct radv_shader_binary **binary = (struct radv_shader_binary **)bin;
|
struct radv_shader_binary **binary = (struct radv_shader_binary **)bin;
|
||||||
|
|
||||||
uint32_t debug_info_size = debug_info_count * sizeof(struct ac_shader_debug_info);
|
uint32_t debug_info_size = debug_info_count * sizeof(struct ac_shader_debug_info);
|
||||||
|
uint32_t stats_size = statistics ? sizeof(struct amd_stats) : 0;
|
||||||
|
|
||||||
size_t size = llvm_ir_size;
|
size_t size = llvm_ir_size;
|
||||||
|
|
||||||
|
|
@ -3019,7 +3020,7 @@ radv_aco_build_shader_binary(void **bin, const struct ac_shader_config *config,
|
||||||
struct radv_shader_binary_layout layout = radv_shader_binary_get_layout(legacy_binary);
|
struct radv_shader_binary_layout layout = radv_shader_binary_get_layout(legacy_binary);
|
||||||
|
|
||||||
if (stats_size)
|
if (stats_size)
|
||||||
memcpy(layout.stats, statistics, stats_size);
|
amd_stats_serialize(layout.stats, statistics);
|
||||||
|
|
||||||
memcpy(layout.code, code, code_dw * sizeof(uint32_t));
|
memcpy(layout.code, code, code_dw * sizeof(uint32_t));
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@
|
||||||
#define RADV_SHADER_H
|
#define RADV_SHADER_H
|
||||||
|
|
||||||
#include "util/mesa-blake3.h"
|
#include "util/mesa-blake3.h"
|
||||||
|
#include "util/shader_stats.h"
|
||||||
#include "util/u_math.h"
|
#include "util/u_math.h"
|
||||||
#include "vulkan/vulkan.h"
|
#include "vulkan/vulkan.h"
|
||||||
#include "ac_binary.h"
|
#include "ac_binary.h"
|
||||||
|
|
@ -460,7 +461,7 @@ struct radv_shader {
|
||||||
char *nir_string;
|
char *nir_string;
|
||||||
char *disasm_string;
|
char *disasm_string;
|
||||||
char *ir_string;
|
char *ir_string;
|
||||||
uint32_t *statistics;
|
struct amd_stats *statistics;
|
||||||
struct ac_shader_debug_info *debug_info;
|
struct ac_shader_debug_info *debug_info;
|
||||||
uint32_t debug_info_count;
|
uint32_t debug_info_count;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,7 @@ si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
|
||||||
static void
|
static void
|
||||||
si_aco_build_shader_binary(void **data, const struct ac_shader_config *config,
|
si_aco_build_shader_binary(void **data, const struct ac_shader_config *config,
|
||||||
const char *llvm_ir_str, unsigned llvm_ir_size, const char *disasm_str,
|
const char *llvm_ir_str, unsigned llvm_ir_size, const char *disasm_str,
|
||||||
unsigned disasm_size, uint32_t *statistics, uint32_t stats_size,
|
unsigned disasm_size, struct amd_stats *statistics,
|
||||||
uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
|
uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
|
||||||
const struct aco_symbol *symbols, unsigned num_symbols,
|
const struct aco_symbol *symbols, unsigned num_symbols,
|
||||||
const struct ac_shader_debug_info *debug_info, unsigned debug_info_count)
|
const struct ac_shader_debug_info *debug_info, unsigned debug_info_count)
|
||||||
|
|
|
||||||
|
|
@ -90,4 +90,33 @@
|
||||||
<stat name="TMU Fills" display="Fills">Number of times a register was filled from memory</stat>
|
<stat name="TMU Fills" display="Fills">Number of times a register was filled from memory</stat>
|
||||||
<stat name="QPU Read Stalls" display="Read Stalls">Number of cycles the QPU stalls for a register read dependency</stat>
|
<stat name="QPU Read Stalls" display="Read Stalls">Number of cycles the QPU stalls for a register read dependency</stat>
|
||||||
</isa>
|
</isa>
|
||||||
|
|
||||||
|
<isa name="AMD">
|
||||||
|
<stat name="Driver pipeline hash" display="DriverHash" hash="true" type="u64">Driver pipeline hash used by RGP</stat>
|
||||||
|
<stat name="SGPRs">Number of SGPR registers allocated per subgroup</stat>
|
||||||
|
<stat name="VGPRs">Number of VGPR registers allocated per subgroup</stat>
|
||||||
|
<stat name="Spilled SGPRs" display="SpillSGPRs">Number of SGPR registers spilled per subgroup</stat>
|
||||||
|
<stat name="Spilled VGPRs" display="SpillVGPRs">Number of VGPR registers spilled per subgroup</stat>
|
||||||
|
<stat name="Code size" display="CodeSize">Code size in bytes</stat>
|
||||||
|
<stat name="LDS size" display="LDS">LDS size in bytes per workgroup</stat>
|
||||||
|
<stat name="Scratch size" display="Scratch">Private memory in bytes per subgroup</stat>
|
||||||
|
<stat name="Subgroups per SIMD" display="MaxWaves" more="better">The maximum number of subgroups in flight on a SIMD unit</stat>
|
||||||
|
<stat name="Combined inputs" display="Inputs">Number of input slots reserved for the shader (including merged stages)</stat>
|
||||||
|
<stat name="Combined outputs" display="Outputs">Number of output slots reserved for the shader (including merged stages)</stat>
|
||||||
|
<stat name="Hash" hash="true">CRC32 hash of code and constant data</stat>
|
||||||
|
<stat name="Instructions" display="Instrs">Instruction count</stat>
|
||||||
|
<stat name="Copies">Copy instructions created for pseudo-instructions</stat>
|
||||||
|
<stat name="Branches">Branch instructions</stat>
|
||||||
|
<stat name="Latency">Issue cycles plus stall cycles</stat>
|
||||||
|
<stat name="Inverse Throughput" display="InvThroughput">Estimated busy cycles to execute one wave</stat>
|
||||||
|
<stat name="VMEM Clause" display="VClause">Number of VMEM clauses (includes 1-sized clauses)</stat>
|
||||||
|
<stat name="SMEM Clause" display="SClause">Number of SMEM clauses (includes 1-sized clauses)</stat>
|
||||||
|
<stat name="Pre-Sched SGPRs" display="PreSGPRs">SGPR usage before scheduling</stat>
|
||||||
|
<stat name="Pre-Sched VGPRs" display="PreVGPRs">VGPR usage before scheduling</stat>
|
||||||
|
<stat name="VALU">Number of VALU instructions</stat>
|
||||||
|
<stat name="SALU">Number of SALU instructions</stat>
|
||||||
|
<stat name="VMEM">Number of VMEM instructions</stat>
|
||||||
|
<stat name="SMEM">Number of SMEM instructions</stat>
|
||||||
|
<stat name="VOPD" more="better">Number of VOPD instructions</stat>
|
||||||
|
</isa>
|
||||||
</shaderdb>
|
</shaderdb>
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue