diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index f0ad06fee4c..e762c8437cf 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -36,6 +36,7 @@ #include "nir_serialize.h" #include "pan_bo.h" #include "pan_context.h" +#include "shader_enums.h" static struct panfrost_uncompiled_shader * panfrost_alloc_shader(const nir_shader *nir) @@ -128,7 +129,6 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, pan_shader_preprocess(s, panfrost_device_gpu_id(dev)); struct panfrost_compile_inputs inputs = { - .debug = dbg, .gpu_id = panfrost_device_gpu_id(dev), }; @@ -201,6 +201,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info); + panfrost_stats_util_debug(dbg, gl_shader_stage_name(s->info.stage), + &out->info.stats); + + if (s->info.stage == MESA_SHADER_VERTEX && out->info.vs.idvs) { + panfrost_stats_util_debug(dbg, "MESA_SHADER_POSITION", + &out->info.stats_idvs_varying); + } + assert(req_local_mem >= out->info.wls_size); out->info.wls_size = req_local_mem; diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index ce2738bd373..4d8eb9e3b95 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -28,6 +28,7 @@ #include "compiler/glsl/glsl_to_nir.h" #include "compiler/glsl_types.h" #include "compiler/nir/nir_builder.h" +#include "panfrost/util/pan_ir.h" #include "util/u_debug.h" #include "bifrost/disassemble.h" @@ -4737,10 +4738,10 @@ bi_shader_stage_name(bi_context *ctx) return gl_shader_stage_name(ctx->stage); } -static char * -bi_print_stats(bi_context *ctx, unsigned size) +static void +bi_gather_stats(bi_context *ctx, unsigned size, struct bifrost_stats *out) { - struct bi_stats stats = {0}; + struct bi_stats counts = {0}; /* Count instructions, clauses, and tuples. Also attempt to construct * normalized execution engine cycle counts, using the following ratio: @@ -4756,57 +4757,46 @@ bi_print_stats(bi_context *ctx, unsigned size) bi_foreach_block(ctx, block) { bi_foreach_clause_in_block(block, clause) { - stats.nr_clauses++; - stats.nr_tuples += clause->tuple_count; + counts.nr_clauses++; + counts.nr_tuples += clause->tuple_count; for (unsigned i = 0; i < clause->tuple_count; ++i) - bi_count_tuple_stats(clause, &clause->tuples[i], &stats); + bi_count_tuple_stats(clause, &clause->tuples[i], &counts); } } - float cycles_arith = ((float)stats.nr_arith) / 24.0; - float cycles_texture = ((float)stats.nr_texture) / 2.0; - float cycles_varying = ((float)stats.nr_varying) / 16.0; - float cycles_ldst = ((float)stats.nr_ldst) / 1.0; - - float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); - float cycles_bound = MAX2(cycles_arith, cycles_message); - /* Thread count and register pressure are traded off only on v7 */ bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32); - unsigned nr_threads = full_threads ? 2 : 1; - /* Dump stats */ - char *str = ralloc_asprintf( - NULL, - "%s shader: " - "%u inst, %u tuples, %u clauses, " - "%f cycles, %f arith, %f texture, %f vary, %f ldst, " - "%u quadwords, %u threads", - bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples, - stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture, - cycles_varying, cycles_ldst, size / 16, nr_threads); + *out = (struct bifrost_stats){ + .instrs = counts.nr_ins, + .tuples = counts.nr_tuples, + .clauses = counts.nr_clauses, + .arith = ((float)counts.nr_arith) / 24.0, + .t = ((float)counts.nr_texture) / 2.0, + .v = ((float)counts.nr_varying) / 16.0, + .ldst = ((float)counts.nr_ldst) / 1.0, + .code_size = size, + .preloads = ctx->arch == 7 ? bi_count_preload_cost(ctx) : 0, + .threads = full_threads ? 2 : 1, + .loops = ctx->loop_count, + .spills = ctx->spills, + .fills = ctx->fills, + }; - if (ctx->arch == 7) { - ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx)); - } - - ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills", - ctx->loop_count, ctx->spills, ctx->fills); - - return str; + out->cycles = MAX2(out->arith, MAX3(out->t, out->v, out->ldst)); } -static char * -va_print_stats(bi_context *ctx, unsigned size) +static void +va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out) { unsigned nr_ins = 0; - struct va_stats stats = {0}; + struct va_stats counts = {0}; /* Count instructions */ bi_foreach_instr_global(ctx, I) { nr_ins++; - va_count_instr_stats(I, &stats); + va_count_instr_stats(I, &counts); } /* Mali G78 peak performance: @@ -4818,31 +4808,24 @@ va_print_stats(bi_context *ctx, unsigned size) * 4 texture instructions per cycle * 1 load/store operation per cycle */ - - float cycles_fma = ((float)stats.fma) / 64.0; - float cycles_cvt = ((float)stats.cvt) / 64.0; - float cycles_sfu = ((float)stats.sfu) / 16.0; - float cycles_v = ((float)stats.v) / 16.0; - float cycles_t = ((float)stats.t) / 4.0; - float cycles_ls = ((float)stats.ls) / 1.0; + *out = (struct valhall_stats){ + .instrs = nr_ins, + .code_size = size, + .fma = ((float)counts.fma) / 64.0, + .cvt = ((float)counts.cvt) / 64.0, + .sfu = ((float)counts.sfu) / 16.0, + .v = ((float)counts.v) / 16.0, + .t = ((float)counts.t) / 4.0, + .ls = ((float)counts.ls) / 1.0, + .threads = (ctx->info.work_reg_count <= 32) ? 2 : 1, + .loops = ctx->loop_count, + .spills = ctx->spills, + .fills = ctx->fills, + }; /* Calculate the bound */ - float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu), - MAX3(cycles_v, cycles_t, cycles_ls)); - - /* Thread count and register pressure are traded off */ - unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1; - - /* Dump stats */ - return ralloc_asprintf(NULL, - "%s shader: " - "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, " - "%f t, %f ls, %u quadwords, %u threads, %u loops, " - "%u:%u spills:fills", - bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma, - cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls, - size / 16, nr_threads, ctx->loop_count, ctx->spills, - ctx->fills); + out->cycles = + MAX2(MAX3(out->fma, out->cvt, out->sfu), MAX3(out->v, out->t, out->ls)); } static int @@ -5748,7 +5731,7 @@ static bi_context * bi_compile_variant_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, struct bi_shader_info info, - enum bi_idvs_mode idvs) + struct panfrost_stats *stats, enum bi_idvs_mode idvs) { bi_context *ctx = rzalloc(NULL, bi_context); @@ -5985,23 +5968,17 @@ bi_compile_variant_nir(nir_shader *nir, fflush(stdout); } - if (!skip_internal && - ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) { - char *shaderdb; + if (ctx->arch >= 9) { + stats->isa = PANFROST_STAT_VALHALL; + va_gather_stats(ctx, binary->size - offset, &stats->valhall); + } else { + stats->isa = PANFROST_STAT_BIFROST; + bi_gather_stats(ctx, binary->size - offset, &stats->bifrost); + } - if (ctx->arch >= 9) { - shaderdb = va_print_stats(ctx, binary->size - offset); - } else { - shaderdb = bi_print_stats(ctx, binary->size - offset); - } - - if (bifrost_debug & BIFROST_DBG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); - - if (inputs->debug) - util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); - - ralloc_free(shaderdb); + if ((bifrost_debug & BIFROST_DBG_SHADERDB) && !skip_internal) { + const char *prefix = bi_shader_stage_name(ctx); + panfrost_stats_fprintf(stderr, prefix, stats); } return ctx; @@ -6034,8 +6011,11 @@ bi_compile_variant(nir_shader *nir, * offset, to keep the ABI simple. */ assert((offset == 0) ^ (idvs == BI_IDVS_VARYING)); + struct panfrost_stats *stats = + idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats; + bi_context *ctx = - bi_compile_variant_nir(nir, inputs, binary, local_info, idvs); + bi_compile_variant_nir(nir, inputs, binary, local_info, stats, idvs); /* A register is preloaded <==> it is live before the first block */ bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); diff --git a/src/panfrost/compiler/compiler.h b/src/panfrost/compiler/compiler.h index 672699add68..f27c7632ccf 100644 --- a/src/panfrost/compiler/compiler.h +++ b/src/panfrost/compiler/compiler.h @@ -30,6 +30,7 @@ #include "compiler/nir/nir.h" #include "panfrost/util/pan_ir.h" #include "util/half_float.h" +#include "util/shader_stats.h" #include "util/u_math.h" #include "util/u_worklist.h" #include "bi_opcodes.h" @@ -834,6 +835,7 @@ bi_block_add_successor(bi_block *block, bi_block *successor) struct bi_shader_info { struct panfrost_ubo_push *push; struct bifrost_shader_info *bifrost; + struct panfrost_stats stats; unsigned tls_size; unsigned work_reg_count; unsigned push_offset; diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index c045b002c6c..39c50038c88 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -36,6 +36,7 @@ #include "compiler/nir/nir_builder.h" #include "util/half_float.h" #include "util/list.h" +#include "util/shader_stats.h" #include "util/u_debug.h" #include "util/u_dynarray.h" #include "util/u_math.h" @@ -49,6 +50,7 @@ #include "midgard_quirks.h" #include "disassemble.h" +#include "shader_enums.h" static const struct debug_named_value midgard_debug_options[] = { {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, @@ -3155,51 +3157,34 @@ midgard_compile_shader_nir(nir_shader *nir, if (binary->size) memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16); - if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->debug) && - !nir->info.internal) { - unsigned nr_bundles = 0, nr_ins = 0; + struct midgard_stats stats = { + .quadwords = ctx->quadword_count, + .registers = info->work_reg_count, + .loops = ctx->loop_count, + .spills = ctx->spills, + .fills = ctx->fills, + }; - /* Count instructions and bundles */ + /* Count instructions and bundles */ + mir_foreach_block(ctx, _block) { + midgard_block *block = (midgard_block *)_block; + stats.bundles += + util_dynarray_num_elements(&block->bundles, midgard_bundle); - mir_foreach_block(ctx, _block) { - midgard_block *block = (midgard_block *)_block; - nr_bundles += - util_dynarray_num_elements(&block->bundles, midgard_bundle); + mir_foreach_bundle_in_block(block, bun) + stats.inst += bun->instruction_count; + } - mir_foreach_bundle_in_block(block, bun) - nr_ins += bun->instruction_count; - } + /* Calculate thread count. There are certain cutoffs by + * register count for thread count */ + stats.threads = (stats.registers <= 4) ? 4 : (stats.registers <= 8) ? 2 : 1; - /* Calculate thread count. There are certain cutoffs by - * register count for thread count */ + info->stats.isa = PANFROST_STAT_MIDGARD; + info->stats.midgard = stats; - unsigned nr_registers = info->work_reg_count; - - unsigned nr_threads = (nr_registers <= 4) ? 4 - : (nr_registers <= 8) ? 2 - : 1; - - char *shaderdb = NULL; - - /* Dump stats */ - - asprintf(&shaderdb, - "%s shader: " - "%u inst, %u bundles, %u quadwords, " - "%u registers, %u threads, %u loops, " - "%u:%u spills:fills", - ctx->inputs->is_blend ? "PAN_SHADER_BLEND" - : gl_shader_stage_name(ctx->stage), - nr_ins, nr_bundles, ctx->quadword_count, nr_registers, - nr_threads, ctx->loop_count, ctx->spills, ctx->fills); - - if (midgard_debug & MIDGARD_DBG_SHADERDB) - fprintf(stderr, "SHADER-DB: %s\n", shaderdb); - - if (inputs->debug) - util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb); - - free(shaderdb); + if ((midgard_debug & MIDGARD_DBG_SHADERDB) && !nir->info.internal) { + const char *prefix = _mesa_shader_stage_to_abbrev(ctx->stage); + midgard_stats_fprintf(stderr, prefix, &stats); } _mesa_hash_table_u64_destroy(ctx->ssa_constants); diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 0459c45af20..7920d0e072e 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -27,6 +27,7 @@ #include #include "compiler/nir/nir.h" #include "util/hash_table.h" +#include "util/shader_stats.h" #include "util/u_dynarray.h" /* Indices for named (non-XFB) varyings that are present. These are packed @@ -95,8 +96,6 @@ unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs); struct panfrost_compile_inputs { - struct util_debug_callback *debug; - unsigned gpu_id; bool is_blend, is_blit; struct { @@ -196,6 +195,8 @@ struct pan_shader_info { unsigned tls_size; unsigned wls_size; + struct panfrost_stats stats, stats_idvs_varying; + /* Bit mask of preloaded registers */ uint64_t preload; diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 55ad829114e..00d5982c3d3 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -41,11 +41,13 @@ #include "spirv/nir_spirv.h" #include "util/memstream.h" #include "util/mesa-sha1.h" +#include "util/shader_stats.h" #include "util/u_dynarray.h" #include "nir_builder.h" #include "nir_conversion_builder.h" #include "nir_deref.h" +#include "shader_enums.h" #include "vk_graphics_state.h" #include "vk_nir_convert_ycbcr.h" #include "vk_shader_module.h" @@ -1416,13 +1418,6 @@ panvk_shader_serialize(struct vk_device *vk_dev, return !blob->out_of_memory; } -#define WRITE_STR(field, ...) \ - ({ \ - memset(field, 0, sizeof(field)); \ - UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \ - assert(i > 0 && i < sizeof(field)); \ - }) - static VkResult panvk_shader_get_executable_properties( UNUSED struct vk_device *device, const struct vk_shader *vk_shader, @@ -1438,10 +1433,20 @@ panvk_shader_get_executable_properties( { props->stages = mesa_to_vk_shader_stage(shader->info.stage); props->subgroupSize = 8; - WRITE_STR(props->name, "%s", - _mesa_shader_stage_to_string(shader->info.stage)); - WRITE_STR(props->description, "%s shader", - _mesa_shader_stage_to_string(shader->info.stage)); + VK_COPY_STR(props->name, + _mesa_shader_stage_to_string(shader->info.stage)); + VK_PRINT_STR(props->description, "%s shader", + _mesa_shader_stage_to_string(shader->info.stage)); + } + + if (shader->info.stage == MESA_SHADER_VERTEX && shader->info.vs.idvs) { + vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) + { + props->stages = mesa_to_vk_shader_stage(shader->info.stage); + props->subgroupSize = 8; + VK_COPY_STR(props->name, "varying"); + VK_COPY_STR(props->description, "Varying shader"); + } } return vk_outarray_status(&out); @@ -1459,19 +1464,11 @@ panvk_shader_get_executable_statistics( VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics, statistic_count); - assert(executable_index == 0); - - vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) - { - WRITE_STR(stat->name, "Code Size"); - WRITE_STR(stat->description, - "Size of the compiled shader binary, in bytes"); - stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - stat->value.u64 = shader->bin_size; - } - - /* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */ + assert(executable_index == 0 || executable_index == 1); + struct panfrost_stats *stats = + executable_index ? &shader->info.stats_idvs_varying : &shader->info.stats; + vk_add_panfrost_stats(out, stats); return vk_outarray_status(&out); } @@ -1513,8 +1510,8 @@ panvk_shader_get_executable_internal_representations( vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { - WRITE_STR(ir->name, "NIR shader"); - WRITE_STR(ir->description, + VK_COPY_STR(ir->name, "NIR shader"); + VK_COPY_STR(ir->description, "NIR shader before sending to the back-end compiler"); if (!write_ir_text(ir, shader->nir_str)) incomplete_text = true; @@ -1525,8 +1522,8 @@ panvk_shader_get_executable_internal_representations( vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { - WRITE_STR(ir->name, "Assembly"); - WRITE_STR(ir->description, "Final Assembly"); + VK_COPY_STR(ir->name, "Assembly"); + VK_COPY_STR(ir->description, "Final Assembly"); if (!write_ir_text(ir, shader->asm_str)) incomplete_text = true; } diff --git a/src/util/shader_stats.xml b/src/util/shader_stats.xml index 89b37e1e3fa..6a3c84eb742 100644 --- a/src/util/shader_stats.xml +++ b/src/util/shader_stats.xml @@ -13,4 +13,50 @@ Number of spill (stack store) instructions Number of fill (stack load) instructions + + + + Instruction count + Instruction bundles + Register usage in vec4s + Maximum number of threads in flight on a compute unit + Binary size in quadwords + Number of hardware loops + Number of spill instructions + Number of fill instructions + + + + Instruction count + Tuple count + Clause count + Estimated normalized cycles + Estimated normalized arithmetic cycles + Estimated normalized Texture cycles + Estimated normalized Load/Store cycles + Estimated normalized Varying cycles + Preload count + Maximum number of threads in flight on a compute unit + Binary size in bytes + Number of hardware loops + Number of spill instructions + Number of fill instructions + + + + Instruction count + Estimated normalized cycles + Estimated normalized FMA (Fused Multiply-Add) cycles + Estimated normalized CVT (ConVerT) cycles + Estimated normalized SFU (Special Function Unit) cycles + Estimated normalized Varying cycles + Estimated normalized Texture cycles + Estimated normalized Load/Store cycles + Binary size in bytes + Maximum number of threads in flight on a compute unit + Number of hardware loops + Number of spill instructions + Number of fill instructions + +