panfrost: port to common stats framework

this adds full support for executable statistics in panvk.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33921>
This commit is contained in:
Alyssa Rosenzweig 2025-02-26 11:16:36 -05:00 committed by Marge Bot
parent 7b1f1a107e
commit 4da7b12000
7 changed files with 167 additions and 148 deletions

View file

@ -36,6 +36,7 @@
#include "nir_serialize.h"
#include "pan_bo.h"
#include "pan_context.h"
#include "shader_enums.h"
static struct panfrost_uncompiled_shader *
panfrost_alloc_shader(const nir_shader *nir)
@ -128,7 +129,6 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
pan_shader_preprocess(s, panfrost_device_gpu_id(dev));
struct panfrost_compile_inputs inputs = {
.debug = dbg,
.gpu_id = panfrost_device_gpu_id(dev),
};
@ -201,6 +201,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);
panfrost_stats_util_debug(dbg, gl_shader_stage_name(s->info.stage),
&out->info.stats);
if (s->info.stage == MESA_SHADER_VERTEX && out->info.vs.idvs) {
panfrost_stats_util_debug(dbg, "MESA_SHADER_POSITION",
&out->info.stats_idvs_varying);
}
assert(req_local_mem >= out->info.wls_size);
out->info.wls_size = req_local_mem;

View file

@ -28,6 +28,7 @@
#include "compiler/glsl/glsl_to_nir.h"
#include "compiler/glsl_types.h"
#include "compiler/nir/nir_builder.h"
#include "panfrost/util/pan_ir.h"
#include "util/u_debug.h"
#include "bifrost/disassemble.h"
@ -4737,10 +4738,10 @@ bi_shader_stage_name(bi_context *ctx)
return gl_shader_stage_name(ctx->stage);
}
static char *
bi_print_stats(bi_context *ctx, unsigned size)
static void
bi_gather_stats(bi_context *ctx, unsigned size, struct bifrost_stats *out)
{
struct bi_stats stats = {0};
struct bi_stats counts = {0};
/* Count instructions, clauses, and tuples. Also attempt to construct
* normalized execution engine cycle counts, using the following ratio:
@ -4756,57 +4757,46 @@ bi_print_stats(bi_context *ctx, unsigned size)
bi_foreach_block(ctx, block) {
bi_foreach_clause_in_block(block, clause) {
stats.nr_clauses++;
stats.nr_tuples += clause->tuple_count;
counts.nr_clauses++;
counts.nr_tuples += clause->tuple_count;
for (unsigned i = 0; i < clause->tuple_count; ++i)
bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
bi_count_tuple_stats(clause, &clause->tuples[i], &counts);
}
}
float cycles_arith = ((float)stats.nr_arith) / 24.0;
float cycles_texture = ((float)stats.nr_texture) / 2.0;
float cycles_varying = ((float)stats.nr_varying) / 16.0;
float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
float cycles_bound = MAX2(cycles_arith, cycles_message);
/* Thread count and register pressure are traded off only on v7 */
bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
unsigned nr_threads = full_threads ? 2 : 1;
/* Dump stats */
char *str = ralloc_asprintf(
NULL,
"%s shader: "
"%u inst, %u tuples, %u clauses, "
"%f cycles, %f arith, %f texture, %f vary, %f ldst, "
"%u quadwords, %u threads",
bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
cycles_varying, cycles_ldst, size / 16, nr_threads);
*out = (struct bifrost_stats){
.instrs = counts.nr_ins,
.tuples = counts.nr_tuples,
.clauses = counts.nr_clauses,
.arith = ((float)counts.nr_arith) / 24.0,
.t = ((float)counts.nr_texture) / 2.0,
.v = ((float)counts.nr_varying) / 16.0,
.ldst = ((float)counts.nr_ldst) / 1.0,
.code_size = size,
.preloads = ctx->arch == 7 ? bi_count_preload_cost(ctx) : 0,
.threads = full_threads ? 2 : 1,
.loops = ctx->loop_count,
.spills = ctx->spills,
.fills = ctx->fills,
};
if (ctx->arch == 7) {
ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
}
ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
ctx->loop_count, ctx->spills, ctx->fills);
return str;
out->cycles = MAX2(out->arith, MAX3(out->t, out->v, out->ldst));
}
static char *
va_print_stats(bi_context *ctx, unsigned size)
static void
va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out)
{
unsigned nr_ins = 0;
struct va_stats stats = {0};
struct va_stats counts = {0};
/* Count instructions */
bi_foreach_instr_global(ctx, I) {
nr_ins++;
va_count_instr_stats(I, &stats);
va_count_instr_stats(I, &counts);
}
/* Mali G78 peak performance:
@ -4818,31 +4808,24 @@ va_print_stats(bi_context *ctx, unsigned size)
* 4 texture instructions per cycle
* 1 load/store operation per cycle
*/
float cycles_fma = ((float)stats.fma) / 64.0;
float cycles_cvt = ((float)stats.cvt) / 64.0;
float cycles_sfu = ((float)stats.sfu) / 16.0;
float cycles_v = ((float)stats.v) / 16.0;
float cycles_t = ((float)stats.t) / 4.0;
float cycles_ls = ((float)stats.ls) / 1.0;
*out = (struct valhall_stats){
.instrs = nr_ins,
.code_size = size,
.fma = ((float)counts.fma) / 64.0,
.cvt = ((float)counts.cvt) / 64.0,
.sfu = ((float)counts.sfu) / 16.0,
.v = ((float)counts.v) / 16.0,
.t = ((float)counts.t) / 4.0,
.ls = ((float)counts.ls) / 1.0,
.threads = (ctx->info.work_reg_count <= 32) ? 2 : 1,
.loops = ctx->loop_count,
.spills = ctx->spills,
.fills = ctx->fills,
};
/* Calculate the bound */
float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
MAX3(cycles_v, cycles_t, cycles_ls));
/* Thread count and register pressure are traded off */
unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
/* Dump stats */
return ralloc_asprintf(NULL,
"%s shader: "
"%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
"%f t, %f ls, %u quadwords, %u threads, %u loops, "
"%u:%u spills:fills",
bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
size / 16, nr_threads, ctx->loop_count, ctx->spills,
ctx->fills);
out->cycles =
MAX2(MAX3(out->fma, out->cvt, out->sfu), MAX3(out->v, out->t, out->ls));
}
static int
@ -5748,7 +5731,7 @@ static bi_context *
bi_compile_variant_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs,
struct util_dynarray *binary, struct bi_shader_info info,
enum bi_idvs_mode idvs)
struct panfrost_stats *stats, enum bi_idvs_mode idvs)
{
bi_context *ctx = rzalloc(NULL, bi_context);
@ -5985,23 +5968,17 @@ bi_compile_variant_nir(nir_shader *nir,
fflush(stdout);
}
if (!skip_internal &&
((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
char *shaderdb;
if (ctx->arch >= 9) {
stats->isa = PANFROST_STAT_VALHALL;
va_gather_stats(ctx, binary->size - offset, &stats->valhall);
} else {
stats->isa = PANFROST_STAT_BIFROST;
bi_gather_stats(ctx, binary->size - offset, &stats->bifrost);
}
if (ctx->arch >= 9) {
shaderdb = va_print_stats(ctx, binary->size - offset);
} else {
shaderdb = bi_print_stats(ctx, binary->size - offset);
}
if (bifrost_debug & BIFROST_DBG_SHADERDB)
fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
if (inputs->debug)
util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
ralloc_free(shaderdb);
if ((bifrost_debug & BIFROST_DBG_SHADERDB) && !skip_internal) {
const char *prefix = bi_shader_stage_name(ctx);
panfrost_stats_fprintf(stderr, prefix, stats);
}
return ctx;
@ -6034,8 +6011,11 @@ bi_compile_variant(nir_shader *nir,
* offset, to keep the ABI simple. */
assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
struct panfrost_stats *stats =
idvs == BI_IDVS_VARYING ? &info->stats_idvs_varying : &info->stats;
bi_context *ctx =
bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
bi_compile_variant_nir(nir, inputs, binary, local_info, stats, idvs);
/* A register is preloaded <==> it is live before the first block */
bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);

View file

@ -30,6 +30,7 @@
#include "compiler/nir/nir.h"
#include "panfrost/util/pan_ir.h"
#include "util/half_float.h"
#include "util/shader_stats.h"
#include "util/u_math.h"
#include "util/u_worklist.h"
#include "bi_opcodes.h"
@ -834,6 +835,7 @@ bi_block_add_successor(bi_block *block, bi_block *successor)
struct bi_shader_info {
struct panfrost_ubo_push *push;
struct bifrost_shader_info *bifrost;
struct panfrost_stats stats;
unsigned tls_size;
unsigned work_reg_count;
unsigned push_offset;

View file

@ -36,6 +36,7 @@
#include "compiler/nir/nir_builder.h"
#include "util/half_float.h"
#include "util/list.h"
#include "util/shader_stats.h"
#include "util/u_debug.h"
#include "util/u_dynarray.h"
#include "util/u_math.h"
@ -49,6 +50,7 @@
#include "midgard_quirks.h"
#include "disassemble.h"
#include "shader_enums.h"
static const struct debug_named_value midgard_debug_options[] = {
{"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"},
@ -3155,51 +3157,34 @@ midgard_compile_shader_nir(nir_shader *nir,
if (binary->size)
memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16);
if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->debug) &&
!nir->info.internal) {
unsigned nr_bundles = 0, nr_ins = 0;
struct midgard_stats stats = {
.quadwords = ctx->quadword_count,
.registers = info->work_reg_count,
.loops = ctx->loop_count,
.spills = ctx->spills,
.fills = ctx->fills,
};
/* Count instructions and bundles */
/* Count instructions and bundles */
mir_foreach_block(ctx, _block) {
midgard_block *block = (midgard_block *)_block;
stats.bundles +=
util_dynarray_num_elements(&block->bundles, midgard_bundle);
mir_foreach_block(ctx, _block) {
midgard_block *block = (midgard_block *)_block;
nr_bundles +=
util_dynarray_num_elements(&block->bundles, midgard_bundle);
mir_foreach_bundle_in_block(block, bun)
stats.inst += bun->instruction_count;
}
mir_foreach_bundle_in_block(block, bun)
nr_ins += bun->instruction_count;
}
/* Calculate thread count. There are certain cutoffs by
* register count for thread count */
stats.threads = (stats.registers <= 4) ? 4 : (stats.registers <= 8) ? 2 : 1;
/* Calculate thread count. There are certain cutoffs by
* register count for thread count */
info->stats.isa = PANFROST_STAT_MIDGARD;
info->stats.midgard = stats;
unsigned nr_registers = info->work_reg_count;
unsigned nr_threads = (nr_registers <= 4) ? 4
: (nr_registers <= 8) ? 2
: 1;
char *shaderdb = NULL;
/* Dump stats */
asprintf(&shaderdb,
"%s shader: "
"%u inst, %u bundles, %u quadwords, "
"%u registers, %u threads, %u loops, "
"%u:%u spills:fills",
ctx->inputs->is_blend ? "PAN_SHADER_BLEND"
: gl_shader_stage_name(ctx->stage),
nr_ins, nr_bundles, ctx->quadword_count, nr_registers,
nr_threads, ctx->loop_count, ctx->spills, ctx->fills);
if (midgard_debug & MIDGARD_DBG_SHADERDB)
fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
if (inputs->debug)
util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
free(shaderdb);
if ((midgard_debug & MIDGARD_DBG_SHADERDB) && !nir->info.internal) {
const char *prefix = _mesa_shader_stage_to_abbrev(ctx->stage);
midgard_stats_fprintf(stderr, prefix, &stats);
}
_mesa_hash_table_u64_destroy(ctx->ssa_constants);

View file

@ -27,6 +27,7 @@
#include <stdint.h>
#include "compiler/nir/nir.h"
#include "util/hash_table.h"
#include "util/shader_stats.h"
#include "util/u_dynarray.h"
/* Indices for named (non-XFB) varyings that are present. These are packed
@ -95,8 +96,6 @@ unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo,
unsigned offs);
struct panfrost_compile_inputs {
struct util_debug_callback *debug;
unsigned gpu_id;
bool is_blend, is_blit;
struct {
@ -196,6 +195,8 @@ struct pan_shader_info {
unsigned tls_size;
unsigned wls_size;
struct panfrost_stats stats, stats_idvs_varying;
/* Bit mask of preloaded registers */
uint64_t preload;

View file

@ -41,11 +41,13 @@
#include "spirv/nir_spirv.h"
#include "util/memstream.h"
#include "util/mesa-sha1.h"
#include "util/shader_stats.h"
#include "util/u_dynarray.h"
#include "nir_builder.h"
#include "nir_conversion_builder.h"
#include "nir_deref.h"
#include "shader_enums.h"
#include "vk_graphics_state.h"
#include "vk_nir_convert_ycbcr.h"
#include "vk_shader_module.h"
@ -1416,13 +1418,6 @@ panvk_shader_serialize(struct vk_device *vk_dev,
return !blob->out_of_memory;
}
#define WRITE_STR(field, ...) \
({ \
memset(field, 0, sizeof(field)); \
UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
assert(i > 0 && i < sizeof(field)); \
})
static VkResult
panvk_shader_get_executable_properties(
UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
@ -1438,10 +1433,20 @@ panvk_shader_get_executable_properties(
{
props->stages = mesa_to_vk_shader_stage(shader->info.stage);
props->subgroupSize = 8;
WRITE_STR(props->name, "%s",
_mesa_shader_stage_to_string(shader->info.stage));
WRITE_STR(props->description, "%s shader",
_mesa_shader_stage_to_string(shader->info.stage));
VK_COPY_STR(props->name,
_mesa_shader_stage_to_string(shader->info.stage));
VK_PRINT_STR(props->description, "%s shader",
_mesa_shader_stage_to_string(shader->info.stage));
}
if (shader->info.stage == MESA_SHADER_VERTEX && shader->info.vs.idvs) {
vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
{
props->stages = mesa_to_vk_shader_stage(shader->info.stage);
props->subgroupSize = 8;
VK_COPY_STR(props->name, "varying");
VK_COPY_STR(props->description, "Varying shader");
}
}
return vk_outarray_status(&out);
@ -1459,19 +1464,11 @@ panvk_shader_get_executable_statistics(
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
statistic_count);
assert(executable_index == 0);
vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
{
WRITE_STR(stat->name, "Code Size");
WRITE_STR(stat->description,
"Size of the compiled shader binary, in bytes");
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
stat->value.u64 = shader->bin_size;
}
/* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */
assert(executable_index == 0 || executable_index == 1);
struct panfrost_stats *stats =
executable_index ? &shader->info.stats_idvs_varying : &shader->info.stats;
vk_add_panfrost_stats(out, stats);
return vk_outarray_status(&out);
}
@ -1513,8 +1510,8 @@ panvk_shader_get_executable_internal_representations(
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
&out, ir)
{
WRITE_STR(ir->name, "NIR shader");
WRITE_STR(ir->description,
VK_COPY_STR(ir->name, "NIR shader");
VK_COPY_STR(ir->description,
"NIR shader before sending to the back-end compiler");
if (!write_ir_text(ir, shader->nir_str))
incomplete_text = true;
@ -1525,8 +1522,8 @@ panvk_shader_get_executable_internal_representations(
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
&out, ir)
{
WRITE_STR(ir->name, "Assembly");
WRITE_STR(ir->description, "Final Assembly");
VK_COPY_STR(ir->name, "Assembly");
VK_COPY_STR(ir->description, "Final Assembly");
if (!write_ir_text(ir, shader->asm_str))
incomplete_text = true;
}

View file

@ -13,4 +13,50 @@
<stat name="Spills">Number of spill (stack store) instructions</stat>
<stat name="Fills">Number of fill (stack load) instructions</stat>
</isa>
<family name="Panfrost">
<isa name="Midgard">
<stat name="Instructions" display="Inst">Instruction count</stat>
<stat name="Bundles">Instruction bundles</stat>
<stat name="Registers" type="u16">Register usage in vec4s</stat>
<stat name="Threads" more="better" type="u16">Maximum number of threads in flight on a compute unit</stat>
<stat name="Quadwords">Binary size in quadwords</stat>
<stat name="Loops">Number of hardware loops</stat>
<stat name="Spills">Number of spill instructions</stat>
<stat name="Fills">Number of fill instructions</stat>
</isa>
<isa name="Bifrost">
<stat name="Instructions" display="Instrs">Instruction count</stat>
<stat name="Tuples">Tuple count</stat>
<stat name="Clauses">Clause count</stat>
<stat name="Cycles" type="float">Estimated normalized cycles</stat>
<stat name="Arithmetic" display="Arith" type="float">Estimated normalized arithmetic cycles</stat>
<stat name="Texture" display="T" type="float">Estimated normalized Texture cycles</stat>
<stat name="Load/store" display="LDST" type="float">Estimated normalized Load/Store cycles</stat>
<stat name="Varying" display="V" type="float">Estimated normalized Varying cycles</stat>
<stat name="Preloads" type="u16">Preload count</stat>
<stat name="Threads" more="better" type="u16">Maximum number of threads in flight on a compute unit</stat>
<stat name="Code size">Binary size in bytes</stat>
<stat name="Loops">Number of hardware loops</stat>
<stat name="Spills">Number of spill instructions</stat>
<stat name="Fills">Number of fill instructions</stat>
</isa>
<isa name="Valhall">
<stat name="Instructions" display="Instrs">Instruction count</stat>
<stat name="Cycles" type="float">Estimated normalized cycles</stat>
<stat name="FMA" type="float">Estimated normalized FMA (Fused Multiply-Add) cycles</stat>
<stat name="CVT" type="float">Estimated normalized CVT (ConVerT) cycles</stat>
<stat name="SFU" type="float">Estimated normalized SFU (Special Function Unit) cycles</stat>
<stat name="Varying" display="V" type="float">Estimated normalized Varying cycles</stat>
<stat name="Texture" display="T" type="float">Estimated normalized Texture cycles</stat>
<stat name="Load/store" display="LS" type="float">Estimated normalized Load/Store cycles</stat>
<stat name="Code size">Binary size in bytes</stat>
<stat name="Threads" more="better" type="u16">Maximum number of threads in flight on a compute unit</stat>
<stat name="Loops">Number of hardware loops</stat>
<stat name="Spills">Number of spill instructions</stat>
<stat name="Fills">Number of fill instructions</stat>
</isa>
</family>
</shaderdb>