freedreno: add common implementation of perfcntr-based derived counters

Freedreno's derived counters combine multiple perfcntrs into a more
sensible, human-friendly metric. This change picks up the counters
currently used in Freedreno's Perfetto producer and rolls them into a
more genericallly usable form.

First place of their use will be through VK_KHR_performance_query, but
the Perfetto producer should also be able to use this interface instead
of having the logic duplicated. For now the counters are available only
for a7xx devices.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Reviewed-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33208>
This commit is contained in:
Zan Dobersek 2025-02-20 16:59:28 +01:00 committed by Marge Bot
parent b8338dee39
commit 27fd2d1ad1
4 changed files with 1056 additions and 0 deletions

View file

@ -1265,3 +1265,982 @@ const struct fd_perfcntr_group a7xx_perfcntr_groups[] = {
const unsigned a7xx_num_perfcntr_groups = ARRAY_SIZE(a7xx_perfcntr_groups);
enum {
DERIVED_COUNTER_PERFCNTR_CP_ALWAYS_COUNT,
DERIVED_COUNTER_PERFCNTR_CP_NUM_PREEMPTIONS,
DERIVED_COUNTER_PERFCNTR_CP_PREEMPTION_REACTION_DELAY,
DERIVED_COUNTER_PERFCNTR_RBBM_STATUS_MASKED,
DERIVED_COUNTER_PERFCNTR_PC_STALL_CYCLES_VFD,
DERIVED_COUNTER_PERFCNTR_PC_VERTEX_HITS,
DERIVED_COUNTER_PERFCNTR_PC_VS_INVOCATIONS,
DERIVED_COUNTER_PERFCNTR_TSE_INPUT_PRIM,
DERIVED_COUNTER_PERFCNTR_TSE_TRIVAL_REJ_PRIM,
DERIVED_COUNTER_PERFCNTR_TSE_CLIPPED_PRIM,
DERIVED_COUNTER_PERFCNTR_TSE_OUTPUT_VISIBLE_PRIM,
DERIVED_COUNTER_PERFCNTR_UCHE_STALL_CYCLES_ARBITER,
DERIVED_COUNTER_PERFCNTR_UCHE_VBIF_READ_BEATS_TP,
DERIVED_COUNTER_PERFCNTR_UCHE_VBIF_READ_BEATS_VFD,
DERIVED_COUNTER_PERFCNTR_UCHE_VBIF_READ_BEATS_SP,
DERIVED_COUNTER_PERFCNTR_UCHE_READ_REQUESTS_TP,
DERIVED_COUNTER_PERFCNTR_TP_BUSY_CYCLES,
DERIVED_COUNTER_PERFCNTR_TP_L1_CACHELINE_REQUESTS,
DERIVED_COUNTER_PERFCNTR_TP_L1_CACHELINE_MISSES,
DERIVED_COUNTER_PERFCNTR_TP_OUTPUT_PIXELS,
DERIVED_COUNTER_PERFCNTR_TP_OUTPUT_PIXELS_POINT,
DERIVED_COUNTER_PERFCNTR_TP_OUTPUT_PIXELS_BILINEAR,
DERIVED_COUNTER_PERFCNTR_TP_OUTPUT_PIXELS_ANISO,
DERIVED_COUNTER_PERFCNTR_SP_BUSY_CYCLES,
DERIVED_COUNTER_PERFCNTR_SP_ALU_WORKING_CYCLES,
DERIVED_COUNTER_PERFCNTR_SP_EFU_WORKING_CYCLES,
DERIVED_COUNTER_PERFCNTR_SP_STALL_CYCLES_TP,
DERIVED_COUNTER_PERFCNTR_SP_NON_EXECUTION_CYCLES,
DERIVED_COUNTER_PERFCNTR_SP_VS_STAGE_TEX_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_VS_STAGE_EFU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_FS_STAGE_EFU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_ICL1_REQUESTS,
DERIVED_COUNTER_PERFCNTR_SP_ICL1_MISSES,
DERIVED_COUNTER_PERFCNTR_SP_ANY_EU_WORKING_FS_STAGE,
DERIVED_COUNTER_PERFCNTR_SP_ANY_EU_WORKING_VS_STAGE,
DERIVED_COUNTER_PERFCNTR_SP_ANY_EU_WORKING_CS_STAGE,
DERIVED_COUNTER_PERFCNTR_SP_PIXELS,
DERIVED_COUNTER_PERFCNTR_SP_RAY_QUERY_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_SP_RTU_BUSY_CYCLES,
DERIVED_COUNTER_PERFCNTR_SP_RTU_BVH_FETCH_LATENCY_CYCLES,
DERIVED_COUNTER_PERFCNTR_SP_RTU_BVH_FETCH_LATENCY_SAMPLES,
DERIVED_COUNTER_PERFCNTR_SP_RTU_RAY_BOX_INTERSECTIONS,
DERIVED_COUNTER_PERFCNTR_SP_RTU_RAY_TRIANGLE_INTERSECTIONS,
DERIVED_COUNTER_PERFCNTR_SP_SCH_STALL_CYCLES_RTU,
DERIVED_COUNTER_PERFCNTR_CMPDECMP_VBIF_READ_DATA,
DERIVED_COUNTER_PERFCNTR_BV_PC_STALL_CYCLES_VFD,
DERIVED_COUNTER_PERFCNTR_BV_PC_VERTEX_HITS,
DERIVED_COUNTER_PERFCNTR_BV_PC_VS_INVOCATIONS,
DERIVED_COUNTER_PERFCNTR_BV_TP_L1_CACHELINE_REQUESTS,
DERIVED_COUNTER_PERFCNTR_BV_TP_L1_CACHELINE_MISSES,
DERIVED_COUNTER_PERFCNTR_BV_TP_OUTPUT_PIXELS,
DERIVED_COUNTER_PERFCNTR_BV_TP_OUTPUT_PIXELS_POINT,
DERIVED_COUNTER_PERFCNTR_BV_TP_OUTPUT_PIXELS_BILINEAR,
DERIVED_COUNTER_PERFCNTR_BV_TP_OUTPUT_PIXELS_ANISO,
DERIVED_COUNTER_PERFCNTR_BV_SP_STALL_CYCLES_TP,
DERIVED_COUNTER_PERFCNTR_BV_SP_VS_STAGE_TEX_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_BV_SP_VS_STAGE_EFU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
DERIVED_COUNTER_PERFCNTR_BV_SP_ICL1_REQUESTS,
DERIVED_COUNTER_PERFCNTR_BV_SP_ICL1_MISSES,
DERIVED_COUNTER_PERFCNTR_BV_SP_ANY_EU_WORKING_FS_STAGE,
DERIVED_COUNTER_PERFCNTR_BV_SP_ANY_EU_WORKING_VS_STAGE,
DERIVED_COUNTER_PERFCNTR_MAX_VALUE,
};
static_assert(DERIVED_COUNTER_PERFCNTR_MAX_VALUE <= FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS, "");
#define DERIVED_COUNTER_PERFCNTR(_enum, _counter) \
[DERIVED_COUNTER_PERFCNTR_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum }
#define DERIVED_COUNTER_PERFCNTR_BV(_enum, _counter) \
[DERIVED_COUNTER_PERFCNTR_BV_##_enum] = { .counter = _counter, .countable = A7XX_PERF_##_enum }
static const struct {
const struct fd_perfcntr_counter *counter;
unsigned countable;
} a7xx_derived_counter_perfcntrs[] = {
/* CP: 3/14 counters */
DERIVED_COUNTER_PERFCNTR(CP_ALWAYS_COUNT, &cp_counters[0]),
DERIVED_COUNTER_PERFCNTR(CP_NUM_PREEMPTIONS, &cp_counters[1]),
DERIVED_COUNTER_PERFCNTR(CP_PREEMPTION_REACTION_DELAY, &cp_counters[2]),
/* RBBM: 1/4 counters */
DERIVED_COUNTER_PERFCNTR(RBBM_STATUS_MASKED, &rbbm_counters[0]),
/* PC: 3/8 counters */
DERIVED_COUNTER_PERFCNTR(PC_STALL_CYCLES_VFD, &pc_counters[0]),
DERIVED_COUNTER_PERFCNTR(PC_VERTEX_HITS, &pc_counters[1]),
DERIVED_COUNTER_PERFCNTR(PC_VS_INVOCATIONS, &pc_counters[2]),
/* TSE: 4/4 counters */
DERIVED_COUNTER_PERFCNTR(TSE_INPUT_PRIM, &tse_counters[0]),
DERIVED_COUNTER_PERFCNTR(TSE_TRIVAL_REJ_PRIM, &tse_counters[1]),
DERIVED_COUNTER_PERFCNTR(TSE_CLIPPED_PRIM, &tse_counters[2]),
DERIVED_COUNTER_PERFCNTR(TSE_OUTPUT_VISIBLE_PRIM, &tse_counters[3]),
/* UCHE: 5/12 counters */
DERIVED_COUNTER_PERFCNTR(UCHE_STALL_CYCLES_ARBITER, &uche_counters[0]),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_TP, &uche_counters[1]),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_VFD, &uche_counters[2]),
DERIVED_COUNTER_PERFCNTR(UCHE_VBIF_READ_BEATS_SP, &uche_counters[3]),
DERIVED_COUNTER_PERFCNTR(UCHE_READ_REQUESTS_TP, &uche_counters[4]),
/* TP: 7/12 counters */
DERIVED_COUNTER_PERFCNTR(TP_BUSY_CYCLES, &tp_counters[0]),
DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_REQUESTS, &tp_counters[1]),
DERIVED_COUNTER_PERFCNTR(TP_L1_CACHELINE_MISSES, &tp_counters[2]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS, &tp_counters[3]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_POINT, &tp_counters[4]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_BILINEAR, &tp_counters[5]),
DERIVED_COUNTER_PERFCNTR(TP_OUTPUT_PIXELS_ANISO, &tp_counters[6]),
/* SP: 24/24 counters */
DERIVED_COUNTER_PERFCNTR(SP_BUSY_CYCLES, &sp_counters[ 0]),
DERIVED_COUNTER_PERFCNTR(SP_ALU_WORKING_CYCLES, &sp_counters[ 1]),
DERIVED_COUNTER_PERFCNTR(SP_EFU_WORKING_CYCLES, &sp_counters[ 2]),
DERIVED_COUNTER_PERFCNTR(SP_STALL_CYCLES_TP, &sp_counters[ 3]),
DERIVED_COUNTER_PERFCNTR(SP_NON_EXECUTION_CYCLES, &sp_counters[ 4]),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_TEX_INSTRUCTIONS, &sp_counters[ 5]),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 6]),
DERIVED_COUNTER_PERFCNTR(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 7]),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_EFU_INSTRUCTIONS, &sp_counters[ 8]),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, &sp_counters[ 9]),
DERIVED_COUNTER_PERFCNTR(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, &sp_counters[10]),
DERIVED_COUNTER_PERFCNTR(SP_ICL1_REQUESTS, &sp_counters[11]),
DERIVED_COUNTER_PERFCNTR(SP_ICL1_MISSES, &sp_counters[12]),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_FS_STAGE, &sp_counters[13]),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_VS_STAGE, &sp_counters[14]),
DERIVED_COUNTER_PERFCNTR(SP_ANY_EU_WORKING_CS_STAGE, &sp_counters[15]),
DERIVED_COUNTER_PERFCNTR(SP_PIXELS, &sp_counters[16]),
DERIVED_COUNTER_PERFCNTR(SP_RAY_QUERY_INSTRUCTIONS, &sp_counters[17]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BUSY_CYCLES, &sp_counters[18]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_CYCLES, &sp_counters[19]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_BVH_FETCH_LATENCY_SAMPLES, &sp_counters[20]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_BOX_INTERSECTIONS, &sp_counters[21]),
DERIVED_COUNTER_PERFCNTR(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, &sp_counters[22]),
DERIVED_COUNTER_PERFCNTR(SP_SCH_STALL_CYCLES_RTU, &sp_counters[23]),
/* CMP: 1/4 counters */
DERIVED_COUNTER_PERFCNTR(CMPDECMP_VBIF_READ_DATA, &cmp_counters[0]),
/* BV_PC: 3/8 counters */
DERIVED_COUNTER_PERFCNTR_BV(PC_STALL_CYCLES_VFD, &bv_pc_counters[0]),
DERIVED_COUNTER_PERFCNTR_BV(PC_VERTEX_HITS, &bv_pc_counters[1]),
DERIVED_COUNTER_PERFCNTR_BV(PC_VS_INVOCATIONS, &bv_pc_counters[2]),
/* BV_TP: 6/6 counters */
DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_REQUESTS, &bv_tp_counters[0]),
DERIVED_COUNTER_PERFCNTR_BV(TP_L1_CACHELINE_MISSES, &bv_tp_counters[1]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS, &bv_tp_counters[2]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_POINT, &bv_tp_counters[3]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_BILINEAR, &bv_tp_counters[4]),
DERIVED_COUNTER_PERFCNTR_BV(TP_OUTPUT_PIXELS_ANISO, &bv_tp_counters[5]),
/* GP: 8/12 counters */
DERIVED_COUNTER_PERFCNTR_BV(SP_STALL_CYCLES_TP, &bv_sp_counters[0]),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_TEX_INSTRUCTIONS, &bv_sp_counters[1]),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_EFU_INSTRUCTIONS, &bv_sp_counters[2]),
DERIVED_COUNTER_PERFCNTR_BV(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, &bv_sp_counters[3]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_REQUESTS, &bv_sp_counters[4]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ICL1_MISSES, &bv_sp_counters[5]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_FS_STAGE, &bv_sp_counters[6]),
DERIVED_COUNTER_PERFCNTR_BV(SP_ANY_EU_WORKING_VS_STAGE, &bv_sp_counters[7]),
};
static uint64_t
safe_div(uint64_t a, uint64_t b)
{
double value = 0.0;
if (b)
value = a / (double) b;
union {
double d;
uint64_t u;
} v;
v.d = value;
return v.u;
}
static uint64_t
percent(uint64_t a, uint64_t b)
{
float value = 0;
if (b)
value = (a / (float) b) * 100.0f;
union {
float f;
uint32_t u;
} v;
v.f = value;
return (uint64_t )v.u & 0xffffffff;
}
#define DERIVED_COUNTER_CATEGORY_GPU_GENERAL "GPU General"
#define DERIVED_COUNTER_CATEGORY_GPU_MEMORY_STATS "GPU Memory Stats"
#define DERIVED_COUNTER_CATEGORY_GPU_PREEMPTION "GPU Preemption"
#define DERIVED_COUNTER_CATEGORY_GPU_PRIMITIVE_PROCESSING "GPU Primitive Processing"
#define DERIVED_COUNTER_CATEGORY_GPU_SHADER_PROCESSING "GPU Shader Processing"
#define DERIVED_COUNTER_CATEGORY_GPU_STALLS "GPU Stalls"
#define DERIVED_COUNTER_PERFCNTRS_COUNT_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
#define DERIVED_COUNTER_PERFCNTRS_COUNT(...) DERIVED_COUNTER_PERFCNTRS_COUNT_IMPL(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#define DERIVED_COUNTER_PERFCNTRS(...) __VA_ARGS__
#define PERFCNTR_VALUE_0(OP, _0, ...) OP(0, _0)
#define PERFCNTR_VALUE_1(OP, _0, _1, ...) PERFCNTR_VALUE_0(OP, _0) OP(1, _1)
#define PERFCNTR_VALUE_2(OP, _0, _1, _2, ...) PERFCNTR_VALUE_1(OP, _0, _1) OP(2, _2)
#define PERFCNTR_VALUE_3(OP, _0, _1, _2, _3, ...) PERFCNTR_VALUE_2(OP, _0, _1, _2) OP(3, _3)
#define PERFCNTR_VALUE_4(OP, _0, _1, _2, _3, _4, ...) PERFCNTR_VALUE_3(OP, _0, _1, _2, _3) OP(4, _4)
#define PERFCNTR_HANDLE_VALUES(OP, _0, _1, _2, _3, _4, N, ...) PERFCNTR_VALUE_##N(OP, _0, _1, _2, _3, _4)
#define PERFCNTR_OP_LIST(_index, _name) DERIVED_COUNTER_PERFCNTR_##_name,
#define PERFCNTR_OP_DECLARE(_index, _name) uint64_t _name = values[_index];
#define DERIVED_COUNTER_PERFCNTR_LIST_VALUES(...) PERFCNTR_HANDLE_VALUES(PERFCNTR_OP_LIST, __VA_ARGS__, 4, 3, 2, 1, 0)
#define DERIVED_COUNTER_PERFCNTR_DECLARE_VALUES(...) PERFCNTR_HANDLE_VALUES(PERFCNTR_OP_DECLARE, __VA_ARGS__, 4, 3, 2, 1, 0)
#define DERIVED_COUNTER(_impl_name, _name, _description, _category, _type, _perfcntrs, _derivation) \
static uint64_t a7xx_derived_counter_##_impl_name##_derive(struct fd_derivation_context *context, uint64_t *values) {\
DERIVED_COUNTER_PERFCNTR_DECLARE_VALUES(_perfcntrs) \
_derivation \
} \
const struct fd_derived_counter a7xx_derived_counter_##_impl_name = { \
.name = _name, .description = _description, \
.category = DERIVED_COUNTER_CATEGORY_##_category, \
.type = FD_PERFCNTR_TYPE_##_type, \
.num_perfcntrs = DERIVED_COUNTER_PERFCNTRS_COUNT(_perfcntrs), \
.perfcntrs = { DERIVED_COUNTER_PERFCNTR_LIST_VALUES(_perfcntrs) }, \
.derive = a7xx_derived_counter_##_impl_name##_derive, \
}
#define DERIVED_COUNTER_PTR(_impl_name) &a7xx_derived_counter_##_impl_name
DERIVED_COUNTER(clocks, "Clocks", "Number of GPU clocks", GPU_GENERAL, UINT64,
DERIVED_COUNTER_PERFCNTRS(CP_ALWAYS_COUNT),
{
return CP_ALWAYS_COUNT;
});
DERIVED_COUNTER(avg_bytes_per_fragment,
"Avg Bytes / Fragment",
"Average number of bytes transferred from main memory for each fragment",
GPU_MEMORY_STATS, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(UCHE_VBIF_READ_BEATS_TP,
SP_PIXELS),
{
return safe_div(UCHE_VBIF_READ_BEATS_TP * 32, SP_PIXELS);
});
DERIVED_COUNTER(avg_bytes_per_vertex,
"Avg Bytes / Vertex",
"Average number of bytes transferred from main memory for each vertex",
GPU_MEMORY_STATS, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(UCHE_VBIF_READ_BEATS_VFD,
PC_VS_INVOCATIONS,
BV_PC_VS_INVOCATIONS),
{
return safe_div(UCHE_VBIF_READ_BEATS_VFD * 32,
PC_VS_INVOCATIONS + BV_PC_VS_INVOCATIONS);
});
DERIVED_COUNTER(sp_memory_read,
"SP Memory Read (Bytes)",
"Bytes of data read from memory by the Shader Processors",
GPU_MEMORY_STATS, UINT64,
DERIVED_COUNTER_PERFCNTRS(UCHE_VBIF_READ_BEATS_SP),
{
return UCHE_VBIF_READ_BEATS_SP * 32;
});
DERIVED_COUNTER(texture_memory_read,
"Texture Memory Read BW (Bytes)",
"Bytes of texture data read from memory",
GPU_MEMORY_STATS, UINT64,
DERIVED_COUNTER_PERFCNTRS(UCHE_VBIF_READ_BEATS_TP,
CMPDECMP_VBIF_READ_DATA),
{
return (UCHE_VBIF_READ_BEATS_TP + CMPDECMP_VBIF_READ_DATA) * 32;
});
DERIVED_COUNTER(vertex_memory_read,
"Vertex Memory Read (Bytes)",
"Bytes of vertex data read from memory",
GPU_MEMORY_STATS, UINT64,
DERIVED_COUNTER_PERFCNTRS(UCHE_VBIF_READ_BEATS_VFD),
{
return UCHE_VBIF_READ_BEATS_VFD * 32;
});
/* FIXME: disabled due to lack of support for VBIF perfcounters which
have a more complex way of being enabled.
DERIVED_COUNTER(read_total,
"Read Total (Bytes)",
"Total number of bytes read by the GPU from memory",
GPU_MEMORY_STATS, UINT64,
DERIVED_COUNTER_PERFCNTRS(...),
{
return 0;
});
DERIVED_COUNTER(write_total,
"Write Total (Bytes)",
"Total number of bytes written by the GPU to memory",
GPU_MEMORY_STATS, UINT64,
DERIVED_COUNTER_PERFCNTRS(...),
{
return 0;
});
*/
DERIVED_COUNTER(avg_preemption_delay,
"Avg Preemption Delay",
"Average number of cycles from the preemption request to preemption start",
GPU_PREEMPTION, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(CP_PREEMPTION_REACTION_DELAY,
CP_NUM_PREEMPTIONS,
CP_ALWAYS_COUNT),
{
if (!CP_ALWAYS_COUNT || !CP_NUM_PREEMPTIONS)
return 0;
union {
double d;
uint64_t u;
} v;
double delay = CP_PREEMPTION_REACTION_DELAY / (double) CP_ALWAYS_COUNT;
v.d = delay / (CP_NUM_PREEMPTIONS / 2);
return v.u;
});
DERIVED_COUNTER(preemptions,
"Preemptions",
"The number of GPU preemptions that occurred",
GPU_PREEMPTION, UINT64,
DERIVED_COUNTER_PERFCNTRS(CP_NUM_PREEMPTIONS),
{
return CP_NUM_PREEMPTIONS / 2;
});
DERIVED_COUNTER(average_polygon_area,
"Average Polygon Area",
"Average number of pixels per polygon",
GPU_PRIMITIVE_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(TSE_OUTPUT_VISIBLE_PRIM,
SP_PIXELS),
{
return safe_div(SP_PIXELS, TSE_OUTPUT_VISIBLE_PRIM);
});
DERIVED_COUNTER(average_vertices_per_polygon,
"Average Vertices / Polygon",
"Average number of vertices per polygon",
GPU_PRIMITIVE_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(PC_VS_INVOCATIONS,
BV_PC_VS_INVOCATIONS,
TSE_INPUT_PRIM),
{
return safe_div(PC_VS_INVOCATIONS + BV_PC_VS_INVOCATIONS, TSE_INPUT_PRIM);
});
DERIVED_COUNTER(preclipped_polygon,
"Pre-clipped Polygon",
"Number of polygons submitted to the GPU before any hardware clipping",
GPU_PRIMITIVE_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(TSE_INPUT_PRIM),
{
return TSE_INPUT_PRIM;
});
DERIVED_COUNTER(percent_prims_clipped,
"% Prims Clipped",
"Percentage of primitives clipped by the GPU (where new primitives are generated)",
GPU_PRIMITIVE_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TSE_CLIPPED_PRIM,
TSE_INPUT_PRIM),
{
return percent(TSE_CLIPPED_PRIM, TSE_INPUT_PRIM);
});
DERIVED_COUNTER(percent_prims_trivially_rejected,
"% Prims Trivially Rejected",
"Percentage of primitives that are trivially rejected",
GPU_PRIMITIVE_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TSE_TRIVAL_REJ_PRIM,
TSE_INPUT_PRIM),
{
return percent(TSE_TRIVAL_REJ_PRIM, TSE_INPUT_PRIM);
});
DERIVED_COUNTER(reused_vertices,
"Reused Vertices",
"Number of vertices used from the post-transform vertex buffer cache",
GPU_PRIMITIVE_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(PC_VERTEX_HITS,
BV_PC_VERTEX_HITS),
{
return PC_VERTEX_HITS + BV_PC_VERTEX_HITS;
});
DERIVED_COUNTER(alu_per_fragment,
"ALU / Fragment",
"Average number of scalar fragment shader ALU instructions issued per shaded fragment, expressed as full precision ALUs (2 mediump = 1 fullp)",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS,
SP_FS_STAGE_HALF_ALU_INSTRUCTIONS,
SP_PIXELS),
{
return safe_div(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2,
SP_PIXELS);
});
DERIVED_COUNTER(alu_per_vertex,
"ALU / Vertex",
"Average number of vertex scalar shader ALU instructions issued per shaded vertex",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(PC_VS_INVOCATIONS,
BV_PC_VS_INVOCATIONS,
SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS),
{
return safe_div(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
PC_VS_INVOCATIONS + BV_PC_VS_INVOCATIONS);
});
DERIVED_COUNTER(percent_anisotropic_filtered,
"% Anisotropic Filtered",
"Percent of texels filtered using the 'Anisotropic' sampling method",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TP_OUTPUT_PIXELS,
BV_TP_OUTPUT_PIXELS,
TP_OUTPUT_PIXELS_ANISO,
BV_TP_OUTPUT_PIXELS_ANISO),
{
return percent(TP_OUTPUT_PIXELS_ANISO + BV_TP_OUTPUT_PIXELS_ANISO,
TP_OUTPUT_PIXELS + BV_TP_OUTPUT_PIXELS);
});
DERIVED_COUNTER(average_bvh_fetch_latency_cycles,
"Average BVH Fetch Latency Cycles",
"The Average BVH Fetch Latency cycles is the latency counted from start of BVH query request till getting BVH Query result back",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_RTU_BVH_FETCH_LATENCY_CYCLES,
SP_RTU_BVH_FETCH_LATENCY_SAMPLES),
{
return safe_div(SP_RTU_BVH_FETCH_LATENCY_CYCLES, SP_RTU_BVH_FETCH_LATENCY_SAMPLES);
});
DERIVED_COUNTER(efu_per_fragment,
"EFU / Fragment",
"Average number of scalar fragment shader EFU instructions issued per shaded fragment",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(SP_FS_STAGE_EFU_INSTRUCTIONS,
SP_PIXELS),
{
return safe_div(SP_FS_STAGE_EFU_INSTRUCTIONS, SP_PIXELS);
});
DERIVED_COUNTER(efu_per_vertex,
"EFU / Vertex",
"Average number of scalar vertex shader EFU instructions issued per shaded vertex",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(PC_VS_INVOCATIONS,
BV_PC_VS_INVOCATIONS,
SP_VS_STAGE_EFU_INSTRUCTIONS,
BV_SP_VS_STAGE_EFU_INSTRUCTIONS),
{
return safe_div(SP_VS_STAGE_EFU_INSTRUCTIONS + BV_SP_VS_STAGE_EFU_INSTRUCTIONS,
PC_VS_INVOCATIONS + BV_PC_VS_INVOCATIONS);
});
DERIVED_COUNTER(fragment_alu_instructions_full,
"Fragment ALU Instructions (Full)",
"Total number of full precision fragment shader instructions issued",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_FS_STAGE_FULL_ALU_INSTRUCTIONS),
{
return SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * 4;
});
DERIVED_COUNTER(fragment_alu_instructions_half,
"Fragment ALU Instructions (Half)",
"Total number of half precision Scalar fragment shader instructions issued",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_FS_STAGE_HALF_ALU_INSTRUCTIONS),
{
return SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * 4;
});
DERIVED_COUNTER(fragment_efu_instructions,
"Fragment EFU Instructions",
"Total number of Scalar fragment shader Elementary Function Unit (EFU) instructions issued",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_FS_STAGE_EFU_INSTRUCTIONS),
{
return SP_FS_STAGE_EFU_INSTRUCTIONS * 4;
});
DERIVED_COUNTER(fragment_instructions,
"Fragment Instructions",
"Total number of fragment shader instructions issued",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_FS_STAGE_EFU_INSTRUCTIONS,
SP_FS_STAGE_FULL_ALU_INSTRUCTIONS,
SP_FS_STAGE_HALF_ALU_INSTRUCTIONS),
{
return (SP_FS_STAGE_EFU_INSTRUCTIONS + SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) * 4;
});
DERIVED_COUNTER(fragments_shaded,
"Fragments Shaded",
"Number of fragments submitted to the shader engine",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_PIXELS),
{
return SP_PIXELS;
});
DERIVED_COUNTER(percent_linear_filtered,
"% Linear Filtered",
"Percent of texels filtered using the 'Linear' sampling method",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TP_OUTPUT_PIXELS,
BV_TP_OUTPUT_PIXELS,
TP_OUTPUT_PIXELS_BILINEAR,
BV_TP_OUTPUT_PIXELS_BILINEAR),
{
return percent(TP_OUTPUT_PIXELS_BILINEAR + BV_TP_OUTPUT_PIXELS_BILINEAR,
TP_OUTPUT_PIXELS + BV_TP_OUTPUT_PIXELS);
});
DERIVED_COUNTER(percent_nearest_filtered,
"% Nearest Filtered",
"Percent of texels filtered using the 'Nearest' sampling method",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TP_OUTPUT_PIXELS,
BV_TP_OUTPUT_PIXELS,
TP_OUTPUT_PIXELS_POINT,
BV_TP_OUTPUT_PIXELS_POINT),
{
return percent(TP_OUTPUT_PIXELS_POINT + BV_TP_OUTPUT_PIXELS_POINT,
TP_OUTPUT_PIXELS + BV_TP_OUTPUT_PIXELS);
});
DERIVED_COUNTER(percent_rtu_busy,
"% RTU Busy",
"Percentage of time that Ray Tracing Unit in SP is busy compared to whole SP",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_RTU_BUSY_CYCLES,
SP_BUSY_CYCLES),
{
return percent(SP_RTU_BUSY_CYCLES, SP_BUSY_CYCLES);
});
DERIVED_COUNTER(rtu_ray_box_intersections_per_instruction,
"RTU Ray Box Intersections Per Instruction",
"Number of Ray Box intersections per instruction",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(SP_RTU_RAY_BOX_INTERSECTIONS,
SP_RAY_QUERY_INSTRUCTIONS),
{
return safe_div(SP_RTU_RAY_BOX_INTERSECTIONS, SP_RAY_QUERY_INSTRUCTIONS);
});
DERIVED_COUNTER(rtu_ray_triangle_intersections_per_instruction,
"RTU Ray Triangle Intersections Per Instruction",
"Number of Ray Triangle intersections per instruction",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(SP_RTU_RAY_TRIANGLE_INTERSECTIONS,
SP_RAY_QUERY_INSTRUCTIONS),
{
return safe_div(SP_RTU_RAY_TRIANGLE_INTERSECTIONS, SP_RAY_QUERY_INSTRUCTIONS);
});
/* FIXME: disabled due to lack of TP counter capacity
DERIVED_COUNTER(percent_non_base_level_textures,
"% Non-Base Level Textures",
"Percent of texels coming from a non-base MIP level",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(...),
{
return 0;
});
*/
DERIVED_COUNTER(percent_shader_alu_capacity_utilized,
"% Shader ALU Capacity Utilized",
"Percent of maximum shader capacity (ALU operations) utilized",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_BUSY_CYCLES,
SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
SP_FS_STAGE_FULL_ALU_INSTRUCTIONS,
SP_FS_STAGE_HALF_ALU_INSTRUCTIONS),
{
return percent(SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
SP_FS_STAGE_FULL_ALU_INSTRUCTIONS + SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2,
SP_BUSY_CYCLES * context->a7xx.number_of_alus_per_usptp);
});
DERIVED_COUNTER(percent_shaders_busy,
"% Shaders Busy",
"Percentage of time that all Shader cores are busy",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_BUSY_CYCLES,
TP_BUSY_CYCLES,
RBBM_STATUS_MASKED),
{
uint64_t numerator = SP_BUSY_CYCLES;
if (!numerator)
numerator = TP_BUSY_CYCLES;
return percent(numerator, context->a7xx.number_of_usptp * RBBM_STATUS_MASKED);
});
DERIVED_COUNTER(percent_shaders_stalled,
"% Shaders Stalled",
"Percentage of time that all shader cores are idle with at least one active wave",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_NON_EXECUTION_CYCLES,
RBBM_STATUS_MASKED),
{
return percent(SP_NON_EXECUTION_CYCLES, context->a7xx.number_of_usptp * RBBM_STATUS_MASKED);
});
DERIVED_COUNTER(percent_texture_pipes_busy,
"% Texture Pipes Busy",
"Percentage of time that any texture pipe is busy",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TP_BUSY_CYCLES,
RBBM_STATUS_MASKED),
{
return percent(TP_BUSY_CYCLES, context->a7xx.number_of_usptp * RBBM_STATUS_MASKED);
});
DERIVED_COUNTER(textures_per_fragment,
"Textures / Fragment",
"Average number of textures referenced per fragment",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(SP_VS_STAGE_TEX_INSTRUCTIONS,
TP_OUTPUT_PIXELS,
SP_PIXELS),
{
/* FIXME: SP_VS_STAGE_TEX_INSTRUCTIONS seems to be unused. */
(void)SP_VS_STAGE_TEX_INSTRUCTIONS;
return safe_div(TP_OUTPUT_PIXELS, SP_PIXELS);
});
DERIVED_COUNTER(textures_per_vertex,
"Textures / Vertex",
"Average number of textures referenced per vertex",
GPU_SHADER_PROCESSING, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(PC_VS_INVOCATIONS,
BV_PC_VS_INVOCATIONS,
SP_VS_STAGE_TEX_INSTRUCTIONS,
BV_SP_VS_STAGE_TEX_INSTRUCTIONS),
{
return safe_div(4 * (SP_VS_STAGE_TEX_INSTRUCTIONS + BV_SP_VS_STAGE_TEX_INSTRUCTIONS),
PC_VS_INVOCATIONS + BV_PC_VS_INVOCATIONS);
});
DERIVED_COUNTER(percent_time_alus_working,
"% Time ALUs Working",
"Percentage of time the ALUs are working while the Shaders are busy",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_BUSY_CYCLES,
SP_ALU_WORKING_CYCLES),
{
return percent(SP_ALU_WORKING_CYCLES / 2, SP_BUSY_CYCLES);
});
DERIVED_COUNTER(percent_time_compute,
"% Time Compute",
"Percentage of time spent in compute work compared to the total time spent shading everything",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_ANY_EU_WORKING_FS_STAGE,
SP_ANY_EU_WORKING_VS_STAGE,
BV_SP_ANY_EU_WORKING_VS_STAGE,
SP_ANY_EU_WORKING_CS_STAGE),
{
uint64_t total = SP_ANY_EU_WORKING_VS_STAGE + BV_SP_ANY_EU_WORKING_VS_STAGE +
SP_ANY_EU_WORKING_FS_STAGE;
return percent(SP_ANY_EU_WORKING_CS_STAGE, total);
});
DERIVED_COUNTER(percent_time_efus_working,
"% Time EFUs Working",
"Percentage of time the EFUs are working while the Shaders are busy",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_BUSY_CYCLES,
SP_EFU_WORKING_CYCLES),
{
return percent(SP_EFU_WORKING_CYCLES, SP_BUSY_CYCLES);
});
DERIVED_COUNTER(percent_time_shading_fragments,
"% Time Shading Fragments",
"Percentage of time spent shading fragments compared to the total time spent shading everything",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_ANY_EU_WORKING_FS_STAGE,
SP_ANY_EU_WORKING_VS_STAGE,
BV_SP_ANY_EU_WORKING_VS_STAGE,
SP_ANY_EU_WORKING_CS_STAGE),
{
uint64_t total = SP_ANY_EU_WORKING_VS_STAGE + BV_SP_ANY_EU_WORKING_VS_STAGE +
SP_ANY_EU_WORKING_FS_STAGE;
return percent(SP_ANY_EU_WORKING_FS_STAGE - SP_ANY_EU_WORKING_CS_STAGE, total);
});
DERIVED_COUNTER(percent_time_shading_vertices,
"% Time Shading Vertices",
"Percentage of time spent shading vertices compared to the total time spent shading everything",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_ANY_EU_WORKING_FS_STAGE,
BV_SP_ANY_EU_WORKING_FS_STAGE,
SP_ANY_EU_WORKING_VS_STAGE,
BV_SP_ANY_EU_WORKING_VS_STAGE),
{
uint64_t total = SP_ANY_EU_WORKING_FS_STAGE + BV_SP_ANY_EU_WORKING_FS_STAGE +
SP_ANY_EU_WORKING_VS_STAGE + BV_SP_ANY_EU_WORKING_VS_STAGE;
return percent(SP_ANY_EU_WORKING_VS_STAGE + BV_SP_ANY_EU_WORKING_VS_STAGE, total);
});
DERIVED_COUNTER(vertex_instructions,
"Vertex Instructions",
"Total number of scalar vertex shader instructions issued",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(SP_VS_STAGE_EFU_INSTRUCTIONS,
SP_VS_STAGE_FULL_ALU_INSTRUCTIONS,
BV_SP_VS_STAGE_EFU_INSTRUCTIONS,
BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS),
{
return (SP_VS_STAGE_EFU_INSTRUCTIONS + BV_SP_VS_STAGE_EFU_INSTRUCTIONS +
SP_VS_STAGE_FULL_ALU_INSTRUCTIONS + BV_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS) * 4;
});
DERIVED_COUNTER(vertices_shaded,
"Vertices Shaded",
"Number of vertices submitted to the shader engine",
GPU_SHADER_PROCESSING, UINT64,
DERIVED_COUNTER_PERFCNTRS(PC_VS_INVOCATIONS,
BV_PC_VS_INVOCATIONS),
{
return PC_VS_INVOCATIONS + BV_PC_VS_INVOCATIONS;
});
/* FIXME: disabled due to lack of SP counter capacity
DERIVED_COUNTER(percent_wave_context_occupancy,
"% Wave Context Occupancy",
"Average percentage of wave context occupancy per cycle",
GPU_SHADER_PROCESSING, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(...),
{
});
*/
DERIVED_COUNTER(percent_bvh_fetch_stall,
"% BVH Fetch Stall",
"Percentage of clock cycles where the RTU could not make any more requests for BVH fetch from scheduler",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_SCH_STALL_CYCLES_RTU,
RBBM_STATUS_MASKED),
{
return percent(SP_SCH_STALL_CYCLES_RTU, RBBM_STATUS_MASKED);
});
DERIVED_COUNTER(percent_instruction_cache_miss,
"% Instruction Cache Miss",
"Number of L1 instruction cache misses divided by L1 instruction cache requests",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_ICL1_REQUESTS,
SP_ICL1_MISSES,
BV_SP_ICL1_REQUESTS,
BV_SP_ICL1_MISSES),
{
return percent(SP_ICL1_MISSES + BV_SP_ICL1_MISSES,
SP_ICL1_REQUESTS + BV_SP_ICL1_REQUESTS);
});
DERIVED_COUNTER(l1_texture_cache_miss_per_pixel,
"L1 Texture Cache Miss Per Pixel",
"Average number of Texture L1 cache misses per pixel",
GPU_STALLS, DOUBLE,
DERIVED_COUNTER_PERFCNTRS(TP_L1_CACHELINE_MISSES,
BV_TP_L1_CACHELINE_MISSES,
SP_PIXELS),
{
return safe_div(TP_L1_CACHELINE_MISSES + BV_TP_L1_CACHELINE_MISSES, SP_PIXELS);
});
DERIVED_COUNTER(percent_stalled_on_system_memory,
"% Stalled on System Memory",
"Percentage of cycles the L2 cache is stalled waiting for data from system memory",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(UCHE_STALL_CYCLES_ARBITER,
RBBM_STATUS_MASKED),
{
return percent(UCHE_STALL_CYCLES_ARBITER, 4 * RBBM_STATUS_MASKED);
});
DERIVED_COUNTER(percent_texture_fetch_stall,
"% Texture Fetch Stall",
"Percentage of clock cycles where the shader processors cannot make any more requests for texture data",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(SP_STALL_CYCLES_TP,
BV_SP_STALL_CYCLES_TP,
RBBM_STATUS_MASKED),
{
return percent(SP_STALL_CYCLES_TP + BV_SP_STALL_CYCLES_TP,
context->a7xx.number_of_usptp * RBBM_STATUS_MASKED);
});
DERIVED_COUNTER(percent_texture_l1_miss,
"% Texture L1 Miss",
"Number of L1 texture cache misses divided by L1 texture cache requests",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(TP_L1_CACHELINE_REQUESTS,
TP_L1_CACHELINE_MISSES,
BV_TP_L1_CACHELINE_REQUESTS,
BV_TP_L1_CACHELINE_MISSES),
{
return percent(TP_L1_CACHELINE_MISSES + BV_TP_L1_CACHELINE_MISSES,
TP_L1_CACHELINE_REQUESTS + BV_TP_L1_CACHELINE_REQUESTS);
});
DERIVED_COUNTER(percent_texture_l2_miss,
"% Texture L2 Miss",
"Number of L2 texture cache misses divided by L2 texture cache requests",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(UCHE_VBIF_READ_BEATS_TP,
UCHE_READ_REQUESTS_TP),
{
return percent(2 * UCHE_VBIF_READ_BEATS_TP, UCHE_READ_REQUESTS_TP);
});
DERIVED_COUNTER(percent_vertex_fetch_stall,
"% Vertex Fetch Stall",
"Percentage of clock cycles where the GPU cannot make any more requests for vertex data",
GPU_STALLS, PERCENTAGE,
DERIVED_COUNTER_PERFCNTRS(PC_STALL_CYCLES_VFD,
BV_PC_STALL_CYCLES_VFD,
RBBM_STATUS_MASKED),
{
return percent(PC_STALL_CYCLES_VFD + BV_PC_STALL_CYCLES_VFD,
RBBM_STATUS_MASKED);
});
const struct fd_derived_counter *a7xx_derived_counters[] = {
/* Category: GPU General */
DERIVED_COUNTER_PTR(clocks),
/* Category: GPU Memory Stats */
DERIVED_COUNTER_PTR(avg_bytes_per_fragment),
DERIVED_COUNTER_PTR(avg_bytes_per_vertex),
DERIVED_COUNTER_PTR(sp_memory_read),
DERIVED_COUNTER_PTR(texture_memory_read),
DERIVED_COUNTER_PTR(vertex_memory_read),
/* Category: GPU Preemption */
DERIVED_COUNTER_PTR(avg_preemption_delay),
DERIVED_COUNTER_PTR(preemptions),
/* Category: GPU Primitive Processing */
DERIVED_COUNTER_PTR(average_polygon_area),
DERIVED_COUNTER_PTR(average_vertices_per_polygon),
DERIVED_COUNTER_PTR(preclipped_polygon),
DERIVED_COUNTER_PTR(percent_prims_clipped),
DERIVED_COUNTER_PTR(percent_prims_trivially_rejected),
DERIVED_COUNTER_PTR(reused_vertices),
/* Category: GPU Shader Processing */
DERIVED_COUNTER_PTR(alu_per_fragment),
DERIVED_COUNTER_PTR(alu_per_vertex),
DERIVED_COUNTER_PTR(percent_anisotropic_filtered),
DERIVED_COUNTER_PTR(average_bvh_fetch_latency_cycles),
DERIVED_COUNTER_PTR(efu_per_fragment),
DERIVED_COUNTER_PTR(efu_per_vertex),
DERIVED_COUNTER_PTR(fragment_alu_instructions_full),
DERIVED_COUNTER_PTR(fragment_alu_instructions_half),
DERIVED_COUNTER_PTR(fragment_efu_instructions),
DERIVED_COUNTER_PTR(fragment_instructions),
DERIVED_COUNTER_PTR(fragments_shaded),
DERIVED_COUNTER_PTR(percent_linear_filtered),
DERIVED_COUNTER_PTR(percent_nearest_filtered),
DERIVED_COUNTER_PTR(percent_rtu_busy),
DERIVED_COUNTER_PTR(rtu_ray_box_intersections_per_instruction),
DERIVED_COUNTER_PTR(rtu_ray_triangle_intersections_per_instruction),
DERIVED_COUNTER_PTR(percent_shader_alu_capacity_utilized),
DERIVED_COUNTER_PTR(percent_shaders_busy),
DERIVED_COUNTER_PTR(percent_shaders_stalled),
DERIVED_COUNTER_PTR(percent_texture_pipes_busy),
DERIVED_COUNTER_PTR(textures_per_fragment),
DERIVED_COUNTER_PTR(textures_per_vertex),
DERIVED_COUNTER_PTR(percent_time_alus_working),
DERIVED_COUNTER_PTR(percent_time_compute),
DERIVED_COUNTER_PTR(percent_time_efus_working),
DERIVED_COUNTER_PTR(percent_time_shading_fragments),
DERIVED_COUNTER_PTR(percent_time_shading_vertices),
DERIVED_COUNTER_PTR(vertex_instructions),
DERIVED_COUNTER_PTR(vertices_shaded),
/* Category: GPU Stalls */
DERIVED_COUNTER_PTR(percent_bvh_fetch_stall),
DERIVED_COUNTER_PTR(percent_instruction_cache_miss),
DERIVED_COUNTER_PTR(l1_texture_cache_miss_per_pixel),
DERIVED_COUNTER_PTR(percent_stalled_on_system_memory),
DERIVED_COUNTER_PTR(percent_texture_fetch_stall),
DERIVED_COUNTER_PTR(percent_texture_l1_miss),
DERIVED_COUNTER_PTR(percent_texture_l2_miss),
DERIVED_COUNTER_PTR(percent_vertex_fetch_stall),
};
const unsigned a7xx_num_derived_counters = ARRAY_SIZE(a7xx_derived_counters);
static_assert(ARRAY_SIZE(a7xx_derived_counters) <= FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS, "");
/* Prototype for linking purposes. */
void
a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection);
void
a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection)
{
/* The provided collection should already specify the derived counters that will be measured.
* This function will set up enabled_perfcntrs_map and enabled_perfcntrs array so that each
* used DERIVED_COUNTER_PERFCNTR_* enum value will map to the corresponding index in the
* array where the relevant fd_perfcntr_counter and fd_perfcntr_countable are stored.
*/
collection->num_enabled_perfcntrs = 0;
memset(collection->enabled_perfcntrs_map, 0xff, ARRAY_SIZE(collection->enabled_perfcntrs_map));
for (unsigned i = 0; i < collection->num_counters; ++i) {
const struct fd_derived_counter *counter = collection->counters[i];
for (unsigned j = 0; j < counter->num_perfcntrs; ++j) {
uint8_t perfcntr = counter->perfcntrs[j];
collection->enabled_perfcntrs_map[perfcntr] = 0x00;
}
}
/* Note if CP_ALWAYS_COUNT is enabled. This is the zero-index perfcntr. */
collection->cp_always_count_enabled = !collection->enabled_perfcntrs_map[0];
for (unsigned i = 0; i < ARRAY_SIZE(collection->enabled_perfcntrs_map); ++i) {
if (collection->enabled_perfcntrs_map[i] == 0xff)
continue;
uint8_t enabled_perfcntr_index = collection->num_enabled_perfcntrs++;
collection->enabled_perfcntrs_map[i] = enabled_perfcntr_index;
collection->enabled_perfcntrs[enabled_perfcntr_index].counter =
a7xx_derived_counter_perfcntrs[i].counter;
collection->enabled_perfcntrs[enabled_perfcntr_index].countable =
a7xx_derived_counter_perfcntrs[i].countable;
}
const struct fd_dev_info *info = fd_dev_info_raw(id);
collection->derivation_context.a7xx.number_of_usptp = info->num_sp_cores * 2;
collection->derivation_context.a7xx.number_of_alus_per_usptp = 128;
}

View file

@ -43,3 +43,33 @@ fd_perfcntrs(const struct fd_dev_id *id, unsigned *count)
return NULL;
}
}
extern const struct fd_derived_counter *a7xx_derived_counters[];
extern const unsigned a7xx_num_derived_counters;
const struct fd_derived_counter **
fd_derived_counters(const struct fd_dev_id *id, unsigned *count)
{
switch (fd_dev_gen(id)) {
case 7:
*count = a7xx_num_derived_counters;
return a7xx_derived_counters;
default:
*count = 0;
return NULL;
}
}
extern void a7xx_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection);
void
fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection)
{
switch (fd_dev_gen(id)) {
case 7:
a7xx_generate_derived_counter_collection(id, collection);
break;
default:
break;
}
}

View file

@ -40,6 +40,7 @@ enum fd_perfcntr_type {
FD_PERFCNTR_TYPE_UINT64,
FD_PERFCNTR_TYPE_UINT,
FD_PERFCNTR_TYPE_FLOAT,
FD_PERFCNTR_TYPE_DOUBLE,
FD_PERFCNTR_TYPE_PERCENTAGE,
FD_PERFCNTR_TYPE_BYTES,
FD_PERFCNTR_TYPE_MICROSECONDS,
@ -107,6 +108,50 @@ const struct fd_perfcntr_group *fd_perfcntrs(const struct fd_dev_id *id, unsigne
.countables = _countables, \
}
#define FD_DERIVED_COUNTER_MAX_PERFCNTRS 8
struct fd_derivation_context {
struct {
uint32_t number_of_usptp;
uint32_t number_of_alus_per_usptp;
} a7xx;
};
struct fd_derived_counter {
const char *name;
const char *description;
const char *category;
enum fd_perfcntr_type type;
unsigned num_perfcntrs;
uint8_t perfcntrs[FD_DERIVED_COUNTER_MAX_PERFCNTRS];
uint64_t (*derive)(struct fd_derivation_context *context, uint64_t *values);
};
const struct fd_derived_counter **fd_derived_counters(const struct fd_dev_id *id, unsigned *count);
#define FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS 64
#define FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS 128
struct fd_derived_counter_collection {
unsigned num_counters;
const struct fd_derived_counter *counters[FD_DERIVED_COUNTER_COLLECTION_MAX_DERIVED_COUNTERS];
bool cp_always_count_enabled;
unsigned num_enabled_perfcntrs;
struct {
const struct fd_perfcntr_counter *counter;
unsigned countable;
} enabled_perfcntrs[FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS];
uint8_t enabled_perfcntrs_map[FD_DERIVED_COUNTER_COLLECTION_MAX_ENABLED_PERFCNTRS];
struct fd_derivation_context derivation_context;
};
void fd_generate_derived_counter_collection(const struct fd_dev_id *id, struct fd_derived_counter_collection *collection);
#ifdef __cplusplus
} /* end of extern "C" */
#endif

View file

@ -148,6 +148,7 @@ fd_perfcntr_type_to_vk_unit[] = {
[FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
[FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
[FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
[FD_PERFCNTR_TYPE_DOUBLE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
[FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
[FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
/* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */
@ -169,6 +170,7 @@ fd_perfcntr_type_to_vk_storage[] = {
[FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
[FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
[FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
[FD_PERFCNTR_TYPE_DOUBLE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR,
[FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
[FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
[FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,