From cf86860a7e196bcc7b70b6d5b672fa48f14a6e13 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 19:13:27 +0100 Subject: [PATCH 01/15] radv: add the SQTT relocated shaders BO to the cmdbuf list Found this while debugging another thing with amdgpu.debug_mask=0x1 (VM). Cc: mesa-stable Signed-off-by: Samuel Pitoiset --- src/amd/vulkan/radv_cmd_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index c9c34e9d0b5..8c7110d6bf9 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -3981,6 +3981,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + struct radv_cmd_stream *cs = cmd_buffer->cs; if (cmd_buffer->state.emitted_graphics_pipeline == pipeline) return; @@ -3990,6 +3991,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) if (pipeline->sqtt_shaders_reloc) { /* Emit shaders relocation because RGP requires them to be contiguous in memory. */ radv_sqtt_emit_relocated_shaders(cmd_buffer, pipeline); + + radv_cs_add_buffer(device->ws, cs->b, pipeline->sqtt_shaders_reloc->bo); } if (radv_device_fault_detection_enabled(device)) From d22909128bcb260074bb0802917b3133c0d74173 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 15:53:30 +0100 Subject: [PATCH 02/15] ac/perfcounter: add a separate group for GFX10.3 This is just a copy&paste but GFX10.3 has way more counters than GFX10 that will be added later. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_perfcounter.c | 37 ++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_perfcounter.c b/src/amd/common/ac_perfcounter.c index 0a1e95798dd..f2245e4d9e2 100644 --- a/src/amd/common/ac_perfcounter.c +++ b/src/amd/common/ac_perfcounter.c @@ -1029,6 +1029,38 @@ static struct ac_pc_block_gfxdescr groups_gfx10[] = { {&gfx10_UTCL1, 15}, }; +static struct ac_pc_block_gfxdescr groups_gfx103[] = { + {&cik_CB, 461}, + {&gfx10_CHA, 45}, + {&gfx10_CHCG, 35}, + {&gfx10_CHC, 35}, + {&cik_CPC, 47}, + {&cik_CPF, 40}, + {&cik_CPG, 82}, + {&gfx10_DB, 370}, + {&gfx10_GCR, 94}, + {&cik_GDS, 123}, + {&gfx10_GE, 315}, + {&gfx10_GL1A, 36}, + {&gfx10_GL1C, 64, 4}, + {&gfx10_GL2A, 91}, + {&gfx10_GL2C, 235}, + {&cik_GRBM, 47}, + {&cik_GRBMSE, 19}, + {&gfx10_PA_PH, 960}, + {&cik_PA_SC, 552}, + {&gfx10_PA_SU, 266}, + {&gfx10_RLC, 7}, + {&gfx10_RMI, 258}, + {&cik_SPI, 329}, + {&gfx10_SQ, 509}, + {&cik_SX, 225}, + {&cik_TA, 226}, + {&gfx10_TCP, 77}, + {&cik_TD, 61}, + {&gfx10_UTCL1, 15}, +}; + static struct ac_pc_block_gfxdescr groups_gfx11[] = { {&cik_CB, 313}, {&gfx10_CHA, 39}, @@ -1237,10 +1269,13 @@ bool ac_init_perfcounters(const struct radeon_info *info, num_blocks = ARRAY_SIZE(groups_gfx9); break; case GFX10: - case GFX10_3: blocks = groups_gfx10; num_blocks = ARRAY_SIZE(groups_gfx10); break; + case GFX10_3: + blocks = groups_gfx103; + num_blocks = ARRAY_SIZE(groups_gfx103); + break; case GFX11: blocks = groups_gfx11; num_blocks = ARRAY_SIZE(groups_gfx11); From 9c88a0c7d1d2638d579c84ba70acb81b2ad63f1e Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 15:58:02 +0100 Subject: [PATCH 03/15] ac/perfcounter: adjust the number of events for TD on GFX10.3 Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_perfcounter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_perfcounter.c b/src/amd/common/ac_perfcounter.c index f2245e4d9e2..13299b85510 100644 --- a/src/amd/common/ac_perfcounter.c +++ b/src/amd/common/ac_perfcounter.c @@ -1057,7 +1057,7 @@ static struct ac_pc_block_gfxdescr groups_gfx103[] = { {&cik_SX, 225}, {&cik_TA, 226}, {&gfx10_TCP, 77}, - {&cik_TD, 61}, + {&cik_TD, 192}, {&gfx10_UTCL1, 15}, }; From 6ab7234a6dc6194c45e5dd61e3086f88f17f7242 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 11:55:00 +0100 Subject: [PATCH 04/15] ac/perfcounter: add GCEA block description on GFX10-11 Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_perfcounter.c | 28 +++++++++++++++++++++++++++- src/amd/common/ac_perfcounter.h | 2 +- src/amd/common/ac_spm.h | 2 +- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/amd/common/ac_perfcounter.c b/src/amd/common/ac_perfcounter.c index 13299b85510..fe7e48fbe80 100644 --- a/src/amd/common/ac_perfcounter.c +++ b/src/amd/common/ac_perfcounter.c @@ -932,6 +932,28 @@ static struct ac_pc_block_base gfx10_UTCL1 = { .num_spm_counters = 0, }; +/* gfx10_GCEA */ +static unsigned gfx10_GCEA_select0[] = { + R_036800_GCEA_PERFCOUNTER2_SELECT, +}; + +static unsigned gfx10_GCEA_select1[] = { + R_036804_GCEA_PERFCOUNTER2_SELECT1, +}; +static struct ac_pc_block_base gfx10_GCEA = { + .gpu_block = GCEA, + .name = "GCEA", + .num_counters = 1, + + .select0 = gfx10_GCEA_select0, + .select1 = gfx10_GCEA_select1, + .counter0_lo = R_034980_GCEA_PERFCOUNTER2_LO, + + .num_spm_counters = 1, + .num_spm_wires = 2, + .spm_block_select = AC_SPM_GLOBAL_BLOCK_GCEA, +}; + /* gfx11_SQ_WQP */ static struct ac_pc_block_base gfx11_SQ_WGP = { .gpu_block = SQ_WGP, @@ -1027,6 +1049,7 @@ static struct ac_pc_block_gfxdescr groups_gfx10[] = { {&gfx10_TCP, 77}, {&cik_TD, 61}, {&gfx10_UTCL1, 15}, + {&gfx10_GCEA, 88}, }; static struct ac_pc_block_gfxdescr groups_gfx103[] = { @@ -1059,6 +1082,7 @@ static struct ac_pc_block_gfxdescr groups_gfx103[] = { {&gfx10_TCP, 77}, {&cik_TD, 192}, {&gfx10_UTCL1, 15}, + {&gfx10_GCEA, 89}, }; static struct ac_pc_block_gfxdescr groups_gfx11[] = { @@ -1092,6 +1116,7 @@ static struct ac_pc_block_gfxdescr groups_gfx11[] = { {&cik_TD, 196}, {&gfx10_UTCL1, 65}, {&gfx11_SQ_WGP, 511, 4}, + {&gfx10_GCEA, 86}, }; static struct ac_pc_block_gfxdescr groups_gfx12[] = { @@ -1325,7 +1350,8 @@ bool ac_init_perfcounters(const struct radeon_info *info, } else if (!strcmp(block->b->b->name, "GL1C") || !strcmp(block->b->b->name, "SQ_WGP")) { block->num_global_instances = block->num_instances * info->num_se * info->max_sa_per_se; - } else if (!strcmp(block->b->b->name, "GL2C")) { + } else if (!strcmp(block->b->b->name, "GL2C") || + !strcmp(block->b->b->name, "GCEA")) { block->num_instances = block->num_global_instances = info->num_tcc_blocks; } } diff --git a/src/amd/common/ac_perfcounter.h b/src/amd/common/ac_perfcounter.h index c198c4403ef..80a06b5cd7a 100644 --- a/src/amd/common/ac_perfcounter.h +++ b/src/amd/common/ac_perfcounter.h @@ -67,7 +67,7 @@ enum ac_pc_gpu_block { ATC = 0x1A, ATCL2 = 0x1B, MCVML2 = 0x1C, - EA = 0x1D, + GCEA = 0x1D, RPB = 0x1E, RMI = 0x1F, UMCCH = 0x20, diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index 27b76736b23..e0886679072 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -35,7 +35,7 @@ enum ac_spm_global_block { AC_SPM_GLOBAL_BLOCK_GL2C, AC_SPM_GLOBAL_BLOCK_SDMA, AC_SPM_GLOBAL_BLOCK_GUS, - AC_SPM_GLOBAL_BLOCK_EA, + AC_SPM_GLOBAL_BLOCK_GCEA, AC_SPM_GLOBAL_BLOCK_CHA, AC_SPM_GLOBAL_BLOCK_CHC, AC_SPM_GLOBAL_BLOCK_CHCG, From 48c3b7e5334de9b66de62ba3774e5c7a62184f6a Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 12:07:03 +0100 Subject: [PATCH 05/15] ac/spm: adjust configuration of some GPU blocks Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_perfcounter.c | 5 +++++ src/amd/common/ac_spm.c | 8 ++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/amd/common/ac_perfcounter.c b/src/amd/common/ac_perfcounter.c index fe7e48fbe80..abb4a36b7f9 100644 --- a/src/amd/common/ac_perfcounter.c +++ b/src/amd/common/ac_perfcounter.c @@ -1353,6 +1353,11 @@ bool ac_init_perfcounters(const struct radeon_info *info, } else if (!strcmp(block->b->b->name, "GL2C") || !strcmp(block->b->b->name, "GCEA")) { block->num_instances = block->num_global_instances = info->num_tcc_blocks; + } else if (!strcmp(block->b->b->name, "CPF")) { + block->num_instances = block->num_global_instances = 1; + } else if (!strcmp(block->b->b->name, "TA") || + !strcmp(block->b->b->name, "TD")) { + block->num_global_instances = block->num_instances; } } diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 54866ccb857..d14ac2b80c0 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -165,14 +165,18 @@ ac_spm_init_instance_mapping(const struct radeon_info *info, /* Per-SA blocks. */ assert(block->b->b->gpu_block == GL1C || block->b->b->gpu_block == TCP || - block->b->b->gpu_block == SQ_WGP); + block->b->b->gpu_block == SQ_WGP || + block->b->b->gpu_block == TA || + block->b->b->gpu_block == TD); se_index = (counter->instance / block->num_instances) / info->max_sa_per_se; sa_index = (counter->instance / block->num_instances) % info->max_sa_per_se; instance_index = counter->instance % block->num_instances; } } else { /* Global blocks. */ - assert(block->b->b->gpu_block == GL2C); + assert(block->b->b->gpu_block == GL2C || + block->b->b->gpu_block == CPF || + block->b->b->gpu_block == GCEA); instance_index = counter->instance; } From 0ac66659dc7b74bbe7392e58a1a73e83c4ebe827 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 12:07:49 +0100 Subject: [PATCH 06/15] ac/spm: add an assertion to check the number of global instances To make sure counters aren't silently discarded. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index d14ac2b80c0..a0ea83f2847 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -545,6 +545,8 @@ bool ac_init_spm(const struct radeon_info *info, const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block); struct ac_spm_counter_create_info counter = create_info[i]; + assert(block->num_global_instances > 0); + for (unsigned j = 0; j < block->num_global_instances; j++) { counter.instance = j; From 7c038c586fd31247a139db06955088286947c34b Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 10:39:37 +0100 Subject: [PATCH 07/15] ac/spm: fix programming more than one counter slot Some blocks have two or more SPM counters and they should be used when more than 4 counters are programmed (ie. 16-bit per counter). Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index a0ea83f2847..6b4ec16d47c 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -357,7 +357,8 @@ ac_spm_map_counter(struct ac_spm *spm, struct ac_spm_block_select *block_sel, S_037008_PERF_MODE3(0); break; default: - return false; + /* Try to program the new counter slot. */ + continue; } /* Mark this 16-bit counter as used. */ From 0f3ca72203a91db688f66d8dcff18fc807a59f85 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 16:32:13 +0100 Subject: [PATCH 08/15] ac/spm: print an error message when a group is unknown Help debugging. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 6b4ec16d47c..3fb780b3f2b 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -532,8 +532,10 @@ bool ac_init_spm(const struct radeon_info *info, for (unsigned i = 0; i < create_info_count; i++) { const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block); - if (!block) + if (!block) { + fprintf(stderr, "ac/spm: Unknown group.\n"); return false; + } num_counters += block->num_global_instances; } From efd98ebee2adf8a796da3c87443db51aadc9cc01 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 16 Dec 2025 08:49:50 +0100 Subject: [PATCH 09/15] ac/spm: add an ID to raw performance counters This will be used to compute derived values for the new RGP/SPM chunk. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 82 +++++++++++++++++++++++++++-------------- src/amd/common/ac_spm.h | 18 +++++++++ 2 files changed, 73 insertions(+), 27 deletions(-) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 3fb780b3f2b..e05097ef778 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -13,18 +13,30 @@ /* SPM counters definition. */ /* GFX10+ */ -static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req = {TCP, 0x9}; -static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req_miss = {TCP, 0x12}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_hits = {SQ, 0x14f}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses = {SQ, 0x150}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses_duplicate = {SQ, 0x151}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_hits = {SQ, 0x12c}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses = {SQ, 0x12d}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses_duplicate = {SQ, 0x12e}; -static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req = {GL1C, 0xe}; -static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req_miss = {GL1C, 0x12}; -static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_req = {GL2C, 0x3}; -static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_miss = {GL2C, 0x23}; +static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req = + {AC_SPM_TCP_PERF_SEL_REQ, TCP, 0x9}; +static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req_miss = + {AC_SPM_TCP_PERF_SEL_REQ_MISS, TCP, 0x12}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_hits = + {AC_SPM_SQC_PERF_SEL_DCACHE_HITS, SQ, 0x14f}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, SQ, 0x150}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, SQ, 0x151}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_hits = + {AC_SPM_SQC_PERF_SEL_ICACHE_HITS, SQ, 0x12c}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ, 0x12d}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ, 0x12e}; +static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req = + {AC_SPM_GL1C_PERF_SEL_REQ, GL1C, 0xe}; +static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req_miss = + {AC_SPM_GL1C_PERF_SEL_REQ_MISS, GL1C, 0x12}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_req = + {AC_SPM_GL2C_PERF_SEL_REQ, GL2C, 0x3}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_miss = + {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x23}; static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -42,7 +54,8 @@ static struct ac_spm_counter_create_info gfx10_spm_counters[] = { }; /* GFX10.3+ */ -static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_miss = {GL2C, 0x2b}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_miss = + {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x2b}; static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -60,13 +73,20 @@ static struct ac_spm_counter_create_info gfx103_spm_counters[] = { }; /* GFX11+ */ -static struct ac_spm_counter_descr gfx11_tcp_perf_sel_req_miss = {TCP, 0x11}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_hits = {SQ_WGP, 0x126}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses = {SQ_WGP, 0x127}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses_duplicate = {SQ_WGP, 0x128}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_hits = {SQ_WGP, 0x10e}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses = {SQ_WGP, 0x10f}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses_duplicate = {SQ_WGP, 0x110}; +static struct ac_spm_counter_descr gfx11_tcp_perf_sel_req_miss = + {AC_SPM_TCP_PERF_SEL_REQ_MISS, TCP, 0x11}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_hits = + {AC_SPM_SQC_PERF_SEL_DCACHE_HITS, SQ_WGP, 0x126}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, SQ_WGP, 0x127}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, SQ_WGP, 0x128}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_hits = + {AC_SPM_SQC_PERF_SEL_ICACHE_HITS, SQ_WGP, 0x10e}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ_WGP, 0x10f}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ_WGP, 0x110}; static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -84,13 +104,20 @@ static struct ac_spm_counter_create_info gfx11_spm_counters[] = { }; /* GFX12+ */ -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_hits = {SQ_WGP, 0x146}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses = {SQ_WGP, 0x147}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses_duplicate = {SQ_WGP, 0x148}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_hits = {SQ_WGP, 0x12e}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses = {SQ_WGP, 0x12f}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses_duplicate = {SQ_WGP, 0x130}; -static struct ac_spm_counter_descr gfx12_gl2c_perf_sel_miss = {GL2C, 0x2a}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_hits = + {AC_SPM_SQC_PERF_SEL_DCACHE_HITS, SQ_WGP, 0x146}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, SQ_WGP, 0x147}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, SQ_WGP, 0x148}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_hits = + {AC_SPM_SQC_PERF_SEL_ICACHE_HITS, SQ_WGP, 0x12e}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ_WGP, 0x12f}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ_WGP, 0x130}; +static struct ac_spm_counter_descr gfx12_gl2c_perf_sel_miss = + {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x2a}; static struct ac_spm_counter_create_info gfx12_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -411,6 +438,7 @@ ac_spm_add_counter(const struct radeon_info *info, counter = &spm->counters[spm->num_counters]; spm->num_counters++; + counter->id = counter_info->b->id; counter->gpu_block = counter_info->b->gpu_block; counter->event_id = counter_info->b->event_id; counter->instance = counter_info->instance; diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index e0886679072..f21b2c59c7e 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -86,7 +86,24 @@ enum ac_spm_segment_type { AC_SPM_SEGMENT_TYPE_COUNT, }; +enum ac_spm_raw_counter_id { + AC_SPM_TCP_PERF_SEL_REQ = 0, + AC_SPM_TCP_PERF_SEL_REQ_MISS, + AC_SPM_SQC_PERF_SEL_DCACHE_HITS, + AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, + AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, + AC_SPM_SQC_PERF_SEL_ICACHE_HITS, + AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, + AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, + AC_SPM_GL1C_PERF_SEL_REQ, + AC_SPM_GL1C_PERF_SEL_REQ_MISS, + AC_SPM_GL2C_PERF_SEL_REQ, + AC_SPM_GL2C_PERF_SEL_MISS, + AC_SPM_RAW_COUNTER_ID_COUNT, +}; + struct ac_spm_counter_descr { + enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; uint32_t event_id; }; @@ -119,6 +136,7 @@ struct ac_spm_muxsel_line { struct ac_spm_counter_info { /* General info. */ + enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; uint32_t instance; uint32_t event_id; From eff6169a7959f8e915de34d3dbda2da6d38a51a4 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 16 Dec 2025 14:56:24 +0100 Subject: [PATCH 10/15] ac/spm: implement the new derived SPM chunk for performance counters This is the new method to add performance counters to RGP captures. This will be used to add the new RGP 2.6 counters too. The previous SPM code will be deprecated at some point but it's hard to support all generations in one batch. So, I will implement this step by step. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_rgp.c | 222 +++++++++++++++++- src/amd/common/ac_spm.c | 501 ++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_spm.h | 110 +++++++++ 3 files changed, 830 insertions(+), 3 deletions(-) diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index a3564042309..404846ab3c5 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -58,6 +58,10 @@ enum sqtt_file_chunk_type SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS, SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION, SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE, + + SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE = 128, + SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB = SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE, + SQTT_FILE_CHUNK_TYPE_COUNT }; @@ -992,10 +996,203 @@ static void ac_sqtt_dump_spm(const struct ac_spm_trace *spm_trace, fseek(output, file_offset, SEEK_SET); } +/** + * SQTT Derived SPM DB info. + */ +struct sqtt_derived_spm_group_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t group_name_length; + uint32_t group_description_length; + uint32_t num_counters; +}; + +struct sqtt_derived_spm_counter_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t counter_name_length; + uint32_t counter_description_length; + uint32_t num_components; + uint8_t usage_type; +}; + +struct sqtt_derived_spm_component_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t component_name_length; + uint32_t component_description_length; + uint32_t usage_type; +}; + +struct sqtt_file_chunk_derived_spm_db { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t num_timestamps; + uint32_t num_groups; + uint32_t num_counters; + uint32_t num_components; + uint32_t sampling_interval; +}; + +static_assert(sizeof(struct sqtt_file_chunk_derived_spm_db) == 44, + "sqtt_file_chunk_derived_spm_db doesn't match RGP spec"); + +static void ac_sqtt_fill_derived_spm_db(const struct ac_spm_derived_trace *spm_derived_trace, + struct sqtt_file_chunk_derived_spm_db *chunk, + size_t file_offset, + uint32_t chunk_size) +{ + chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB; + chunk->header.chunk_id.index = 0; + chunk->header.major_version = 0; + chunk->header.minor_version = 0; + chunk->header.size_in_bytes = chunk_size; + + chunk->offset = sizeof(*chunk); + chunk->flags = 0; + chunk->num_timestamps = spm_derived_trace->num_timestamps; + chunk->num_groups = spm_derived_trace->num_groups; + chunk->num_counters = spm_derived_trace->num_counters; + chunk->num_components = spm_derived_trace->num_components; + chunk->sampling_interval = spm_derived_trace->sample_interval; +} + +static void ac_sqtt_dump_derived_spm(const struct ac_spm_derived_trace *spm_derived_trace, + size_t file_offset, + FILE *output) +{ + struct sqtt_file_chunk_derived_spm_db derived_spm_db; + size_t file_derived_spm_db_offset = file_offset; + + fseek(output, sizeof(struct sqtt_file_chunk_derived_spm_db), SEEK_CUR); + file_offset += sizeof(struct sqtt_file_chunk_derived_spm_db); + + /* Dump timestamps. */ + for (uint32_t i = 0; i < spm_derived_trace->num_timestamps; i++) { + uint64_t timestamp = spm_derived_trace->timestamps[i]; + + file_offset += sizeof(timestamp); + fwrite(×tamp, sizeof(timestamp), 1, output); + } + + /* Dump SPM groups. */ + for (uint32_t i = 0; i < spm_derived_trace->num_groups; i++) { + const struct ac_spm_derived_group *group = &spm_derived_trace->groups[i]; + const struct ac_spm_derived_group_descr *group_descr = group->descr; + struct sqtt_derived_spm_group_info group_info = {0}; + + const uint32_t num_counters = group_descr->num_counters; + const uint32_t name_length = strlen(group_descr->name); + + group_info.size_in_bytes = sizeof(group_info) + name_length + + num_counters * sizeof(uint32_t); + group_info.offset = sizeof(group_info); + group_info.group_name_length = name_length; + group_info.num_counters = num_counters; + + file_offset += sizeof(group_info) + group_info.group_name_length; + fwrite(&group_info, sizeof(group_info), 1, output); + fwrite(group_descr->name, group_info.group_name_length, 1, output); + + for (uint32_t j = 0; j < group_descr->num_counters; j++) { + const struct ac_spm_derived_counter_descr *counter_descr = group_descr->counters[j]; + uint32_t counter_id = counter_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&counter_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM counters. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + const struct ac_spm_derived_counter_descr *counter_descr = counter->descr; + struct sqtt_derived_spm_counter_info counter_info = {0}; + + const uint32_t num_components = counter_descr->num_components; + const uint32_t name_length = strlen(counter_descr->name); + const uint32_t description_length = strlen(counter_descr->desc); + + counter_info.size_in_bytes = sizeof(counter_info) + name_length + + description_length + num_components * sizeof(uint32_t); + counter_info.offset = sizeof(counter_info); + counter_info.counter_name_length = name_length; + counter_info.counter_description_length = description_length; + counter_info.num_components = num_components; + counter_info.usage_type = counter_descr->usage; + + file_offset += sizeof(counter_info) + counter_info.counter_name_length + + counter_info.counter_description_length; + fwrite(&counter_info, sizeof(counter_info), 1, output); + fwrite(counter_descr->name, counter_info.counter_name_length, 1, output); + fwrite(counter_descr->desc, counter_info.counter_description_length, 1, output); + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + const struct ac_spm_derived_component_descr *component_descr = counter_descr->components[j]; + uint32_t component_id = component_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&component_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM components. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + const struct ac_spm_derived_component_descr *component_descr = component->descr; + struct sqtt_derived_spm_component_info component_info = {0}; + + const uint32_t name_length = strlen(component_descr->name); + + component_info.size_in_bytes = sizeof(component_info) + name_length; + component_info.offset = sizeof(component_info); + component_info.component_name_length = name_length; + component_info.usage_type = component_descr->usage; + + file_offset += sizeof(component_info) + component_info.component_name_length + + component_info.component_description_length; + fwrite(&component_info, sizeof(component_info), 1, output); + fwrite(component_descr->name, component_info.component_name_length, 1, output); + } + + /* Dump counter values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + assert(util_dynarray_num_elements(&counter->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&counter->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* Dump component values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + assert(util_dynarray_num_elements(&component->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&component->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* SQTT Derived SPM chunk. */ + ac_sqtt_fill_derived_spm_db(spm_derived_trace, &derived_spm_db, + file_derived_spm_db_offset, + file_offset - file_derived_spm_db_offset); + fseek(output, file_derived_spm_db_offset, SEEK_SET); + fwrite(&derived_spm_db, sizeof(struct sqtt_file_chunk_derived_spm_db), 1, output); + fseek(output, file_offset, SEEK_SET); +} + #if defined(USE_LIBELF) static void ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt_trace, - const struct ac_spm_trace *spm_trace, FILE *output) + const struct ac_spm_trace *spm_trace, + const struct ac_spm_derived_trace *spm_derived_trace, + FILE *output) { struct sqtt_file_chunk_asic_info asic_info = {0}; struct sqtt_file_chunk_cpu_info cpu_info = {0}; @@ -1193,12 +1390,25 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt } } - if (spm_trace) { + if (spm_derived_trace) { + ac_sqtt_dump_derived_spm(spm_derived_trace, file_offset, output); + } else if (spm_trace) { ac_sqtt_dump_spm(spm_trace, file_offset, output); } } #endif +static bool +ac_use_derived_spm_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + if (!spm_trace) + return false; + + /* TODO: Enable for GPUs. */ + return false; +} + int ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace, const struct ac_spm_trace *spm_trace) @@ -1223,7 +1433,13 @@ ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_t if (!f) return -1; - ac_sqtt_dump_data(info, sqtt_trace, spm_trace, f); + struct ac_spm_derived_trace *spm_derived_trace = + ac_use_derived_spm_trace(info, spm_trace) ? ac_spm_get_derived_trace(info, spm_trace) : NULL; + + ac_sqtt_dump_data(info, sqtt_trace, spm_trace, spm_derived_trace, f); + + if (spm_derived_trace) + ac_spm_destroy_derived_trace(spm_derived_trace); fprintf(stderr, "RGP capture saved to '%s'\n", filename); diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index e05097ef778..7115fddcff4 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -727,6 +727,507 @@ bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace) return ac_spm_get_num_samples(spm, &trace->num_samples); } +/* SPM components. */ +/* Instruction cache components. */ +static struct ac_spm_derived_component_descr gfx10_inst_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* Scalar cache components. */ +static struct ac_spm_derived_component_descr gfx10_scalar_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L0 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l0_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L1 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l1_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L2 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l2_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* SPM counters. */ +static struct ac_spm_derived_counter_descr gfx10_inst_cache_hit_counter = { + .id = AC_SPM_COUNTER_INST_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Instruction cache hit", + .desc = "The percentage of read requests made that hit the data in the " + "Instruction cache. The Instruction cache supplies shader code to an " + "executing shader. Each request is 64 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_inst_cache_request_count_comp, + &gfx10_inst_cache_hit_count_comp, + &gfx10_inst_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_scalar_cache_hit_counter = { + .id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Scalar cache hit", + .desc = "The percentage of read requests made from executing shader code " + "that hit the data in the Scalar cache. The Scalar cache contains data " + "that does not vary in each thread across the wavefront. Each request is " + "64 bytes in size. Value range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_scalar_cache_request_count_comp, + &gfx10_scalar_cache_hit_count_comp, + &gfx10_scalar_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l0_cache_hit_counter = { + .id = AC_SPM_COUNTER_L0_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L0 cache hit", + .desc = "The percentage of read requests that hit the data in the L0 cache. " + "The L0 cache contains vector data, which is data that may vary in each " + "thread across the wavefront. Each request is 128 bytes in size. Value " + "range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l0_cache_request_count_comp, + &gfx10_l0_cache_hit_count_comp, + &gfx10_l0_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l1_cache_hit_counter = { + .id = AC_SPM_COUNTER_L1_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L1 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L1 cache. The L1 cache is shared across all WGPs in a single shader " + "engine. Each request is 128 bytes in size. Value range: 0% (no hit) to " + "100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l1_cache_request_count_comp, + &gfx10_l1_cache_hit_count_comp, + &gfx10_l1_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l2_cache_hit_counter = { + .id = AC_SPM_COUNTER_L2_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L2 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L2 cache. The L2 cache is shared by many blocks across the GPU, " + "including the Command Processor, Geometry Engine, all WGPs, all Render " + "Backends, and others. Each request is 128 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l2_cache_request_count_comp, + &gfx10_l2_cache_hit_count_comp, + &gfx10_l2_cache_miss_count_comp, + }, +}; + +/* SPM groups. */ +static struct ac_spm_derived_group_descr gfx10_cache_group = { + .id = AC_SPM_GROUP_CACHE, + .name = "Cache", + .num_counters = 5, + .counters = { + &gfx10_inst_cache_hit_counter, + &gfx10_scalar_cache_hit_counter, + &gfx10_l0_cache_hit_counter, + &gfx10_l1_cache_hit_counter, + &gfx10_l2_cache_hit_counter, + }, +}; + +static struct ac_spm_derived_counter * +ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_counter_id counter_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + if (counter->descr->id == counter_id) + return counter; + } + + return NULL; +} + +static struct ac_spm_derived_component * +ac_spm_get_component_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_component_id component_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + if (component->descr->id == component_id) + return component; + } + + return NULL; +} + +static void +ac_spm_add_group(struct ac_spm_derived_trace *spm_derived_trace, + const struct ac_spm_derived_group_descr *group_descr) +{ + for (uint32_t i = 0; i < group_descr->num_counters; i++) { + const struct ac_spm_derived_counter_descr *counter_descr = + group_descr->counters[i]; + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + struct ac_spm_derived_component *component = + &spm_derived_trace->components[spm_derived_trace->num_components++]; + assert(spm_derived_trace->num_components <= AC_SPM_COMPONENT_COUNT); + + component->descr = counter_descr->components[j]; + } + + struct ac_spm_derived_counter *counter = + &spm_derived_trace->counters[spm_derived_trace->num_counters++]; + assert(spm_derived_trace->num_counters <= AC_SPM_COUNTER_COUNT); + counter->descr = counter_descr; + } + + struct ac_spm_derived_group *group = + &spm_derived_trace->groups[spm_derived_trace->num_groups++]; + assert(spm_derived_trace->num_groups <= AC_SPM_GROUP_COUNT); + group->descr = group_descr; +} + +static enum ac_spm_raw_counter_op +ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) +{ + switch (id) { + case AC_SPM_TCP_PERF_SEL_REQ: + case AC_SPM_TCP_PERF_SEL_REQ_MISS: + case AC_SPM_SQC_PERF_SEL_DCACHE_HITS: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE: + case AC_SPM_SQC_PERF_SEL_ICACHE_HITS: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE: + case AC_SPM_GL1C_PERF_SEL_REQ: + case AC_SPM_GL1C_PERF_SEL_REQ_MISS: + case AC_SPM_GL2C_PERF_SEL_REQ: + case AC_SPM_GL2C_PERF_SEL_MISS: + return AC_SPM_RAW_COUNTER_OP_SUM; + default: + UNREACHABLE("Invalid SPM raw counter ID."); + } +} + +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + uint32_t sample_size_in_bytes = spm_trace->sample_size_in_bytes; + uint8_t *spm_data_ptr = (uint8_t *)spm_trace->ptr; + struct ac_spm_derived_trace *spm_derived_trace; + + spm_derived_trace = calloc(1, sizeof(*spm_derived_trace)); + if (!spm_derived_trace) + return NULL; + + /* Add groups to the trace. */ + ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); + + spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); + if (!spm_derived_trace->timestamps) { + free(spm_derived_trace); + return NULL; + } + + /* Skip the reserved 32 bytes of data at beginning. */ + spm_data_ptr += 32; + + /* Collect timestamps. */ + uint64_t sample_size_in_qwords = sample_size_in_bytes / sizeof(uint64_t); + uint64_t *timestamp_ptr = (uint64_t *)spm_data_ptr; + + for (uint32_t i = 0; i < spm_trace->num_samples; i++) { + uint64_t index = i * sample_size_in_qwords; + uint64_t timestamp = timestamp_ptr[index]; + + spm_derived_trace->timestamps[i] = timestamp; + } + + /* Collect raw counter values. */ + uint64_t *raw_counter_values[AC_SPM_RAW_COUNTER_ID_COUNT]; + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) { + raw_counter_values[i] = calloc(spm_trace->num_samples, sizeof(uint64_t)); + } + + const uint32_t sample_size_in_hwords = sample_size_in_bytes / sizeof(uint16_t); + const uint16_t *counter_values_ptr = (uint16_t *)spm_data_ptr; + + for (uint32_t c = 0; c < spm_trace->num_counters; c++) { + const uint64_t offset = spm_trace->counters[c].offset; + const uint32_t id = spm_trace->counters[c].id; + const enum ac_spm_raw_counter_op op = ac_spm_get_raw_counter_op(id); + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + const uint64_t index = offset + (s * sample_size_in_hwords); + const uint16_t value = counter_values_ptr[index]; + + switch (op) { + case AC_SPM_RAW_COUNTER_OP_SUM: + raw_counter_values[id][s] += value; + break; + default: + UNREACHABLE("Invalid SPM raw counter OP.\n"); + } + } + } + +#define GET_COMPONENT(n) \ + struct ac_spm_derived_component *_##n = \ + ac_spm_get_component_by_id(spm_derived_trace, AC_SPM_COMPONENT_##n); +#define GET_COUNTER(n) \ + struct ac_spm_derived_counter *_##n = \ + ac_spm_get_counter_by_id(spm_derived_trace, AC_SPM_COUNTER_##n); + + GET_COUNTER(INST_CACHE_HIT); + GET_COUNTER(SCALAR_CACHE_HIT); + GET_COUNTER(L0_CACHE_HIT); + GET_COUNTER(L1_CACHE_HIT); + GET_COUNTER(L2_CACHE_HIT); + + GET_COMPONENT(INST_CACHE_REQUEST_COUNT); + GET_COMPONENT(INST_CACHE_HIT_COUNT); + GET_COMPONENT(INST_CACHE_MISS_COUNT); + GET_COMPONENT(SCALAR_CACHE_REQUEST_COUNT); + GET_COMPONENT(SCALAR_CACHE_HIT_COUNT); + GET_COMPONENT(SCALAR_CACHE_MISS_COUNT); + GET_COMPONENT(L0_CACHE_REQUEST_COUNT); + GET_COMPONENT(L0_CACHE_HIT_COUNT); + GET_COMPONENT(L0_CACHE_MISS_COUNT); + GET_COMPONENT(L1_CACHE_REQUEST_COUNT); + GET_COMPONENT(L1_CACHE_HIT_COUNT); + GET_COMPONENT(L1_CACHE_MISS_COUNT); + GET_COMPONENT(L2_CACHE_REQUEST_COUNT); + GET_COMPONENT(L2_CACHE_HIT_COUNT); + GET_COMPONENT(L2_CACHE_MISS_COUNT); + +#undef GET_COMPONENT +#undef GET_COUNTER + +#define ADD(id, value) \ + util_dynarray_append(&_##id->values, (double)(value)); + +#define OP_RAW(n) \ + raw_counter_values[AC_SPM_##n][s] +#define OP_SUM2(a, b) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] +#define OP_SUM3(a, b, c) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] + \ + raw_counter_values[AC_SPM_##c][s] +#define OP_SUB2(a, b) \ + raw_counter_values[AC_SPM_##a][s] - \ + raw_counter_values[AC_SPM_##b][s] + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + /* Cache group. */ + /* Instruction cache. */ + const double inst_cache_request_count = + OP_SUM3(SQC_PERF_SEL_ICACHE_HITS, SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit_count = + OP_RAW(SQC_PERF_SEL_ICACHE_HITS); + const double inst_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit = + inst_cache_request_count ? (inst_cache_hit_count / inst_cache_request_count) * 100.0f : 0.0f; + + ADD(INST_CACHE_REQUEST_COUNT, inst_cache_request_count); + ADD(INST_CACHE_HIT_COUNT, inst_cache_hit_count); + ADD(INST_CACHE_MISS_COUNT, inst_cache_miss_count); + ADD(INST_CACHE_HIT, inst_cache_hit); + + /* Scalar cache. */ + const double scalar_cache_request_count = + OP_SUM3(SQC_PERF_SEL_DCACHE_HITS, SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit_count = + OP_RAW(SQC_PERF_SEL_DCACHE_HITS); + const double scalar_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit = + scalar_cache_request_count ? (scalar_cache_hit_count / scalar_cache_request_count) * 100.0f : 0.0f; + + ADD(SCALAR_CACHE_REQUEST_COUNT, scalar_cache_request_count); + ADD(SCALAR_CACHE_HIT_COUNT, scalar_cache_hit_count); + ADD(SCALAR_CACHE_MISS_COUNT, scalar_cache_miss_count); + ADD(SCALAR_CACHE_HIT, scalar_cache_hit); + + /* L0 cache. */ + const double l0_cache_request_count = OP_RAW(TCP_PERF_SEL_REQ); + const double l0_cache_hit_count = OP_SUB2(TCP_PERF_SEL_REQ, TCP_PERF_SEL_REQ_MISS); + const double l0_cache_miss_count = OP_RAW(TCP_PERF_SEL_REQ_MISS); + const double l0_cache_hit = + l0_cache_request_count ? (l0_cache_hit_count / l0_cache_request_count) * 100.0f : 0.0f; + + ADD(L0_CACHE_REQUEST_COUNT, l0_cache_request_count); + ADD(L0_CACHE_HIT_COUNT, l0_cache_hit_count); + ADD(L0_CACHE_MISS_COUNT, l0_cache_miss_count); + ADD(L0_CACHE_HIT, l0_cache_hit); + + /* L1 cache. */ + const double l1_cache_request_count = OP_RAW(GL1C_PERF_SEL_REQ); + const double l1_cache_hit_count = OP_SUB2(GL1C_PERF_SEL_REQ, GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_miss_count = OP_RAW(GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_hit = + l1_cache_request_count ? (l1_cache_hit_count / l1_cache_request_count) * 100.0f : 0.0f; + + ADD(L1_CACHE_REQUEST_COUNT, l1_cache_request_count); + ADD(L1_CACHE_HIT_COUNT, l1_cache_hit_count); + ADD(L1_CACHE_MISS_COUNT, l1_cache_miss_count); + ADD(L1_CACHE_HIT, l1_cache_hit); + + /* L2 cache. */ + const double l2_cache_request_count = OP_RAW(GL2C_PERF_SEL_REQ); + const double l2_cache_hit_count = OP_SUB2(GL2C_PERF_SEL_REQ, GL2C_PERF_SEL_MISS); + const double l2_cache_miss_count = OP_RAW(GL2C_PERF_SEL_MISS); + const double l2_cache_hit = + l2_cache_request_count ? (l2_cache_hit_count / l2_cache_request_count) * 100.0f : 0.0f; + + ADD(L2_CACHE_REQUEST_COUNT, l2_cache_request_count); + ADD(L2_CACHE_HIT_COUNT, l2_cache_hit_count); + ADD(L2_CACHE_MISS_COUNT, l2_cache_miss_count); + ADD(L2_CACHE_HIT, l2_cache_hit); + } + +#undef ADD +#undef OP_RAW +#undef OP_SUM2 +#undef OP_SUM3 +#undef OP_SUB2 + + spm_derived_trace->num_timestamps = spm_trace->num_samples; + spm_derived_trace->sample_interval = spm_trace->sample_interval; + + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) + free(raw_counter_values[i]); + + return spm_derived_trace; +} + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + util_dynarray_fini(&component->values); + } + + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + util_dynarray_fini(&counter->values); + } + + free(spm_derived_trace); +} + static void ac_emit_spm_muxsel(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm) diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index f21b2c59c7e..47f0915a21b 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -11,6 +11,8 @@ #include "ac_perfcounter.h" +#include "util/u_dynarray.h" + struct ac_cmdbuf; #define AC_SPM_MAX_COUNTER_PER_BLOCK 16 @@ -102,6 +104,10 @@ enum ac_spm_raw_counter_id { AC_SPM_RAW_COUNTER_ID_COUNT, }; +enum ac_spm_raw_counter_op { + AC_SPM_RAW_COUNTER_OP_SUM = 0, +}; + struct ac_spm_counter_descr { enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; @@ -212,6 +218,103 @@ struct ac_spm_trace { uint32_t num_samples; }; +enum ac_spm_group_id { + AC_SPM_GROUP_CACHE, + AC_SPM_GROUP_COUNT, +}; + +enum ac_spm_counter_id { + AC_SPM_COUNTER_INST_CACHE_HIT, + AC_SPM_COUNTER_SCALAR_CACHE_HIT, + AC_SPM_COUNTER_L0_CACHE_HIT, + AC_SPM_COUNTER_L1_CACHE_HIT, /* < GFX12 */ + AC_SPM_COUNTER_L2_CACHE_HIT, + AC_SPM_COUNTER_COUNT, +}; + +enum ac_spm_component_id { + AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_COUNT, +}; + +enum ac_spm_usage_type { + AC_SPM_USAGE_PERCENTAGE = 1, + AC_SPM_USAGE_ITEMS = 5, +}; + +#define AC_SPM_MAX_COMPONENTS_PER_COUNTER 3 +#define AC_SPM_MAX_COUNTERS_PER_GROUP 5 + +struct ac_spm_derived_component_descr { + enum ac_spm_component_id id; + enum ac_spm_counter_id counter_id; + const char *name; + enum ac_spm_usage_type usage; +}; + +struct ac_spm_derived_counter_descr { + enum ac_spm_counter_id id; + enum ac_spm_group_id group_id; + const char *name; + const char *desc; + enum ac_spm_usage_type usage; + uint32_t num_components; + struct ac_spm_derived_component_descr *components[AC_SPM_MAX_COMPONENTS_PER_COUNTER]; +}; + +struct ac_spm_derived_group_descr { + enum ac_spm_group_id id; + const char *name; + uint32_t num_counters; + struct ac_spm_derived_counter_descr *counters[AC_SPM_MAX_COUNTERS_PER_GROUP]; +}; + +struct ac_spm_derived_group { + const struct ac_spm_derived_group_descr *descr; +}; + +struct ac_spm_derived_counter { + const struct ac_spm_derived_counter_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_component { + const struct ac_spm_derived_component_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_trace { + uint32_t num_timestamps; + uint64_t *timestamps; + + uint32_t num_groups; + struct ac_spm_derived_group groups[AC_SPM_GROUP_COUNT]; + + uint32_t num_counters; + struct ac_spm_derived_counter counters[AC_SPM_COUNTER_COUNT]; + + uint32_t num_components; + struct ac_spm_derived_component components[AC_SPM_COMPONENT_COUNT]; + + uint32_t sample_interval; +}; + bool ac_init_spm(const struct radeon_info *info, const struct ac_perfcounters *pc, struct ac_spm *spm); @@ -219,6 +322,13 @@ void ac_destroy_spm(struct ac_spm *spm); bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace); +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace); + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace); + void ac_emit_spm_setup(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm, From a729b12f59edb852a61d7168d5878a8ccf941300 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 17:56:08 +0100 Subject: [PATCH 11/15] ac/spm: add support for new LDS counters in RGP 2.6 Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 68 +++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_spm.h | 7 +++++ 2 files changed, 75 insertions(+) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 7115fddcff4..06ccc56aaf4 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -37,6 +37,10 @@ static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_req = {AC_SPM_GL2C_PERF_SEL_REQ, GL2C, 0x3}; static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_miss = {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x23}; +static struct ac_spm_counter_descr gfx10_cpf_perf_sel_stat_busy = + {AC_SPM_CPF_PERF_SEL_STAT_BUSY, CPF, 0x18}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_lds_bank_conflict = + {AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, SQ, 0x11d}; static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -51,6 +55,8 @@ static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_gl1c_perf_sel_req_miss}, {&gfx10_gl2c_perf_sel_req}, {&gfx10_gl2c_perf_sel_miss}, + {&gfx10_cpf_perf_sel_stat_busy}, + {&gfx10_sqc_perf_sel_lds_bank_conflict}, }; /* GFX10.3+ */ @@ -70,6 +76,8 @@ static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_gl1c_perf_sel_req_miss}, {&gfx10_gl2c_perf_sel_req}, {&gfx103_gl2c_perf_sel_miss}, + {&gfx10_cpf_perf_sel_stat_busy}, + {&gfx10_sqc_perf_sel_lds_bank_conflict}, }; /* GFX11+ */ @@ -87,6 +95,8 @@ static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses = {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ_WGP, 0x10f}; static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses_duplicate = {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ_WGP, 0x110}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_lds_bank_conflict = + {AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, SQ_WGP, 0x100}; static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -101,6 +111,8 @@ static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_gl1c_perf_sel_req_miss}, {&gfx10_gl2c_perf_sel_req}, {&gfx103_gl2c_perf_sel_miss}, + {&gfx10_cpf_perf_sel_stat_busy}, + {&gfx11_sqc_perf_sel_lds_bank_conflict}, }; /* GFX12+ */ @@ -838,6 +850,20 @@ static struct ac_spm_derived_component_descr gfx10_l2_cache_miss_count_comp = { .usage = AC_SPM_USAGE_ITEMS, }; +static struct ac_spm_derived_component_descr gfx10_gpu_busy_cycles_comp = { + .id = AC_SPM_COMPONENT_GPU_BUSY_CYCLES, + .counter_id = AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + .name = "Gpu Busy Cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + +static struct ac_spm_derived_component_descr gfx10_cs_lds_bank_conflict_cycles_comp = { + .id = AC_SPM_COMPONENT_CS_LDS_BANK_CONFLICT_CYCLES, + .counter_id = AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + .name = "LDS Busy Cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + /* SPM counters. */ static struct ac_spm_derived_counter_descr gfx10_inst_cache_hit_counter = { .id = AC_SPM_COUNTER_INST_CACHE_HIT, @@ -925,6 +951,20 @@ static struct ac_spm_derived_counter_descr gfx10_l2_cache_hit_counter = { }, }; +static struct ac_spm_derived_counter_descr gfx10_cs_lds_bank_conflict_counter = { + .id = AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + .group_id = AC_SPM_GROUP_LDS, + .name = "LDS Bank Conflict", + .desc = "The percentage of GPUTime LDS is stalled by bank conflicts. Value " + "range: 0% (optimal) to 100% (bad).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 2, + .components = { + &gfx10_gpu_busy_cycles_comp, + &gfx10_cs_lds_bank_conflict_cycles_comp, + }, +}; + /* SPM groups. */ static struct ac_spm_derived_group_descr gfx10_cache_group = { .id = AC_SPM_GROUP_CACHE, @@ -939,6 +979,15 @@ static struct ac_spm_derived_group_descr gfx10_cache_group = { }, }; +static struct ac_spm_derived_group_descr gfx10_lds_group = { + .id = AC_SPM_GROUP_LDS, + .name = "LDS", + .num_counters = 1, + .counters = { + &gfx10_cs_lds_bank_conflict_counter, + }, +}; + static struct ac_spm_derived_counter * ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, enum ac_spm_counter_id counter_id) @@ -1011,6 +1060,8 @@ ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) case AC_SPM_GL1C_PERF_SEL_REQ_MISS: case AC_SPM_GL2C_PERF_SEL_REQ: case AC_SPM_GL2C_PERF_SEL_MISS: + case AC_SPM_CPF_PERF_SEL_STAT_BUSY: + case AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT: return AC_SPM_RAW_COUNTER_OP_SUM; default: UNREACHABLE("Invalid SPM raw counter ID."); @@ -1031,6 +1082,7 @@ ac_spm_get_derived_trace(const struct radeon_info *info, /* Add groups to the trace. */ ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); + ac_spm_add_group(spm_derived_trace, &gfx10_lds_group); spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); if (!spm_derived_trace->timestamps) { @@ -1092,6 +1144,7 @@ ac_spm_get_derived_trace(const struct radeon_info *info, GET_COUNTER(L0_CACHE_HIT); GET_COUNTER(L1_CACHE_HIT); GET_COUNTER(L2_CACHE_HIT); + GET_COUNTER(CS_LDS_BANK_CONFLICT); GET_COMPONENT(INST_CACHE_REQUEST_COUNT); GET_COMPONENT(INST_CACHE_HIT_COUNT); @@ -1108,6 +1161,8 @@ ac_spm_get_derived_trace(const struct radeon_info *info, GET_COMPONENT(L2_CACHE_REQUEST_COUNT); GET_COMPONENT(L2_CACHE_HIT_COUNT); GET_COMPONENT(L2_CACHE_MISS_COUNT); + GET_COMPONENT(GPU_BUSY_CYCLES); + GET_COMPONENT(CS_LDS_BANK_CONFLICT_CYCLES); #undef GET_COMPONENT #undef GET_COUNTER @@ -1128,6 +1183,8 @@ ac_spm_get_derived_trace(const struct radeon_info *info, raw_counter_values[AC_SPM_##a][s] - \ raw_counter_values[AC_SPM_##b][s] + const uint32_t num_simds = info->num_cu * info->num_simd_per_compute_unit; + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { /* Cache group. */ /* Instruction cache. */ @@ -1195,6 +1252,17 @@ ac_spm_get_derived_trace(const struct radeon_info *info, ADD(L2_CACHE_HIT_COUNT, l2_cache_hit_count); ADD(L2_CACHE_MISS_COUNT, l2_cache_miss_count); ADD(L2_CACHE_HIT, l2_cache_hit); + + /* LDS group */ + /* CS LDS Bank Conflict. */ + const double gpu_busy_cycles = OP_RAW(CPF_PERF_SEL_STAT_BUSY); + const double cs_lds_bank_conflict_cycles = OP_RAW(SQC_PERF_SEL_LDS_BANK_CONFLICT) / (double)num_simds; + const double cs_lds_bank_conflict = + gpu_busy_cycles ? (cs_lds_bank_conflict_cycles / gpu_busy_cycles) * 100.0f : 0.0f; + + ADD(GPU_BUSY_CYCLES, gpu_busy_cycles); + ADD(CS_LDS_BANK_CONFLICT_CYCLES, cs_lds_bank_conflict_cycles); + ADD(CS_LDS_BANK_CONFLICT, cs_lds_bank_conflict); } #undef ADD diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index 47f0915a21b..0512ebecfac 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -101,6 +101,8 @@ enum ac_spm_raw_counter_id { AC_SPM_GL1C_PERF_SEL_REQ_MISS, AC_SPM_GL2C_PERF_SEL_REQ, AC_SPM_GL2C_PERF_SEL_MISS, + AC_SPM_CPF_PERF_SEL_STAT_BUSY, + AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, AC_SPM_RAW_COUNTER_ID_COUNT, }; @@ -220,6 +222,7 @@ struct ac_spm_trace { enum ac_spm_group_id { AC_SPM_GROUP_CACHE, + AC_SPM_GROUP_LDS, AC_SPM_GROUP_COUNT, }; @@ -229,6 +232,7 @@ enum ac_spm_counter_id { AC_SPM_COUNTER_L0_CACHE_HIT, AC_SPM_COUNTER_L1_CACHE_HIT, /* < GFX12 */ AC_SPM_COUNTER_L2_CACHE_HIT, + AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, AC_SPM_COUNTER_COUNT, }; @@ -248,11 +252,14 @@ enum ac_spm_component_id { AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_GPU_BUSY_CYCLES, + AC_SPM_COMPONENT_CS_LDS_BANK_CONFLICT_CYCLES, AC_SPM_COMPONENT_COUNT, }; enum ac_spm_usage_type { AC_SPM_USAGE_PERCENTAGE = 1, + AC_SPM_USAGE_CYCLES = 2, AC_SPM_USAGE_ITEMS = 5, }; From 74d5b94e6baf2cce6e4ab240e36b4cd0b9752aa9 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 17:57:14 +0100 Subject: [PATCH 12/15] ac/spm: add support for new Memory bytes counters in RGP 2.6 Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 143 ++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_spm.h | 14 ++++ 2 files changed, 157 insertions(+) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 06ccc56aaf4..70bb9802ec7 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -41,6 +41,22 @@ static struct ac_spm_counter_descr gfx10_cpf_perf_sel_stat_busy = {AC_SPM_CPF_PERF_SEL_STAT_BUSY, CPF, 0x18}; static struct ac_spm_counter_descr gfx10_sqc_perf_sel_lds_bank_conflict = {AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, SQ, 0x11d}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_32b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B, GL2C, 0x59}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B, GL2C, 0x5a}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_96b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B, GL2C, 0x5b}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_128b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B, GL2C, 0x5c}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_wrreq = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ, GL2C, 0x4b}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_wrreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, GL2C, 0x4c}; +static struct ac_spm_counter_descr gfx10_gcea_perf_sel_sarb_dram_sized_requests = + {AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS, GCEA, 0x37}; +static struct ac_spm_counter_descr gfx10_gcea_perf_sel_sarb_io_sized_requests = + {AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, GCEA, 0x39}; static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -57,11 +73,31 @@ static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_gl2c_perf_sel_miss}, {&gfx10_cpf_perf_sel_stat_busy}, {&gfx10_sqc_perf_sel_lds_bank_conflict}, + {&gfx10_gl2c_perf_sel_ea_rdreq_32b}, + {&gfx10_gl2c_perf_sel_ea_rdreq_64b}, + {&gfx10_gl2c_perf_sel_ea_rdreq_96b}, + {&gfx10_gl2c_perf_sel_ea_rdreq_128b}, + {&gfx10_gl2c_perf_sel_ea_wrreq}, + {&gfx10_gl2c_perf_sel_ea_wrreq_64b}, + {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, + {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, }; /* GFX10.3+ */ static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_miss = {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x2b}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_32b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B, GL2C, 0x63}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B, GL2C, 0x64}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_96b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B, GL2C, 0x65}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_128b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B, GL2C, 0x66}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_wrreq = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ, GL2C, 0x53}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_wrreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, GL2C, 0x55}; static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -78,6 +114,14 @@ static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx103_gl2c_perf_sel_miss}, {&gfx10_cpf_perf_sel_stat_busy}, {&gfx10_sqc_perf_sel_lds_bank_conflict}, + {&gfx103_gl2c_perf_sel_ea_rdreq_32b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_64b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_96b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_128b}, + {&gfx103_gl2c_perf_sel_ea_wrreq}, + {&gfx103_gl2c_perf_sel_ea_wrreq_64b}, + {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, + {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, }; /* GFX11+ */ @@ -113,6 +157,14 @@ static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx103_gl2c_perf_sel_miss}, {&gfx10_cpf_perf_sel_stat_busy}, {&gfx11_sqc_perf_sel_lds_bank_conflict}, + {&gfx103_gl2c_perf_sel_ea_rdreq_32b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_64b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_96b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_128b}, + {&gfx103_gl2c_perf_sel_ea_wrreq}, + {&gfx103_gl2c_perf_sel_ea_wrreq_64b}, + {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, + {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, }; /* GFX12+ */ @@ -965,6 +1017,46 @@ static struct ac_spm_derived_counter_descr gfx10_cs_lds_bank_conflict_counter = }, }; +static struct ac_spm_derived_counter_descr gfx10_fetch_size_counter = { + .id = AC_SPM_COUNTER_FETCH_SIZE, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Fetch size", + .desc = "The total bytes fetched from the video memory. This is measured " + "with all extra fetches and any cache or memory effects taken into " + "account.", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_write_size_counter = { + .id = AC_SPM_COUNTER_WRITE_SIZE, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Write size", + .desc = "The total bytes written to the video memory. This is measured with " + "all extra fetches and any cache or memory effects taken into account.", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_local_vid_mem_bytes_counter = { + .id = AC_SPM_COUNTER_LOCAL_VID_MEM_BYTES, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Local video memory bytes", + .desc = "Number of bytes read from or written to the Infinity Cache (if " + "available) or local video memory", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_pcie_bytes_counter = { + .id = AC_SPM_COUNTER_PCIE_BYTES, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "PCIe bytes", + .desc = "Number of bytes sent and received over the PCIe bus", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + /* SPM groups. */ static struct ac_spm_derived_group_descr gfx10_cache_group = { .id = AC_SPM_GROUP_CACHE, @@ -988,6 +1080,18 @@ static struct ac_spm_derived_group_descr gfx10_lds_group = { }, }; +static struct ac_spm_derived_group_descr gfx10_memory_bytes_group = { + .id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Memory (bytes)", + .num_counters = 4, + .counters = { + &gfx10_fetch_size_counter, + &gfx10_write_size_counter, + &gfx10_local_vid_mem_bytes_counter, + &gfx10_pcie_bytes_counter, + }, +}; + static struct ac_spm_derived_counter * ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, enum ac_spm_counter_id counter_id) @@ -1062,6 +1166,14 @@ ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) case AC_SPM_GL2C_PERF_SEL_MISS: case AC_SPM_CPF_PERF_SEL_STAT_BUSY: case AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B: + case AC_SPM_GL2C_PERF_SEL_EA_WRREQ: + case AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B: + case AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS: + case AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS: return AC_SPM_RAW_COUNTER_OP_SUM; default: UNREACHABLE("Invalid SPM raw counter ID."); @@ -1083,6 +1195,7 @@ ac_spm_get_derived_trace(const struct radeon_info *info, /* Add groups to the trace. */ ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); ac_spm_add_group(spm_derived_trace, &gfx10_lds_group); + ac_spm_add_group(spm_derived_trace, &gfx10_memory_bytes_group); spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); if (!spm_derived_trace->timestamps) { @@ -1145,6 +1258,10 @@ ac_spm_get_derived_trace(const struct radeon_info *info, GET_COUNTER(L1_CACHE_HIT); GET_COUNTER(L2_CACHE_HIT); GET_COUNTER(CS_LDS_BANK_CONFLICT); + GET_COUNTER(FETCH_SIZE); + GET_COUNTER(WRITE_SIZE); + GET_COUNTER(LOCAL_VID_MEM_BYTES); + GET_COUNTER(PCIE_BYTES); GET_COMPONENT(INST_CACHE_REQUEST_COUNT); GET_COMPONENT(INST_CACHE_HIT_COUNT); @@ -1263,6 +1380,32 @@ ac_spm_get_derived_trace(const struct radeon_info *info, ADD(GPU_BUSY_CYCLES, gpu_busy_cycles); ADD(CS_LDS_BANK_CONFLICT_CYCLES, cs_lds_bank_conflict_cycles); ADD(CS_LDS_BANK_CONFLICT, cs_lds_bank_conflict); + + /* Memmory (bytes) group. */ + /* Fetch size. */ + double fetch_size = OP_RAW(GL2C_PERF_SEL_EA_RDREQ_32B) * 32 + + OP_RAW(GL2C_PERF_SEL_EA_RDREQ_64B) * 64 + + OP_RAW(GL2C_PERF_SEL_EA_RDREQ_96B) * 96 + + OP_RAW(GL2C_PERF_SEL_EA_RDREQ_128B) * 128; + + ADD(FETCH_SIZE, fetch_size); + + /* Write size. */ + const double write_size = (OP_RAW(GL2C_PERF_SEL_EA_WRREQ) * 32 + + OP_RAW(GL2C_PERF_SEL_EA_WRREQ_64B) * 64) - + (OP_RAW(GL2C_PERF_SEL_EA_WRREQ_64B) * 32); + + ADD(WRITE_SIZE, write_size); + + /* Local video mem bytes. */ + const double local_vid_mem_bytes = OP_RAW(GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS) * 32; + + ADD(LOCAL_VID_MEM_BYTES, local_vid_mem_bytes); + + /* PCIe bytes. */ + const double pcie_bytes = OP_RAW(GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS) * 32; + + ADD(PCIE_BYTES, pcie_bytes); } #undef ADD diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index 0512ebecfac..b5a08cb706c 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -103,6 +103,14 @@ enum ac_spm_raw_counter_id { AC_SPM_GL2C_PERF_SEL_MISS, AC_SPM_CPF_PERF_SEL_STAT_BUSY, AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B, + AC_SPM_GL2C_PERF_SEL_EA_WRREQ, + AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, + AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS, + AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, AC_SPM_RAW_COUNTER_ID_COUNT, }; @@ -223,6 +231,7 @@ struct ac_spm_trace { enum ac_spm_group_id { AC_SPM_GROUP_CACHE, AC_SPM_GROUP_LDS, + AC_SPM_GROUP_MEMORY_BYTES, AC_SPM_GROUP_COUNT, }; @@ -233,6 +242,10 @@ enum ac_spm_counter_id { AC_SPM_COUNTER_L1_CACHE_HIT, /* < GFX12 */ AC_SPM_COUNTER_L2_CACHE_HIT, AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + AC_SPM_COUNTER_FETCH_SIZE, + AC_SPM_COUNTER_WRITE_SIZE, + AC_SPM_COUNTER_LOCAL_VID_MEM_BYTES, + AC_SPM_COUNTER_PCIE_BYTES, AC_SPM_COUNTER_COUNT, }; @@ -260,6 +273,7 @@ enum ac_spm_component_id { enum ac_spm_usage_type { AC_SPM_USAGE_PERCENTAGE = 1, AC_SPM_USAGE_CYCLES = 2, + AC_SPM_USAGE_BYTES = 4, AC_SPM_USAGE_ITEMS = 5, }; From f9d041333ff3d48edf3fcbf276453c38566edbcd Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 17:58:19 +0100 Subject: [PATCH 13/15] ac/spm: add support for new Memory percentage counters in RGP 2.6 Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 100 ++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_spm.h | 8 ++++ 2 files changed, 108 insertions(+) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 70bb9802ec7..57a9510ee6c 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -57,6 +57,10 @@ static struct ac_spm_counter_descr gfx10_gcea_perf_sel_sarb_dram_sized_requests {AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS, GCEA, 0x37}; static struct ac_spm_counter_descr gfx10_gcea_perf_sel_sarb_io_sized_requests = {AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, GCEA, 0x39}; +static struct ac_spm_counter_descr gfx10_ta_perf_sel_ta_busy = + {AC_SPM_TA_PERF_SEL_TA_BUSY, TA, 0xf}; +static struct ac_spm_counter_descr gfx10_tcp_perf_sel_tcp_ta_req_stall = + {AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, TCP, 0x24}; static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -81,6 +85,8 @@ static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_gl2c_perf_sel_ea_wrreq_64b}, {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, + {&gfx10_ta_perf_sel_ta_busy}, + {&gfx10_tcp_perf_sel_tcp_ta_req_stall}, }; /* GFX10.3+ */ @@ -122,6 +128,8 @@ static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx103_gl2c_perf_sel_ea_wrreq_64b}, {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, + {&gfx10_ta_perf_sel_ta_busy}, + {&gfx10_tcp_perf_sel_tcp_ta_req_stall}, }; /* GFX11+ */ @@ -141,6 +149,8 @@ static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses_duplicate = {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ_WGP, 0x110}; static struct ac_spm_counter_descr gfx11_sqc_perf_sel_lds_bank_conflict = {AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, SQ_WGP, 0x100}; +static struct ac_spm_counter_descr gfx11_tcp_perf_sel_tcp_ta_req_stall = + {AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, TCP, 0x27}; static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -165,6 +175,8 @@ static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx103_gl2c_perf_sel_ea_wrreq_64b}, {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, + {&gfx10_ta_perf_sel_ta_busy}, + {&gfx11_tcp_perf_sel_tcp_ta_req_stall}, }; /* GFX12+ */ @@ -916,6 +928,20 @@ static struct ac_spm_derived_component_descr gfx10_cs_lds_bank_conflict_cycles_c .usage = AC_SPM_USAGE_CYCLES, }; +static struct ac_spm_derived_component_descr gfx10_mem_unit_busy_cycles_comp = { + .id = AC_SPM_COMPONENT_MEM_UNIT_BUSY_CYCLES, + .counter_id = AC_SPM_COUNTER_MEM_UNIT_BUSY, + .name = "Memory unit busy cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + +static struct ac_spm_derived_component_descr gfx10_mem_unit_stalled_cycles_comp = { + .id = AC_SPM_COMPONENT_MEM_UNIT_STALLED_CYCLES, + .counter_id = AC_SPM_COUNTER_MEM_UNIT_STALLED, + .name = "Memory unit stalled cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + /* SPM counters. */ static struct ac_spm_derived_counter_descr gfx10_inst_cache_hit_counter = { .id = AC_SPM_COUNTER_INST_CACHE_HIT, @@ -1057,6 +1083,37 @@ static struct ac_spm_derived_counter_descr gfx10_pcie_bytes_counter = { .num_components = 0, }; +static struct ac_spm_derived_counter_descr gfx10_mem_unit_busy_counter = { + .id = AC_SPM_COUNTER_MEM_UNIT_BUSY, + .group_id = AC_SPM_GROUP_MEMORY_PERCENTAGE, + .name = "Memory unity busy", + .desc = "The percentage of GPUTime the memory unit is active. The result " + "includes the stall time (MemUnitStalled). This is measured with all " + "extra fetches and writes and any cache or memory effects taken into " + "account. Value range: 0% to 100% (fetch-bound).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 2, + .components = { + &gfx10_gpu_busy_cycles_comp, + &gfx10_mem_unit_busy_cycles_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_mem_unit_stalled_counter = { + .id = AC_SPM_COUNTER_MEM_UNIT_STALLED, + .group_id = AC_SPM_GROUP_MEMORY_PERCENTAGE, + .name = "Memory unit stalled", + .desc = "The percentage of GPUTime the memory unit is stalled. Try reducing " + "the number or size of fetches and writes if possible. Value range: 0% " + "(optimal) to 100% (bad).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 2, + .components = { + &gfx10_gpu_busy_cycles_comp, + &gfx10_mem_unit_stalled_cycles_comp, + }, +}; + /* SPM groups. */ static struct ac_spm_derived_group_descr gfx10_cache_group = { .id = AC_SPM_GROUP_CACHE, @@ -1092,6 +1149,16 @@ static struct ac_spm_derived_group_descr gfx10_memory_bytes_group = { }, }; +static struct ac_spm_derived_group_descr gfx10_memory_percentage_group = { + .id = AC_SPM_GROUP_MEMORY_PERCENTAGE, + .name = "Memory (%)", + .num_counters = 2, + .counters = { + &gfx10_mem_unit_busy_counter, + &gfx10_mem_unit_stalled_counter, + }, +}; + static struct ac_spm_derived_counter * ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, enum ac_spm_counter_id counter_id) @@ -1129,6 +1196,11 @@ ac_spm_add_group(struct ac_spm_derived_trace *spm_derived_trace, group_descr->counters[i]; for (uint32_t j = 0; j < counter_descr->num_components; j++) { + /* Avoid redundant components. */ + if (ac_spm_get_component_by_id(spm_derived_trace, + counter_descr->components[j]->id)) + continue; + struct ac_spm_derived_component *component = &spm_derived_trace->components[spm_derived_trace->num_components++]; assert(spm_derived_trace->num_components <= AC_SPM_COMPONENT_COUNT); @@ -1175,6 +1247,9 @@ ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) case AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS: case AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS: return AC_SPM_RAW_COUNTER_OP_SUM; + case AC_SPM_TA_PERF_SEL_TA_BUSY: + case AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL: + return AC_SPM_RAW_COUNTER_OP_MAX; default: UNREACHABLE("Invalid SPM raw counter ID."); } @@ -1196,6 +1271,7 @@ ac_spm_get_derived_trace(const struct radeon_info *info, ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); ac_spm_add_group(spm_derived_trace, &gfx10_lds_group); ac_spm_add_group(spm_derived_trace, &gfx10_memory_bytes_group); + ac_spm_add_group(spm_derived_trace, &gfx10_memory_percentage_group); spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); if (!spm_derived_trace->timestamps) { @@ -1239,6 +1315,9 @@ ac_spm_get_derived_trace(const struct radeon_info *info, case AC_SPM_RAW_COUNTER_OP_SUM: raw_counter_values[id][s] += value; break; + case AC_SPM_RAW_COUNTER_OP_MAX: + raw_counter_values[id][s] = MAX2(raw_counter_values[id][s], value); + break; default: UNREACHABLE("Invalid SPM raw counter OP.\n"); } @@ -1262,6 +1341,8 @@ ac_spm_get_derived_trace(const struct radeon_info *info, GET_COUNTER(WRITE_SIZE); GET_COUNTER(LOCAL_VID_MEM_BYTES); GET_COUNTER(PCIE_BYTES); + GET_COUNTER(MEM_UNIT_BUSY); + GET_COUNTER(MEM_UNIT_STALLED); GET_COMPONENT(INST_CACHE_REQUEST_COUNT); GET_COMPONENT(INST_CACHE_HIT_COUNT); @@ -1280,6 +1361,8 @@ ac_spm_get_derived_trace(const struct radeon_info *info, GET_COMPONENT(L2_CACHE_MISS_COUNT); GET_COMPONENT(GPU_BUSY_CYCLES); GET_COMPONENT(CS_LDS_BANK_CONFLICT_CYCLES); + GET_COMPONENT(MEM_UNIT_BUSY_CYCLES); + GET_COMPONENT(MEM_UNIT_STALLED_CYCLES); #undef GET_COMPONENT #undef GET_COUNTER @@ -1406,6 +1489,23 @@ ac_spm_get_derived_trace(const struct radeon_info *info, const double pcie_bytes = OP_RAW(GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS) * 32; ADD(PCIE_BYTES, pcie_bytes); + + /* Memory (percentage) group. */ + /* Memory unit busy. */ + const double mem_unit_busy_cycles = OP_RAW(TA_PERF_SEL_TA_BUSY); + const double mem_unit_busy = + gpu_busy_cycles ? (mem_unit_busy_cycles / gpu_busy_cycles) * 100.0f : 0.0f; + + ADD(MEM_UNIT_BUSY_CYCLES, mem_unit_busy_cycles); + ADD(MEM_UNIT_BUSY, mem_unit_busy); + + /* Memory unit stalled. */ + const double mem_unit_stalled_cycles = OP_RAW(TCP_PERF_SEL_TCP_TA_REQ_STALL); + const double mem_unit_stalled = + gpu_busy_cycles ? (mem_unit_stalled_cycles / gpu_busy_cycles) * 100.0f : 0.f; + + ADD(MEM_UNIT_STALLED_CYCLES, mem_unit_stalled_cycles); + ADD(MEM_UNIT_STALLED, mem_unit_stalled); } #undef ADD diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index b5a08cb706c..2f21342c6b5 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -111,11 +111,14 @@ enum ac_spm_raw_counter_id { AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS, AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, + AC_SPM_TA_PERF_SEL_TA_BUSY, + AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, AC_SPM_RAW_COUNTER_ID_COUNT, }; enum ac_spm_raw_counter_op { AC_SPM_RAW_COUNTER_OP_SUM = 0, + AC_SPM_RAW_COUNTER_OP_MAX, }; struct ac_spm_counter_descr { @@ -232,6 +235,7 @@ enum ac_spm_group_id { AC_SPM_GROUP_CACHE, AC_SPM_GROUP_LDS, AC_SPM_GROUP_MEMORY_BYTES, + AC_SPM_GROUP_MEMORY_PERCENTAGE, AC_SPM_GROUP_COUNT, }; @@ -246,6 +250,8 @@ enum ac_spm_counter_id { AC_SPM_COUNTER_WRITE_SIZE, AC_SPM_COUNTER_LOCAL_VID_MEM_BYTES, AC_SPM_COUNTER_PCIE_BYTES, + AC_SPM_COUNTER_MEM_UNIT_BUSY, + AC_SPM_COUNTER_MEM_UNIT_STALLED, AC_SPM_COUNTER_COUNT, }; @@ -267,6 +273,8 @@ enum ac_spm_component_id { AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, AC_SPM_COMPONENT_GPU_BUSY_CYCLES, AC_SPM_COMPONENT_CS_LDS_BANK_CONFLICT_CYCLES, + AC_SPM_COMPONENT_MEM_UNIT_BUSY_CYCLES, + AC_SPM_COMPONENT_MEM_UNIT_STALLED_CYCLES, AC_SPM_COMPONENT_COUNT, }; From af799be4b0207377ad161edb5e00c31080b2672d Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 17 Dec 2025 17:59:05 +0100 Subject: [PATCH 14/15] ac/spm: add support for Ray Tracing counters in RGP These aren't new in RGP 2.6, they have been added since a while. But because RADV wasn't supporting the new derived SPM chunk it wasn't possible to expose them. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_spm.c | 59 +++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_spm.h | 6 +++++ 2 files changed, 65 insertions(+) diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 57a9510ee6c..e3e47c6015e 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -104,6 +104,12 @@ static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_wrreq = {AC_SPM_GL2C_PERF_SEL_EA_WRREQ, GL2C, 0x53}; static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_wrreq_64b = {AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, GL2C, 0x55}; +static struct ac_spm_counter_descr gfx103_td_perf_sel_ray_tracing_bvh4_tri_node = + {AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE, TD, 0x76}; +static struct ac_spm_counter_descr gfx103_td_perf_sel_ray_tracing_bvh4_fp16_box_node = + {AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE, TD, 0x74}; +static struct ac_spm_counter_descr gfx103_td_perf_sel_ray_tracing_bvh4_fp32_box_node = + {AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE, TD, 0x75}; static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -130,6 +136,9 @@ static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, {&gfx10_ta_perf_sel_ta_busy}, {&gfx10_tcp_perf_sel_tcp_ta_req_stall}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_tri_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp16_box_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp32_box_node}, }; /* GFX11+ */ @@ -177,6 +186,9 @@ static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, {&gfx10_ta_perf_sel_ta_busy}, {&gfx11_tcp_perf_sel_tcp_ta_req_stall}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_tri_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp16_box_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp32_box_node}, }; /* GFX12+ */ @@ -1114,6 +1126,24 @@ static struct ac_spm_derived_counter_descr gfx10_mem_unit_stalled_counter = { }, }; +static struct ac_spm_derived_counter_descr gfx103_ray_box_tests_counter = { + .id = AC_SPM_COUNTER_RAY_BOX_TESTS, + .group_id = AC_SPM_GROUP_RT, + .name = "Ray-box tests", + .desc = "The number of ray box intersection tests.", + .usage = AC_SPM_USAGE_ITEMS, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx103_ray_tri_tests_counter = { + .id = AC_SPM_COUNTER_RAY_TRI_TESTS, + .group_id = AC_SPM_GROUP_RT, + .name = "Ray-triangle tests", + .desc = "iThe number of ray triangle intersection tests", + .usage = AC_SPM_USAGE_ITEMS, + .num_components = 0, +}; + /* SPM groups. */ static struct ac_spm_derived_group_descr gfx10_cache_group = { .id = AC_SPM_GROUP_CACHE, @@ -1159,6 +1189,16 @@ static struct ac_spm_derived_group_descr gfx10_memory_percentage_group = { }, }; +static struct ac_spm_derived_group_descr gfx103_rt_group = { + .id = AC_SPM_GROUP_RT, + .name = "Ray tracing", + .num_counters = 2, + .counters = { + &gfx103_ray_box_tests_counter, + &gfx103_ray_tri_tests_counter, + }, +}; + static struct ac_spm_derived_counter * ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, enum ac_spm_counter_id counter_id) @@ -1246,6 +1286,9 @@ ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) case AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B: case AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS: case AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS: + case AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE: + case AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE: + case AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE: return AC_SPM_RAW_COUNTER_OP_SUM; case AC_SPM_TA_PERF_SEL_TA_BUSY: case AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL: @@ -1272,6 +1315,8 @@ ac_spm_get_derived_trace(const struct radeon_info *info, ac_spm_add_group(spm_derived_trace, &gfx10_lds_group); ac_spm_add_group(spm_derived_trace, &gfx10_memory_bytes_group); ac_spm_add_group(spm_derived_trace, &gfx10_memory_percentage_group); + if (info->gfx_level >= GFX10_3) + ac_spm_add_group(spm_derived_trace, &gfx103_rt_group); spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); if (!spm_derived_trace->timestamps) { @@ -1343,6 +1388,8 @@ ac_spm_get_derived_trace(const struct radeon_info *info, GET_COUNTER(PCIE_BYTES); GET_COUNTER(MEM_UNIT_BUSY); GET_COUNTER(MEM_UNIT_STALLED); + GET_COUNTER(RAY_BOX_TESTS); + GET_COUNTER(RAY_TRI_TESTS); GET_COMPONENT(INST_CACHE_REQUEST_COUNT); GET_COMPONENT(INST_CACHE_HIT_COUNT); @@ -1506,6 +1553,18 @@ ac_spm_get_derived_trace(const struct radeon_info *info, ADD(MEM_UNIT_STALLED_CYCLES, mem_unit_stalled_cycles); ADD(MEM_UNIT_STALLED, mem_unit_stalled); + + /* Raytracing group. */ + /* Ray box tests. */ + const double ray_box_tests = OP_RAW(TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE) + + OP_RAW(TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE); + + ADD(RAY_BOX_TESTS, ray_box_tests); + + /* Ray triangle tests. */ + const double ray_tri_tests = OP_RAW(TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE); + + ADD(RAY_TRI_TESTS, ray_tri_tests); } #undef ADD diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index 2f21342c6b5..a4a539f4834 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -113,6 +113,9 @@ enum ac_spm_raw_counter_id { AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, AC_SPM_TA_PERF_SEL_TA_BUSY, AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, + AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE, + AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE, + AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE, AC_SPM_RAW_COUNTER_ID_COUNT, }; @@ -236,6 +239,7 @@ enum ac_spm_group_id { AC_SPM_GROUP_LDS, AC_SPM_GROUP_MEMORY_BYTES, AC_SPM_GROUP_MEMORY_PERCENTAGE, + AC_SPM_GROUP_RT, AC_SPM_GROUP_COUNT, }; @@ -252,6 +256,8 @@ enum ac_spm_counter_id { AC_SPM_COUNTER_PCIE_BYTES, AC_SPM_COUNTER_MEM_UNIT_BUSY, AC_SPM_COUNTER_MEM_UNIT_STALLED, + AC_SPM_COUNTER_RAY_BOX_TESTS, + AC_SPM_COUNTER_RAY_TRI_TESTS, AC_SPM_COUNTER_COUNT, }; From 6fefd4fc0df810c856e86fce605be323a0f9e067 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Thu, 18 Dec 2025 08:57:49 +0100 Subject: [PATCH 15/15] ac/rgp: enable new performance counters for RGP 2.6 on GFX10-GFX11 GFX12 needs more work and it will be added separately. Signed-off-by: Samuel Pitoiset --- src/amd/common/ac_rgp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index 404846ab3c5..8660d5733ed 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -1405,7 +1405,8 @@ ac_use_derived_spm_trace(const struct radeon_info *info, if (!spm_trace) return false; - /* TODO: Enable for GPUs. */ + /* TODO: Enable for GFX12. */ + return info->gfx_level >= GFX10 && info->gfx_level < GFX12; return false; }