diff --git a/src/amd/common/ac_perfcounter.c b/src/amd/common/ac_perfcounter.c index 0a1e95798dd..abb4a36b7f9 100644 --- a/src/amd/common/ac_perfcounter.c +++ b/src/amd/common/ac_perfcounter.c @@ -932,6 +932,28 @@ static struct ac_pc_block_base gfx10_UTCL1 = { .num_spm_counters = 0, }; +/* gfx10_GCEA */ +static unsigned gfx10_GCEA_select0[] = { + R_036800_GCEA_PERFCOUNTER2_SELECT, +}; + +static unsigned gfx10_GCEA_select1[] = { + R_036804_GCEA_PERFCOUNTER2_SELECT1, +}; +static struct ac_pc_block_base gfx10_GCEA = { + .gpu_block = GCEA, + .name = "GCEA", + .num_counters = 1, + + .select0 = gfx10_GCEA_select0, + .select1 = gfx10_GCEA_select1, + .counter0_lo = R_034980_GCEA_PERFCOUNTER2_LO, + + .num_spm_counters = 1, + .num_spm_wires = 2, + .spm_block_select = AC_SPM_GLOBAL_BLOCK_GCEA, +}; + /* gfx11_SQ_WQP */ static struct ac_pc_block_base gfx11_SQ_WGP = { .gpu_block = SQ_WGP, @@ -1027,6 +1049,40 @@ static struct ac_pc_block_gfxdescr groups_gfx10[] = { {&gfx10_TCP, 77}, {&cik_TD, 61}, {&gfx10_UTCL1, 15}, + {&gfx10_GCEA, 88}, +}; + +static struct ac_pc_block_gfxdescr groups_gfx103[] = { + {&cik_CB, 461}, + {&gfx10_CHA, 45}, + {&gfx10_CHCG, 35}, + {&gfx10_CHC, 35}, + {&cik_CPC, 47}, + {&cik_CPF, 40}, + {&cik_CPG, 82}, + {&gfx10_DB, 370}, + {&gfx10_GCR, 94}, + {&cik_GDS, 123}, + {&gfx10_GE, 315}, + {&gfx10_GL1A, 36}, + {&gfx10_GL1C, 64, 4}, + {&gfx10_GL2A, 91}, + {&gfx10_GL2C, 235}, + {&cik_GRBM, 47}, + {&cik_GRBMSE, 19}, + {&gfx10_PA_PH, 960}, + {&cik_PA_SC, 552}, + {&gfx10_PA_SU, 266}, + {&gfx10_RLC, 7}, + {&gfx10_RMI, 258}, + {&cik_SPI, 329}, + {&gfx10_SQ, 509}, + {&cik_SX, 225}, + {&cik_TA, 226}, + {&gfx10_TCP, 77}, + {&cik_TD, 192}, + {&gfx10_UTCL1, 15}, + {&gfx10_GCEA, 89}, }; static struct ac_pc_block_gfxdescr groups_gfx11[] = { @@ -1060,6 +1116,7 @@ static struct ac_pc_block_gfxdescr groups_gfx11[] = { {&cik_TD, 196}, {&gfx10_UTCL1, 65}, {&gfx11_SQ_WGP, 511, 4}, + {&gfx10_GCEA, 86}, }; static struct ac_pc_block_gfxdescr groups_gfx12[] = { @@ -1237,10 +1294,13 @@ bool ac_init_perfcounters(const struct radeon_info *info, num_blocks = ARRAY_SIZE(groups_gfx9); break; case GFX10: - case GFX10_3: blocks = groups_gfx10; num_blocks = ARRAY_SIZE(groups_gfx10); break; + case GFX10_3: + blocks = groups_gfx103; + num_blocks = ARRAY_SIZE(groups_gfx103); + break; case GFX11: blocks = groups_gfx11; num_blocks = ARRAY_SIZE(groups_gfx11); @@ -1290,8 +1350,14 @@ bool ac_init_perfcounters(const struct radeon_info *info, } else if (!strcmp(block->b->b->name, "GL1C") || !strcmp(block->b->b->name, "SQ_WGP")) { block->num_global_instances = block->num_instances * info->num_se * info->max_sa_per_se; - } else if (!strcmp(block->b->b->name, "GL2C")) { + } else if (!strcmp(block->b->b->name, "GL2C") || + !strcmp(block->b->b->name, "GCEA")) { block->num_instances = block->num_global_instances = info->num_tcc_blocks; + } else if (!strcmp(block->b->b->name, "CPF")) { + block->num_instances = block->num_global_instances = 1; + } else if (!strcmp(block->b->b->name, "TA") || + !strcmp(block->b->b->name, "TD")) { + block->num_global_instances = block->num_instances; } } diff --git a/src/amd/common/ac_perfcounter.h b/src/amd/common/ac_perfcounter.h index c198c4403ef..80a06b5cd7a 100644 --- a/src/amd/common/ac_perfcounter.h +++ b/src/amd/common/ac_perfcounter.h @@ -67,7 +67,7 @@ enum ac_pc_gpu_block { ATC = 0x1A, ATCL2 = 0x1B, MCVML2 = 0x1C, - EA = 0x1D, + GCEA = 0x1D, RPB = 0x1E, RMI = 0x1F, UMCCH = 0x20, diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index a3564042309..8660d5733ed 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -58,6 +58,10 @@ enum sqtt_file_chunk_type SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS, SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION, SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE, + + SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE = 128, + SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB = SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE, + SQTT_FILE_CHUNK_TYPE_COUNT }; @@ -992,10 +996,203 @@ static void ac_sqtt_dump_spm(const struct ac_spm_trace *spm_trace, fseek(output, file_offset, SEEK_SET); } +/** + * SQTT Derived SPM DB info. + */ +struct sqtt_derived_spm_group_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t group_name_length; + uint32_t group_description_length; + uint32_t num_counters; +}; + +struct sqtt_derived_spm_counter_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t counter_name_length; + uint32_t counter_description_length; + uint32_t num_components; + uint8_t usage_type; +}; + +struct sqtt_derived_spm_component_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t component_name_length; + uint32_t component_description_length; + uint32_t usage_type; +}; + +struct sqtt_file_chunk_derived_spm_db { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t num_timestamps; + uint32_t num_groups; + uint32_t num_counters; + uint32_t num_components; + uint32_t sampling_interval; +}; + +static_assert(sizeof(struct sqtt_file_chunk_derived_spm_db) == 44, + "sqtt_file_chunk_derived_spm_db doesn't match RGP spec"); + +static void ac_sqtt_fill_derived_spm_db(const struct ac_spm_derived_trace *spm_derived_trace, + struct sqtt_file_chunk_derived_spm_db *chunk, + size_t file_offset, + uint32_t chunk_size) +{ + chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB; + chunk->header.chunk_id.index = 0; + chunk->header.major_version = 0; + chunk->header.minor_version = 0; + chunk->header.size_in_bytes = chunk_size; + + chunk->offset = sizeof(*chunk); + chunk->flags = 0; + chunk->num_timestamps = spm_derived_trace->num_timestamps; + chunk->num_groups = spm_derived_trace->num_groups; + chunk->num_counters = spm_derived_trace->num_counters; + chunk->num_components = spm_derived_trace->num_components; + chunk->sampling_interval = spm_derived_trace->sample_interval; +} + +static void ac_sqtt_dump_derived_spm(const struct ac_spm_derived_trace *spm_derived_trace, + size_t file_offset, + FILE *output) +{ + struct sqtt_file_chunk_derived_spm_db derived_spm_db; + size_t file_derived_spm_db_offset = file_offset; + + fseek(output, sizeof(struct sqtt_file_chunk_derived_spm_db), SEEK_CUR); + file_offset += sizeof(struct sqtt_file_chunk_derived_spm_db); + + /* Dump timestamps. */ + for (uint32_t i = 0; i < spm_derived_trace->num_timestamps; i++) { + uint64_t timestamp = spm_derived_trace->timestamps[i]; + + file_offset += sizeof(timestamp); + fwrite(×tamp, sizeof(timestamp), 1, output); + } + + /* Dump SPM groups. */ + for (uint32_t i = 0; i < spm_derived_trace->num_groups; i++) { + const struct ac_spm_derived_group *group = &spm_derived_trace->groups[i]; + const struct ac_spm_derived_group_descr *group_descr = group->descr; + struct sqtt_derived_spm_group_info group_info = {0}; + + const uint32_t num_counters = group_descr->num_counters; + const uint32_t name_length = strlen(group_descr->name); + + group_info.size_in_bytes = sizeof(group_info) + name_length + + num_counters * sizeof(uint32_t); + group_info.offset = sizeof(group_info); + group_info.group_name_length = name_length; + group_info.num_counters = num_counters; + + file_offset += sizeof(group_info) + group_info.group_name_length; + fwrite(&group_info, sizeof(group_info), 1, output); + fwrite(group_descr->name, group_info.group_name_length, 1, output); + + for (uint32_t j = 0; j < group_descr->num_counters; j++) { + const struct ac_spm_derived_counter_descr *counter_descr = group_descr->counters[j]; + uint32_t counter_id = counter_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&counter_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM counters. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + const struct ac_spm_derived_counter_descr *counter_descr = counter->descr; + struct sqtt_derived_spm_counter_info counter_info = {0}; + + const uint32_t num_components = counter_descr->num_components; + const uint32_t name_length = strlen(counter_descr->name); + const uint32_t description_length = strlen(counter_descr->desc); + + counter_info.size_in_bytes = sizeof(counter_info) + name_length + + description_length + num_components * sizeof(uint32_t); + counter_info.offset = sizeof(counter_info); + counter_info.counter_name_length = name_length; + counter_info.counter_description_length = description_length; + counter_info.num_components = num_components; + counter_info.usage_type = counter_descr->usage; + + file_offset += sizeof(counter_info) + counter_info.counter_name_length + + counter_info.counter_description_length; + fwrite(&counter_info, sizeof(counter_info), 1, output); + fwrite(counter_descr->name, counter_info.counter_name_length, 1, output); + fwrite(counter_descr->desc, counter_info.counter_description_length, 1, output); + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + const struct ac_spm_derived_component_descr *component_descr = counter_descr->components[j]; + uint32_t component_id = component_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&component_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM components. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + const struct ac_spm_derived_component_descr *component_descr = component->descr; + struct sqtt_derived_spm_component_info component_info = {0}; + + const uint32_t name_length = strlen(component_descr->name); + + component_info.size_in_bytes = sizeof(component_info) + name_length; + component_info.offset = sizeof(component_info); + component_info.component_name_length = name_length; + component_info.usage_type = component_descr->usage; + + file_offset += sizeof(component_info) + component_info.component_name_length + + component_info.component_description_length; + fwrite(&component_info, sizeof(component_info), 1, output); + fwrite(component_descr->name, component_info.component_name_length, 1, output); + } + + /* Dump counter values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + assert(util_dynarray_num_elements(&counter->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&counter->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* Dump component values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + assert(util_dynarray_num_elements(&component->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&component->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* SQTT Derived SPM chunk. */ + ac_sqtt_fill_derived_spm_db(spm_derived_trace, &derived_spm_db, + file_derived_spm_db_offset, + file_offset - file_derived_spm_db_offset); + fseek(output, file_derived_spm_db_offset, SEEK_SET); + fwrite(&derived_spm_db, sizeof(struct sqtt_file_chunk_derived_spm_db), 1, output); + fseek(output, file_offset, SEEK_SET); +} + #if defined(USE_LIBELF) static void ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt_trace, - const struct ac_spm_trace *spm_trace, FILE *output) + const struct ac_spm_trace *spm_trace, + const struct ac_spm_derived_trace *spm_derived_trace, + FILE *output) { struct sqtt_file_chunk_asic_info asic_info = {0}; struct sqtt_file_chunk_cpu_info cpu_info = {0}; @@ -1193,12 +1390,26 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt } } - if (spm_trace) { + if (spm_derived_trace) { + ac_sqtt_dump_derived_spm(spm_derived_trace, file_offset, output); + } else if (spm_trace) { ac_sqtt_dump_spm(spm_trace, file_offset, output); } } #endif +static bool +ac_use_derived_spm_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + if (!spm_trace) + return false; + + /* TODO: Enable for GFX12. */ + return info->gfx_level >= GFX10 && info->gfx_level < GFX12; + return false; +} + int ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace, const struct ac_spm_trace *spm_trace) @@ -1223,7 +1434,13 @@ ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_t if (!f) return -1; - ac_sqtt_dump_data(info, sqtt_trace, spm_trace, f); + struct ac_spm_derived_trace *spm_derived_trace = + ac_use_derived_spm_trace(info, spm_trace) ? ac_spm_get_derived_trace(info, spm_trace) : NULL; + + ac_sqtt_dump_data(info, sqtt_trace, spm_trace, spm_derived_trace, f); + + if (spm_derived_trace) + ac_spm_destroy_derived_trace(spm_derived_trace); fprintf(stderr, "RGP capture saved to '%s'\n", filename); diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index 54866ccb857..e3e47c6015e 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -13,18 +13,54 @@ /* SPM counters definition. */ /* GFX10+ */ -static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req = {TCP, 0x9}; -static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req_miss = {TCP, 0x12}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_hits = {SQ, 0x14f}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses = {SQ, 0x150}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses_duplicate = {SQ, 0x151}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_hits = {SQ, 0x12c}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses = {SQ, 0x12d}; -static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses_duplicate = {SQ, 0x12e}; -static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req = {GL1C, 0xe}; -static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req_miss = {GL1C, 0x12}; -static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_req = {GL2C, 0x3}; -static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_miss = {GL2C, 0x23}; +static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req = + {AC_SPM_TCP_PERF_SEL_REQ, TCP, 0x9}; +static struct ac_spm_counter_descr gfx10_tcp_perf_sel_req_miss = + {AC_SPM_TCP_PERF_SEL_REQ_MISS, TCP, 0x12}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_hits = + {AC_SPM_SQC_PERF_SEL_DCACHE_HITS, SQ, 0x14f}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, SQ, 0x150}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_dcache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, SQ, 0x151}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_hits = + {AC_SPM_SQC_PERF_SEL_ICACHE_HITS, SQ, 0x12c}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ, 0x12d}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_icache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ, 0x12e}; +static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req = + {AC_SPM_GL1C_PERF_SEL_REQ, GL1C, 0xe}; +static struct ac_spm_counter_descr gfx10_gl1c_perf_sel_req_miss = + {AC_SPM_GL1C_PERF_SEL_REQ_MISS, GL1C, 0x12}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_req = + {AC_SPM_GL2C_PERF_SEL_REQ, GL2C, 0x3}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_miss = + {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x23}; +static struct ac_spm_counter_descr gfx10_cpf_perf_sel_stat_busy = + {AC_SPM_CPF_PERF_SEL_STAT_BUSY, CPF, 0x18}; +static struct ac_spm_counter_descr gfx10_sqc_perf_sel_lds_bank_conflict = + {AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, SQ, 0x11d}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_32b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B, GL2C, 0x59}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B, GL2C, 0x5a}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_96b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B, GL2C, 0x5b}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_rdreq_128b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B, GL2C, 0x5c}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_wrreq = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ, GL2C, 0x4b}; +static struct ac_spm_counter_descr gfx10_gl2c_perf_sel_ea_wrreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, GL2C, 0x4c}; +static struct ac_spm_counter_descr gfx10_gcea_perf_sel_sarb_dram_sized_requests = + {AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS, GCEA, 0x37}; +static struct ac_spm_counter_descr gfx10_gcea_perf_sel_sarb_io_sized_requests = + {AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, GCEA, 0x39}; +static struct ac_spm_counter_descr gfx10_ta_perf_sel_ta_busy = + {AC_SPM_TA_PERF_SEL_TA_BUSY, TA, 0xf}; +static struct ac_spm_counter_descr gfx10_tcp_perf_sel_tcp_ta_req_stall = + {AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, TCP, 0x24}; static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -39,10 +75,41 @@ static struct ac_spm_counter_create_info gfx10_spm_counters[] = { {&gfx10_gl1c_perf_sel_req_miss}, {&gfx10_gl2c_perf_sel_req}, {&gfx10_gl2c_perf_sel_miss}, + {&gfx10_cpf_perf_sel_stat_busy}, + {&gfx10_sqc_perf_sel_lds_bank_conflict}, + {&gfx10_gl2c_perf_sel_ea_rdreq_32b}, + {&gfx10_gl2c_perf_sel_ea_rdreq_64b}, + {&gfx10_gl2c_perf_sel_ea_rdreq_96b}, + {&gfx10_gl2c_perf_sel_ea_rdreq_128b}, + {&gfx10_gl2c_perf_sel_ea_wrreq}, + {&gfx10_gl2c_perf_sel_ea_wrreq_64b}, + {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, + {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, + {&gfx10_ta_perf_sel_ta_busy}, + {&gfx10_tcp_perf_sel_tcp_ta_req_stall}, }; /* GFX10.3+ */ -static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_miss = {GL2C, 0x2b}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_miss = + {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x2b}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_32b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B, GL2C, 0x63}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B, GL2C, 0x64}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_96b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B, GL2C, 0x65}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_rdreq_128b = + {AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B, GL2C, 0x66}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_wrreq = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ, GL2C, 0x53}; +static struct ac_spm_counter_descr gfx103_gl2c_perf_sel_ea_wrreq_64b = + {AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, GL2C, 0x55}; +static struct ac_spm_counter_descr gfx103_td_perf_sel_ray_tracing_bvh4_tri_node = + {AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE, TD, 0x76}; +static struct ac_spm_counter_descr gfx103_td_perf_sel_ray_tracing_bvh4_fp16_box_node = + {AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE, TD, 0x74}; +static struct ac_spm_counter_descr gfx103_td_perf_sel_ray_tracing_bvh4_fp32_box_node = + {AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE, TD, 0x75}; static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -57,16 +124,42 @@ static struct ac_spm_counter_create_info gfx103_spm_counters[] = { {&gfx10_gl1c_perf_sel_req_miss}, {&gfx10_gl2c_perf_sel_req}, {&gfx103_gl2c_perf_sel_miss}, + {&gfx10_cpf_perf_sel_stat_busy}, + {&gfx10_sqc_perf_sel_lds_bank_conflict}, + {&gfx103_gl2c_perf_sel_ea_rdreq_32b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_64b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_96b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_128b}, + {&gfx103_gl2c_perf_sel_ea_wrreq}, + {&gfx103_gl2c_perf_sel_ea_wrreq_64b}, + {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, + {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, + {&gfx10_ta_perf_sel_ta_busy}, + {&gfx10_tcp_perf_sel_tcp_ta_req_stall}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_tri_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp16_box_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp32_box_node}, }; /* GFX11+ */ -static struct ac_spm_counter_descr gfx11_tcp_perf_sel_req_miss = {TCP, 0x11}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_hits = {SQ_WGP, 0x126}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses = {SQ_WGP, 0x127}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses_duplicate = {SQ_WGP, 0x128}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_hits = {SQ_WGP, 0x10e}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses = {SQ_WGP, 0x10f}; -static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses_duplicate = {SQ_WGP, 0x110}; +static struct ac_spm_counter_descr gfx11_tcp_perf_sel_req_miss = + {AC_SPM_TCP_PERF_SEL_REQ_MISS, TCP, 0x11}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_hits = + {AC_SPM_SQC_PERF_SEL_DCACHE_HITS, SQ_WGP, 0x126}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, SQ_WGP, 0x127}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_dcache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, SQ_WGP, 0x128}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_hits = + {AC_SPM_SQC_PERF_SEL_ICACHE_HITS, SQ_WGP, 0x10e}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ_WGP, 0x10f}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_icache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ_WGP, 0x110}; +static struct ac_spm_counter_descr gfx11_sqc_perf_sel_lds_bank_conflict = + {AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, SQ_WGP, 0x100}; +static struct ac_spm_counter_descr gfx11_tcp_perf_sel_tcp_ta_req_stall = + {AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, TCP, 0x27}; static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -81,16 +174,38 @@ static struct ac_spm_counter_create_info gfx11_spm_counters[] = { {&gfx10_gl1c_perf_sel_req_miss}, {&gfx10_gl2c_perf_sel_req}, {&gfx103_gl2c_perf_sel_miss}, + {&gfx10_cpf_perf_sel_stat_busy}, + {&gfx11_sqc_perf_sel_lds_bank_conflict}, + {&gfx103_gl2c_perf_sel_ea_rdreq_32b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_64b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_96b}, + {&gfx103_gl2c_perf_sel_ea_rdreq_128b}, + {&gfx103_gl2c_perf_sel_ea_wrreq}, + {&gfx103_gl2c_perf_sel_ea_wrreq_64b}, + {&gfx10_gcea_perf_sel_sarb_dram_sized_requests}, + {&gfx10_gcea_perf_sel_sarb_io_sized_requests}, + {&gfx10_ta_perf_sel_ta_busy}, + {&gfx11_tcp_perf_sel_tcp_ta_req_stall}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_tri_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp16_box_node}, + {&gfx103_td_perf_sel_ray_tracing_bvh4_fp32_box_node}, }; /* GFX12+ */ -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_hits = {SQ_WGP, 0x146}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses = {SQ_WGP, 0x147}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses_duplicate = {SQ_WGP, 0x148}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_hits = {SQ_WGP, 0x12e}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses = {SQ_WGP, 0x12f}; -static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses_duplicate = {SQ_WGP, 0x130}; -static struct ac_spm_counter_descr gfx12_gl2c_perf_sel_miss = {GL2C, 0x2a}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_hits = + {AC_SPM_SQC_PERF_SEL_DCACHE_HITS, SQ_WGP, 0x146}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, SQ_WGP, 0x147}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_dcache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, SQ_WGP, 0x148}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_hits = + {AC_SPM_SQC_PERF_SEL_ICACHE_HITS, SQ_WGP, 0x12e}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, SQ_WGP, 0x12f}; +static struct ac_spm_counter_descr gfx12_sqc_perf_sel_icache_misses_duplicate = + {AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, SQ_WGP, 0x130}; +static struct ac_spm_counter_descr gfx12_gl2c_perf_sel_miss = + {AC_SPM_GL2C_PERF_SEL_MISS, GL2C, 0x2a}; static struct ac_spm_counter_create_info gfx12_spm_counters[] = { {&gfx10_tcp_perf_sel_req}, @@ -165,14 +280,18 @@ ac_spm_init_instance_mapping(const struct radeon_info *info, /* Per-SA blocks. */ assert(block->b->b->gpu_block == GL1C || block->b->b->gpu_block == TCP || - block->b->b->gpu_block == SQ_WGP); + block->b->b->gpu_block == SQ_WGP || + block->b->b->gpu_block == TA || + block->b->b->gpu_block == TD); se_index = (counter->instance / block->num_instances) / info->max_sa_per_se; sa_index = (counter->instance / block->num_instances) % info->max_sa_per_se; instance_index = counter->instance % block->num_instances; } } else { /* Global blocks. */ - assert(block->b->b->gpu_block == GL2C); + assert(block->b->b->gpu_block == GL2C || + block->b->b->gpu_block == CPF || + block->b->b->gpu_block == GCEA); instance_index = counter->instance; } @@ -353,7 +472,8 @@ ac_spm_map_counter(struct ac_spm *spm, struct ac_spm_block_select *block_sel, S_037008_PERF_MODE3(0); break; default: - return false; + /* Try to program the new counter slot. */ + continue; } /* Mark this 16-bit counter as used. */ @@ -406,6 +526,7 @@ ac_spm_add_counter(const struct radeon_info *info, counter = &spm->counters[spm->num_counters]; spm->num_counters++; + counter->id = counter_info->b->id; counter->gpu_block = counter_info->b->gpu_block; counter->event_id = counter_info->b->event_id; counter->instance = counter_info->instance; @@ -527,8 +648,10 @@ bool ac_init_spm(const struct radeon_info *info, for (unsigned i = 0; i < create_info_count; i++) { const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block); - if (!block) + if (!block) { + fprintf(stderr, "ac/spm: Unknown group.\n"); return false; + } num_counters += block->num_global_instances; } @@ -541,6 +664,8 @@ bool ac_init_spm(const struct radeon_info *info, const struct ac_pc_block *block = ac_pc_get_block(pc, create_info[i].b->gpu_block); struct ac_spm_counter_create_info counter = create_info[i]; + assert(block->num_global_instances > 0); + for (unsigned j = 0; j < block->num_global_instances; j++) { counter.instance = j; @@ -690,6 +815,789 @@ bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace) return ac_spm_get_num_samples(spm, &trace->num_samples); } +/* SPM components. */ +/* Instruction cache components. */ +static struct ac_spm_derived_component_descr gfx10_inst_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* Scalar cache components. */ +static struct ac_spm_derived_component_descr gfx10_scalar_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L0 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l0_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L1 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l1_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L2 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l2_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_gpu_busy_cycles_comp = { + .id = AC_SPM_COMPONENT_GPU_BUSY_CYCLES, + .counter_id = AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + .name = "Gpu Busy Cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + +static struct ac_spm_derived_component_descr gfx10_cs_lds_bank_conflict_cycles_comp = { + .id = AC_SPM_COMPONENT_CS_LDS_BANK_CONFLICT_CYCLES, + .counter_id = AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + .name = "LDS Busy Cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + +static struct ac_spm_derived_component_descr gfx10_mem_unit_busy_cycles_comp = { + .id = AC_SPM_COMPONENT_MEM_UNIT_BUSY_CYCLES, + .counter_id = AC_SPM_COUNTER_MEM_UNIT_BUSY, + .name = "Memory unit busy cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + +static struct ac_spm_derived_component_descr gfx10_mem_unit_stalled_cycles_comp = { + .id = AC_SPM_COMPONENT_MEM_UNIT_STALLED_CYCLES, + .counter_id = AC_SPM_COUNTER_MEM_UNIT_STALLED, + .name = "Memory unit stalled cycles", + .usage = AC_SPM_USAGE_CYCLES, +}; + +/* SPM counters. */ +static struct ac_spm_derived_counter_descr gfx10_inst_cache_hit_counter = { + .id = AC_SPM_COUNTER_INST_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Instruction cache hit", + .desc = "The percentage of read requests made that hit the data in the " + "Instruction cache. The Instruction cache supplies shader code to an " + "executing shader. Each request is 64 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_inst_cache_request_count_comp, + &gfx10_inst_cache_hit_count_comp, + &gfx10_inst_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_scalar_cache_hit_counter = { + .id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Scalar cache hit", + .desc = "The percentage of read requests made from executing shader code " + "that hit the data in the Scalar cache. The Scalar cache contains data " + "that does not vary in each thread across the wavefront. Each request is " + "64 bytes in size. Value range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_scalar_cache_request_count_comp, + &gfx10_scalar_cache_hit_count_comp, + &gfx10_scalar_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l0_cache_hit_counter = { + .id = AC_SPM_COUNTER_L0_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L0 cache hit", + .desc = "The percentage of read requests that hit the data in the L0 cache. " + "The L0 cache contains vector data, which is data that may vary in each " + "thread across the wavefront. Each request is 128 bytes in size. Value " + "range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l0_cache_request_count_comp, + &gfx10_l0_cache_hit_count_comp, + &gfx10_l0_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l1_cache_hit_counter = { + .id = AC_SPM_COUNTER_L1_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L1 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L1 cache. The L1 cache is shared across all WGPs in a single shader " + "engine. Each request is 128 bytes in size. Value range: 0% (no hit) to " + "100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l1_cache_request_count_comp, + &gfx10_l1_cache_hit_count_comp, + &gfx10_l1_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l2_cache_hit_counter = { + .id = AC_SPM_COUNTER_L2_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L2 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L2 cache. The L2 cache is shared by many blocks across the GPU, " + "including the Command Processor, Geometry Engine, all WGPs, all Render " + "Backends, and others. Each request is 128 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l2_cache_request_count_comp, + &gfx10_l2_cache_hit_count_comp, + &gfx10_l2_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_cs_lds_bank_conflict_counter = { + .id = AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + .group_id = AC_SPM_GROUP_LDS, + .name = "LDS Bank Conflict", + .desc = "The percentage of GPUTime LDS is stalled by bank conflicts. Value " + "range: 0% (optimal) to 100% (bad).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 2, + .components = { + &gfx10_gpu_busy_cycles_comp, + &gfx10_cs_lds_bank_conflict_cycles_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_fetch_size_counter = { + .id = AC_SPM_COUNTER_FETCH_SIZE, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Fetch size", + .desc = "The total bytes fetched from the video memory. This is measured " + "with all extra fetches and any cache or memory effects taken into " + "account.", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_write_size_counter = { + .id = AC_SPM_COUNTER_WRITE_SIZE, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Write size", + .desc = "The total bytes written to the video memory. This is measured with " + "all extra fetches and any cache or memory effects taken into account.", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_local_vid_mem_bytes_counter = { + .id = AC_SPM_COUNTER_LOCAL_VID_MEM_BYTES, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Local video memory bytes", + .desc = "Number of bytes read from or written to the Infinity Cache (if " + "available) or local video memory", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_pcie_bytes_counter = { + .id = AC_SPM_COUNTER_PCIE_BYTES, + .group_id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "PCIe bytes", + .desc = "Number of bytes sent and received over the PCIe bus", + .usage = AC_SPM_USAGE_BYTES, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx10_mem_unit_busy_counter = { + .id = AC_SPM_COUNTER_MEM_UNIT_BUSY, + .group_id = AC_SPM_GROUP_MEMORY_PERCENTAGE, + .name = "Memory unity busy", + .desc = "The percentage of GPUTime the memory unit is active. The result " + "includes the stall time (MemUnitStalled). This is measured with all " + "extra fetches and writes and any cache or memory effects taken into " + "account. Value range: 0% to 100% (fetch-bound).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 2, + .components = { + &gfx10_gpu_busy_cycles_comp, + &gfx10_mem_unit_busy_cycles_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_mem_unit_stalled_counter = { + .id = AC_SPM_COUNTER_MEM_UNIT_STALLED, + .group_id = AC_SPM_GROUP_MEMORY_PERCENTAGE, + .name = "Memory unit stalled", + .desc = "The percentage of GPUTime the memory unit is stalled. Try reducing " + "the number or size of fetches and writes if possible. Value range: 0% " + "(optimal) to 100% (bad).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 2, + .components = { + &gfx10_gpu_busy_cycles_comp, + &gfx10_mem_unit_stalled_cycles_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx103_ray_box_tests_counter = { + .id = AC_SPM_COUNTER_RAY_BOX_TESTS, + .group_id = AC_SPM_GROUP_RT, + .name = "Ray-box tests", + .desc = "The number of ray box intersection tests.", + .usage = AC_SPM_USAGE_ITEMS, + .num_components = 0, +}; + +static struct ac_spm_derived_counter_descr gfx103_ray_tri_tests_counter = { + .id = AC_SPM_COUNTER_RAY_TRI_TESTS, + .group_id = AC_SPM_GROUP_RT, + .name = "Ray-triangle tests", + .desc = "iThe number of ray triangle intersection tests", + .usage = AC_SPM_USAGE_ITEMS, + .num_components = 0, +}; + +/* SPM groups. */ +static struct ac_spm_derived_group_descr gfx10_cache_group = { + .id = AC_SPM_GROUP_CACHE, + .name = "Cache", + .num_counters = 5, + .counters = { + &gfx10_inst_cache_hit_counter, + &gfx10_scalar_cache_hit_counter, + &gfx10_l0_cache_hit_counter, + &gfx10_l1_cache_hit_counter, + &gfx10_l2_cache_hit_counter, + }, +}; + +static struct ac_spm_derived_group_descr gfx10_lds_group = { + .id = AC_SPM_GROUP_LDS, + .name = "LDS", + .num_counters = 1, + .counters = { + &gfx10_cs_lds_bank_conflict_counter, + }, +}; + +static struct ac_spm_derived_group_descr gfx10_memory_bytes_group = { + .id = AC_SPM_GROUP_MEMORY_BYTES, + .name = "Memory (bytes)", + .num_counters = 4, + .counters = { + &gfx10_fetch_size_counter, + &gfx10_write_size_counter, + &gfx10_local_vid_mem_bytes_counter, + &gfx10_pcie_bytes_counter, + }, +}; + +static struct ac_spm_derived_group_descr gfx10_memory_percentage_group = { + .id = AC_SPM_GROUP_MEMORY_PERCENTAGE, + .name = "Memory (%)", + .num_counters = 2, + .counters = { + &gfx10_mem_unit_busy_counter, + &gfx10_mem_unit_stalled_counter, + }, +}; + +static struct ac_spm_derived_group_descr gfx103_rt_group = { + .id = AC_SPM_GROUP_RT, + .name = "Ray tracing", + .num_counters = 2, + .counters = { + &gfx103_ray_box_tests_counter, + &gfx103_ray_tri_tests_counter, + }, +}; + +static struct ac_spm_derived_counter * +ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_counter_id counter_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + if (counter->descr->id == counter_id) + return counter; + } + + return NULL; +} + +static struct ac_spm_derived_component * +ac_spm_get_component_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_component_id component_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + if (component->descr->id == component_id) + return component; + } + + return NULL; +} + +static void +ac_spm_add_group(struct ac_spm_derived_trace *spm_derived_trace, + const struct ac_spm_derived_group_descr *group_descr) +{ + for (uint32_t i = 0; i < group_descr->num_counters; i++) { + const struct ac_spm_derived_counter_descr *counter_descr = + group_descr->counters[i]; + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + /* Avoid redundant components. */ + if (ac_spm_get_component_by_id(spm_derived_trace, + counter_descr->components[j]->id)) + continue; + + struct ac_spm_derived_component *component = + &spm_derived_trace->components[spm_derived_trace->num_components++]; + assert(spm_derived_trace->num_components <= AC_SPM_COMPONENT_COUNT); + + component->descr = counter_descr->components[j]; + } + + struct ac_spm_derived_counter *counter = + &spm_derived_trace->counters[spm_derived_trace->num_counters++]; + assert(spm_derived_trace->num_counters <= AC_SPM_COUNTER_COUNT); + counter->descr = counter_descr; + } + + struct ac_spm_derived_group *group = + &spm_derived_trace->groups[spm_derived_trace->num_groups++]; + assert(spm_derived_trace->num_groups <= AC_SPM_GROUP_COUNT); + group->descr = group_descr; +} + +static enum ac_spm_raw_counter_op +ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) +{ + switch (id) { + case AC_SPM_TCP_PERF_SEL_REQ: + case AC_SPM_TCP_PERF_SEL_REQ_MISS: + case AC_SPM_SQC_PERF_SEL_DCACHE_HITS: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE: + case AC_SPM_SQC_PERF_SEL_ICACHE_HITS: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE: + case AC_SPM_GL1C_PERF_SEL_REQ: + case AC_SPM_GL1C_PERF_SEL_REQ_MISS: + case AC_SPM_GL2C_PERF_SEL_REQ: + case AC_SPM_GL2C_PERF_SEL_MISS: + case AC_SPM_CPF_PERF_SEL_STAT_BUSY: + case AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B: + case AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B: + case AC_SPM_GL2C_PERF_SEL_EA_WRREQ: + case AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B: + case AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS: + case AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS: + case AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE: + case AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE: + case AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE: + return AC_SPM_RAW_COUNTER_OP_SUM; + case AC_SPM_TA_PERF_SEL_TA_BUSY: + case AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL: + return AC_SPM_RAW_COUNTER_OP_MAX; + default: + UNREACHABLE("Invalid SPM raw counter ID."); + } +} + +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + uint32_t sample_size_in_bytes = spm_trace->sample_size_in_bytes; + uint8_t *spm_data_ptr = (uint8_t *)spm_trace->ptr; + struct ac_spm_derived_trace *spm_derived_trace; + + spm_derived_trace = calloc(1, sizeof(*spm_derived_trace)); + if (!spm_derived_trace) + return NULL; + + /* Add groups to the trace. */ + ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); + ac_spm_add_group(spm_derived_trace, &gfx10_lds_group); + ac_spm_add_group(spm_derived_trace, &gfx10_memory_bytes_group); + ac_spm_add_group(spm_derived_trace, &gfx10_memory_percentage_group); + if (info->gfx_level >= GFX10_3) + ac_spm_add_group(spm_derived_trace, &gfx103_rt_group); + + spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); + if (!spm_derived_trace->timestamps) { + free(spm_derived_trace); + return NULL; + } + + /* Skip the reserved 32 bytes of data at beginning. */ + spm_data_ptr += 32; + + /* Collect timestamps. */ + uint64_t sample_size_in_qwords = sample_size_in_bytes / sizeof(uint64_t); + uint64_t *timestamp_ptr = (uint64_t *)spm_data_ptr; + + for (uint32_t i = 0; i < spm_trace->num_samples; i++) { + uint64_t index = i * sample_size_in_qwords; + uint64_t timestamp = timestamp_ptr[index]; + + spm_derived_trace->timestamps[i] = timestamp; + } + + /* Collect raw counter values. */ + uint64_t *raw_counter_values[AC_SPM_RAW_COUNTER_ID_COUNT]; + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) { + raw_counter_values[i] = calloc(spm_trace->num_samples, sizeof(uint64_t)); + } + + const uint32_t sample_size_in_hwords = sample_size_in_bytes / sizeof(uint16_t); + const uint16_t *counter_values_ptr = (uint16_t *)spm_data_ptr; + + for (uint32_t c = 0; c < spm_trace->num_counters; c++) { + const uint64_t offset = spm_trace->counters[c].offset; + const uint32_t id = spm_trace->counters[c].id; + const enum ac_spm_raw_counter_op op = ac_spm_get_raw_counter_op(id); + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + const uint64_t index = offset + (s * sample_size_in_hwords); + const uint16_t value = counter_values_ptr[index]; + + switch (op) { + case AC_SPM_RAW_COUNTER_OP_SUM: + raw_counter_values[id][s] += value; + break; + case AC_SPM_RAW_COUNTER_OP_MAX: + raw_counter_values[id][s] = MAX2(raw_counter_values[id][s], value); + break; + default: + UNREACHABLE("Invalid SPM raw counter OP.\n"); + } + } + } + +#define GET_COMPONENT(n) \ + struct ac_spm_derived_component *_##n = \ + ac_spm_get_component_by_id(spm_derived_trace, AC_SPM_COMPONENT_##n); +#define GET_COUNTER(n) \ + struct ac_spm_derived_counter *_##n = \ + ac_spm_get_counter_by_id(spm_derived_trace, AC_SPM_COUNTER_##n); + + GET_COUNTER(INST_CACHE_HIT); + GET_COUNTER(SCALAR_CACHE_HIT); + GET_COUNTER(L0_CACHE_HIT); + GET_COUNTER(L1_CACHE_HIT); + GET_COUNTER(L2_CACHE_HIT); + GET_COUNTER(CS_LDS_BANK_CONFLICT); + GET_COUNTER(FETCH_SIZE); + GET_COUNTER(WRITE_SIZE); + GET_COUNTER(LOCAL_VID_MEM_BYTES); + GET_COUNTER(PCIE_BYTES); + GET_COUNTER(MEM_UNIT_BUSY); + GET_COUNTER(MEM_UNIT_STALLED); + GET_COUNTER(RAY_BOX_TESTS); + GET_COUNTER(RAY_TRI_TESTS); + + GET_COMPONENT(INST_CACHE_REQUEST_COUNT); + GET_COMPONENT(INST_CACHE_HIT_COUNT); + GET_COMPONENT(INST_CACHE_MISS_COUNT); + GET_COMPONENT(SCALAR_CACHE_REQUEST_COUNT); + GET_COMPONENT(SCALAR_CACHE_HIT_COUNT); + GET_COMPONENT(SCALAR_CACHE_MISS_COUNT); + GET_COMPONENT(L0_CACHE_REQUEST_COUNT); + GET_COMPONENT(L0_CACHE_HIT_COUNT); + GET_COMPONENT(L0_CACHE_MISS_COUNT); + GET_COMPONENT(L1_CACHE_REQUEST_COUNT); + GET_COMPONENT(L1_CACHE_HIT_COUNT); + GET_COMPONENT(L1_CACHE_MISS_COUNT); + GET_COMPONENT(L2_CACHE_REQUEST_COUNT); + GET_COMPONENT(L2_CACHE_HIT_COUNT); + GET_COMPONENT(L2_CACHE_MISS_COUNT); + GET_COMPONENT(GPU_BUSY_CYCLES); + GET_COMPONENT(CS_LDS_BANK_CONFLICT_CYCLES); + GET_COMPONENT(MEM_UNIT_BUSY_CYCLES); + GET_COMPONENT(MEM_UNIT_STALLED_CYCLES); + +#undef GET_COMPONENT +#undef GET_COUNTER + +#define ADD(id, value) \ + util_dynarray_append(&_##id->values, (double)(value)); + +#define OP_RAW(n) \ + raw_counter_values[AC_SPM_##n][s] +#define OP_SUM2(a, b) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] +#define OP_SUM3(a, b, c) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] + \ + raw_counter_values[AC_SPM_##c][s] +#define OP_SUB2(a, b) \ + raw_counter_values[AC_SPM_##a][s] - \ + raw_counter_values[AC_SPM_##b][s] + + const uint32_t num_simds = info->num_cu * info->num_simd_per_compute_unit; + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + /* Cache group. */ + /* Instruction cache. */ + const double inst_cache_request_count = + OP_SUM3(SQC_PERF_SEL_ICACHE_HITS, SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit_count = + OP_RAW(SQC_PERF_SEL_ICACHE_HITS); + const double inst_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit = + inst_cache_request_count ? (inst_cache_hit_count / inst_cache_request_count) * 100.0f : 0.0f; + + ADD(INST_CACHE_REQUEST_COUNT, inst_cache_request_count); + ADD(INST_CACHE_HIT_COUNT, inst_cache_hit_count); + ADD(INST_CACHE_MISS_COUNT, inst_cache_miss_count); + ADD(INST_CACHE_HIT, inst_cache_hit); + + /* Scalar cache. */ + const double scalar_cache_request_count = + OP_SUM3(SQC_PERF_SEL_DCACHE_HITS, SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit_count = + OP_RAW(SQC_PERF_SEL_DCACHE_HITS); + const double scalar_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit = + scalar_cache_request_count ? (scalar_cache_hit_count / scalar_cache_request_count) * 100.0f : 0.0f; + + ADD(SCALAR_CACHE_REQUEST_COUNT, scalar_cache_request_count); + ADD(SCALAR_CACHE_HIT_COUNT, scalar_cache_hit_count); + ADD(SCALAR_CACHE_MISS_COUNT, scalar_cache_miss_count); + ADD(SCALAR_CACHE_HIT, scalar_cache_hit); + + /* L0 cache. */ + const double l0_cache_request_count = OP_RAW(TCP_PERF_SEL_REQ); + const double l0_cache_hit_count = OP_SUB2(TCP_PERF_SEL_REQ, TCP_PERF_SEL_REQ_MISS); + const double l0_cache_miss_count = OP_RAW(TCP_PERF_SEL_REQ_MISS); + const double l0_cache_hit = + l0_cache_request_count ? (l0_cache_hit_count / l0_cache_request_count) * 100.0f : 0.0f; + + ADD(L0_CACHE_REQUEST_COUNT, l0_cache_request_count); + ADD(L0_CACHE_HIT_COUNT, l0_cache_hit_count); + ADD(L0_CACHE_MISS_COUNT, l0_cache_miss_count); + ADD(L0_CACHE_HIT, l0_cache_hit); + + /* L1 cache. */ + const double l1_cache_request_count = OP_RAW(GL1C_PERF_SEL_REQ); + const double l1_cache_hit_count = OP_SUB2(GL1C_PERF_SEL_REQ, GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_miss_count = OP_RAW(GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_hit = + l1_cache_request_count ? (l1_cache_hit_count / l1_cache_request_count) * 100.0f : 0.0f; + + ADD(L1_CACHE_REQUEST_COUNT, l1_cache_request_count); + ADD(L1_CACHE_HIT_COUNT, l1_cache_hit_count); + ADD(L1_CACHE_MISS_COUNT, l1_cache_miss_count); + ADD(L1_CACHE_HIT, l1_cache_hit); + + /* L2 cache. */ + const double l2_cache_request_count = OP_RAW(GL2C_PERF_SEL_REQ); + const double l2_cache_hit_count = OP_SUB2(GL2C_PERF_SEL_REQ, GL2C_PERF_SEL_MISS); + const double l2_cache_miss_count = OP_RAW(GL2C_PERF_SEL_MISS); + const double l2_cache_hit = + l2_cache_request_count ? (l2_cache_hit_count / l2_cache_request_count) * 100.0f : 0.0f; + + ADD(L2_CACHE_REQUEST_COUNT, l2_cache_request_count); + ADD(L2_CACHE_HIT_COUNT, l2_cache_hit_count); + ADD(L2_CACHE_MISS_COUNT, l2_cache_miss_count); + ADD(L2_CACHE_HIT, l2_cache_hit); + + /* LDS group */ + /* CS LDS Bank Conflict. */ + const double gpu_busy_cycles = OP_RAW(CPF_PERF_SEL_STAT_BUSY); + const double cs_lds_bank_conflict_cycles = OP_RAW(SQC_PERF_SEL_LDS_BANK_CONFLICT) / (double)num_simds; + const double cs_lds_bank_conflict = + gpu_busy_cycles ? (cs_lds_bank_conflict_cycles / gpu_busy_cycles) * 100.0f : 0.0f; + + ADD(GPU_BUSY_CYCLES, gpu_busy_cycles); + ADD(CS_LDS_BANK_CONFLICT_CYCLES, cs_lds_bank_conflict_cycles); + ADD(CS_LDS_BANK_CONFLICT, cs_lds_bank_conflict); + + /* Memmory (bytes) group. */ + /* Fetch size. */ + double fetch_size = OP_RAW(GL2C_PERF_SEL_EA_RDREQ_32B) * 32 + + OP_RAW(GL2C_PERF_SEL_EA_RDREQ_64B) * 64 + + OP_RAW(GL2C_PERF_SEL_EA_RDREQ_96B) * 96 + + OP_RAW(GL2C_PERF_SEL_EA_RDREQ_128B) * 128; + + ADD(FETCH_SIZE, fetch_size); + + /* Write size. */ + const double write_size = (OP_RAW(GL2C_PERF_SEL_EA_WRREQ) * 32 + + OP_RAW(GL2C_PERF_SEL_EA_WRREQ_64B) * 64) - + (OP_RAW(GL2C_PERF_SEL_EA_WRREQ_64B) * 32); + + ADD(WRITE_SIZE, write_size); + + /* Local video mem bytes. */ + const double local_vid_mem_bytes = OP_RAW(GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS) * 32; + + ADD(LOCAL_VID_MEM_BYTES, local_vid_mem_bytes); + + /* PCIe bytes. */ + const double pcie_bytes = OP_RAW(GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS) * 32; + + ADD(PCIE_BYTES, pcie_bytes); + + /* Memory (percentage) group. */ + /* Memory unit busy. */ + const double mem_unit_busy_cycles = OP_RAW(TA_PERF_SEL_TA_BUSY); + const double mem_unit_busy = + gpu_busy_cycles ? (mem_unit_busy_cycles / gpu_busy_cycles) * 100.0f : 0.0f; + + ADD(MEM_UNIT_BUSY_CYCLES, mem_unit_busy_cycles); + ADD(MEM_UNIT_BUSY, mem_unit_busy); + + /* Memory unit stalled. */ + const double mem_unit_stalled_cycles = OP_RAW(TCP_PERF_SEL_TCP_TA_REQ_STALL); + const double mem_unit_stalled = + gpu_busy_cycles ? (mem_unit_stalled_cycles / gpu_busy_cycles) * 100.0f : 0.f; + + ADD(MEM_UNIT_STALLED_CYCLES, mem_unit_stalled_cycles); + ADD(MEM_UNIT_STALLED, mem_unit_stalled); + + /* Raytracing group. */ + /* Ray box tests. */ + const double ray_box_tests = OP_RAW(TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE) + + OP_RAW(TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE); + + ADD(RAY_BOX_TESTS, ray_box_tests); + + /* Ray triangle tests. */ + const double ray_tri_tests = OP_RAW(TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE); + + ADD(RAY_TRI_TESTS, ray_tri_tests); + } + +#undef ADD +#undef OP_RAW +#undef OP_SUM2 +#undef OP_SUM3 +#undef OP_SUB2 + + spm_derived_trace->num_timestamps = spm_trace->num_samples; + spm_derived_trace->sample_interval = spm_trace->sample_interval; + + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) + free(raw_counter_values[i]); + + return spm_derived_trace; +} + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + util_dynarray_fini(&component->values); + } + + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + util_dynarray_fini(&counter->values); + } + + free(spm_derived_trace); +} + static void ac_emit_spm_muxsel(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm) diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index 27b76736b23..a4a539f4834 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -11,6 +11,8 @@ #include "ac_perfcounter.h" +#include "util/u_dynarray.h" + struct ac_cmdbuf; #define AC_SPM_MAX_COUNTER_PER_BLOCK 16 @@ -35,7 +37,7 @@ enum ac_spm_global_block { AC_SPM_GLOBAL_BLOCK_GL2C, AC_SPM_GLOBAL_BLOCK_SDMA, AC_SPM_GLOBAL_BLOCK_GUS, - AC_SPM_GLOBAL_BLOCK_EA, + AC_SPM_GLOBAL_BLOCK_GCEA, AC_SPM_GLOBAL_BLOCK_CHA, AC_SPM_GLOBAL_BLOCK_CHC, AC_SPM_GLOBAL_BLOCK_CHCG, @@ -86,7 +88,44 @@ enum ac_spm_segment_type { AC_SPM_SEGMENT_TYPE_COUNT, }; +enum ac_spm_raw_counter_id { + AC_SPM_TCP_PERF_SEL_REQ = 0, + AC_SPM_TCP_PERF_SEL_REQ_MISS, + AC_SPM_SQC_PERF_SEL_DCACHE_HITS, + AC_SPM_SQC_PERF_SEL_DCACHE_MISSES, + AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE, + AC_SPM_SQC_PERF_SEL_ICACHE_HITS, + AC_SPM_SQC_PERF_SEL_ICACHE_MISSES, + AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE, + AC_SPM_GL1C_PERF_SEL_REQ, + AC_SPM_GL1C_PERF_SEL_REQ_MISS, + AC_SPM_GL2C_PERF_SEL_REQ, + AC_SPM_GL2C_PERF_SEL_MISS, + AC_SPM_CPF_PERF_SEL_STAT_BUSY, + AC_SPM_SQC_PERF_SEL_LDS_BANK_CONFLICT, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_32B, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_64B, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_96B, + AC_SPM_GL2C_PERF_SEL_EA_RDREQ_128B, + AC_SPM_GL2C_PERF_SEL_EA_WRREQ, + AC_SPM_GL2C_PERF_SEL_EA_WRREQ_64B, + AC_SPM_GCEA_PERF_SEL_SARB_DRAM_SIZED_REQUESTS, + AC_SPM_GCEA_PERF_SEL_SARB_IO_SIZED_REQUESTS, + AC_SPM_TA_PERF_SEL_TA_BUSY, + AC_SPM_TCP_PERF_SEL_TCP_TA_REQ_STALL, + AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_TRI_NODE, + AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP16_BOX_NODE, + AC_SPM_TD_PERF_SEL_RAY_TRACING_BVH4_FP32_BOX_NODE, + AC_SPM_RAW_COUNTER_ID_COUNT, +}; + +enum ac_spm_raw_counter_op { + AC_SPM_RAW_COUNTER_OP_SUM = 0, + AC_SPM_RAW_COUNTER_OP_MAX, +}; + struct ac_spm_counter_descr { + enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; uint32_t event_id; }; @@ -119,6 +158,7 @@ struct ac_spm_muxsel_line { struct ac_spm_counter_info { /* General info. */ + enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; uint32_t instance; uint32_t event_id; @@ -194,6 +234,122 @@ struct ac_spm_trace { uint32_t num_samples; }; +enum ac_spm_group_id { + AC_SPM_GROUP_CACHE, + AC_SPM_GROUP_LDS, + AC_SPM_GROUP_MEMORY_BYTES, + AC_SPM_GROUP_MEMORY_PERCENTAGE, + AC_SPM_GROUP_RT, + AC_SPM_GROUP_COUNT, +}; + +enum ac_spm_counter_id { + AC_SPM_COUNTER_INST_CACHE_HIT, + AC_SPM_COUNTER_SCALAR_CACHE_HIT, + AC_SPM_COUNTER_L0_CACHE_HIT, + AC_SPM_COUNTER_L1_CACHE_HIT, /* < GFX12 */ + AC_SPM_COUNTER_L2_CACHE_HIT, + AC_SPM_COUNTER_CS_LDS_BANK_CONFLICT, + AC_SPM_COUNTER_FETCH_SIZE, + AC_SPM_COUNTER_WRITE_SIZE, + AC_SPM_COUNTER_LOCAL_VID_MEM_BYTES, + AC_SPM_COUNTER_PCIE_BYTES, + AC_SPM_COUNTER_MEM_UNIT_BUSY, + AC_SPM_COUNTER_MEM_UNIT_STALLED, + AC_SPM_COUNTER_RAY_BOX_TESTS, + AC_SPM_COUNTER_RAY_TRI_TESTS, + AC_SPM_COUNTER_COUNT, +}; + +enum ac_spm_component_id { + AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_GPU_BUSY_CYCLES, + AC_SPM_COMPONENT_CS_LDS_BANK_CONFLICT_CYCLES, + AC_SPM_COMPONENT_MEM_UNIT_BUSY_CYCLES, + AC_SPM_COMPONENT_MEM_UNIT_STALLED_CYCLES, + AC_SPM_COMPONENT_COUNT, +}; + +enum ac_spm_usage_type { + AC_SPM_USAGE_PERCENTAGE = 1, + AC_SPM_USAGE_CYCLES = 2, + AC_SPM_USAGE_BYTES = 4, + AC_SPM_USAGE_ITEMS = 5, +}; + +#define AC_SPM_MAX_COMPONENTS_PER_COUNTER 3 +#define AC_SPM_MAX_COUNTERS_PER_GROUP 5 + +struct ac_spm_derived_component_descr { + enum ac_spm_component_id id; + enum ac_spm_counter_id counter_id; + const char *name; + enum ac_spm_usage_type usage; +}; + +struct ac_spm_derived_counter_descr { + enum ac_spm_counter_id id; + enum ac_spm_group_id group_id; + const char *name; + const char *desc; + enum ac_spm_usage_type usage; + uint32_t num_components; + struct ac_spm_derived_component_descr *components[AC_SPM_MAX_COMPONENTS_PER_COUNTER]; +}; + +struct ac_spm_derived_group_descr { + enum ac_spm_group_id id; + const char *name; + uint32_t num_counters; + struct ac_spm_derived_counter_descr *counters[AC_SPM_MAX_COUNTERS_PER_GROUP]; +}; + +struct ac_spm_derived_group { + const struct ac_spm_derived_group_descr *descr; +}; + +struct ac_spm_derived_counter { + const struct ac_spm_derived_counter_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_component { + const struct ac_spm_derived_component_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_trace { + uint32_t num_timestamps; + uint64_t *timestamps; + + uint32_t num_groups; + struct ac_spm_derived_group groups[AC_SPM_GROUP_COUNT]; + + uint32_t num_counters; + struct ac_spm_derived_counter counters[AC_SPM_COUNTER_COUNT]; + + uint32_t num_components; + struct ac_spm_derived_component components[AC_SPM_COMPONENT_COUNT]; + + uint32_t sample_interval; +}; + bool ac_init_spm(const struct radeon_info *info, const struct ac_perfcounters *pc, struct ac_spm *spm); @@ -201,6 +357,13 @@ void ac_destroy_spm(struct ac_spm *spm); bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace); +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace); + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace); + void ac_emit_spm_setup(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm, diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index c9c34e9d0b5..8c7110d6bf9 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -3981,6 +3981,7 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) { struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline; struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + struct radv_cmd_stream *cs = cmd_buffer->cs; if (cmd_buffer->state.emitted_graphics_pipeline == pipeline) return; @@ -3990,6 +3991,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer) if (pipeline->sqtt_shaders_reloc) { /* Emit shaders relocation because RGP requires them to be contiguous in memory. */ radv_sqtt_emit_relocated_shaders(cmd_buffer, pipeline); + + radv_cs_add_buffer(device->ws, cs->b, pipeline->sqtt_shaders_reloc->bo); } if (radv_device_fault_detection_enabled(device))