From dac0019373a2a2cb2ed8b68b066b13dd3d822ef2 Mon Sep 17 00:00:00 2001 From: "Gu, Wangfeng" Date: Tue, 16 Jun 2026 11:16:45 +0800 Subject: [PATCH] radv/sqtt: add instruction timing SE mask controls Add configurable SE masks for instruction timing capture and export the selected mask in RGP metadata so hit counts match the traced shader engine coverage. An environment variable RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK is used to config SE mask. If it's not specified, all SE data are captured. Signed-off-by: Gu, Wangfeng Part-of: --- docs/envvars.rst | 5 +++++ src/amd/common/ac_rgp.c | 12 +++++++++--- src/amd/common/ac_sqtt.c | 8 ++++++-- src/amd/common/ac_sqtt.h | 2 ++ src/amd/vulkan/tools/radv_sqtt.c | 8 ++++++++ src/gallium/drivers/radeonsi/gfx/si_sqtt.c | 4 ++++ 6 files changed, 34 insertions(+), 5 deletions(-) diff --git a/docs/envvars.rst b/docs/envvars.rst index b2351f23b19..f14e6936f7e 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -1676,6 +1676,11 @@ RADV driver environment variables enable/disable SQTT/RGP instruction timing (enabled by default) +.. envvar:: RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK + + set the SQTT/RGP instruction timing SE mask (default value is 0xFFFFFFFF, + which means all SE are included) + .. envvar:: RADV_THREAD_TRACE_QUEUE_EVENTS enable/disable SQTT/RGP queue events (enabled by default) diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index 95a95cb1d7f..6a594fafc57 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -573,7 +573,8 @@ struct sqtt_file_chunk_api_info { static_assert(sizeof(struct sqtt_file_chunk_api_info) == 560, "sqtt_file_chunk_api_info doesn't match RGP spec"); -static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk) +static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk, + uint32_t instruction_timing_se_mask) { chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_API_INFO; chunk->header.chunk_id.index = 0; @@ -585,7 +586,12 @@ static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk) chunk->major_version = 0; chunk->minor_version = 0; chunk->profiling_mode = SQTT_PROFILING_MODE_PRESENT; - chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED; + if (instruction_timing_se_mask) { + chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_FULL_FRAME; + chunk->instruction_trace_data.shader_engine_filter.mask = instruction_timing_se_mask; + } else { + chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED; + } } struct sqtt_code_object_database_record { @@ -1232,7 +1238,7 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt fwrite(&asic_info, sizeof(asic_info), 1, output); /* SQTT api chunk. */ - ac_sqtt_fill_api_info(&api_info); + ac_sqtt_fill_api_info(&api_info, sqtt_trace->instruction_timing_se_mask); file_offset += sizeof(api_info); fwrite(&api_info, sizeof(api_info), 1, output); diff --git a/src/amd/common/ac_sqtt.c b/src/amd/common/ac_sqtt.c index 9f4d2414a8d..a04c79ea2cc 100644 --- a/src/amd/common/ac_sqtt.c +++ b/src/amd/common/ac_sqtt.c @@ -301,6 +301,7 @@ ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info, sqtt_trace->trace_shader_core_clock = data->trace_shader_core_clock; sqtt_trace->trace_memory_clock = data->trace_memory_clock; + sqtt_trace->instruction_timing_se_mask = data->instruction_timing_se_mask; /* Use maximum clocks when they aren't sampled. */ if (!sqtt_trace->trace_shader_core_clock) @@ -374,6 +375,9 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, if (ac_sqtt_se_is_disabled(info, se)) continue; + const bool instruction_timing_enabled = + sqtt->instruction_timing_enabled && (sqtt->instruction_timing_se_mask & (1u << se)); + /* Target SEx and SH0. */ ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); @@ -406,7 +410,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, /* Performance counters with SQTT are considered deprecated. */ uint32_t token_exclude = 0; - if (!sqtt->instruction_timing_enabled) { + if (!instruction_timing_enabled) { /* Reduce SQTT traffic when instruction timing isn't enabled. */ token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC | V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE | @@ -447,7 +451,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, /* Performance counters with SQTT are considered deprecated. */ uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF; - if (!sqtt->instruction_timing_enabled) { + if (!instruction_timing_enabled) { /* Reduce SQTT traffic when instruction timing isn't enabled. */ token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC | V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE | diff --git a/src/amd/common/ac_sqtt.h b/src/amd/common/ac_sqtt.h index dffcf39460e..8de4a101e64 100644 --- a/src/amd/common/ac_sqtt.h +++ b/src/amd/common/ac_sqtt.h @@ -45,6 +45,7 @@ struct ac_sqtt { int start_frame; char *trigger_file; bool instruction_timing_enabled; + uint32_t instruction_timing_se_mask; /* Shader/memory clock frequencies in Mhz sampled at trace time. */ uint32_t trace_shader_core_clock; @@ -94,6 +95,7 @@ struct ac_sqtt_trace { uint32_t trace_shader_core_clock; uint32_t trace_memory_clock; + uint32_t instruction_timing_se_mask; uint32_t num_traces; struct ac_sqtt_data_se traces[SQTT_MAX_TRACES]; diff --git a/src/amd/vulkan/tools/radv_sqtt.c b/src/amd/vulkan/tools/radv_sqtt.c index c7579bfdac8..d75e45ab522 100644 --- a/src/amd/vulkan/tools/radv_sqtt.c +++ b/src/amd/vulkan/tools/radv_sqtt.c @@ -26,6 +26,12 @@ radv_is_instruction_timing_enabled(void) return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true); } +static uint32_t +radv_get_instruction_timing_se_mask(void) +{ + return (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u); +} + bool radv_sqtt_queue_events_enabled(void) { @@ -401,6 +407,8 @@ radv_sqtt_init(struct radv_device *device) return false; device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled(); + device->sqtt.instruction_timing_se_mask = + device->sqtt.instruction_timing_enabled ? radv_get_instruction_timing_se_mask() : 0; /* Whether to use a staging buffer for faster reads on dGPUs. */ device->rgp_use_staging_buffer = pdev->info.has_dedicated_vram; diff --git a/src/gallium/drivers/radeonsi/gfx/si_sqtt.c b/src/gallium/drivers/radeonsi/gfx/si_sqtt.c index 33e1b3b67c5..40fbedea59f 100644 --- a/src/gallium/drivers/radeonsi/gfx/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/gfx/si_sqtt.c @@ -357,6 +357,10 @@ bool si_init_sqtt(struct si_context *sctx) sctx->sqtt->instruction_timing_enabled = debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true); + sctx->sqtt->instruction_timing_se_mask = + sctx->sqtt->instruction_timing_enabled + ? (uint32_t)debug_get_num_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u) + : 0; sctx->sqtt->start_frame = 10; const char *trigger = os_get_option("AMD_THREAD_TRACE_TRIGGER");