radv/sqtt: add instruction timing SE mask controls
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Add configurable SE masks for instruction timing capture and export the selected mask in RGP metadata so hit counts match the traced shader engine coverage.
An environment variable RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK is used to config SE mask. If it's not specified, all SE data are captured.

Signed-off-by: Gu, Wangfeng <Wangfeng.Gu@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42264>
This commit is contained in:
Gu, Wangfeng 2026-06-16 11:16:45 +08:00 committed by Marge Bot
parent 1e687cb162
commit dac0019373
6 changed files with 34 additions and 5 deletions

View file

@ -1676,6 +1676,11 @@ RADV driver environment variables
enable/disable SQTT/RGP instruction timing (enabled by default)
.. envvar:: RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK
set the SQTT/RGP instruction timing SE mask (default value is 0xFFFFFFFF,
which means all SE are included)
.. envvar:: RADV_THREAD_TRACE_QUEUE_EVENTS
enable/disable SQTT/RGP queue events (enabled by default)

View file

@ -573,7 +573,8 @@ struct sqtt_file_chunk_api_info {
static_assert(sizeof(struct sqtt_file_chunk_api_info) == 560,
"sqtt_file_chunk_api_info doesn't match RGP spec");
static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk)
static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk,
uint32_t instruction_timing_se_mask)
{
chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_API_INFO;
chunk->header.chunk_id.index = 0;
@ -585,7 +586,12 @@ static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk)
chunk->major_version = 0;
chunk->minor_version = 0;
chunk->profiling_mode = SQTT_PROFILING_MODE_PRESENT;
chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED;
if (instruction_timing_se_mask) {
chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_FULL_FRAME;
chunk->instruction_trace_data.shader_engine_filter.mask = instruction_timing_se_mask;
} else {
chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED;
}
}
struct sqtt_code_object_database_record {
@ -1232,7 +1238,7 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt
fwrite(&asic_info, sizeof(asic_info), 1, output);
/* SQTT api chunk. */
ac_sqtt_fill_api_info(&api_info);
ac_sqtt_fill_api_info(&api_info, sqtt_trace->instruction_timing_se_mask);
file_offset += sizeof(api_info);
fwrite(&api_info, sizeof(api_info), 1, output);

View file

@ -301,6 +301,7 @@ ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
sqtt_trace->trace_shader_core_clock = data->trace_shader_core_clock;
sqtt_trace->trace_memory_clock = data->trace_memory_clock;
sqtt_trace->instruction_timing_se_mask = data->instruction_timing_se_mask;
/* Use maximum clocks when they aren't sampled. */
if (!sqtt_trace->trace_shader_core_clock)
@ -374,6 +375,9 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
if (ac_sqtt_se_is_disabled(info, se))
continue;
const bool instruction_timing_enabled =
sqtt->instruction_timing_enabled && (sqtt->instruction_timing_se_mask & (1u << se));
/* Target SEx and SH0. */
ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
@ -406,7 +410,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
/* Performance counters with SQTT are considered deprecated. */
uint32_t token_exclude = 0;
if (!sqtt->instruction_timing_enabled) {
if (!instruction_timing_enabled) {
/* Reduce SQTT traffic when instruction timing isn't enabled. */
token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
@ -447,7 +451,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
/* Performance counters with SQTT are considered deprecated. */
uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
if (!sqtt->instruction_timing_enabled) {
if (!instruction_timing_enabled) {
/* Reduce SQTT traffic when instruction timing isn't enabled. */
token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |

View file

@ -45,6 +45,7 @@ struct ac_sqtt {
int start_frame;
char *trigger_file;
bool instruction_timing_enabled;
uint32_t instruction_timing_se_mask;
/* Shader/memory clock frequencies in Mhz sampled at trace time. */
uint32_t trace_shader_core_clock;
@ -94,6 +95,7 @@ struct ac_sqtt_trace {
uint32_t trace_shader_core_clock;
uint32_t trace_memory_clock;
uint32_t instruction_timing_se_mask;
uint32_t num_traces;
struct ac_sqtt_data_se traces[SQTT_MAX_TRACES];

View file

@ -26,6 +26,12 @@ radv_is_instruction_timing_enabled(void)
return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
}
static uint32_t
radv_get_instruction_timing_se_mask(void)
{
return (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u);
}
bool
radv_sqtt_queue_events_enabled(void)
{
@ -401,6 +407,8 @@ radv_sqtt_init(struct radv_device *device)
return false;
device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled();
device->sqtt.instruction_timing_se_mask =
device->sqtt.instruction_timing_enabled ? radv_get_instruction_timing_se_mask() : 0;
/* Whether to use a staging buffer for faster reads on dGPUs. */
device->rgp_use_staging_buffer = pdev->info.has_dedicated_vram;

View file

@ -357,6 +357,10 @@ bool si_init_sqtt(struct si_context *sctx)
sctx->sqtt->instruction_timing_enabled =
debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true);
sctx->sqtt->instruction_timing_se_mask =
sctx->sqtt->instruction_timing_enabled
? (uint32_t)debug_get_num_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u)
: 0;
sctx->sqtt->start_frame = 10;
const char *trigger = os_get_option("AMD_THREAD_TRACE_TRIGGER");