diff --git a/docs/envvars.rst b/docs/envvars.rst index b2351f23b19..f14e6936f7e 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -1676,6 +1676,11 @@ RADV driver environment variables enable/disable SQTT/RGP instruction timing (enabled by default) +.. envvar:: RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK + + set the SQTT/RGP instruction timing SE mask (default value is 0xFFFFFFFF, + which means all SE are included) + .. envvar:: RADV_THREAD_TRACE_QUEUE_EVENTS enable/disable SQTT/RGP queue events (enabled by default) diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index 95a95cb1d7f..6a594fafc57 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -573,7 +573,8 @@ struct sqtt_file_chunk_api_info { static_assert(sizeof(struct sqtt_file_chunk_api_info) == 560, "sqtt_file_chunk_api_info doesn't match RGP spec"); -static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk) +static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk, + uint32_t instruction_timing_se_mask) { chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_API_INFO; chunk->header.chunk_id.index = 0; @@ -585,7 +586,12 @@ static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk) chunk->major_version = 0; chunk->minor_version = 0; chunk->profiling_mode = SQTT_PROFILING_MODE_PRESENT; - chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED; + if (instruction_timing_se_mask) { + chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_FULL_FRAME; + chunk->instruction_trace_data.shader_engine_filter.mask = instruction_timing_se_mask; + } else { + chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED; + } } struct sqtt_code_object_database_record { @@ -1232,7 +1238,7 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt fwrite(&asic_info, sizeof(asic_info), 1, output); /* SQTT api chunk. */ - ac_sqtt_fill_api_info(&api_info); + ac_sqtt_fill_api_info(&api_info, sqtt_trace->instruction_timing_se_mask); file_offset += sizeof(api_info); fwrite(&api_info, sizeof(api_info), 1, output); diff --git a/src/amd/common/ac_sqtt.c b/src/amd/common/ac_sqtt.c index 9f4d2414a8d..a04c79ea2cc 100644 --- a/src/amd/common/ac_sqtt.c +++ b/src/amd/common/ac_sqtt.c @@ -301,6 +301,7 @@ ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info, sqtt_trace->trace_shader_core_clock = data->trace_shader_core_clock; sqtt_trace->trace_memory_clock = data->trace_memory_clock; + sqtt_trace->instruction_timing_se_mask = data->instruction_timing_se_mask; /* Use maximum clocks when they aren't sampled. */ if (!sqtt_trace->trace_shader_core_clock) @@ -374,6 +375,9 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, if (ac_sqtt_se_is_disabled(info, se)) continue; + const bool instruction_timing_enabled = + sqtt->instruction_timing_enabled && (sqtt->instruction_timing_se_mask & (1u << se)); + /* Target SEx and SH0. */ ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); @@ -406,7 +410,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, /* Performance counters with SQTT are considered deprecated. */ uint32_t token_exclude = 0; - if (!sqtt->instruction_timing_enabled) { + if (!instruction_timing_enabled) { /* Reduce SQTT traffic when instruction timing isn't enabled. */ token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC | V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE | @@ -447,7 +451,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4, /* Performance counters with SQTT are considered deprecated. */ uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF; - if (!sqtt->instruction_timing_enabled) { + if (!instruction_timing_enabled) { /* Reduce SQTT traffic when instruction timing isn't enabled. */ token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC | V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE | diff --git a/src/amd/common/ac_sqtt.h b/src/amd/common/ac_sqtt.h index dffcf39460e..8de4a101e64 100644 --- a/src/amd/common/ac_sqtt.h +++ b/src/amd/common/ac_sqtt.h @@ -45,6 +45,7 @@ struct ac_sqtt { int start_frame; char *trigger_file; bool instruction_timing_enabled; + uint32_t instruction_timing_se_mask; /* Shader/memory clock frequencies in Mhz sampled at trace time. */ uint32_t trace_shader_core_clock; @@ -94,6 +95,7 @@ struct ac_sqtt_trace { uint32_t trace_shader_core_clock; uint32_t trace_memory_clock; + uint32_t instruction_timing_se_mask; uint32_t num_traces; struct ac_sqtt_data_se traces[SQTT_MAX_TRACES]; diff --git a/src/amd/vulkan/tools/radv_sqtt.c b/src/amd/vulkan/tools/radv_sqtt.c index c7579bfdac8..d75e45ab522 100644 --- a/src/amd/vulkan/tools/radv_sqtt.c +++ b/src/amd/vulkan/tools/radv_sqtt.c @@ -26,6 +26,12 @@ radv_is_instruction_timing_enabled(void) return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true); } +static uint32_t +radv_get_instruction_timing_se_mask(void) +{ + return (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u); +} + bool radv_sqtt_queue_events_enabled(void) { @@ -401,6 +407,8 @@ radv_sqtt_init(struct radv_device *device) return false; device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled(); + device->sqtt.instruction_timing_se_mask = + device->sqtt.instruction_timing_enabled ? radv_get_instruction_timing_se_mask() : 0; /* Whether to use a staging buffer for faster reads on dGPUs. */ device->rgp_use_staging_buffer = pdev->info.has_dedicated_vram; diff --git a/src/gallium/drivers/radeonsi/gfx/si_sqtt.c b/src/gallium/drivers/radeonsi/gfx/si_sqtt.c index 33e1b3b67c5..40fbedea59f 100644 --- a/src/gallium/drivers/radeonsi/gfx/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/gfx/si_sqtt.c @@ -357,6 +357,10 @@ bool si_init_sqtt(struct si_context *sctx) sctx->sqtt->instruction_timing_enabled = debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true); + sctx->sqtt->instruction_timing_se_mask = + sctx->sqtt->instruction_timing_enabled + ? (uint32_t)debug_get_num_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u) + : 0; sctx->sqtt->start_frame = 10; const char *trigger = os_get_option("AMD_THREAD_TRACE_TRIGGER");