mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-06-21 01:38:23 +02:00
radv/sqtt: add instruction timing SE mask controls
Add configurable SE masks for instruction timing capture and export the selected mask in RGP metadata so hit counts match the traced shader engine coverage. An environment variable RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK is used to config SE mask. If it's not specified, all SE data are captured. Signed-off-by: Gu, Wangfeng <Wangfeng.Gu@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/42264>
This commit is contained in:
parent
1e687cb162
commit
dac0019373
6 changed files with 34 additions and 5 deletions
|
|
@ -1676,6 +1676,11 @@ RADV driver environment variables
|
|||
|
||||
enable/disable SQTT/RGP instruction timing (enabled by default)
|
||||
|
||||
.. envvar:: RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK
|
||||
|
||||
set the SQTT/RGP instruction timing SE mask (default value is 0xFFFFFFFF,
|
||||
which means all SE are included)
|
||||
|
||||
.. envvar:: RADV_THREAD_TRACE_QUEUE_EVENTS
|
||||
|
||||
enable/disable SQTT/RGP queue events (enabled by default)
|
||||
|
|
|
|||
|
|
@ -573,7 +573,8 @@ struct sqtt_file_chunk_api_info {
|
|||
static_assert(sizeof(struct sqtt_file_chunk_api_info) == 560,
|
||||
"sqtt_file_chunk_api_info doesn't match RGP spec");
|
||||
|
||||
static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk)
|
||||
static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk,
|
||||
uint32_t instruction_timing_se_mask)
|
||||
{
|
||||
chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_API_INFO;
|
||||
chunk->header.chunk_id.index = 0;
|
||||
|
|
@ -585,7 +586,12 @@ static void ac_sqtt_fill_api_info(struct sqtt_file_chunk_api_info *chunk)
|
|||
chunk->major_version = 0;
|
||||
chunk->minor_version = 0;
|
||||
chunk->profiling_mode = SQTT_PROFILING_MODE_PRESENT;
|
||||
chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED;
|
||||
if (instruction_timing_se_mask) {
|
||||
chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_FULL_FRAME;
|
||||
chunk->instruction_trace_data.shader_engine_filter.mask = instruction_timing_se_mask;
|
||||
} else {
|
||||
chunk->instruction_trace_mode = SQTT_INSTRUCTION_TRACE_DISABLED;
|
||||
}
|
||||
}
|
||||
|
||||
struct sqtt_code_object_database_record {
|
||||
|
|
@ -1232,7 +1238,7 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt
|
|||
fwrite(&asic_info, sizeof(asic_info), 1, output);
|
||||
|
||||
/* SQTT api chunk. */
|
||||
ac_sqtt_fill_api_info(&api_info);
|
||||
ac_sqtt_fill_api_info(&api_info, sqtt_trace->instruction_timing_se_mask);
|
||||
file_offset += sizeof(api_info);
|
||||
fwrite(&api_info, sizeof(api_info), 1, output);
|
||||
|
||||
|
|
|
|||
|
|
@ -301,6 +301,7 @@ ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
|
|||
|
||||
sqtt_trace->trace_shader_core_clock = data->trace_shader_core_clock;
|
||||
sqtt_trace->trace_memory_clock = data->trace_memory_clock;
|
||||
sqtt_trace->instruction_timing_se_mask = data->instruction_timing_se_mask;
|
||||
|
||||
/* Use maximum clocks when they aren't sampled. */
|
||||
if (!sqtt_trace->trace_shader_core_clock)
|
||||
|
|
@ -374,6 +375,9 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
|
|||
if (ac_sqtt_se_is_disabled(info, se))
|
||||
continue;
|
||||
|
||||
const bool instruction_timing_enabled =
|
||||
sqtt->instruction_timing_enabled && (sqtt->instruction_timing_se_mask & (1u << se));
|
||||
|
||||
/* Target SEx and SH0. */
|
||||
ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
|
||||
S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
|
||||
|
|
@ -406,7 +410,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
|
|||
/* Performance counters with SQTT are considered deprecated. */
|
||||
uint32_t token_exclude = 0;
|
||||
|
||||
if (!sqtt->instruction_timing_enabled) {
|
||||
if (!instruction_timing_enabled) {
|
||||
/* Reduce SQTT traffic when instruction timing isn't enabled. */
|
||||
token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
|
||||
V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
|
||||
|
|
@ -447,7 +451,7 @@ ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
|
|||
/* Performance counters with SQTT are considered deprecated. */
|
||||
uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
|
||||
|
||||
if (!sqtt->instruction_timing_enabled) {
|
||||
if (!instruction_timing_enabled) {
|
||||
/* Reduce SQTT traffic when instruction timing isn't enabled. */
|
||||
token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
|
||||
V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ struct ac_sqtt {
|
|||
int start_frame;
|
||||
char *trigger_file;
|
||||
bool instruction_timing_enabled;
|
||||
uint32_t instruction_timing_se_mask;
|
||||
|
||||
/* Shader/memory clock frequencies in Mhz sampled at trace time. */
|
||||
uint32_t trace_shader_core_clock;
|
||||
|
|
@ -94,6 +95,7 @@ struct ac_sqtt_trace {
|
|||
|
||||
uint32_t trace_shader_core_clock;
|
||||
uint32_t trace_memory_clock;
|
||||
uint32_t instruction_timing_se_mask;
|
||||
|
||||
uint32_t num_traces;
|
||||
struct ac_sqtt_data_se traces[SQTT_MAX_TRACES];
|
||||
|
|
|
|||
|
|
@ -26,6 +26,12 @@ radv_is_instruction_timing_enabled(void)
|
|||
return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true);
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
radv_get_instruction_timing_se_mask(void)
|
||||
{
|
||||
return (uint32_t)debug_get_num_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u);
|
||||
}
|
||||
|
||||
bool
|
||||
radv_sqtt_queue_events_enabled(void)
|
||||
{
|
||||
|
|
@ -401,6 +407,8 @@ radv_sqtt_init(struct radv_device *device)
|
|||
return false;
|
||||
|
||||
device->sqtt.instruction_timing_enabled = radv_is_instruction_timing_enabled();
|
||||
device->sqtt.instruction_timing_se_mask =
|
||||
device->sqtt.instruction_timing_enabled ? radv_get_instruction_timing_se_mask() : 0;
|
||||
|
||||
/* Whether to use a staging buffer for faster reads on dGPUs. */
|
||||
device->rgp_use_staging_buffer = pdev->info.has_dedicated_vram;
|
||||
|
|
|
|||
|
|
@ -357,6 +357,10 @@ bool si_init_sqtt(struct si_context *sctx)
|
|||
|
||||
sctx->sqtt->instruction_timing_enabled =
|
||||
debug_get_bool_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING", true);
|
||||
sctx->sqtt->instruction_timing_se_mask =
|
||||
sctx->sqtt->instruction_timing_enabled
|
||||
? (uint32_t)debug_get_num_option("AMD_THREAD_TRACE_INSTRUCTION_TIMING_SE_MASK", ~0u)
|
||||
: 0;
|
||||
sctx->sqtt->start_frame = 10;
|
||||
|
||||
const char *trigger = os_get_option("AMD_THREAD_TRACE_TRIGGER");
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue