mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 04:50:11 +01:00
intel/perf: add eu stall sampling support
Xe2+ GPUs have support for eu stall sampling perf debug feature. This feature allows driver to collect count and reasons for why EUs are stalled on GPU. Stall data is cross referenced with ip address within individual shaders so it is possible to know which instructions in which shaders are generating stalls. This should be a very useful feature for debugging performance of slow shaders. Reviewed-by: José Roberto de Souza <jose.souza@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30142>
This commit is contained in:
parent
d6a379f7a7
commit
2a828c35a1
4 changed files with 336 additions and 0 deletions
|
|
@ -1651,3 +1651,64 @@ intel_perf_stream_set_metrics_id(struct intel_perf_config *perf_config,
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
intel_perf_eustall_stream_open(struct intel_device_info *devinfo, int drm_fd,
|
||||
uint32_t sample_rate, uint32_t min_event_count)
|
||||
{
|
||||
if (devinfo->ver >= 20 &&
|
||||
devinfo->kmd_type == INTEL_KMD_TYPE_XE)
|
||||
return xe_perf_eustall_stream_open(drm_fd, sample_rate,
|
||||
min_event_count);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
intel_perf_eustall_stream_set_state(struct intel_device_info *devinfo,
|
||||
int perf_stream_fd, bool enable)
|
||||
{
|
||||
if (devinfo->ver >= 20 &&
|
||||
devinfo->kmd_type == INTEL_KMD_TYPE_XE)
|
||||
return xe_perf_stream_set_state(perf_stream_fd, enable);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
intel_perf_eustall_stream_record_size(struct intel_device_info *devinfo,
|
||||
int drm_fd)
|
||||
{
|
||||
if (devinfo->ver >= 20 &&
|
||||
devinfo->kmd_type == INTEL_KMD_TYPE_XE)
|
||||
return xe_perf_eustall_stream_record_size(drm_fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
intel_perf_eustall_stream_sample_rate(struct intel_device_info *devinfo,
|
||||
int drm_fd)
|
||||
{
|
||||
if (devinfo->ver >= 20 &&
|
||||
devinfo->kmd_type == INTEL_KMD_TYPE_XE)
|
||||
return xe_perf_eustall_stream_sample_rate(drm_fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
intel_perf_eustall_stream_read_samples(struct intel_device_info *devinfo,
|
||||
int perf_stream_fd, uint8_t *buffer,
|
||||
size_t buffer_len, bool *overflow)
|
||||
{
|
||||
if (devinfo->ver >= 20 &&
|
||||
devinfo->kmd_type == INTEL_KMD_TYPE_XE)
|
||||
return xe_perf_eustall_stream_read_samples(perf_stream_fd, buffer,
|
||||
buffer_len, overflow);
|
||||
return -1;
|
||||
}
|
||||
|
||||
void
|
||||
intel_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result,
|
||||
const void *start, const void *end,
|
||||
size_t record_size)
|
||||
{
|
||||
return xe_perf_eustall_accumulate_results(result, start, end, record_size);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -175,6 +175,115 @@ struct intel_perf_query_result {
|
|||
bool query_disjoint;
|
||||
};
|
||||
|
||||
struct intel_perf_query_eustall_event {
|
||||
/**
|
||||
* Offset of instruction within shader cache, bit shifted by 3.
|
||||
* Should be unique identifier for event.
|
||||
*/
|
||||
uint64_t ip_addr;
|
||||
|
||||
/**
|
||||
* Number of EU stalls with at least one thread waiting on Pixel
|
||||
* Shader dependency
|
||||
*/
|
||||
uint64_t tdr_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting on any
|
||||
* other dependency (Flag/EoT etc). Multiple stall reasons can
|
||||
* qualify during the same cycle
|
||||
*/
|
||||
uint64_t other_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting for JEU to
|
||||
* complete branch instruction. Multiple stall reasons can qualify
|
||||
* during the same cycle
|
||||
*/
|
||||
uint64_t control_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread ready to be
|
||||
* scheduled (Grf conf/send holds etc). Multiple stall reasons can
|
||||
* qualify during the same cycle
|
||||
*/
|
||||
uint64_t pipestall_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting for SEND
|
||||
* message to be dispatched from EU. Multiple stall reasons can
|
||||
* qualify during the same cycle
|
||||
*/
|
||||
uint64_t send_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting for ALU to
|
||||
* write GRF/ACC register. Multiple stall reasons can qualify
|
||||
* during the same cycle
|
||||
*/
|
||||
uint64_t dist_acc_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting for
|
||||
* Scoreboard token to be available. Multiple stall reasons can
|
||||
* qualify during the same cycle
|
||||
*/
|
||||
uint64_t sbid_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting for
|
||||
* Gateway to write Notify register. Multiple stall reasons can
|
||||
* qualify during the same cycle
|
||||
*/
|
||||
uint64_t sync_count;
|
||||
|
||||
/**
|
||||
* Number of samples with at least one thread waiting for
|
||||
* Instruction Fetch. Multiple stall reasons can qualify during
|
||||
* the same cycle
|
||||
*/
|
||||
uint64_t inst_fetch_count;
|
||||
|
||||
/**
|
||||
* Number of samples where no threads are waiting
|
||||
*/
|
||||
uint64_t active_count;
|
||||
};
|
||||
|
||||
struct intel_perf_query_eustall_result {
|
||||
/**
|
||||
* Storage for accumulated samples. Hash table containing
|
||||
* intel_perf_query_eustall_event values with ip_addr as key.
|
||||
*/
|
||||
struct hash_table *accumulator;
|
||||
|
||||
/**
|
||||
* Hw ID used by the context on which the query was running.
|
||||
*/
|
||||
uint32_t hw_id;
|
||||
|
||||
/**
|
||||
* Number of records accumulated to produce the results.
|
||||
*/
|
||||
uint32_t records_accumulated;
|
||||
|
||||
/**
|
||||
* Overflow event occurred during sampling.
|
||||
*/
|
||||
bool overflow;
|
||||
|
||||
/**
|
||||
* Size of eu sample records in bytes. Obtained from
|
||||
* kmd headers.
|
||||
*/
|
||||
size_t record_size;
|
||||
|
||||
/**
|
||||
* Number of bytes to next record to parse.
|
||||
*/
|
||||
int bytes_to_next_record;
|
||||
};
|
||||
|
||||
typedef uint64_t (*intel_counter_read_uint64_t)(struct intel_perf_config *perf,
|
||||
const struct intel_perf_query_info *query,
|
||||
const struct intel_perf_query_result *results);
|
||||
|
|
@ -521,7 +630,16 @@ void intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *r
|
|||
const void *end,
|
||||
bool no_oa_accumulate);
|
||||
|
||||
/** Accumulate EU stall sampling data, ensuring data from previously seen offsets
|
||||
* get aggregated.
|
||||
*/
|
||||
void intel_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result,
|
||||
const void *start,
|
||||
const void *end,
|
||||
size_t record_size);
|
||||
|
||||
void intel_perf_query_result_clear(struct intel_perf_query_result *result);
|
||||
void intel_perf_query_eustall_result_clear(struct intel_perf_query_eustall_result *result);
|
||||
|
||||
/** Debug helper printing out query data.
|
||||
*/
|
||||
|
|
@ -608,6 +726,17 @@ int intel_perf_stream_set_metrics_id(struct intel_perf_config *perf_config,
|
|||
uint64_t metrics_set_id,
|
||||
struct intel_bind_timeline *timeline);
|
||||
|
||||
int intel_perf_eustall_stream_open(struct intel_device_info *devinfo, int drm_fd,
|
||||
uint32_t sample_rate, uint32_t min_event_count);
|
||||
int intel_perf_eustall_stream_set_state(struct intel_device_info *devinfo,
|
||||
int perf_stream_fd, bool enable);
|
||||
int intel_perf_eustall_stream_record_size(struct intel_device_info *devinfo,
|
||||
int drm_fd);
|
||||
int intel_perf_eustall_stream_sample_rate(struct intel_device_info *devinfo,
|
||||
int drm_fd);
|
||||
int intel_perf_eustall_stream_read_samples(struct intel_device_info *devinfo,
|
||||
int perf_stream_fd, uint8_t *buffer,
|
||||
size_t buffer_len, bool *overflow);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -369,3 +369,139 @@ xe_perf_stream_read_samples(struct intel_perf_config *perf_config, int perf_stre
|
|||
|
||||
return offset - buffer;
|
||||
}
|
||||
|
||||
static int
|
||||
first_rendering_gt_id(int drm_fd) {
|
||||
struct intel_query_engine_info *engine_info =
|
||||
intel_engine_get_info(drm_fd, INTEL_KMD_TYPE_XE);
|
||||
for (int i = 0; i < engine_info->num_engines; i++) {
|
||||
if (engine_info->engines[i].engine_class == INTEL_ENGINE_CLASS_RENDER)
|
||||
return engine_info->engines[i].gt_id;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
xe_perf_eustall_stream_open(int drm_fd, uint32_t sample_rate,
|
||||
uint32_t min_event_count)
|
||||
{
|
||||
struct drm_xe_ext_set_property props[DRM_XE_EU_STALL_PROP_MAX] = {};
|
||||
struct drm_xe_observation_param observation_param = {
|
||||
.observation_type = DRM_XE_OBSERVATION_TYPE_EU_STALL,
|
||||
.observation_op = DRM_XE_OBSERVATION_OP_STREAM_OPEN,
|
||||
.param = (uintptr_t)&props,
|
||||
};
|
||||
uint32_t i = 0;
|
||||
int fd, flags;
|
||||
int gt_id = first_rendering_gt_id(drm_fd);
|
||||
assert(gt_id >= 0);
|
||||
|
||||
oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_SAMPLE_RATE, sample_rate);
|
||||
oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS, min_event_count);
|
||||
oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_GT_ID, gt_id);
|
||||
|
||||
fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param);
|
||||
if (fd < 0)
|
||||
return -errno;
|
||||
|
||||
flags = fcntl(fd, F_GETFL, 0);
|
||||
flags |= O_CLOEXEC | O_NONBLOCK;
|
||||
if (fcntl(fd, F_SETFL, flags)) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
int
|
||||
xe_perf_eustall_stream_record_size(int drm_fd)
|
||||
{
|
||||
int record_size;
|
||||
struct drm_xe_query_eu_stall *eu_stall_data =
|
||||
xe_device_query_alloc_fetch(drm_fd, DRM_XE_DEVICE_QUERY_EU_STALL, NULL);
|
||||
if (!eu_stall_data)
|
||||
return -errno;
|
||||
|
||||
assert(eu_stall_data->record_size > 0 &&
|
||||
eu_stall_data->record_size < INT_MAX);
|
||||
record_size = (int)eu_stall_data->record_size;
|
||||
free(eu_stall_data);
|
||||
return record_size;
|
||||
}
|
||||
|
||||
int
|
||||
xe_perf_eustall_stream_sample_rate(int drm_fd)
|
||||
{
|
||||
int sampling_rate;
|
||||
struct drm_xe_query_eu_stall *eu_stall_data =
|
||||
xe_device_query_alloc_fetch(drm_fd, DRM_XE_DEVICE_QUERY_EU_STALL, NULL);
|
||||
if (!eu_stall_data)
|
||||
return -errno;
|
||||
|
||||
assert(eu_stall_data->sampling_rates[0] > 0 &&
|
||||
eu_stall_data->sampling_rates[0] < INT_MAX);
|
||||
sampling_rate = (int)eu_stall_data->sampling_rates[0];
|
||||
free(eu_stall_data);
|
||||
return sampling_rate;
|
||||
}
|
||||
|
||||
int
|
||||
xe_perf_eustall_stream_read_samples(int perf_stream_fd, uint8_t *buffer,
|
||||
size_t buffer_len, bool *overflow)
|
||||
{
|
||||
int len;
|
||||
|
||||
*overflow = false;
|
||||
do {
|
||||
len = read(perf_stream_fd, buffer, buffer_len);
|
||||
if (unlikely(len < 0 && errno == EIO))
|
||||
*overflow = true;
|
||||
} while (len < 0 && (errno == EINTR || errno == EIO));
|
||||
|
||||
if (unlikely(len < 0 && errno == EAGAIN))
|
||||
len = 0;
|
||||
|
||||
return len < 0 ? -errno : len;
|
||||
}
|
||||
|
||||
void
|
||||
xe_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result,
|
||||
const uint8_t *start, const uint8_t *end,
|
||||
size_t record_size)
|
||||
{
|
||||
const uint8_t *offset;
|
||||
assert(((end - start) % record_size) == 0);
|
||||
|
||||
for (offset = start; offset < end; offset += record_size) {
|
||||
const struct drm_xe_eu_stall_data_xe2* stall_data =
|
||||
(const struct drm_xe_eu_stall_data_xe2*)offset;
|
||||
struct intel_perf_query_eustall_event* stall_result;
|
||||
uint64_t ip_addr = stall_data->ip_addr;
|
||||
struct hash_entry *e = _mesa_hash_table_search(result->accumulator,
|
||||
(const void*)&ip_addr);
|
||||
if (e) {
|
||||
stall_result = e->data;
|
||||
} else {
|
||||
stall_result = calloc(1, sizeof(struct intel_perf_query_eustall_event));
|
||||
stall_result->ip_addr = ip_addr;
|
||||
_mesa_hash_table_insert(result->accumulator,
|
||||
(const void*)&stall_result->ip_addr,
|
||||
stall_result);
|
||||
}
|
||||
assert(stall_result->ip_addr == stall_data->ip_addr);
|
||||
|
||||
stall_result->tdr_count += stall_data->tdr_count;
|
||||
stall_result->other_count += stall_data->other_count;
|
||||
stall_result->control_count += stall_data->control_count;
|
||||
stall_result->pipestall_count += stall_data->pipestall_count;
|
||||
stall_result->send_count += stall_data->send_count;
|
||||
stall_result->dist_acc_count += stall_data->dist_acc_count;
|
||||
stall_result->sbid_count += stall_data->sbid_count;
|
||||
stall_result->sync_count += stall_data->sync_count;
|
||||
stall_result->inst_fetch_count += stall_data->inst_fetch_count;
|
||||
stall_result->active_count += stall_data->active_count;
|
||||
|
||||
result->records_accumulated++;
|
||||
}
|
||||
}
|
||||
|
|
@ -12,6 +12,7 @@
|
|||
struct intel_bind_timeline;
|
||||
struct intel_perf_config;
|
||||
struct intel_perf_registers;
|
||||
struct intel_perf_query_eustall_result;
|
||||
|
||||
uint64_t xe_perf_get_oa_format(struct intel_perf_config *perf);
|
||||
|
||||
|
|
@ -31,3 +32,12 @@ int xe_perf_stream_set_metrics_id(int perf_stream_fd, int drm_fd,
|
|||
struct intel_bind_timeline *timeline);
|
||||
int xe_perf_stream_read_samples(struct intel_perf_config *perf_config, int perf_stream_fd,
|
||||
uint8_t *buffer, size_t buffer_len);
|
||||
int xe_perf_eustall_stream_open(int drm_fd, uint32_t sample_rate,
|
||||
uint32_t min_event_count);
|
||||
int xe_perf_eustall_stream_record_size(int drm_fd);
|
||||
int xe_perf_eustall_stream_sample_rate(int drm_fd);
|
||||
int xe_perf_eustall_stream_read_samples(int perf_stream_fd, uint8_t *buffer,
|
||||
size_t buffer_len, bool *overflow);
|
||||
void xe_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result,
|
||||
const uint8_t *start, const uint8_t *end,
|
||||
size_t record_size);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue