diff --git a/src/intel/perf/intel_perf.c b/src/intel/perf/intel_perf.c index 7e56d40d9b0..2a393b484df 100644 --- a/src/intel/perf/intel_perf.c +++ b/src/intel/perf/intel_perf.c @@ -1651,3 +1651,64 @@ intel_perf_stream_set_metrics_id(struct intel_perf_config *perf_config, return -1; } } + +int +intel_perf_eustall_stream_open(struct intel_device_info *devinfo, int drm_fd, + uint32_t sample_rate, uint32_t min_event_count) +{ + if (devinfo->ver >= 20 && + devinfo->kmd_type == INTEL_KMD_TYPE_XE) + return xe_perf_eustall_stream_open(drm_fd, sample_rate, + min_event_count); + return -1; +} + +int +intel_perf_eustall_stream_set_state(struct intel_device_info *devinfo, + int perf_stream_fd, bool enable) +{ + if (devinfo->ver >= 20 && + devinfo->kmd_type == INTEL_KMD_TYPE_XE) + return xe_perf_stream_set_state(perf_stream_fd, enable); + return -1; +} + +int +intel_perf_eustall_stream_record_size(struct intel_device_info *devinfo, + int drm_fd) +{ + if (devinfo->ver >= 20 && + devinfo->kmd_type == INTEL_KMD_TYPE_XE) + return xe_perf_eustall_stream_record_size(drm_fd); + return -1; +} + +int +intel_perf_eustall_stream_sample_rate(struct intel_device_info *devinfo, + int drm_fd) +{ + if (devinfo->ver >= 20 && + devinfo->kmd_type == INTEL_KMD_TYPE_XE) + return xe_perf_eustall_stream_sample_rate(drm_fd); + return -1; +} + +int +intel_perf_eustall_stream_read_samples(struct intel_device_info *devinfo, + int perf_stream_fd, uint8_t *buffer, + size_t buffer_len, bool *overflow) +{ + if (devinfo->ver >= 20 && + devinfo->kmd_type == INTEL_KMD_TYPE_XE) + return xe_perf_eustall_stream_read_samples(perf_stream_fd, buffer, + buffer_len, overflow); + return -1; +} + +void +intel_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result, + const void *start, const void *end, + size_t record_size) +{ + return xe_perf_eustall_accumulate_results(result, start, end, record_size); +} diff --git a/src/intel/perf/intel_perf.h b/src/intel/perf/intel_perf.h index 8b789dc6feb..7a9e80906f6 100644 --- a/src/intel/perf/intel_perf.h +++ b/src/intel/perf/intel_perf.h @@ -175,6 +175,115 @@ struct intel_perf_query_result { bool query_disjoint; }; +struct intel_perf_query_eustall_event { + /** + * Offset of instruction within shader cache, bit shifted by 3. + * Should be unique identifier for event. + */ + uint64_t ip_addr; + + /** + * Number of EU stalls with at least one thread waiting on Pixel + * Shader dependency + */ + uint64_t tdr_count; + + /** + * Number of samples with at least one thread waiting on any + * other dependency (Flag/EoT etc). Multiple stall reasons can + * qualify during the same cycle + */ + uint64_t other_count; + + /** + * Number of samples with at least one thread waiting for JEU to + * complete branch instruction. Multiple stall reasons can qualify + * during the same cycle + */ + uint64_t control_count; + + /** + * Number of samples with at least one thread ready to be + * scheduled (Grf conf/send holds etc). Multiple stall reasons can + * qualify during the same cycle + */ + uint64_t pipestall_count; + + /** + * Number of samples with at least one thread waiting for SEND + * message to be dispatched from EU. Multiple stall reasons can + * qualify during the same cycle + */ + uint64_t send_count; + + /** + * Number of samples with at least one thread waiting for ALU to + * write GRF/ACC register. Multiple stall reasons can qualify + * during the same cycle + */ + uint64_t dist_acc_count; + + /** + * Number of samples with at least one thread waiting for + * Scoreboard token to be available. Multiple stall reasons can + * qualify during the same cycle + */ + uint64_t sbid_count; + + /** + * Number of samples with at least one thread waiting for + * Gateway to write Notify register. Multiple stall reasons can + * qualify during the same cycle + */ + uint64_t sync_count; + + /** + * Number of samples with at least one thread waiting for + * Instruction Fetch. Multiple stall reasons can qualify during + * the same cycle + */ + uint64_t inst_fetch_count; + + /** + * Number of samples where no threads are waiting + */ + uint64_t active_count; +}; + +struct intel_perf_query_eustall_result { + /** + * Storage for accumulated samples. Hash table containing + * intel_perf_query_eustall_event values with ip_addr as key. + */ + struct hash_table *accumulator; + + /** + * Hw ID used by the context on which the query was running. + */ + uint32_t hw_id; + + /** + * Number of records accumulated to produce the results. + */ + uint32_t records_accumulated; + + /** + * Overflow event occurred during sampling. + */ + bool overflow; + + /** + * Size of eu sample records in bytes. Obtained from + * kmd headers. + */ + size_t record_size; + + /** + * Number of bytes to next record to parse. + */ + int bytes_to_next_record; +}; + typedef uint64_t (*intel_counter_read_uint64_t)(struct intel_perf_config *perf, const struct intel_perf_query_info *query, const struct intel_perf_query_result *results); @@ -521,7 +630,16 @@ void intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *r const void *end, bool no_oa_accumulate); +/** Accumulate EU stall sampling data, ensuring data from previously seen offsets + * get aggregated. + */ +void intel_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result, + const void *start, + const void *end, + size_t record_size); + void intel_perf_query_result_clear(struct intel_perf_query_result *result); +void intel_perf_query_eustall_result_clear(struct intel_perf_query_eustall_result *result); /** Debug helper printing out query data. */ @@ -608,6 +726,17 @@ int intel_perf_stream_set_metrics_id(struct intel_perf_config *perf_config, uint64_t metrics_set_id, struct intel_bind_timeline *timeline); +int intel_perf_eustall_stream_open(struct intel_device_info *devinfo, int drm_fd, + uint32_t sample_rate, uint32_t min_event_count); +int intel_perf_eustall_stream_set_state(struct intel_device_info *devinfo, + int perf_stream_fd, bool enable); +int intel_perf_eustall_stream_record_size(struct intel_device_info *devinfo, + int drm_fd); +int intel_perf_eustall_stream_sample_rate(struct intel_device_info *devinfo, + int drm_fd); +int intel_perf_eustall_stream_read_samples(struct intel_device_info *devinfo, + int perf_stream_fd, uint8_t *buffer, + size_t buffer_len, bool *overflow); #ifdef __cplusplus } // extern "C" #endif diff --git a/src/intel/perf/xe/intel_perf.c b/src/intel/perf/xe/intel_perf.c index 1cce1ad3ac8..799668c3f3b 100644 --- a/src/intel/perf/xe/intel_perf.c +++ b/src/intel/perf/xe/intel_perf.c @@ -369,3 +369,139 @@ xe_perf_stream_read_samples(struct intel_perf_config *perf_config, int perf_stre return offset - buffer; } + +static int +first_rendering_gt_id(int drm_fd) { + struct intel_query_engine_info *engine_info = + intel_engine_get_info(drm_fd, INTEL_KMD_TYPE_XE); + for (int i = 0; i < engine_info->num_engines; i++) { + if (engine_info->engines[i].engine_class == INTEL_ENGINE_CLASS_RENDER) + return engine_info->engines[i].gt_id; + } + return -1; +} + +int +xe_perf_eustall_stream_open(int drm_fd, uint32_t sample_rate, + uint32_t min_event_count) +{ + struct drm_xe_ext_set_property props[DRM_XE_EU_STALL_PROP_MAX] = {}; + struct drm_xe_observation_param observation_param = { + .observation_type = DRM_XE_OBSERVATION_TYPE_EU_STALL, + .observation_op = DRM_XE_OBSERVATION_OP_STREAM_OPEN, + .param = (uintptr_t)&props, + }; + uint32_t i = 0; + int fd, flags; + int gt_id = first_rendering_gt_id(drm_fd); + assert(gt_id >= 0); + + oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_SAMPLE_RATE, sample_rate); + oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_WAIT_NUM_REPORTS, min_event_count); + oa_prop_set(props, &i, DRM_XE_EU_STALL_PROP_GT_ID, gt_id); + + fd = intel_ioctl(drm_fd, DRM_IOCTL_XE_OBSERVATION, &observation_param); + if (fd < 0) + return -errno; + + flags = fcntl(fd, F_GETFL, 0); + flags |= O_CLOEXEC | O_NONBLOCK; + if (fcntl(fd, F_SETFL, flags)) { + close(fd); + return -1; + } + + return fd; +} + +int +xe_perf_eustall_stream_record_size(int drm_fd) +{ + int record_size; + struct drm_xe_query_eu_stall *eu_stall_data = + xe_device_query_alloc_fetch(drm_fd, DRM_XE_DEVICE_QUERY_EU_STALL, NULL); + if (!eu_stall_data) + return -errno; + + assert(eu_stall_data->record_size > 0 && + eu_stall_data->record_size < INT_MAX); + record_size = (int)eu_stall_data->record_size; + free(eu_stall_data); + return record_size; +} + +int +xe_perf_eustall_stream_sample_rate(int drm_fd) +{ + int sampling_rate; + struct drm_xe_query_eu_stall *eu_stall_data = + xe_device_query_alloc_fetch(drm_fd, DRM_XE_DEVICE_QUERY_EU_STALL, NULL); + if (!eu_stall_data) + return -errno; + + assert(eu_stall_data->sampling_rates[0] > 0 && + eu_stall_data->sampling_rates[0] < INT_MAX); + sampling_rate = (int)eu_stall_data->sampling_rates[0]; + free(eu_stall_data); + return sampling_rate; +} + +int +xe_perf_eustall_stream_read_samples(int perf_stream_fd, uint8_t *buffer, + size_t buffer_len, bool *overflow) +{ + int len; + + *overflow = false; + do { + len = read(perf_stream_fd, buffer, buffer_len); + if (unlikely(len < 0 && errno == EIO)) + *overflow = true; + } while (len < 0 && (errno == EINTR || errno == EIO)); + + if (unlikely(len < 0 && errno == EAGAIN)) + len = 0; + + return len < 0 ? -errno : len; +} + +void +xe_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result, + const uint8_t *start, const uint8_t *end, + size_t record_size) +{ + const uint8_t *offset; + assert(((end - start) % record_size) == 0); + + for (offset = start; offset < end; offset += record_size) { + const struct drm_xe_eu_stall_data_xe2* stall_data = + (const struct drm_xe_eu_stall_data_xe2*)offset; + struct intel_perf_query_eustall_event* stall_result; + uint64_t ip_addr = stall_data->ip_addr; + struct hash_entry *e = _mesa_hash_table_search(result->accumulator, + (const void*)&ip_addr); + if (e) { + stall_result = e->data; + } else { + stall_result = calloc(1, sizeof(struct intel_perf_query_eustall_event)); + stall_result->ip_addr = ip_addr; + _mesa_hash_table_insert(result->accumulator, + (const void*)&stall_result->ip_addr, + stall_result); + } + assert(stall_result->ip_addr == stall_data->ip_addr); + + stall_result->tdr_count += stall_data->tdr_count; + stall_result->other_count += stall_data->other_count; + stall_result->control_count += stall_data->control_count; + stall_result->pipestall_count += stall_data->pipestall_count; + stall_result->send_count += stall_data->send_count; + stall_result->dist_acc_count += stall_data->dist_acc_count; + stall_result->sbid_count += stall_data->sbid_count; + stall_result->sync_count += stall_data->sync_count; + stall_result->inst_fetch_count += stall_data->inst_fetch_count; + stall_result->active_count += stall_data->active_count; + + result->records_accumulated++; + } +} \ No newline at end of file diff --git a/src/intel/perf/xe/intel_perf.h b/src/intel/perf/xe/intel_perf.h index 8e070908a07..976ec7678bf 100644 --- a/src/intel/perf/xe/intel_perf.h +++ b/src/intel/perf/xe/intel_perf.h @@ -12,6 +12,7 @@ struct intel_bind_timeline; struct intel_perf_config; struct intel_perf_registers; +struct intel_perf_query_eustall_result; uint64_t xe_perf_get_oa_format(struct intel_perf_config *perf); @@ -31,3 +32,12 @@ int xe_perf_stream_set_metrics_id(int perf_stream_fd, int drm_fd, struct intel_bind_timeline *timeline); int xe_perf_stream_read_samples(struct intel_perf_config *perf_config, int perf_stream_fd, uint8_t *buffer, size_t buffer_len); +int xe_perf_eustall_stream_open(int drm_fd, uint32_t sample_rate, + uint32_t min_event_count); +int xe_perf_eustall_stream_record_size(int drm_fd); +int xe_perf_eustall_stream_sample_rate(int drm_fd); +int xe_perf_eustall_stream_read_samples(int perf_stream_fd, uint8_t *buffer, + size_t buffer_len, bool *overflow); +void xe_perf_eustall_accumulate_results(struct intel_perf_query_eustall_result *result, + const uint8_t *start, const uint8_t *end, + size_t record_size);