intel/executor: Add performance counter support
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Add optional OA performance counter collection around each execute()
call. Examples:

```
  # List all profiles and counters, with descriptions.
  $ executor --oa list

  # Collect all counters from a profile.
  $ executor --oa ComputeBasic file.lua

  # Collect a subset of counters from a profile, separated by comma.
  $ executor --oa ComputeBasic:GpuTime,AvgGpuCoreFrequency file.lua

  # By default use ComputeBasic profile, so counter names only also work.
  $ executor --oa GpuTime file.lua
```

The selected counters are printed to stdout after the script finishes,
or written to a file specified by --oa-csv FILENAME.

Assisted-by: Pi coding agent (GPT-5.5)
Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41610>
This commit is contained in:
Caio Oliveira 2026-05-12 10:22:05 -07:00
parent 8d237b5408
commit 2c64e12462
5 changed files with 704 additions and 9 deletions

View file

@ -6,10 +6,13 @@
#ifndef EXECUTOR_H
#define EXECUTOR_H
#include <stdbool.h>
#include <stdint.h>
#include "intel/dev/intel_device_info.h"
#include "intel/isl/isl.h"
#include "perf/intel_perf.h"
#include "perf/intel_perf_query.h"
typedef struct {
uint32_t size;
@ -39,8 +42,16 @@ typedef struct {
executor_bo batch;
executor_bo extra;
executor_bo data;
executor_bo perf;
} bo;
bool perf_enabled;
struct {
struct intel_perf_context *ctx;
struct intel_perf_query_object *obj;
} perf_query;
uint64_t batch_start;
} executor_context;

View file

@ -45,6 +45,45 @@ emit_pipe_control(executor_context *ec)
}
}
void
genX(emit_perf_stall)(executor_context *ec)
{
executor_batch_emit(GENX(PIPE_CONTROL), pc) {
#if GFX_VER >= 12
pc.HDCPipelineFlushEnable = true;
#endif
pc.PipeControlFlushEnable = true;
pc.CommandStreamerStallEnable = true;
pc.StallAtPixelScoreboard = true;
}
}
void
genX(emit_mi_report_perf_count)(executor_context *ec, executor_bo *bo,
uint32_t offset_in_bytes,
uint32_t report_id)
{
executor_batch_emit(GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
mi_rpc.MemoryAddress = (executor_address){bo->addr + offset_in_bytes};
mi_rpc.ReportID = report_id;
}
}
void
genX(store_register_mem)(executor_context *ec, executor_bo *bo,
uint32_t reg, uint32_t reg_size,
uint32_t offset_in_bytes)
{
assert(reg_size == 4 || reg_size == 8);
for (uint32_t i = 0; i < reg_size; i += 4) {
executor_batch_emit(GENX(MI_STORE_REGISTER_MEM), srm) {
srm.RegisterAddress = reg + i;
srm.MemoryAddress = (executor_address){bo->addr + offset_in_bytes + i};
}
}
}
static void
emit_state_base_address(executor_context *ec, uint32_t mocs)
{
@ -91,6 +130,19 @@ emit_state_base_address(executor_context *ec, uint32_t mocs)
};
}
static void
executor_perf_begin(executor_context *ec)
{
if (!intel_perf_begin_query(ec->perf_query.ctx, ec->perf_query.obj))
failf("failed to begin OA performance query");
}
static void
executor_perf_end(executor_context *ec)
{
intel_perf_end_query(ec->perf_query.ctx, ec->perf_query.obj);
}
void
genX(emit_execute)(executor_context *ec, const executor_params *params)
{
@ -158,9 +210,15 @@ genX(emit_execute)(executor_context *ec, const executor_params *params)
#endif
#if GFX_VERx10 >= 125
if (ec->perf_enabled)
executor_perf_begin(ec);
executor_batch_emit(GENX(COMPUTE_WALKER), cw) {
cw.body = body;
};
if (ec->perf_enabled)
executor_perf_end(ec);
#else
uint32_t *idd = executor_alloc_bytes_aligned(&ec->bo.extra, 8 * 4, 256);
GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, idd, &desc);
@ -172,6 +230,9 @@ genX(emit_execute)(executor_context *ec, const executor_params *params)
load.InterfaceDescriptorTotalLength = 8 * 4;
}
if (ec->perf_enabled)
executor_perf_begin(ec);
executor_batch_emit(GENX(GPGPU_WALKER), gw) {
gw.ThreadGroupIDXDimension = 1;
gw.ThreadGroupIDYDimension = 1;
@ -181,6 +242,9 @@ genX(emit_execute)(executor_context *ec, const executor_params *params)
}
executor_batch_emit(GENX(MEDIA_STATE_FLUSH), msf);
if (ec->perf_enabled)
executor_perf_end(ec);
#endif
emit_pipe_control(ec);

View file

@ -8,3 +8,10 @@
#endif
void genX(emit_execute)(executor_context *ec, const executor_params *params);
void genX(emit_perf_stall)(executor_context *ec);
void genX(emit_mi_report_perf_count)(executor_context *ec, executor_bo *bo,
uint32_t offset_in_bytes,
uint32_t report_id);
void genX(store_register_mem)(executor_context *ec, executor_bo *bo,
uint32_t reg, uint32_t reg_size,
uint32_t offset_in_bytes);

View file

@ -3,11 +3,14 @@
* SPDX-License-Identifier: MIT
*/
#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <inttypes.h>
#include <libgen.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
@ -16,6 +19,7 @@
#include <lauxlib.h>
#include "util/ralloc.h"
#include "util/u_math.h"
#include <xf86drm.h>
#include "drm-uapi/i915_drm.h"
@ -35,12 +39,16 @@ enum {
EXECUTOR_BO_BATCH_ADDR = 0x10000000,
EXECUTOR_BO_EXTRA_ADDR = 0x20000000,
EXECUTOR_BO_DATA_ADDR = 0x30000000,
EXECUTOR_BO_PERF_ADDR = 0x40000000,
/* Apply to all BOs. */
EXECUTOR_BO_SIZE = 10 * 1024 * 1024,
};
const char usage_line[] = "usage: executor [-d DEVICE] FILENAME [ARGS...]";
const char usage_line[] =
"usage: executor [-d DEVICE] FILENAME [ARGS...]\n"
" executor [-d DEVICE] --oa OA [--oa-csv FILE] FILENAME [ARGS...]\n"
" executor [-d DEVICE] --oa list";
static void
open_manual()
@ -68,6 +76,10 @@ open_manual()
"",
"executor [-d DEVICE] FILENAME [ARGS...]",
"",
"executor [-d DEVICE] --oa OA [--oa-csv FILE] FILENAME [ARGS...]",
"",
"executor [-d DEVICE] --oa list",
"",
"executor -d list",
"",
".SH DESCRIPTION",
@ -85,6 +97,28 @@ open_manual()
"passed with either the index or a substring of the device to use.",
"Use \"-d list\" to list available devices.",
"",
".SH PERFORMANCE COUNTERS",
"",
"If --oa OA is passed, executor wraps every execute() call in an OA",
"performance query and writes one CSV row per execute(). The first row is",
"a header. By default, CSV is printed to stdout after the script finishes.",
"With --oa-csv FILE, CSV is written to FILE as the script runs.",
"",
"--oa OA selects the OA profile and optional counter filter. OA has the",
"form PROFILE[:COUNTER1,COUNTER2]. If the counter list is omitted, all",
"counters in the selected profile are written. If PROFILE is omitted, or",
"OA does not match a profile name, counters are selected from ComputeBasic.",
"--oa-csv only selects the output file and must be combined with --oa.",
"For example:",
"",
" --oa ComputeBasic",
" --oa ComputeBasic:GpuTime,AvgGpuCoreFrequency",
" --oa GpuTime,AvgGpuCoreFrequency",
"",
"--oa list lists all OA profiles and counters and exits. For example:",
"",
" executor --oa list",
"",
".SH SCRIPTING ENVIRONMENT",
"",
"In addition to the regular Lua standard library the following variables and",
@ -221,8 +255,14 @@ print_help()
"- @read DST_REG OFFSET_REG\n"
"- @write OFFSET_REG SRC_REG\n"
"\n"
"Use \'executor -d list\' to list available devices.\n"
"For more details, use \'executor --help\' to open manual.\n",
"PERFORMANCE COUNTERS:\n"
"- --oa PROFILE[:COUNTER1,COUNTER2]\n"
"- --oa COUNTER1[,COUNTER2]\n"
"- --oa PROFILE[:COUNTER1,COUNTER2] --oa-csv FILE\n"
"- --oa list\n"
"\n"
"Use 'executor -d list' to list available devices.\n"
"For more details, use 'executor --help' to open manual.\n",
usage_line);
}
@ -231,6 +271,21 @@ static struct {
struct isl_device isl_dev;
struct brw_isa_info isa;
int fd;
const char *oa_csv_path;
const char *oa_spec;
const char *oa_metric_name;
const char **oa_counter_names;
int n_oa_counter_names;
int *oa_counter_indices;
bool oa_spec_has_colon;
bool oa_list;
struct intel_perf_config *perf_cfg;
int perf_query_index;
uint32_t perf_execute_count;
FILE *oa_csv_file;
char *oa_csv_mem;
size_t oa_csv_mem_size;
} E;
#define genX_call(func, ...) \
@ -367,6 +422,435 @@ executor_address_of_ptr(executor_bo *bo, void *ptr)
return (executor_address){ptr - bo->map + bo->addr};
}
static void *
executor_perf_bo_alloc(void *bufmgr, const char *name, uint64_t size)
{
executor_context *ec = bufmgr;
executor_bo *bo = rzalloc(ec->mem_ctx, executor_bo);
if (!bo)
failf("failed to allocate perf BO wrapper");
/* Sub-allocate from a single BO. */
size = align64(size, 4096);
void *map = executor_alloc_bytes_aligned(&ec->bo.perf, size, 4096);
uint64_t offset = (char *)map - (char *)ec->bo.perf.map;
*bo = (executor_bo) {
.size = size,
.handle = ec->bo.perf.handle,
.map = map,
.cursor = map,
.addr = ec->bo.perf.addr + offset,
};
return bo;
}
/* intel_perf vtbl: executor has no live batch object to inspect and owns the
* single real perf BO, so several callbacks are trivial adapters or no-ops.
*/
static void
executor_perf_bo_unreference(void *bo)
{
/* Perf BO is destroyed with executor_context. */
}
static void *
executor_perf_bo_map(void *ctx, void *bo, unsigned flags)
{
return ((executor_bo *)bo)->map;
}
static void
executor_perf_bo_unmap(void *bo)
{
/* Perf BO slices are persistently mapped as part of ec->bo.perf. */
}
static bool
executor_perf_batch_references(void *batch, void *bo)
{
/* Executor has no live batch object for intel_perf to inspect. */
return false;
}
static void
executor_perf_bo_wait_rendering(void *bo)
{
/* executor_context_dispatch() already waits for batch completion. */
}
static int
executor_perf_bo_busy(void *bo)
{
/* Queries are only read after executor_context_dispatch() has waited. */
return 0;
}
static void
executor_perf_emit_stall_at_pixel_scoreboard(void *ctx)
{
executor_context *ec = ctx;
genX_call(emit_perf_stall, ec);
}
static void
executor_perf_emit_mi_report_perf_count(void *ctx, void *bo,
uint32_t offset_in_bytes,
uint32_t report_id)
{
executor_context *ec = ctx;
genX_call(emit_mi_report_perf_count, ec, bo, offset_in_bytes, report_id);
}
static void
executor_perf_batchbuffer_flush(void *ctx, const char *file, int line)
{
/* Unused because executor_perf_batch_references() always returns false. */
}
static void
executor_perf_store_register_mem(void *ctx, void *bo, uint32_t reg,
uint32_t reg_size, uint32_t offset)
{
executor_context *ec = ctx;
genX_call(store_register_mem, ec, bo, reg, reg_size, offset);
}
static const __typeof__(((struct intel_perf_config *)0)->vtbl)
executor_perf_vtbl = {
.bo_alloc = executor_perf_bo_alloc,
.bo_unreference = executor_perf_bo_unreference,
.bo_map = executor_perf_bo_map,
.bo_unmap = executor_perf_bo_unmap,
.batch_references = executor_perf_batch_references,
.bo_wait_rendering = executor_perf_bo_wait_rendering,
.bo_busy = executor_perf_bo_busy,
.emit_stall_at_pixel_scoreboard =
executor_perf_emit_stall_at_pixel_scoreboard,
.emit_mi_report_perf_count = executor_perf_emit_mi_report_perf_count,
.batchbuffer_flush = executor_perf_batchbuffer_flush,
.store_register_mem = executor_perf_store_register_mem,
};
static bool
executor_perf_query_name_matches(const struct intel_perf_query_info *query,
const char *name)
{
return (query->symbol_name && !strcmp(query->symbol_name, name)) ||
(query->name && !strcmp(query->name, name));
}
static bool
executor_perf_counter_name_matches(const struct intel_perf_query_counter *counter,
const char *name)
{
return (counter->symbol_name && !strcmp(counter->symbol_name, name)) ||
(counter->name && !strcmp(counter->name, name));
}
static void
executor_perf_print_query_name(FILE *f, const char *prefix,
const struct intel_perf_query_info *query)
{
fprintf(f, "%s%s%s%s\n", prefix,
query->symbol_name ? query->symbol_name : query->name,
query->symbol_name && query->name ? " - " : "",
query->symbol_name && query->name ? query->name : "");
}
static void
executor_perf_print_available_queries(FILE *f)
{
fprintf(f, "Available OA metric sets:\n");
for (int i = 0; i < E.perf_cfg->n_queries; i++) {
const struct intel_perf_query_info *query = &E.perf_cfg->queries[i];
if (query->kind == INTEL_PERF_QUERY_TYPE_OA)
executor_perf_print_query_name(f, " ", query);
}
}
static int
executor_perf_count_selected_counters(const struct intel_perf_query_info *query)
{
return E.n_oa_counter_names > 0 ? E.n_oa_counter_names : query->n_counters;
}
static void
executor_perf_validate_counters(void *mem_ctx,
const struct intel_perf_query_info *query)
{
E.oa_counter_indices =
ralloc_array(mem_ctx, int, E.n_oa_counter_names);
if (E.n_oa_counter_names > 0 && !E.oa_counter_indices)
failf("failed to allocate OA counter index map");
for (int i = 0; i < E.n_oa_counter_names; i++) {
int idx = -1;
for (int j = 0; j < query->n_counters; j++) {
if (executor_perf_counter_name_matches(&query->counters[j],
E.oa_counter_names[i])) {
idx = j;
break;
}
}
if (idx < 0)
failf("OA counter '%s' not found in metric set '%s'",
E.oa_counter_names[i],
query->symbol_name ? query->symbol_name : query->name);
E.oa_counter_indices[i] = idx;
}
if (executor_perf_count_selected_counters(query) == 0)
failf("OA metric set '%s' has no selected counters",
query->symbol_name ? query->symbol_name : query->name);
}
static int
executor_find_named_perf_query(const char *metric_name)
{
for (int i = 0; i < E.perf_cfg->n_queries; i++) {
const struct intel_perf_query_info *query = &E.perf_cfg->queries[i];
if (query->kind == INTEL_PERF_QUERY_TYPE_OA &&
executor_perf_query_name_matches(query, metric_name))
return i;
}
return -1;
}
static const char *
executor_default_perf_query_name(void)
{
return "ComputeBasic";
}
static int
executor_find_perf_query(const char *metric_name)
{
int query_index = executor_find_named_perf_query(metric_name);
if (query_index >= 0)
return query_index;
executor_perf_print_available_queries(stderr);
failf("OA metric set '%s' not found", metric_name);
return -1;
}
static void
executor_add_oa_counter(void *mem_ctx, const char *counter)
{
E.oa_counter_names = reralloc(mem_ctx, E.oa_counter_names,
const char *, E.n_oa_counter_names + 1);
if (!E.oa_counter_names)
failf("failed to allocate OA counter filter");
E.oa_counter_names[E.n_oa_counter_names++] = counter;
}
static void
executor_parse_oa_counter_list(void *mem_ctx, const char *counter_list,
const char *spec)
{
char *counters = ralloc_strdup(mem_ctx, counter_list);
while (counters && counters[0]) {
char *counter = counters;
char *comma = strchr(counters, ',');
if (comma) {
*comma = '\0';
counters = comma + 1;
} else {
counters = NULL;
}
if (!counter[0])
failf("empty OA counter name in '%s'", spec);
executor_add_oa_counter(mem_ctx, counter);
}
}
static void
executor_parse_oa_spec(void *mem_ctx, const char *spec)
{
if (!spec)
return;
char *oa = ralloc_strdup(mem_ctx, spec);
char *counter_list = strchr(oa, ':');
if (counter_list) {
E.oa_spec_has_colon = true;
*counter_list++ = '\0';
E.oa_metric_name = oa[0] ? oa : executor_default_perf_query_name();
executor_parse_oa_counter_list(mem_ctx, counter_list, spec);
} else if (oa[0]) {
E.oa_metric_name = oa;
} else {
failf("missing OA metric set name in '%s'", spec);
}
}
static void
executor_perf_list_query(const struct intel_perf_query_info *query)
{
executor_perf_print_query_name(stdout, "", query);
if (query->guid)
printf(" guid: %s\n", query->guid);
printf(" counters:\n");
for (int i = 0; i < query->n_counters; i++) {
const struct intel_perf_query_counter *counter = &query->counters[i];
printf(" %s%s%s [%s, %s, %s]",
counter->symbol_name ? counter->symbol_name : counter->name,
counter->symbol_name && counter->name ? " - " : "",
counter->symbol_name && counter->name ? counter->name : "",
intel_perf_counter_type_name(counter->type),
intel_perf_counter_data_type_name(counter->data_type),
intel_perf_counter_units_name(counter->units));
if (counter->category)
printf(" category=%s", counter->category);
printf("\n");
if (counter->desc)
printf(" %s\n", counter->desc);
}
}
static void
executor_perf_list(void)
{
for (int i = 0; i < E.perf_cfg->n_queries; i++) {
const struct intel_perf_query_info *query = &E.perf_cfg->queries[i];
if (query->kind == INTEL_PERF_QUERY_TYPE_OA)
executor_perf_list_query(query);
}
}
static void
executor_perf_create_query(executor_context *ec)
{
if (!ec->perf_enabled)
return;
ec->perf_query.ctx = intel_perf_new_context(ec->mem_ctx);
if (!ec->perf_query.ctx)
failf("failed to allocate Intel perf context");
const uint32_t hw_ctx = ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915 ?
ec->i915.ctx_id : ec->xe.queue_id;
intel_perf_init_context(ec->perf_query.ctx, E.perf_cfg, ec->mem_ctx,
ec, ec, ec->devinfo, hw_ctx, ec->fd);
ec->perf_query.obj = intel_perf_new_query(ec->perf_query.ctx, E.perf_query_index);
if (!ec->perf_query.obj)
failf("failed to create OA performance query");
}
static void
executor_perf_print_counter_value(FILE *f,
const struct intel_perf_query_counter *counter,
const uint8_t *data)
{
const uint8_t *p = data + counter->offset;
switch (counter->data_type) {
case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: {
uint32_t value;
assert((counter->offset & 3) == 0);
memcpy(&value, p, sizeof(value));
fprintf(f, "%"PRIu32, value);
break;
}
case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: {
uint64_t value;
assert((counter->offset & 7) == 0);
memcpy(&value, p, sizeof(value));
fprintf(f, "%"PRIu64, value);
break;
}
case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: {
float value;
assert((counter->offset & 3) == 0);
memcpy(&value, p, sizeof(value));
fprintf(f, "%.9g", value);
break;
}
case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: {
double value;
assert((counter->offset & 7) == 0);
memcpy(&value, p, sizeof(value));
fprintf(f, "%.17g", value);
break;
}
default:
failf("unhandled OA counter data type %d", counter->data_type);
}
}
static void
executor_perf_finish_query(executor_context *ec)
{
if (!ec->perf_query.obj)
return;
const struct intel_perf_query_info *query =
&E.perf_cfg->queries[E.perf_query_index];
const bool write_header = E.perf_execute_count++ == 0;
if (!E.oa_csv_file) {
if (E.oa_csv_path) {
E.oa_csv_file = fopen(E.oa_csv_path, "w");
if (!E.oa_csv_file)
failf("failed to open '%s' for writing", E.oa_csv_path);
} else {
E.oa_csv_file = open_memstream(&E.oa_csv_mem, &E.oa_csv_mem_size);
if (!E.oa_csv_file)
failf("failed to open memory stream for OA CSV output");
}
}
/* When --oa specifies counters, columns follow the user-supplied order;
* otherwise they follow the profile-defined order.
*/
const int n_cols = executor_perf_count_selected_counters(query);
if (write_header) {
for (int i = 0; i < n_cols; i++) {
const int idx = E.n_oa_counter_names > 0 ? E.oa_counter_indices[i] : i;
const struct intel_perf_query_counter *counter = &query->counters[idx];
if (i > 0)
putc(',', E.oa_csv_file);
fprintf(E.oa_csv_file, "%s",
counter->symbol_name ? counter->symbol_name : counter->name);
}
putc('\n', E.oa_csv_file);
}
uint8_t *data = calloc(1, query->data_size);
if (!data)
failf("failed to allocate OA query result buffer");
intel_perf_wait_query(ec->perf_query.ctx, ec->perf_query.obj, NULL);
intel_perf_get_query_data(ec->perf_query.ctx, ec->perf_query.obj, NULL,
query->data_size, (unsigned *)data, NULL);
for (int i = 0; i < n_cols; i++) {
const int idx = E.n_oa_counter_names > 0 ? E.oa_counter_indices[i] : i;
const struct intel_perf_query_counter *counter = &query->counters[idx];
if (i > 0)
putc(',', E.oa_csv_file);
executor_perf_print_counter_value(E.oa_csv_file, counter, data);
}
putc('\n', E.oa_csv_file);
fflush(E.oa_csv_file);
free(data);
intel_perf_delete_query(ec->perf_query.ctx, ec->perf_query.obj);
intel_perf_free_context(ec->perf_query.ctx);
}
static bool
open_intel_render_device(drmDevicePtr dev,
struct intel_device_info *devinfo,
@ -473,6 +957,11 @@ decode_get_bo(void *_ec, bool ppgtt, uint64_t address)
bo.addr = ec->bo.data.addr;
bo.size = ec->bo.data.size;
bo.map = ec->bo.data.map;
} else if (address >= ec->bo.perf.addr &&
address < ec->bo.perf.addr + ec->bo.perf.size) {
bo.addr = ec->bo.perf.addr;
bo.size = ec->bo.perf.size;
bo.map = ec->bo.perf.map;
}
return bo;
@ -594,6 +1083,9 @@ executor_context_setup(executor_context *ec)
executor_create_bo(ec, &ec->bo.batch, EXECUTOR_BO_BATCH_ADDR, EXECUTOR_BO_SIZE);
executor_create_bo(ec, &ec->bo.extra, EXECUTOR_BO_EXTRA_ADDR, EXECUTOR_BO_SIZE);
executor_create_bo(ec, &ec->bo.data, EXECUTOR_BO_DATA_ADDR, EXECUTOR_BO_SIZE);
if (ec->perf_enabled)
executor_create_bo(ec, &ec->bo.perf, EXECUTOR_BO_PERF_ADDR,
EXECUTOR_BO_SIZE);
uint32_t *data = ec->bo.data.map;
for (int i = 0; i < EXECUTOR_BO_SIZE / 4; i++)
@ -604,6 +1096,7 @@ static void
executor_context_dispatch(executor_context *ec)
{
if (ec->devinfo->kmd_type == INTEL_KMD_TYPE_I915) {
const uint32_t buffer_count = 3 + ec->perf_enabled;
struct drm_i915_gem_exec_object2 objs[] = {
{
.handle = ec->bo.batch.handle,
@ -620,11 +1113,19 @@ executor_context_dispatch(executor_context *ec)
.offset = ec->bo.data.addr,
.flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE,
},
{},
};
if (ec->perf_enabled) {
objs[3] = (struct drm_i915_gem_exec_object2) {
.handle = ec->bo.perf.handle,
.offset = ec->bo.perf.addr,
.flags = EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE,
};
}
struct drm_i915_gem_execbuffer2 exec = {0};
exec.buffers_ptr = (uintptr_t)objs;
exec.buffer_count = ARRAY_SIZE(objs);
exec.buffer_count = buffer_count;
exec.batch_start_offset = ec->batch_start - ec->bo.batch.addr;
exec.flags = I915_EXEC_BATCH_FIRST;
exec.rsvd1 = ec->i915.ctx_id;
@ -658,6 +1159,7 @@ executor_context_dispatch(executor_context *ec)
sync_handles[i] = sync_create.handle;
}
const uint32_t num_binds = 3 + ec->perf_enabled;
struct drm_xe_vm_bind_op bind_ops[] = {
{
.op = DRM_XE_VM_BIND_OP_MAP,
@ -680,7 +1182,17 @@ executor_context_dispatch(executor_context *ec)
.range = EXECUTOR_BO_SIZE,
.pat_index = ec->devinfo->pat.cached_coherent.index,
},
{},
};
if (ec->perf_enabled) {
bind_ops[3] = (struct drm_xe_vm_bind_op) {
.op = DRM_XE_VM_BIND_OP_MAP,
.obj = ec->bo.perf.handle,
.addr = ec->bo.perf.addr,
.range = ec->bo.perf.size,
.pat_index = ec->devinfo->pat.cached_coherent.index,
};
}
struct drm_xe_sync bind_syncs[] = {
{
@ -693,7 +1205,7 @@ executor_context_dispatch(executor_context *ec)
struct drm_xe_vm_bind bind = {
.vm_id = ec->xe.vm_id,
.num_binds = ARRAY_SIZE(bind_ops),
.num_binds = num_binds,
.vector_of_binds = (uintptr_t)bind_ops,
.num_syncs = 1,
.syncs = (uintptr_t)bind_syncs,
@ -751,6 +1263,9 @@ executor_context_dispatch(executor_context *ec)
static void
executor_context_teardown(executor_context *ec)
{
if (ec->perf_enabled)
executor_destroy_bo(ec, &ec->bo.perf);
executor_destroy_bo(ec, &ec->bo.batch);
executor_destroy_bo(ec, &ec->bo.extra);
executor_destroy_bo(ec, &ec->bo.data);
@ -789,11 +1304,13 @@ l_execute(lua_State *L)
.devinfo = &E.devinfo,
.isl_dev = &E.isl_dev,
.fd = E.fd,
.perf_enabled = E.oa_csv_path != NULL || E.oa_spec != NULL,
};
executor_context_setup(&ec);
executor_params params = {0};
executor_perf_create_query(&ec);
{
if (lua_gettop(L) != 1)
@ -847,6 +1364,8 @@ l_execute(lua_State *L)
executor_context_dispatch(&ec);
executor_perf_finish_query(&ec);
{
/* TODO: Use userdata to return a wrapped C array instead of building
* values. Could make integration with array operations better.
@ -905,9 +1424,16 @@ main(int argc, char *argv[])
int opt;
const char *device_pattern = NULL;
enum {
OPT_OA_CSV = 1000,
OPT_OA,
};
static const struct option long_options[] = {
{"help", no_argument, 0, 'H'},
{"device", required_argument, 0, 'd'},
{"help", no_argument, 0, 'H'},
{"device", required_argument, 0, 'd'},
{"oa-csv", required_argument, 0, OPT_OA_CSV},
{"oa", required_argument, 0, OPT_OA},
{},
};
@ -929,13 +1455,36 @@ main(int argc, char *argv[])
case 'H':
open_manual();
return 0;
case OPT_OA_CSV:
E.oa_csv_path = optarg;
break;
case OPT_OA:
E.oa_spec = optarg;
break;
default:
fprintf(stderr, "%s\n", usage_line);
return 1;
}
}
if (optind >= argc) {
if (E.oa_spec && !strcmp(E.oa_spec, "list")) {
E.oa_list = true;
E.oa_spec = NULL;
}
if (E.oa_list && E.oa_csv_path) {
fprintf(stderr, "%s\n", usage_line);
fprintf(stderr, "--oa list cannot be combined with --oa-csv\n");
return 1;
}
if (E.oa_csv_path && !E.oa_spec) {
fprintf(stderr, "%s\n", usage_line);
fprintf(stderr, "--oa-csv requires --oa\n");
return 1;
}
if (!E.oa_list && optind >= argc) {
fprintf(stderr, "%s\n", usage_line);
fprintf(stderr, "expected FILENAME after options\n");
return 1;
@ -943,7 +1492,9 @@ main(int argc, char *argv[])
void *mem_ctx = ralloc_context(NULL);
const char *filename = argv[optind];
const char *filename = optind < argc ? argv[optind] : NULL;
executor_parse_oa_spec(mem_ctx, E.oa_spec);
process_intel_debug_variable();
@ -958,6 +1509,56 @@ main(int argc, char *argv[])
assert(E.devinfo.kmd_type == INTEL_KMD_TYPE_I915 ||
E.devinfo.kmd_type == INTEL_KMD_TYPE_XE);
if (E.oa_csv_path || E.oa_spec || E.oa_list) {
E.perf_cfg = intel_perf_new(NULL);
if (!E.perf_cfg)
failf("failed to allocate Intel perf config");
E.perf_cfg->vtbl = executor_perf_vtbl;
intel_perf_init_metrics(E.perf_cfg, &E.devinfo, E.fd,
false /* include_pipeline_statistics */,
true /* use_register_snapshots */);
if (!E.oa_list) {
if (!E.oa_spec_has_colon &&
executor_find_named_perf_query(E.oa_metric_name) < 0) {
executor_parse_oa_counter_list(mem_ctx, E.oa_metric_name, E.oa_spec);
E.oa_metric_name = executor_default_perf_query_name();
}
E.perf_query_index = executor_find_perf_query(E.oa_metric_name);
}
if (E.perf_query_index < 0) {
if (E.perf_cfg->features_supported & INTEL_PERF_FEATURE_OA_BLOCKED_BY_POLICY) {
const char *sysctl = E.devinfo.kmd_type == INTEL_KMD_TYPE_XE ?
"/proc/sys/dev/xe/observation_paranoid" :
"/proc/sys/dev/i915/perf_stream_paranoid";
failf("no OA metric sets available for %s; access is blocked by %s",
E.devinfo.name, sysctl);
}
failf("no OA metric sets available for %s", E.devinfo.name);
}
if (E.oa_list) {
executor_perf_list();
close(E.fd);
intel_perf_free(E.perf_cfg);
ralloc_free(mem_ctx);
return 0;
}
const struct intel_perf_query_info *query =
&E.perf_cfg->queries[E.perf_query_index];
executor_perf_validate_counters(mem_ctx, query);
fprintf(stderr, "Using OA profile: %s%s%s (%d/%d counters)\n",
query->symbol_name ? query->symbol_name : query->name,
query->symbol_name && query->name ? " - " : "",
query->symbol_name && query->name ? query->name : "",
executor_perf_count_selected_counters(query), query->n_counters);
}
lua_State *L = luaL_newstate();
/* TODO: Could be nice to export some kind of builder interface,
@ -1028,8 +1629,19 @@ main(int argc, char *argv[])
failf("failed to run script: %s", lua_tostring(L, -1));
lua_close(L);
if (E.oa_csv_file) {
fclose(E.oa_csv_file);
if (!E.oa_csv_path && E.oa_csv_mem_size > 0)
fwrite(E.oa_csv_mem, 1, E.oa_csv_mem_size, stdout);
free(E.oa_csv_mem);
}
close(E.fd);
if (E.perf_cfg)
intel_perf_free(E.perf_cfg);
ralloc_free(mem_ctx);
return 0;

View file

@ -46,6 +46,7 @@ executor = executable(
idep_genxml,
idep_intel_decoder_brw,
idep_intel_dev,
idep_intel_perf,
idep_libintel_common,
],
include_directories: [executor_includes],