ethosu: Add performance counter debug output
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Add simple performance counter support as debug output. This is enough
to measure NPU cycles for networks.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Tomeu Vizoso <tomeu@tomeuvizoso.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40269>
This commit is contained in:
Rob Herring (Arm) 2026-02-11 08:46:40 -06:00 committed by Marge Bot
parent 83d0646d79
commit 0972ef7d33
5 changed files with 108 additions and 5 deletions

View file

@ -43,6 +43,11 @@ enum drm_ethosu_ioctl_id {
/** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */ /** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */
DRM_ETHOSU_SUBMIT, DRM_ETHOSU_SUBMIT,
DRM_ETHOSU_PERFMON_CREATE,
DRM_ETHOSU_PERFMON_DESTROY,
DRM_ETHOSU_PERFMON_GET_VALUES,
DRM_ETHOSU_PERFMON_SET_GLOBAL,
}; };
/** /**
@ -79,7 +84,9 @@ struct drm_ethosu_npu_info {
__u32 config; __u32 config;
__u32 sram_size; __u32 sram_size;
__u32 pmu_counters;
}; };
/** /**
* struct drm_ethosu_dev_query - Arguments passed to DRM_ETHOSU_IOCTL_DEV_QUERY * struct drm_ethosu_dev_query - Arguments passed to DRM_ETHOSU_IOCTL_DEV_QUERY
*/ */
@ -171,7 +178,6 @@ struct drm_ethosu_bo_wait {
__s64 timeout_ns; /* absolute */ __s64 timeout_ns; /* absolute */
}; };
struct drm_ethosu_cmdstream_bo_create { struct drm_ethosu_cmdstream_bo_create {
/* Size of the data argument. */ /* Size of the data argument. */
__u32 size; __u32 size;
@ -220,10 +226,52 @@ struct drm_ethosu_submit {
/** Input: Number of jobs passed in. */ /** Input: Number of jobs passed in. */
__u32 job_count; __u32 job_count;
/** Reserved, must be zero. */ /** Input: Id returned by DRM_ETHOSU_PERFMON_CREATE */
__u32 pad; __u32 perfmon_id;
}; };
#define DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS 8
#define DRM_ETHOSU_MAX_PERF_COUNTERS \
(DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS + 1)
struct drm_ethosu_perfmon_create {
__u32 id;
__u32 ncounters;
__u16 counters[DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS];
};
struct drm_ethosu_perfmon_destroy {
__u32 id;
};
/*
* Returns the values of the performance counters tracked by this
* perfmon (as an array of (ncounters + 1) u64 values).
*
* No implicit synchronization is performed, so the user has to
* guarantee that any jobs using this perfmon have already been
* completed.
*/
struct drm_ethosu_perfmon_get_values {
__u32 id;
__u32 pad;
__u64 values_ptr;
};
#define DRM_ETHOSU_PERFMON_CLEAR_GLOBAL 0x0001
/**
* struct drm_ethosu_perfmon_set_global - ioctl to define a global performance
* monitor
*
* The global performance monitor will be used for all jobs. If a global
* performance monitor is defined, jobs with a self-defined performance
* monitor won't be allowed.
*/
struct drm_ethosu_perfmon_set_global {
__u32 flags;
__u32 id;
};
/** /**
* DRM_IOCTL_ETHOSU() - Build a ethosu IOCTL number * DRM_IOCTL_ETHOSU() - Build a ethosu IOCTL number
@ -253,6 +301,14 @@ enum {
DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create), DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
DRM_IOCTL_ETHOSU_SUBMIT = DRM_IOCTL_ETHOSU_SUBMIT =
DRM_IOCTL_ETHOSU(WR, SUBMIT, submit), DRM_IOCTL_ETHOSU(WR, SUBMIT, submit),
DRM_IOCTL_ETHOSU_PERFMON_CREATE =
DRM_IOCTL_ETHOSU(WR, PERFMON_CREATE, perfmon_create),
DRM_IOCTL_ETHOSU_PERFMON_DESTROY =
DRM_IOCTL_ETHOSU(WR, PERFMON_DESTROY, perfmon_destroy),
DRM_IOCTL_ETHOSU_PERFMON_GET_VALUES =
DRM_IOCTL_ETHOSU(WR, PERFMON_GET_VALUES, perfmon_get_values),
DRM_IOCTL_ETHOSU_PERFMON_SET_GLOBAL =
DRM_IOCTL_ETHOSU(WR, PERFMON_SET_GLOBAL, perfmon_set_global),
}; };
#if defined(__cplusplus) #if defined(__cplusplus)

View file

@ -23,6 +23,7 @@ static const struct debug_named_value ethosu_debug_options[] = {
{"disable_nhcwb16", ETHOSU_DBG_DISABLE_NHCWB16, "Disable NHCWB16"}, {"disable_nhcwb16", ETHOSU_DBG_DISABLE_NHCWB16, "Disable NHCWB16"},
{"disable_sram", ETHOSU_DBG_DISABLE_SRAM, "Disable SRAM"}, {"disable_sram", ETHOSU_DBG_DISABLE_SRAM, "Disable SRAM"},
{"force_u85", ETHOSU_DBG_FORCE_U85, "Force U85 behavior even on U65 hardware"}, {"force_u85", ETHOSU_DBG_FORCE_U85, "Force U85 behavior even on U65 hardware"},
{"dump_perf", ETHOSU_DBG_DUMP_PERF, "Dump performance counters for each submit"},
DEBUG_NAMED_VALUE_END}; DEBUG_NAMED_VALUE_END};
DEBUG_GET_ONCE_FLAGS_OPTION(ethosu_debug, "ETHOSU_DEBUG", ethosu_debug_options, 0) DEBUG_GET_ONCE_FLAGS_OPTION(ethosu_debug, "ETHOSU_DEBUG", ethosu_debug_options, 0)
@ -336,4 +337,4 @@ ethosu_ml_device_create(const char *spec)
set_device_callbacks(device); set_device_callbacks(device);
return &device->base; return &device->base;
} }

View file

@ -21,6 +21,7 @@ enum ethosu_dbg {
ETHOSU_DBG_DISABLE_NHCWB16 = BITFIELD_BIT(3), ETHOSU_DBG_DISABLE_NHCWB16 = BITFIELD_BIT(3),
ETHOSU_DBG_DISABLE_SRAM = BITFIELD_BIT(4), ETHOSU_DBG_DISABLE_SRAM = BITFIELD_BIT(4),
ETHOSU_DBG_FORCE_U85 = BITFIELD_BIT(5), ETHOSU_DBG_FORCE_U85 = BITFIELD_BIT(5),
ETHOSU_DBG_DUMP_PERF = BITFIELD_BIT(6),
}; };
extern int ethosu_debug; extern int ethosu_debug;

View file

@ -320,6 +320,23 @@ prepare_for_submission(struct ethosu_subgraph *subgraph,
} }
} }
subgraph->perfmon_id = 0;
if (DBG_ENABLED(ETHOSU_DBG_DUMP_PERF)) {
struct drm_ethosu_perfmon_create perfmon_create = {
.counters = { 32, 35 }, /* npu-idle, npu-active */
.ncounters = 2,
};
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_PERFMON_CREATE, &perfmon_create);
DBG("Perfmon create returned %d\n", ret);
if (ret == 0) {
subgraph->perfmon_id = perfmon_create.id;
} else {
DBG("Could not create perfmon: ret=%d errno=%d (%s)\n",
ret, errno, strerror(errno));
}
}
DBG("subgraph->io_used %d\n", subgraph->io_used); DBG("subgraph->io_used %d\n", subgraph->io_used);
subgraph->io_rsrc = pipe_buffer_create(pcontext->screen, 0, subgraph->io_rsrc = pipe_buffer_create(pcontext->screen, 0,
PIPE_USAGE_DEFAULT, PIPE_USAGE_DEFAULT,
@ -447,6 +464,7 @@ ethosu_ml_subgraph_invoke(struct pipe_context *pcontext,
submit.jobs = (uintptr_t)&job; submit.jobs = (uintptr_t)&job;
submit.job_count = 1; submit.job_count = 1;
submit.perfmon_id = subgraph->perfmon_id;
if (DBG_ENABLED(ETHOSU_DBG_MSGS)) if (DBG_ENABLED(ETHOSU_DBG_MSGS))
clock_gettime(CLOCK_MONOTONIC_RAW, &start); clock_gettime(CLOCK_MONOTONIC_RAW, &start);
@ -493,6 +511,25 @@ ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext,
pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]); pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]);
} }
if (DBG_ENABLED(ETHOSU_DBG_DUMP_PERF)) {
struct ethosu_screen *screen = ethosu_screen(pcontext->screen);
uint64_t values[9];
struct drm_ethosu_perfmon_get_values get_values = {
.id = subgraph->perfmon_id,
.values_ptr = (uintptr_t)values,
};
int ret;
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_PERFMON_GET_VALUES, &get_values);
if (ret == 0) {
mesa_logi("PMU: cycles=%lu, npu-active=%lu, npu-idle=%lu\n",
values[2], values[1], values[0]);
} else {
DBG("Could not read perfmon values: ret=%d errno=%d (%s)\n",
ret, errno, strerror(errno));
}
}
} }
void void
@ -500,10 +537,10 @@ ethosu_ml_subgraph_destroy(struct pipe_ml_device *pdevice,
struct pipe_ml_subgraph *psubgraph) struct pipe_ml_subgraph *psubgraph)
{ {
struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph); struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
struct ethosu_screen *screen = subgraph->screen;
if (subgraph->io_rsrc) { if (subgraph->io_rsrc) {
/* Post-submission state: cleanup DRM resources */ /* Post-submission state: cleanup DRM resources */
struct ethosu_screen *screen = subgraph->screen;
struct drm_gem_close arg = {0}; struct drm_gem_close arg = {0};
int ret; int ret;
@ -521,6 +558,13 @@ ethosu_ml_subgraph_destroy(struct pipe_ml_device *pdevice,
free(subgraph->coefs); free(subgraph->coefs);
} }
if (DBG_ENABLED(ETHOSU_DBG_DUMP_PERF)) {
struct drm_ethosu_perfmon_destroy destroy = {
.id = subgraph->perfmon_id,
};
drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_PERFMON_DESTROY, &destroy);
}
util_dynarray_fini(&subgraph->tensors); util_dynarray_fini(&subgraph->tensors);
free(subgraph); free(subgraph);

View file

@ -257,6 +257,7 @@ struct ethosu_subgraph {
uint32_t *cmdstream; uint32_t *cmdstream;
uint32_t *cursor; uint32_t *cursor;
uint32_t cmdstream_bo; uint32_t cmdstream_bo;
uint32_t perfmon_id;
struct pipe_resource *io_rsrc; struct pipe_resource *io_rsrc;
unsigned io_used; unsigned io_used;