ethos: Initial commit of a driver for the Arm Ethos-U65 NPU.

Supports all models in the test suite. No optimizations implemented yet.

Acked-by: Christian Gmeiner <cgmeiner@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36699>
This commit is contained in:
Tomeu Vizoso 2025-02-23 14:26:01 +01:00 committed by Marge Bot
parent b3262b37ce
commit 2581c3ab60
34 changed files with 6015 additions and 3 deletions

View file

@ -1,6 +1,7 @@
# The following files are opted into `ninja clang-format` and
# enforcement in the CI.
src/gallium/drivers/ethosu/**/*
src/gallium/drivers/i915
src/gallium/drivers/r300/compiler/*
src/gallium/drivers/rocket/**/*

View file

@ -0,0 +1,262 @@
/* SPDX-License-Identifier: MIT */
/* Copyright (C) 2025 Arm, Ltd. */
#ifndef _ETHOSU_DRM_H_
#define _ETHOSU_DRM_H_
#include "drm.h"
#if defined(__cplusplus)
extern "C" {
#endif
/**
* DOC: IOCTL IDs
*
* enum drm_ethosu_ioctl_id - IOCTL IDs
*
* Place new ioctls at the end, don't re-order, don't replace or remove entries.
*
* These IDs are not meant to be used directly. Use the DRM_IOCTL_ETHOSU_xxx
* definitions instead.
*/
enum drm_ethosu_ioctl_id {
/** @DRM_ETHOSU_DEV_QUERY: Query device information. */
DRM_ETHOSU_DEV_QUERY = 0,
/** @DRM_ETHOSU_BO_CREATE: Create a buffer object. */
DRM_ETHOSU_BO_CREATE,
/** @DRM_ETHOSU_BO_WAIT: Wait on a buffer object's fence. */
DRM_ETHOSU_BO_WAIT,
/**
* @DRM_ETHOSU_BO_MMAP_OFFSET: Get the file offset to pass to
* mmap to map a GEM object.
*/
DRM_ETHOSU_BO_MMAP_OFFSET,
/**
* @DRM_ETHOSU_CMDSTREAM_BO_CREATE: Create a command stream buffer
* object.
*/
DRM_ETHOSU_CMDSTREAM_BO_CREATE,
/** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */
DRM_ETHOSU_SUBMIT,
};
/**
* DOC: IOCTL arguments
*/
/**
* enum drm_ethosu_dev_query_type - Query type
*
* Place new types at the end, don't re-order, don't remove or replace.
*/
enum drm_ethosu_dev_query_type {
/** @DRM_ETHOSU_DEV_QUERY_NPU_INFO: Query NPU information. */
DRM_ETHOSU_DEV_QUERY_NPU_INFO = 0,
};
/**
* struct drm_ethosu_gpu_info - NPU information
*
* Structure grouping all queryable information relating to the NPU.
*/
struct drm_ethosu_npu_info {
/** @id : NPU ID. */
__u32 id;
#define DRM_ETHOSU_ARCH_MAJOR(x) ((x) >> 28)
#define DRM_ETHOSU_ARCH_MINOR(x) (((x) >> 20) & 0xff)
#define DRM_ETHOSU_ARCH_PATCH(x) (((x) >> 16) & 0xf)
#define DRM_ETHOSU_PRODUCT_MAJOR(x) (((x) >> 12) & 0xf)
#define DRM_ETHOSU_VERSION_MAJOR(x) (((x) >> 8) & 0xf)
#define DRM_ETHOSU_VERSION_MINOR(x) (((x) >> 4) & 0xff)
#define DRM_ETHOSU_VERSION_STATUS(x) ((x) & 0xf)
/** @gpu_rev: GPU revision. */
__u32 config;
__u32 sram_size;
};
/**
* struct drm_ethosu_dev_query - Arguments passed to DRM_ETHOSU_IOCTL_DEV_QUERY
*/
struct drm_ethosu_dev_query {
/** @type: the query type (see drm_ethosu_dev_query_type). */
__u32 type;
/**
* @size: size of the type being queried.
*
* If pointer is NULL, size is updated by the driver to provide the
* output structure size. If pointer is not NULL, the driver will
* only copy min(size, actual_structure_size) bytes to the pointer,
* and update the size accordingly. This allows us to extend query
* types without breaking userspace.
*/
__u32 size;
/**
* @pointer: user pointer to a query type struct.
*
* Pointer can be NULL, in which case, nothing is copied, but the
* actual structure size is returned. If not NULL, it must point to
* a location that's large enough to hold size bytes.
*/
__u64 pointer;
};
/**
* enum drm_ethosu_bo_flags - Buffer object flags, passed at creation time.
*/
enum drm_ethosu_bo_flags {
/**
* @DRM_ETHOSU_BO_NO_MMAP: The buffer object will never be CPU-mapped
* in userspace.
*/
DRM_ETHOSU_BO_NO_MMAP = (1 << 0),
};
/**
* struct drm_ethosu_bo_create - Arguments passed to DRM_IOCTL_ETHOSU_BO_CREATE.
*/
struct drm_ethosu_bo_create {
/**
* @size: Requested size for the object
*
* The (page-aligned) allocated size for the object will be returned.
*/
__u64 size;
/**
* @flags: Flags. Must be a combination of drm_ethosu_bo_flags flags.
*/
__u32 flags;
/**
* @handle: Returned handle for the object.
*
* Object handles are nonzero.
*/
__u32 handle;
};
/**
* struct drm_ethosu_bo_mmap_offset - Arguments passed to DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET.
*/
struct drm_ethosu_bo_mmap_offset {
/** @handle: Handle of the object we want an mmap offset for. */
__u32 handle;
/** @pad: MBZ. */
__u32 pad;
/** @offset: The fake offset to use for subsequent mmap calls. */
__u64 offset;
};
/**
* struct drm_ethosu_wait_bo - ioctl argument for waiting for
* completion of the last DRM_ETHOSU_SUBMIT on a BO.
*
* This is useful for cases where multiple processes might be
* rendering to a BO and you want to wait for all rendering to be
* completed.
*/
struct drm_ethosu_bo_wait {
__u32 handle;
__u32 pad;
__s64 timeout_ns; /* absolute */
};
struct drm_ethosu_cmdstream_bo_create {
/* Size of the data argument. */
__u32 size;
/* Flags, currently must be 0. */
__u32 flags;
/* Pointer to the data. */
__u64 data;
/** Returned GEM handle for the BO. */
__u32 handle;
/* Pad, must be 0. */
__u32 pad;
};
/**
* struct drm_ethosu_job - A job to be run on the NPU
*
* The kernel will schedule the execution of this job taking into account its
* dependencies with other jobs. All tasks in the same job will be executed
* sequentially on the same core, to benefit from memory residency in SRAM.
*/
struct drm_ethosu_job {
/** Input: BO handle for cmdstream. */
__u32 cmd_bo;
/** Input: Amount of SRAM to use. */
__u32 sram_size;
#define ETHOSU_MAX_REGIONS 8
/** Input: Array of BO handles for each region. */
__u32 region_bo_handles[ETHOSU_MAX_REGIONS];
};
/**
* struct drm_ethosu_submit - ioctl argument for submitting commands to the NPU.
*
* The kernel will schedule the execution of these jobs in dependency order.
*/
struct drm_ethosu_submit {
/** Input: Pointer to an array of struct drm_ethosu_job. */
__u64 jobs;
/** Input: Number of jobs passed in. */
__u32 job_count;
/** Reserved, must be zero. */
__u32 pad;
};
/**
* DRM_IOCTL_ETHOSU() - Build a ethosu IOCTL number
* @__access: Access type. Must be R, W or RW.
* @__id: One of the DRM_ETHOSU_xxx id.
* @__type: Suffix of the type being passed to the IOCTL.
*
* Don't use this macro directly, use the DRM_IOCTL_ETHOSU_xxx
* values instead.
*
* Return: An IOCTL number to be passed to ioctl() from userspace.
*/
#define DRM_IOCTL_ETHOSU(__access, __id, __type) \
DRM_IO ## __access(DRM_COMMAND_BASE + DRM_ETHOSU_ ## __id, \
struct drm_ethosu_ ## __type)
enum {
DRM_IOCTL_ETHOSU_DEV_QUERY =
DRM_IOCTL_ETHOSU(WR, DEV_QUERY, dev_query),
DRM_IOCTL_ETHOSU_BO_CREATE =
DRM_IOCTL_ETHOSU(WR, BO_CREATE, bo_create),
DRM_IOCTL_ETHOSU_BO_WAIT =
DRM_IOCTL_ETHOSU(WR, BO_WAIT, bo_wait),
DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET =
DRM_IOCTL_ETHOSU(WR, BO_MMAP_OFFSET, bo_mmap_offset),
DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE =
DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
DRM_IOCTL_ETHOSU_SUBMIT =
DRM_IOCTL_ETHOSU(WR, SUBMIT, submit),
};
#if defined(__cplusplus)
}
#endif
#endif /* _ETHOSU_DRM_H_ */

View file

@ -186,7 +186,7 @@ elif gallium_drivers.contains('all')
gallium_drivers = [
'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915',
'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris',
'zink', 'd3d12', 'asahi', 'rocket'
'zink', 'd3d12', 'asahi', 'rocket', 'ethosu'
]
endif
@ -214,6 +214,7 @@ with_gallium_zink = gallium_drivers.contains('zink')
with_gallium_d3d12 = gallium_drivers.contains('d3d12')
with_gallium_asahi = gallium_drivers.contains('asahi')
with_gallium_rocket = gallium_drivers.contains('rocket')
with_gallium_ethosu = gallium_drivers.contains('ethosu')
foreach gallium_driver : gallium_drivers
pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
endforeach

View file

@ -86,7 +86,7 @@ option(
value : ['auto'],
choices : [
'all', 'auto',
'asahi', 'crocus', 'd3d12', 'etnaviv', 'freedreno', 'i915', 'iris',
'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris',
'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi',
'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink',
],

View file

@ -0,0 +1,2 @@
BasedOnStyle: InheritParentConfig
DisableFormat: false

View file

@ -0,0 +1,14 @@
Add.Op/.*
AddQuant.Op/.*
Conv2D.Op/.*
DepthwiseConv2D.Op/.*
FullyConnected.Op/.*
# Don't support unfused Pad operations yet
Models.Op/yolox_000
Models.Op/yolox_003
Models.Op/yolox_012
Models.Op/yolox_027
Models.Op/yolox_042
Models.Op/yolox_077
Models.Op/yolox_086

View file

@ -0,0 +1,75 @@
#!/usr/bin/python3
#
# Copyright © 2024-2025 Tomeu Vizoso
#
# SPDX-License-Identifier: MIT
import sys
import os
import argparse
import struct
from gen_parser import Parser, Reg, Enum, mask, Error
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--xml', type=str, required=True)
parser.add_argument('--dump', type=str, required=True)
args = parser.parse_args()
p = Parser()
try:
p.parse("", args.xml)
except Error as e:
print(e, file=sys.stderr)
exit(1)
regs = {}
for e in p.file:
if isinstance(e, Reg):
regs[e.offset] = e
domains = {}
for e in p.file:
if isinstance(e, Enum):
if e.name == "target":
for name, val in e.values:
domains[name] = val
f = open(args.dump, mode='rb')
for i in range(0, os.path.getsize(args.dump) // 8):
cmd = f.read(8)
(offset, value, target) = struct.unpack("<hIh", cmd)
if offset in regs.keys():
reg = regs[offset]
if (target & 0xfffffffe) != domains[reg.domain]:
print("WARNING: target 0x%x doesn't match register's domain 0x%x" % (target, domains[reg.domain]))
print("EMIT(REG_%s, " % regs[offset].full_name.upper(), end="")
first = True
if value == 0 or len(reg.bitset.fields) == 1:
print("0x%x" % value, end="")
else:
for field in reg.bitset.fields:
if field.type == "boolean":
if 1 << field.high & value:
if not first:
print(" | ", end="")
print("%s_%s" % (reg.full_name.upper(), field.name.upper()), end="")
first = False
elif field.type == "uint":
field_value = (value & mask(field.low, field.high)) >> field.low
if field_value != 0:
if not first:
print(" | ", end="")
print("%s_%s(%d)" % (reg.full_name.upper(), field.name.upper(), field_value), end="")
first = False
print(");")
else:
print("%x %x %x" % (target, offset, value))
if __name__ == '__main__':
main()

View file

@ -0,0 +1,783 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include <fcntl.h>
#include <math.h>
#include <stdbool.h>
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "ethosu_cmd.h"
#include "ethosu_coefs.h"
#include "ethosu_ml.h"
#include "ethosu_registers.h"
#include "ethosu_sched.h"
#define MAX_BLOCKDEP 3
#define MAX_OUTSTANDING_DMA_OPS 2
#define MAX_OUTSTANDING_NPU_OPS 2
enum ethosu_op_to_scale {
OP_NONE = 0,
OP_A = 1,
OP_B = 2,
};
static void
ethosu_ensure_cmdstream(struct ethosu_subgraph *subgraph)
{
if ((subgraph->cursor - subgraph->cmdstream) < (subgraph->cmdstream_used - 2))
return;
unsigned cur_size = subgraph->cursor - subgraph->cmdstream;
subgraph->cmdstream = realloc(subgraph->cmdstream, (subgraph->cmdstream_used + 32) * sizeof(*subgraph->cmdstream));
subgraph->cursor = subgraph->cmdstream + cur_size;
subgraph->cmdstream_used += 32;
}
#define EMIT0(cmd, param) \
do { \
ethosu_ensure_cmdstream(subgraph); \
*(subgraph->cursor++) = cmd | (((param) & 0xFFFF) << 16); \
if (DBG_ENABLED(ETHOSU_DBG_MSGS)) \
fprintf(stderr, "emit0(%s, 0x%x);\n", ethosu_get_cmd_name(0, cmd), (param) & 0xFFFF); \
} while (0)
#define EMIT1(cmd, param, offset) \
do { \
ethosu_ensure_cmdstream(subgraph); \
*(subgraph->cursor++) = cmd | 0x4000 | (((param) & 0xFFFF) << 16); \
*(subgraph->cursor++) = (offset) & 0xFFFFFFFF; \
if (DBG_ENABLED(ETHOSU_DBG_MSGS)) \
fprintf(stderr, "emit1(%s, 0x%x, 0x%x);\n", ethosu_get_cmd_name(1, cmd), (param) & 0xFFFF, (int)(offset)); \
} while (0)
static void
emit_addresses(
struct ethosu_subgraph *subgraph,
struct ethosu_feature_map *feature_map,
uint32_t cmd_base0, uint32_t cmd_base1, uint32_t cmd_base2, uint32_t cmd_base3)
{
EMIT1(cmd_base0, 0x0, feature_map->tiles.addresses[0]);
EMIT1(cmd_base1, 0x0, feature_map->tiles.addresses[1]);
EMIT1(cmd_base2, 0x0, feature_map->tiles.addresses[2]);
EMIT1(cmd_base3, 0x0, feature_map->tiles.addresses[3]);
}
static void
emit_tiles(
struct ethosu_subgraph *subgraph,
struct ethosu_feature_map *feature_map,
uint32_t cmd_height0, uint32_t cmd_height1, uint32_t cmd_width0)
{
EMIT0(cmd_height0, feature_map->tiles.height_0 - 1);
EMIT0(cmd_height1, feature_map->tiles.height_1 - 1);
EMIT0(cmd_width0, feature_map->tiles.width_0 - 1);
}
static void
emit_strides(
struct ethosu_subgraph *subgraph,
struct ethosu_feature_map *feature_map,
uint32_t cmd_stride_c, uint32_t cmd_stride_y, uint32_t cmd_stride_x)
{
unsigned elem_size = 1;
unsigned tensor_x, tensor_y, tensor_c;
struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx);
if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) {
tensor_x = 16 * elem_size;
tensor_c = tensor_x * tensor->shape.width;
tensor_y = elem_size * tensor->shape.width * ALIGN(tensor->shape.depth, 16);
} else {
tensor_c = elem_size;
tensor_x = tensor->shape.depth * tensor_c;
tensor_y = tensor->shape.width * tensor_x;
}
EMIT1(cmd_stride_c, 0x0, tensor_c);
EMIT1(cmd_stride_y, 0x0, tensor_y);
EMIT1(cmd_stride_x, 0x0, tensor_x);
}
static void
emit_ifm(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map)
{
EMIT0(NPU_SET_IFM_REGION, IO_REGION);
emit_addresses(
subgraph,
feature_map,
NPU_SET_IFM_BASE0,
NPU_SET_IFM_BASE1,
NPU_SET_IFM_BASE2,
NPU_SET_IFM_BASE3);
emit_tiles(
subgraph, feature_map, NPU_SET_IFM_HEIGHT0_M1, NPU_SET_IFM_HEIGHT1_M1, NPU_SET_IFM_WIDTH0_M1);
EMIT0(NPU_SET_IFM_DEPTH_M1, feature_map->shape.depth - 1);
emit_strides(subgraph, feature_map, NPU_SET_IFM_STRIDE_C, NPU_SET_IFM_STRIDE_Y, NPU_SET_IFM_STRIDE_X);
EMIT0(NPU_SET_IFM_ZERO_POINT, feature_map->zero_point);
}
static void
emit_ifm_precision(struct ethosu_subgraph *subgraph,
struct ethosu_feature_map *feature_map,
enum ethosu_op_to_scale op_to_scale, uint32_t precision_cmd)
{
struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx);
unsigned prec = 0;
if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
prec |= NPU_SET_IFM_PRECISION_FORMAT(1);
if (feature_map->is_signed)
prec |= NPU_SET_IFM_PRECISION_ACTIVATION(1); // signed activation
prec |= NPU_SET_IFM_PRECISION_SCALE_MODE(op_to_scale);
EMIT0(precision_cmd, prec);
}
static void
emit_padding(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_IFM_PAD_TOP, operation->pad.top);
EMIT0(NPU_SET_IFM_PAD_LEFT, operation->pad.left);
EMIT0(NPU_SET_IFM_PAD_BOTTOM, operation->pad.bottom);
EMIT0(NPU_SET_IFM_PAD_RIGHT, operation->pad.right);
}
static void
emit_ofm(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map)
{
EMIT0(NPU_SET_OFM_REGION, IO_REGION);
emit_addresses(
subgraph,
feature_map,
NPU_SET_OFM_BASE0,
NPU_SET_OFM_BASE1,
NPU_SET_OFM_BASE2,
NPU_SET_OFM_BASE3);
emit_tiles(
subgraph, feature_map, NPU_SET_OFM_HEIGHT0_M1, NPU_SET_OFM_HEIGHT1_M1, NPU_SET_OFM_WIDTH0_M1);
EMIT0(NPU_SET_OFM_HEIGHT_M1, feature_map->shape.height - 1);
EMIT0(NPU_SET_OFM_WIDTH_M1, feature_map->shape.width - 1);
EMIT0(NPU_SET_OFM_DEPTH_M1, feature_map->shape.depth - 1);
emit_strides(subgraph, feature_map, NPU_SET_OFM_STRIDE_C, NPU_SET_OFM_STRIDE_Y, NPU_SET_OFM_STRIDE_X);
EMIT0(NPU_SET_OFM_ZERO_POINT, feature_map->zero_point);
}
static void
emit_ofm_precision(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, operation->ofm.tensor_idx);
unsigned prec = 0;
if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
prec |= NPU_SET_OFM_PRECISION_FORMAT(1);
if (operation->ofm.is_signed)
prec |= NPU_SET_OFM_PRECISION_ACTIVATION(1);
if (operation->type == ETHOSU_OPERATION_TYPE_POOLING ||
operation->type == ETHOSU_OPERATION_TYPE_ELTWISE) {
prec |= NPU_SET_OFM_PRECISION_SCALE_MODE(1);
}
prec |= NPU_SET_OFM_PRECISION_ROUND_MODE(operation->round_mode);
EMIT0(NPU_SET_OFM_PRECISION, prec);
}
static void
emit_kernel(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_KERNEL_HEIGHT_M1, operation->kernel.height - 1);
EMIT0(NPU_SET_KERNEL_WIDTH_M1, operation->kernel.width - 1);
unsigned stride = (operation->kernel.stride_x - 1) & 1;
stride |= ((operation->kernel.stride_y - 1) & 1) << 1;
stride |= ((operation->kernel.stride_x - 1) >> 1) << 6;
stride |= ((operation->kernel.stride_y - 1) >> 1) << 9;
stride |= (operation->kernel.dilation_x - 1) << 3;
stride |= (operation->kernel.dilation_y - 1) << 4;
stride |= operation->conv.part_kernel_first << 2;
EMIT0(NPU_SET_KERNEL_STRIDE, stride);
}
static void
emit_weights(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_WEIGHT_REGION, operation->conv.weights.region);
EMIT1(NPU_SET_WEIGHT_BASE, 0x0, operation->conv.weights.address);
EMIT1(NPU_SET_WEIGHT_LENGTH, 0x0, operation->conv.weights.size);
}
static void
emit_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_SCALE_REGION, operation->conv.scales.region);
EMIT1(NPU_SET_SCALE_BASE, 0x0, operation->conv.scales.address);
EMIT1(NPU_SET_SCALE_LENGTH, 0x0, operation->conv.scales.size);
}
static void
emit_activation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_ACTIVATION, 0x0);
if (operation->ofm.is_signed) {
EMIT0(NPU_SET_ACTIVATION_MIN, 0xff80);
EMIT0(NPU_SET_ACTIVATION_MAX, 0x7f);
} else {
EMIT0(NPU_SET_ACTIVATION_MIN, 0x00);
EMIT0(NPU_SET_ACTIVATION_MAX, 0xff);
}
}
static void
emit_block_config(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_OFM_BLK_HEIGHT_M1, operation->block_config.ofm_block.height - 1);
EMIT0(NPU_SET_OFM_BLK_WIDTH_M1, operation->block_config.ofm_block.width - 1);
EMIT0(NPU_SET_OFM_BLK_DEPTH_M1, operation->block_config.ofm_block.depth - 1);
}
static void
emit_shram_registers(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_IFM_IB_END, operation->block_config.shram_layout.ib_end);
EMIT0(NPU_SET_AB_START, operation->block_config.shram_layout.ab_start);
if (operation->type == ETHOSU_OPERATION_TYPE_ELTWISE)
EMIT0(NPU_SET_IFM2_IB_START, operation->block_config.shram_layout.ib_start2);
EMIT0(NPU_SET_ACC_FORMAT, operation->block_config.acc_type);
}
static void
emit_common(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, enum ethosu_op_to_scale op_to_scale)
{
emit_ifm(subgraph, &operation->ifm);
emit_ifm_precision(subgraph, &operation->ifm, op_to_scale, NPU_SET_IFM_PRECISION);
EMIT0(NPU_SET_IFM_UPSCALE, operation->upscale);
if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE)
emit_padding(subgraph, operation);
emit_ofm(subgraph, &operation->ofm);
emit_ofm_precision(subgraph, operation);
if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE)
emit_kernel(subgraph, operation);
if (operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION) {
emit_weights(subgraph, operation);
emit_biases(subgraph, operation);
}
emit_activation(subgraph, operation);
emit_block_config(subgraph, operation);
if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen)))
emit_shram_registers(subgraph, operation);
else
EMIT0(NPU_SET_ACC_FORMAT, 0x300); // FIXME should be based on # of MACs, only works for >=256 MACs
}
static void
emit_convolution(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
ethosu_allocate_feature_map(subgraph, &operation->ifm);
operation->ifm.tiles.height_0 = operation->ifm.shape.height;
operation->ifm.tiles.height_1 = operation->ifm.shape.height;
operation->ifm.tiles.width_0 = operation->ifm.shape.width;
ethosu_allocate_feature_map(subgraph, &operation->ofm);
operation->ofm.tiles.height_0 = operation->ofm.shape.height;
operation->ofm.tiles.height_1 = operation->ofm.shape.height;
operation->ofm.tiles.width_0 = operation->ofm.shape.width;
emit_common(subgraph, operation, false);
}
static unsigned
quantise_pooling_scale(unsigned nr_kernel_elements, unsigned rescale_bits, unsigned *out_shift)
{
int k = 0;
long long N = 0;
frexp(nr_kernel_elements - 1, &k);
N = 31 - rescale_bits;
*out_shift = N + k;
return ((1LL << (N + k)) + (1LL << k)) / nr_kernel_elements;
}
static unsigned
pooling_emit_ofm_scaling(
double input1_scale,
double output_scale,
unsigned kernel_height,
unsigned kernel_width,
uint32_t *out_shift)
{
double rescale = input1_scale / output_scale;
unsigned rescale_bits = 0;
unsigned scale;
if (kernel_height == 1 && kernel_width == 1) {
if (rescale > 1.0)
rescale_bits = 32 - __builtin_clz(ceil(rescale)) + 1;
else if (rescale < 1.0)
rescale_bits = -(32 - __builtin_clz(ceil(1 / rescale))) - 1;
}
scale = quantise_pooling_scale(kernel_height * kernel_width, rescale_bits, out_shift);
scale = ceil(scale * rescale);
return scale;
}
static void
emit_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
unsigned scale;
unsigned scale_shift;
emit_common(subgraph, operation, false);
if (operation->pooling.avg) {
scale = pooling_emit_ofm_scaling(
operation->ifm.scale,
operation->ofm.scale,
operation->kernel.height,
operation->kernel.width,
&scale_shift);
EMIT1(NPU_SET_OFM_SCALE, scale_shift, scale);
}
}
static void
emit_ifm2(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, bool has_scalar)
{
if (!has_scalar) {
EMIT0(NPU_SET_IFM2_REGION, IO_REGION);
emit_addresses(subgraph, &operation->ifm2, NPU_SET_IFM2_BASE0, NPU_SET_IFM2_BASE1, NPU_SET_IFM2_BASE2, NPU_SET_IFM2_BASE3);
emit_tiles(subgraph, &operation->ifm2, NPU_SET_IFM2_HEIGHT0_M1, NPU_SET_IFM2_HEIGHT1_M1, NPU_SET_IFM2_WIDTH0_M1);
emit_strides(subgraph, &operation->ifm2, NPU_SET_IFM2_STRIDE_C, NPU_SET_IFM2_STRIDE_Y, NPU_SET_IFM2_STRIDE_X);
}
EMIT0(NPU_SET_IFM2_ZERO_POINT, operation->ifm2.zero_point);
}
static void
emit_ifm2_broadcast(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
unsigned ifm2_broadcast = 0;
EMIT0(NPU_SET_IFM2_BROADCAST, ifm2_broadcast);
}
/*
def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
if npu_op.activation is not None and npu_op.activation.op_type in (
NpuActivationOp.SIGMOID,
NpuActivationOp.TANH,
):
output_scale = 1 / 0x3000
if npu_op.sub_op_type == NpuElementWiseOp.MUL:
if npu_op.rescale:
ofm_scale, shift = npu_op.rescale
elif None in (input_scale, input2_scale, output_scale):
ofm_scale = 1
shift = 0
else:
ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
else: # Add/Sub
# Default operand scaling is no scaling
opa_scale = opb_scale = 1
opa_shift = 0
bitdepth = npu_op.ifm.data_type.size_in_bits()
use_advanced_scaling = False
if npu_op.rescale is not None:
# Explicit ofm scaling
ofm_scale, shift = npu_op.rescale
elif None in (input_scale, input2_scale, output_scale):
# No ofm scaling
ofm_scale = 1
shift = 0
elif input_scale == input2_scale and bitdepth == 16:
# int16 same scaling
opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
input_scale, input2_scale, output_scale
)
# align the double rounding with that of advanced scaling
opa_scale //= 2
opb_scale //= 2
shift -= 1
opa_shift = 0 # Unused for this case
elif input_scale == input2_scale:
# Same scaling
opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
input_scale, input2_scale, output_scale
)
opa_shift = 0 # Unused for this case
# For 8 bit we can't guarantee double rounding with simplified scaling will always be
# the same as with advanced scaling due to different shifts. When the ofm scale fulfils
# the following we know that double rounding will have no effect for advanced scaling
# no matter the input, so we can safely use simplified scaling with double rounding disabled.
use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
else:
use_advanced_scaling = True
if use_advanced_scaling:
# Use advanced implementation only when input/output scales differ,
# or when we can't guarantee the absence of rounding errors
(
opa_scale,
opa_shift,
ofm_scale,
shift,
op_to_scale,
) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
opb_scale = 0 # Unused for this case
if npu_op.reversed_operands:
# If the operand order is reversed we also have to swap which operand is scaled
if op_to_scale == scaling.OperandToScale.OPa:
op_to_scale = scaling.OperandToScale.OPb
else:
op_to_scale = scaling.OperandToScale.OPa
emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
*/
static void
simplified_elementwise_add_sub_scale(
double input1_scale,
double input2_scale,
double output_scale,
uint32_t input_shift,
double *out_input1_rescale,
double *out_input2_rescale,
uint32_t *out_out_scale,
uint32_t *out_out_shift)
{
double max_input_scale = MAX2(input1_scale, input2_scale);
double input_shift_val = (double)(1LL << input_shift); /* Use 1LL for large shifts */
*out_input1_rescale = input1_scale * input_shift_val / (2.0 * max_input_scale);
*out_input2_rescale = input2_scale * input_shift_val / (2.0 * max_input_scale);
/*
* Be careful with division by zero or very small output_scale if output_scale
* can be zero or close to zero.
*/
double output_rescale_val;
if (output_scale == 0.0) {
/* Handle error or return specific value */
output_rescale_val = 0.0; /* Or INFINITY, depending on desired behavior */
} else {
output_rescale_val = (2.0 * max_input_scale) / (output_scale * input_shift_val);
}
*out_out_scale = ethosu_quantize_scale(output_rescale_val, out_out_shift);
}
static enum ethosu_op_to_scale
eltwise_emit_ofm_scaling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
double max_input_scale = MAX2(operation->ifm.scale, operation->ifm2.scale);
double min_input_scale = MIN2(operation->ifm.scale, operation->ifm2.scale);
unsigned bitdepth = 8;
uint32_t input_shift = (bitdepth == 8) ? 20 : 15;
double input1_rescale_tmp;
double input2_rescale_tmp;
unsigned ofm_scale, ofm_shift;
unsigned opa_scale, opa_shift;
simplified_elementwise_add_sub_scale(
min_input_scale, max_input_scale, operation->ofm.scale, input_shift,
&input1_rescale_tmp, &input2_rescale_tmp,
&ofm_scale, &ofm_shift);
opa_scale = ethosu_quantize_scale(input1_rescale_tmp, &opa_shift);
EMIT1(NPU_SET_OPA_SCALE, opa_shift, opa_scale);
EMIT1(NPU_SET_OPB_SCALE, 0x0, 0x0);
EMIT1(NPU_SET_OFM_SCALE, ofm_shift, ofm_scale);
if (operation->ifm.scale < operation->ifm2.scale)
return OP_A;
else
return OP_B;
}
static void
emit_eltwise(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
bool has_scalar = false;
enum ethosu_op_to_scale op_to_scale = OP_NONE;
op_to_scale = eltwise_emit_ofm_scaling(subgraph, operation);
emit_common(subgraph, operation, op_to_scale);
emit_ifm2(subgraph, operation, has_scalar);
emit_ifm_precision(subgraph, &operation->ifm2, OP_NONE, NPU_SET_IFM2_PRECISION);
emit_ifm2_broadcast(subgraph, operation);
}
static void
emit_dma(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
EMIT0(NPU_SET_DMA0_SRC_REGION, COEFS_REGION);
EMIT1(NPU_SET_DMA0_SRC, 0x0, operation->dma.address);
EMIT0(NPU_SET_DMA0_DST_REGION, SCRATCH_REGION);
EMIT1(NPU_SET_DMA0_DST, 0x0, 0x0);
EMIT1(NPU_SET_DMA0_LEN, 0x0, operation->dma.size);
}
static void
emit_operation_code(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
switch (operation->type) {
case ETHOSU_OPERATION_TYPE_CONVOLUTION:
if (operation->conv.depthwise)
EMIT0(NPU_OP_DEPTHWISE, 0x0);
else
EMIT0(NPU_OP_CONV, 0x0);
break;
case ETHOSU_OPERATION_TYPE_POOLING:
EMIT0(NPU_OP_POOL, operation->pooling.avg);
break;
case ETHOSU_OPERATION_TYPE_ELTWISE:
EMIT0(NPU_OP_ELEMENTWISE, 0x1);
break;
case ETHOSU_OPERATION_TYPE_DMA:
EMIT0(NPU_OP_DMA_START, 0x0);
break;
}
}
static void
emit_cmd_waits(struct ethosu_subgraph *subgraph, int npu_waits, int dma_waits)
{
if (npu_waits >= 0)
EMIT0(NPU_OP_KERNEL_WAIT, npu_waits);
if (dma_waits >= 0)
EMIT0(NPU_OP_DMA_WAIT, dma_waits);
}
static bool
ethosu_intersects_accesses(struct ethosu_address_range *a, struct ethosu_address_range *b)
{
for (int i = 0; i < MAX_MEMORY_ACCESSES; i++) {
for (int j = 0; j < MAX_MEMORY_ACCESSES; j++) {
if (a[i].size == 0 || b[j].size == 0)
continue;
if (a[i].region != b[j].region)
continue;
if (a[i].address < b[j].address + b[j].size &&
b[j].address < a[i].address + a[i].size)
return true;
}
}
return false;
}
static bool
ethosu_operations_conflict(struct ethosu_subgraph *subgraph,
struct ethosu_operation *op1, struct ethosu_operation *op2)
{
/* True dependencies, or write -> read */
if (ethosu_intersects_accesses(op1->write_accesses, op2->read_accesses))
return true;
/* Anti-dependencies, or read -> write */
if (ethosu_intersects_accesses(op1->read_accesses, op2->write_accesses))
return true;
/* Output dependencies, or write -> write */
if (ethosu_intersects_accesses(op1->write_accesses, op2->write_accesses))
return true;
/* read -> read does not cause a conflict */
return false;
}
static void
get_wait_dependency(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation,
struct util_dynarray *outstanding_dma_ops,
struct util_dynarray *outstanding_npu_ops,
int *npu_waits, int *dma_waits)
{
unsigned kern_wait = -1;
unsigned dma_wait = -1;
struct util_dynarray *outstanding_ops = NULL;
if (operation->type == ETHOSU_OPERATION_TYPE_DMA) {
outstanding_ops = outstanding_npu_ops;
util_dynarray_append(outstanding_dma_ops, struct ethosu_operation *, operation);
unsigned dmap_ops = util_dynarray_num_elements(outstanding_dma_ops, struct ethosu_operation *);
if (dmap_ops > MAX_OUTSTANDING_DMA_OPS)
(void)util_dynarray_pop(outstanding_dma_ops, struct ethosu_operation *);
} else {
outstanding_ops = outstanding_dma_ops;
util_dynarray_append(outstanding_npu_ops, struct ethosu_operation *, operation);
unsigned npu_ops = util_dynarray_num_elements(outstanding_npu_ops, struct ethosu_operation *);
if (npu_ops > MAX_OUTSTANDING_NPU_OPS)
(void)util_dynarray_pop(outstanding_npu_ops, struct ethosu_operation *);
}
unsigned waits = -1;
for (int idx = util_dynarray_num_elements(outstanding_ops, struct ethosu_operation *) - 1; idx >= 0; idx--) {
waits += 1;
struct ethosu_operation *other_op = *util_dynarray_element(outstanding_ops, struct ethosu_operation *, idx);
if (other_op == operation)
continue;
if (ethosu_operations_conflict(subgraph, other_op, operation)) {
if (operation->type == ETHOSU_OPERATION_TYPE_DMA)
kern_wait = waits;
else
dma_wait = waits;
// Current op needs to wait, and after it has waited,
// outstanding_ops[0..idx] are not outstanding any longer.
for (int i = 0; i <= idx; i++)
(void)util_dynarray_pop(outstanding_ops, struct ethosu_operation *);
break;
}
}
*npu_waits = kern_wait;
*dma_waits = dma_wait;
}
static void
fill_memory_accesses(struct ethosu_subgraph *subgraph)
{
util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) {
switch (operation->type) {
case ETHOSU_OPERATION_TYPE_DMA:
operation->read_accesses[0].region = COEFS_REGION;
operation->read_accesses[0].address = operation->dma.address;
operation->read_accesses[0].size = operation->dma.size;
operation->write_accesses[0].region = SCRATCH_REGION;
operation->write_accesses[0].address = 0x0;
operation->write_accesses[0].size = operation->dma.size;
break;
default:
operation->read_accesses[0].region = IO_REGION;
operation->read_accesses[0].address = operation->ifm.tiles.addresses[0];
operation->read_accesses[0].size = operation->ifm.shape.height * operation->ifm.shape.width * operation->ifm.shape.depth;
operation->read_accesses[1].region = IO_REGION;
operation->read_accesses[1].address = operation->ifm2.tiles.addresses[0];
operation->read_accesses[1].size = operation->ifm2.shape.height * operation->ifm2.shape.width * operation->ifm2.shape.depth;
operation->read_accesses[2].region = operation->conv.scales.region;
operation->read_accesses[2].address = operation->conv.scales.address;
operation->read_accesses[2].size = operation->conv.scales.size;
operation->read_accesses[3].region = operation->conv.weights.region;
operation->read_accesses[3].address = operation->conv.weights.address;
operation->read_accesses[3].size = operation->conv.weights.size;
operation->write_accesses[0].region = IO_REGION;
operation->write_accesses[0].address = operation->ofm.tiles.addresses[0];
operation->write_accesses[0].size = operation->ofm.shape.height * operation->ofm.shape.width * operation->ofm.shape.depth;
break;
}
}
}
static unsigned
calc_blockdep(struct ethosu_subgraph *subgraph, struct ethosu_operation *prev_op, struct ethosu_operation *operation)
{
if (!prev_op)
return 0;
// Check if the reserved shram will be used in current/prev op
bool prev_uses_lut = false; // prev_op->activation && prev_op->activation->op_type == NpuActivationOp.TABLE_LOOKUP;
bool curr_uses_lut = false; // operation->activation && operation->activation->op_type == NpuActivationOp.TABLE_LOOKUP;
if (prev_uses_lut && SHRAM_RESERVED_UNUSED_BANKS == 0 && !curr_uses_lut)
return 0;
return MAX_BLOCKDEP; /* TODO: Check if there is actually overlap between the FMs */
}
void
ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph)
{
struct ethosu_operation *prev_op = NULL;
struct util_dynarray outstanding_dma_ops;
struct util_dynarray outstanding_npu_ops;
util_dynarray_init(&outstanding_dma_ops, NULL);
util_dynarray_init(&outstanding_npu_ops, NULL);
subgraph->cmdstream_used = 32;
subgraph->cmdstream = calloc(subgraph->cmdstream_used, sizeof(*subgraph->cmdstream));
subgraph->cursor = subgraph->cmdstream;
fill_memory_accesses(subgraph);
/* Compile */
if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen)))
EMIT0(NPU_SET_PARALLEL_MODE, 0x0);
util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) {
int npu_waits, dma_waits;
get_wait_dependency(subgraph, operation, &outstanding_dma_ops, &outstanding_npu_ops,
&npu_waits, &dma_waits);
switch (operation->type) {
case ETHOSU_OPERATION_TYPE_CONVOLUTION:
emit_convolution(subgraph, operation);
break;
case ETHOSU_OPERATION_TYPE_POOLING:
emit_pooling(subgraph, operation);
break;
case ETHOSU_OPERATION_TYPE_ELTWISE:
emit_eltwise(subgraph, operation);
break;
case ETHOSU_OPERATION_TYPE_DMA:
emit_dma(subgraph, operation);
break;
}
if (operation->type != ETHOSU_OPERATION_TYPE_DMA) {
unsigned blockdep = calc_blockdep(subgraph, prev_op, operation);
blockdep = MIN2(blockdep, MAX_BLOCKDEP);
EMIT0(NPU_SET_BLOCKDEP, blockdep);
prev_op = operation;
}
emit_cmd_waits(subgraph, npu_waits, dma_waits);
emit_operation_code(subgraph, operation);
}
EMIT0(NPU_OP_STOP, 0xffff);
util_dynarray_fini(&outstanding_dma_ops);
util_dynarray_fini(&outstanding_npu_ops);
}

View file

@ -0,0 +1,13 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#ifndef ETHOSU_CMD_H
#define ETHOSU_CMD_H
#include "ethosu_ml.h"
void ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph);
#endif /* ETHOSU_CMD_H */

View file

@ -0,0 +1,133 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "util/u_inlines.h"
#include "mlw_codec/mlw_encode.h"
#include "ethosu_coefs.h"
static void
fill_scale_and_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, uint8_t **scales, long *scales_size, struct pipe_resource *bias_rsrc)
{
struct pipe_transfer *transfer_in;
int32_t *biases = pipe_buffer_map(subgraph->base.context, bias_rsrc,
PIPE_MAP_READ, &transfer_in);
unsigned idx = 0;
*scales_size = ALIGN(operation->ofm.shape.depth * 10, 16);
*scales = malloc(*scales_size);
memset(*scales, 0, *scales_size);
for (unsigned i = 0; i < operation->ofm.shape.depth; i++) {
uint64_t bias = biases[i];
double conv_scale = ((double)operation->ifm.scale * (double)operation->kernel.scale) / (double)operation->ofm.scale;
uint32_t shift;
int scale = ethosu_quantize_scale(conv_scale, &shift);
(*scales)[idx++] = (bias >> (0 * 8)) & 0xFF;
(*scales)[idx++] = (bias >> (1 * 8)) & 0xFF;
(*scales)[idx++] = (bias >> (2 * 8)) & 0xFF;
(*scales)[idx++] = (bias >> (3 * 8)) & 0xFF;
(*scales)[idx++] = (bias >> (4 * 8)) & 0xFF;
(*scales)[idx++] = (scale >> (0 * 8)) & 0xFF;
(*scales)[idx++] = (scale >> (1 * 8)) & 0xFF;
(*scales)[idx++] = (scale >> (2 * 8)) & 0xFF;
(*scales)[idx++] = (scale >> (3 * 8)) & 0xFF;
(*scales)[idx++] = shift & 0x3F;
}
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
static void
calculate_weights_strides(struct ethosu_operation *operation, int out_strides[4])
{
if (operation->kernel.depthwise) {
out_strides[0] = 1;
out_strides[1] = operation->ofm.shape.depth * operation->kernel.height;
out_strides[2] = operation->ofm.shape.depth;
out_strides[3] = operation->ofm.shape.depth * operation->kernel.width;
} else {
out_strides[3] = 1;
out_strides[2] = out_strides[3] * operation->ifm.shape.depth;
out_strides[1] = out_strides[2] * operation->kernel.width;
out_strides[0] = out_strides[1] * operation->kernel.height;
}
}
static void
fill_weights(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, uint8_t **weights, long *weights_size, struct pipe_resource *weight_rsrc)
{
int brick_strides[4] = {0};
unsigned input_channels = operation->ifm.shape.depth;
if (operation->kernel.depthwise)
input_channels = 1;
calculate_weights_strides(operation, brick_strides);
struct pipe_transfer *transfer_in;
uint8_t *input_weights_8 = pipe_buffer_map(subgraph->base.context, weight_rsrc,
PIPE_MAP_READ, &transfer_in);
int16_t *input_weights = malloc(pipe_buffer_size(weight_rsrc) * sizeof(*input_weights));
for (int i = 0; i < pipe_buffer_size(weight_rsrc); i++) {
if (operation->kernel.is_signed)
input_weights[i] = (int8_t)input_weights_8[i] - operation->kernel.zero_point;
else
input_weights[i] = input_weights_8[i] - operation->kernel.zero_point;
}
pipe_buffer_unmap(subgraph->base.context, transfer_in);
long padded_size = 0;
*weights_size = mlw_reorder_encode(
IFM_UBLOCK.depth,
OFM_UBLOCK.depth,
operation->ofm.shape.depth,
operation->kernel.height,
operation->kernel.width,
input_channels,
brick_strides,
input_weights,
operation->block_config.ofm_block.depth,
operation->kernel.depthwise,
operation->conv.part_kernel_first,
8 /* ifm_bitdepth */,
8 /* decomp_h */,
8 /* decomp_w */,
weights,
&padded_size,
DBG_ENABLED(ETHOSU_DBG_MSGS));
free(input_weights);
}
void
fill_coefs(struct ethosu_subgraph *subgraph,
struct ethosu_operation *operation,
struct pipe_resource *bias_rsrc,
struct pipe_resource *weight_rsrc)
{
uint8_t *scales = NULL;
fill_scale_and_biases(subgraph, operation, &scales, &operation->conv.scales.size, bias_rsrc);
operation->conv.scales.region = COEFS_REGION;
operation->conv.scales.address = subgraph->coefs_used;
subgraph->coefs_used += ALIGN_POT(operation->conv.scales.size, 16);
subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used);
memcpy(subgraph->coefs + operation->conv.scales.address, scales, operation->conv.scales.size);
free(scales);
uint8_t *weights = NULL;
fill_weights(subgraph, operation, &weights, &operation->conv.weights.size, weight_rsrc);
operation->conv.weights.region = COEFS_REGION;
operation->conv.weights.address = subgraph->coefs_used;
subgraph->coefs_used += ALIGN_POT(operation->conv.weights.size, 16);
subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used);
memcpy(subgraph->coefs + operation->conv.weights.address, weights, operation->conv.weights.size);
free(weights);
}

View file

@ -0,0 +1,17 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#ifndef ETHOSU_COEFS_H
#define ETHOSU_COEFS_H
#include "ethosu_ml.h"
void
fill_coefs(struct ethosu_subgraph *subgraph,
struct ethosu_operation *operation,
struct pipe_resource *bias_rsrc,
struct pipe_resource *weight_rsrc);
#endif /* ETHOSU_COEFS_H */

View file

@ -0,0 +1,243 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "ethosu_device.h"
#include "ethosu_ml.h"
#include "drm-uapi/ethosu_accel.h"
#include <xf86drm.h>
#include "util/os_mman.h"
#include "util/u_inlines.h"
#include "util/u_surface.h"
#include "util/u_transfer.h"
static const struct debug_named_value ethosu_debug_options[] = {
{"dbg_msgs", ETHOSU_DBG_MSGS, "Print debug messages"},
{"dump_bos", ETHOSU_DBG_DUMP_BOS, "Dump buffers for analysis"},
{"zero_bos", ETHOSU_DBG_ZERO, "Zero buffers for debugging"},
{"disable_nhcwb16", ETHOSU_DBG_DISABLE_NHCWB16, "Disable NHCWB16"},
{"disable_sram", ETHOSU_DBG_DISABLE_SRAM, "Disable SRAM"},
DEBUG_NAMED_VALUE_END};
DEBUG_GET_ONCE_FLAGS_OPTION(ethosu_debug, "ETHOSU_DEBUG", ethosu_debug_options, 0)
int ethosu_debug = 0;
static void
ethosu_destroy_screen(struct pipe_screen *pscreen)
{
struct ethosu_screen *screen = ethosu_screen(pscreen);
ralloc_free(screen);
}
static void
ethosu_destroy_context(struct pipe_context *pctx)
{
struct ethosu_context *ctx = ethosu_context(pctx);
ralloc_free(ctx);
}
static void *
ethosu_buffer_map(struct pipe_context *pctx,
struct pipe_resource *prsc, unsigned level,
unsigned usage, const struct pipe_box *box,
struct pipe_transfer **out_transfer)
{
struct ethosu_screen *screen = ethosu_screen(pctx->screen);
struct ethosu_resource *rsc = ethosu_resource(prsc);
struct drm_ethosu_bo_wait bo_wait = {0};
struct drm_ethosu_bo_mmap_offset bo_mmap_offset = {0};
int ret;
assert(level == 0);
assert(prsc->target == PIPE_BUFFER);
assert(box->y == 0);
assert(box->z == 0);
assert(box->height == 1);
assert(box->depth == 1);
struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer);
transfer->level = level;
transfer->usage = usage;
transfer->box = *box;
pipe_resource_reference(&transfer->resource, prsc);
bo_wait.handle = rsc->handle;
bo_wait.timeout_ns = INT64_MAX;
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_WAIT, &bo_wait);
if (ret == -1)
goto free_transfer;
bo_mmap_offset.handle = rsc->handle;
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET, &bo_mmap_offset);
if (ret == -1)
goto free_transfer;
uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED,
screen->fd, bo_mmap_offset.offset);
assert(map != MAP_FAILED);
if (map == MAP_FAILED)
goto free_transfer;
*out_transfer = transfer;
return map + box->x;
free_transfer:
pipe_resource_reference(&transfer->resource, NULL);
ralloc_free(transfer);
return NULL;
}
static void
ethosu_buffer_unmap(struct pipe_context *pctx,
struct pipe_transfer *transfer)
{
pipe_resource_reference(&transfer->resource, NULL);
ralloc_free(transfer);
}
static struct pipe_context *
ethosu_create_context(struct pipe_screen *screen,
void *priv, unsigned flags)
{
struct ethosu_context *ctx = rzalloc(NULL, struct ethosu_context);
struct pipe_context *pctx = &ctx->base;
if (!ctx)
return NULL;
pctx->screen = screen;
pctx->priv = priv;
pctx->destroy = ethosu_destroy_context;
pctx->buffer_map = ethosu_buffer_map;
pctx->buffer_unmap = ethosu_buffer_unmap;
pctx->resource_copy_region = util_resource_copy_region;
pctx->buffer_subdata = u_default_buffer_subdata;
pctx->clear_buffer = u_default_clear_buffer;
pctx->ml_operation_supported = ethosu_ml_operation_supported;
pctx->ml_subgraph_create = ethosu_ml_subgraph_create;
pctx->ml_subgraph_invoke = ethosu_ml_subgraph_invoke;
pctx->ml_subgraph_read_output = ethosu_ml_subgraph_read_outputs;
pctx->ml_subgraph_destroy = ethosu_ml_subgraph_destroy;
return pctx;
}
static struct pipe_resource *
ethosu_resource_create(struct pipe_screen *pscreen,
const struct pipe_resource *templat)
{
struct ethosu_screen *screen = ethosu_screen(pscreen);
struct drm_ethosu_bo_create arg = {0};
struct ethosu_resource *rsc;
int ret;
assert(templat->target == PIPE_BUFFER);
assert(templat->height0 == 1);
assert(templat->depth0 == 1);
assert(templat->array_size == 1);
rsc = rzalloc(NULL, struct ethosu_resource);
if (!rsc)
return NULL;
rsc->base = *templat;
rsc->base.screen = pscreen;
rsc->base.nr_samples = templat->nr_samples;
pipe_reference_init(&rsc->base.reference, 1);
rsc->bo_size = templat->width0;
arg.size = templat->width0;
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_CREATE, &arg);
if (ret < 0)
goto free_rsc;
rsc->handle = arg.handle;
return &rsc->base;
free_rsc:
ralloc_free(rsc);
return NULL;
}
static void
ethosu_resource_destroy(struct pipe_screen *pscreen,
struct pipe_resource *prsc)
{
struct ethosu_resource *rsc = ethosu_resource(prsc);
struct ethosu_screen *screen = ethosu_screen(pscreen);
struct drm_gem_close arg = {0};
int ret;
arg.handle = rsc->handle;
ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
assert(ret >= 0);
ralloc_free(rsc);
}
static int
ethosu_screen_get_fd(struct pipe_screen *pscreen)
{
return ethosu_screen(pscreen)->fd;
}
static void
dev_query(struct ethosu_screen *screen)
{
int ret;
struct drm_ethosu_npu_info *info = &screen->info;
struct drm_ethosu_dev_query dev_query = {
.type = DRM_ETHOSU_DEV_QUERY_NPU_INFO,
.size = sizeof(*info),
.pointer = (uintptr_t)info,
};
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_DEV_QUERY, &dev_query);
assert(ret != -1);
}
struct pipe_screen *
ethosu_screen_create(int fd,
const struct pipe_screen_config *config,
struct renderonly *ro)
{
struct ethosu_screen *ethosu_screen;
struct pipe_screen *screen;
ethosu_screen = rzalloc(NULL, struct ethosu_screen);
if (!ethosu_screen)
return NULL;
screen = &ethosu_screen->pscreen;
ethosu_debug = debug_get_option_ethosu_debug();
ethosu_screen->fd = fd;
dev_query(ethosu_screen);
if (DBG_ENABLED(ETHOSU_DBG_DISABLE_SRAM))
ethosu_screen->info.sram_size = 0;
screen->get_screen_fd = ethosu_screen_get_fd;
screen->destroy = ethosu_destroy_screen;
screen->context_create = ethosu_create_context;
screen->resource_create = ethosu_resource_create;
screen->resource_destroy = ethosu_resource_destroy;
return screen;
}

View file

@ -0,0 +1,84 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "pipe/p_context.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "renderonly/renderonly.h"
#include "util/log.h"
#include "drm-uapi/ethosu_accel.h"
#ifndef ETHOSU_SCREEN_H
#define ETHOSU_SCREEN_H
enum ethosu_dbg {
ETHOSU_DBG_MSGS = BITFIELD_BIT(0),
ETHOSU_DBG_DUMP_BOS = BITFIELD_BIT(1),
ETHOSU_DBG_ZERO = BITFIELD_BIT(2),
ETHOSU_DBG_DISABLE_NHCWB16 = BITFIELD_BIT(3),
ETHOSU_DBG_DISABLE_SRAM = BITFIELD_BIT(4),
};
extern int ethosu_debug;
#define DBG_ENABLED(flag) unlikely(ethosu_debug &(flag))
#define DBG(fmt, ...) \
do { \
if (DBG_ENABLED(ETHOSU_DBG_MSGS)) \
mesa_logd("%s:%d: " fmt, __func__, __LINE__, \
##__VA_ARGS__); \
} while (0)
struct ethosu_screen {
struct pipe_screen pscreen;
int fd;
struct drm_ethosu_npu_info info;
};
static inline struct ethosu_screen *
ethosu_screen(struct pipe_screen *p)
{
return (struct ethosu_screen *)p;
}
static inline bool
ethosu_is_u65(struct ethosu_screen *e)
{
return DRM_ETHOSU_ARCH_MAJOR(e->info.id) == 1;
}
struct ethosu_context {
struct pipe_context base;
};
static inline struct ethosu_context *
ethosu_context(struct pipe_context *pctx)
{
return (struct ethosu_context *)pctx;
}
struct ethosu_resource {
struct pipe_resource base;
uint32_t handle;
uint64_t phys_addr;
uint64_t obj_addr;
uint64_t bo_size;
};
static inline struct ethosu_resource *
ethosu_resource(struct pipe_resource *p)
{
return (struct ethosu_resource *)p;
}
struct pipe_screen *ethosu_screen_create(int fd,
const struct pipe_screen_config *config,
struct renderonly *ro);
#endif /* ETHOSU_SCREEN_H */

View file

@ -0,0 +1,477 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "ethosu_lower.h"
#include "ethosu_coefs.h"
#include "ethosu_sched.h"
static bool
is_depthwise(const struct pipe_ml_operation *poperation)
{
unsigned input_channels = poperation->input_tensors[0]->dims[3];
unsigned output_channels = poperation->output_tensors[0]->dims[3];
return poperation->conv.depthwise && input_channels > 1 &&
output_channels > 1;
}
static unsigned
needed_total_padding(unsigned input_size, unsigned stride, unsigned filter_size)
{
if (input_size % stride == 0)
return MAX2(filter_size - stride, 0);
return MAX2(filter_size - (input_size % stride), 0);
}
static bool
ethosu_is_part_kernel_first(struct ethosu_operation *operation)
{
// Determine which block traversal strategy has better DPU utilization
unsigned kernel_size = operation->kernel.height * operation->kernel.width;
unsigned depth = operation->ifm.shape.depth;
float depth_utilization = (float)depth / ethosu_round_up_to_multiple(depth, 32);
float part_kernel_utilization = ((float)depth / ethosu_round_up_to_multiple(depth, 8));
part_kernel_utilization *= (float)kernel_size / ethosu_round_up_to_multiple(kernel_size, 4);
if (operation->type != ETHOSU_OPERATION_TYPE_CONVOLUTION)
return false;
if (operation->kernel.depthwise)
return false;
// Part-kernel first is always better for ifm depths <= 8
if (part_kernel_utilization >= depth_utilization || depth <= 8)
return true;
return false;
}
static void
set_feature_maps(struct pipe_tensor *input_tensor,
struct pipe_tensor *output_tensor,
struct ethosu_operation *operation)
{
operation->ifm.tensor_idx = input_tensor->index;
operation->ifm.shape.height = input_tensor->dims[1];
operation->ifm.shape.width = input_tensor->dims[2];
operation->ifm.shape.depth = input_tensor->dims[3];
operation->ifm.zero_point = input_tensor->zero_point;
operation->ifm.scale = input_tensor->scale;
operation->ifm.is_signed = input_tensor->is_signed;
operation->ofm.tensor_idx = output_tensor->index;
operation->ofm.shape.height = output_tensor->dims[1];
operation->ofm.shape.width = output_tensor->dims[2];
operation->ofm.shape.depth = output_tensor->dims[3];
operation->ofm.zero_point = output_tensor->zero_point;
operation->ofm.scale = output_tensor->scale;
operation->ofm.is_signed = output_tensor->is_signed;
}
static const struct pipe_ml_operation *
ethosu_find_first_consumer(const struct pipe_ml_operation *poperations,
unsigned count,
unsigned tensor_index)
{
for (unsigned i = 0; i < count; i++) {
const struct pipe_ml_operation *poperation = &poperations[i];
for (unsigned j = 0; j < poperation->input_count; j++)
if (poperation->input_tensors[j]->index == tensor_index)
return poperation;
}
return NULL;
}
static void
allocate_feature_maps(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
ethosu_allocate_feature_map(subgraph, &operation->ifm);
operation->ifm.tiles.height_0 = operation->ifm.shape.height;
operation->ifm.tiles.height_1 = operation->ifm.shape.height;
operation->ifm.tiles.width_0 = operation->ifm.shape.width;
ethosu_allocate_feature_map(subgraph, &operation->ofm);
operation->ofm.tiles.height_0 = operation->ofm.shape.height;
operation->ofm.tiles.height_1 = operation->ofm.shape.height;
operation->ofm.tiles.width_0 = operation->ofm.shape.width;
}
static const struct pipe_ml_operation *
ethosu_find_first_producer(const struct pipe_ml_operation *poperations, unsigned count,
unsigned tensor_index)
{
for (unsigned i = 0; i < count; i++) {
const struct pipe_ml_operation *poperation = &poperations[i];
for (unsigned j = 0; j < poperation->output_count; j++) {
if (poperation->output_tensors[j]->index == tensor_index)
return poperation;
}
}
return NULL;
}
static void
ethosu_lower_convolution(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct pipe_tensor *input_tensor,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_CONVOLUTION;
operation->conv.depthwise = is_depthwise(poperation);
// operation->padding_same = poperation->conv.padding_same;
// operation->stride = poperation->conv.stride_x;
set_feature_maps(input_tensor, poperation->output_tensors[0], operation);
operation->kernel.height = poperation->conv.weight_tensor->dims[1];
operation->kernel.width = poperation->conv.weight_tensor->dims[2];
operation->kernel.stride_y = poperation->conv.stride_y;
operation->kernel.stride_x = poperation->conv.stride_x;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
operation->kernel.depthwise = is_depthwise(poperation);
operation->kernel.scale = poperation->conv.weight_tensor->scale;
operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point;
operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed;
operation->conv.part_kernel_first = ethosu_is_part_kernel_first(operation);
if (poperation->conv.padding_same) {
unsigned vert = needed_total_padding(input_tensor->dims[1], poperation->conv.stride_y, poperation->conv.weight_tensor->dims[1]);
unsigned horiz = needed_total_padding(input_tensor->dims[2], poperation->conv.stride_x, poperation->conv.weight_tensor->dims[2]);
operation->pad.top = vert / 2;
operation->pad.left = horiz / 2;
operation->pad.bottom = (vert + 1) / 2;
operation->pad.right = (horiz + 1) / 2;
} else {
operation->pad.top = 0;
operation->pad.left = 0;
operation->pad.bottom = 0;
operation->pad.right = 0;
}
allocate_feature_maps(subgraph, operation);
ethosu_sched_operation(subgraph, operation);
fill_coefs(subgraph, operation, poperation->conv.bias_tensor->resource, poperation->conv.weight_tensor->resource);
}
static void
ethosu_lower_pooling(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_POOLING;
operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->kernel.height = poperation->pooling.filter_height;
operation->kernel.width = poperation->pooling.filter_width;
operation->kernel.stride_y = poperation->pooling.stride_y;
operation->kernel.stride_x = poperation->pooling.stride_x;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
if (poperation->pooling.padding_same) {
unsigned vert = needed_total_padding(operation->ifm.shape.height, poperation->pooling.stride_y, poperation->pooling.filter_height);
unsigned horiz = needed_total_padding(operation->ifm.shape.width, poperation->pooling.stride_x, poperation->pooling.filter_width);
operation->pad.top = vert / 2;
operation->pad.left = horiz / 2;
operation->pad.bottom = (vert + 1) / 2;
operation->pad.right = (horiz + 1) / 2;
} else {
operation->pad.top = 0;
operation->pad.left = 0;
operation->pad.bottom = 0;
operation->pad.right = 0;
}
allocate_feature_maps(subgraph, operation);
ethosu_sched_operation(subgraph, operation);
}
static void
ethosu_lower_concatenation(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
unsigned input_idx,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_POOLING;
operation->pooling.avg = true;
set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation);
operation->ofm.shape.depth = operation->ifm.shape.depth;
operation->round_mode = ETHOSU_ROUNDING_NATURAL;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
allocate_feature_maps(subgraph, operation);
for (unsigned i = 0; i < input_idx; i++) {
struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, operation->ofm.tensor_idx);
if (tensor->layout == ETHOSU_LAYOUT_NHWC)
operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3];
else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16);
else
assert(0 && "Unsupported layout");
}
ethosu_sched_operation(subgraph, operation);
}
static void
ethosu_lower_resize(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_POOLING;
operation->pooling.avg = true;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->ifm.zero_point = 0;
operation->ofm.zero_point = 0;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
operation->upscale = true;
allocate_feature_maps(subgraph, operation);
ethosu_sched_operation(subgraph, operation);
}
static void
ethosu_lower_strided_slice(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_POOLING;
operation->pooling.avg = true;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->ifm.shape = operation->ofm.shape;
operation->ifm.zero_point = 0;
operation->ofm.zero_point = 0;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
allocate_feature_maps(subgraph, operation);
unsigned augmented_coord[5];
augmented_coord[0] = 0;
for (int i = 0; i < 4; ++i) {
augmented_coord[i + 1] = poperation->slice.begin[i];
}
unsigned augmented_strides[5];
augmented_strides[0] = operation->ifm.shape.depth * operation->ifm.shape.width * operation->ifm.shape.height;
augmented_strides[1] = 1;
augmented_strides[2] = operation->ifm.shape.depth * operation->ifm.shape.width;
augmented_strides[3] = operation->ifm.shape.depth;
augmented_strides[4] = 1;
unsigned address_offset = 0;
for (int i = 0; i < 5; ++i)
address_offset += augmented_coord[i] * augmented_strides[i];
operation->ifm.tiles.addresses[0] += address_offset;
ethosu_sched_operation(subgraph, operation);
}
static void
ethosu_lower_add(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_ELTWISE;
set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation);
operation->ifm2.tensor_idx = poperation->input_tensors[1]->index;
operation->ifm2.shape.height = poperation->input_tensors[1]->dims[1];
operation->ifm2.shape.width = poperation->input_tensors[1]->dims[2];
operation->ifm2.shape.depth = poperation->input_tensors[1]->dims[3];
operation->ifm2.zero_point = poperation->input_tensors[1]->zero_point;
operation->ifm2.scale = poperation->input_tensors[1]->scale;
operation->ifm2.is_signed = poperation->input_tensors[1]->is_signed;
operation->kernel.height = 1;
operation->kernel.width = 1;
operation->kernel.stride_y = 1;
operation->kernel.stride_x = 1;
operation->kernel.dilation_y = 1;
operation->kernel.dilation_x = 1;
allocate_feature_maps(subgraph, operation);
ethosu_allocate_feature_map(subgraph, &operation->ifm2);
operation->ifm2.tiles.height_0 = operation->ifm2.shape.height;
operation->ifm2.tiles.height_1 = operation->ifm2.shape.height;
operation->ifm2.tiles.width_0 = operation->ifm2.shape.width;
ethosu_sched_operation(subgraph, operation);
}
static void
ethosu_lower_dma(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperation,
struct ethosu_operation *conv_operation,
struct ethosu_operation *operation)
{
operation->type = ETHOSU_OPERATION_TYPE_DMA;
operation->dma.address = conv_operation->conv.scales.address;
operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size;
conv_operation->conv.scales.region = SCRATCH_REGION;
conv_operation->conv.scales.address = 0;
conv_operation->conv.weights.region = SCRATCH_REGION;
conv_operation->conv.weights.address = conv_operation->conv.scales.size;
}
static void
register_tensors(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperations,
unsigned count)
{
for (unsigned i = 0; i < count; i++) {
const struct pipe_ml_operation *poperation = &poperations[i];
for (unsigned j = 0; j < poperation->input_count; j++) {
struct pipe_tensor *ptensor = poperation->input_tensors[j];
ethosu_register_tensor(subgraph, ptensor);
}
for (unsigned j = 0; j < poperation->output_count; j++) {
struct pipe_tensor *ptensor = poperation->output_tensors[j];
ethosu_register_tensor(subgraph, ptensor);
if (!DBG_ENABLED(ETHOSU_DBG_DISABLE_NHCWB16)) {
struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, ptensor->index);
if (tensor->shape.depth % 16 == 0 &&
ethosu_find_first_consumer(poperations, count, ptensor->index)) {
tensor->layout = ETHOSU_LAYOUT_NHCWB16;
}
}
}
}
}
void
ethosu_lower_graph(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperations, unsigned count)
{
register_tensors(subgraph, poperations, count);
/* Lower */
for (int i = 0; i < count; i++) {
struct ethosu_operation operation = {0};
switch (poperations[i].type) {
case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
struct pipe_tensor *input_tensor = poperations[i].input_tensors[0];
const struct pipe_ml_operation *producer = ethosu_find_first_producer(poperations, count, input_tensor->index);
bool padded_input = producer && producer->type == PIPE_ML_OPERATION_TYPE_PAD;
if (padded_input) {
input_tensor = producer->input_tensors[0];
}
ethosu_lower_convolution(subgraph, &poperations[i], input_tensor, &operation);
if (padded_input) {
operation.pad.top = 1;
operation.pad.left = 1;
}
if (operation.conv.scales.size + operation.conv.weights.size <=
ethosu_screen(subgraph->base.context->screen)->info.sram_size) {
struct ethosu_operation dma_operation = {0};
ethosu_lower_dma(subgraph, &poperations[i], &operation, &dma_operation);
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
dma_operation);
}
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
operation);
break;
}
case PIPE_ML_OPERATION_TYPE_ADD: {
ethosu_lower_add(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
operation);
break;
}
case PIPE_ML_OPERATION_TYPE_POOLING: {
ethosu_lower_pooling(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
operation);
break;
}
case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: {
ethosu_lower_strided_slice(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
operation);
break;
}
case PIPE_ML_OPERATION_TYPE_CONCATENATION: {
for (int j = 0; j < poperations[i].input_count; j++) {
ethosu_lower_concatenation(subgraph, &poperations[i], j, &operation);
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
operation);
}
break;
}
case PIPE_ML_OPERATION_TYPE_RESIZE: {
ethosu_lower_resize(subgraph, &poperations[i], &operation);
util_dynarray_append(&subgraph->operations, struct ethosu_operation,
operation);
break;
}
case PIPE_ML_OPERATION_TYPE_PAD: {
// Just ignore the pad operation for now, as it will be handled by its consumers
break;
}
default:
DBG("poperation->type %d\n", poperations[i].type);
UNREACHABLE("Unsupported ML operation type");
}
}
}

View file

@ -0,0 +1,15 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#ifndef ETHOSU_LOWER_H
#define ETHOSU_LOWER_H
#include "ethosu_ml.h"
void
ethosu_lower_graph(struct ethosu_subgraph *subgraph,
const struct pipe_ml_operation *poperations, unsigned count);
#endif /* ETHOSU_LOWER_H */

View file

@ -0,0 +1,363 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "pipe/p_defines.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "util/u_inlines.h"
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <xf86drm.h>
#include "drm-uapi/ethosu_accel.h"
#include "ethosu_cmd.h"
#include "ethosu_lower.h"
#include "ethosu_ml.h"
struct ethosu_block IFM_UBLOCK = {2, 2, 8};
struct ethosu_block OFM_UBLOCK = {2, 2, 8};
struct ethosu_block ARCH_OFM_BLOCK_MAX = {64, 32, 128};
struct ethosu_block SUB_KERNEL_MAX = {8, 8, 65536};
void
ethosu_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
int suboperation_nr, int offset, unsigned size)
{
char buffer[255];
snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr,
suboperation_nr);
FILE *f = fopen(buffer, "wb");
assert(f);
fwrite(ptr + offset, 1, size, f);
if (ferror(f)) {
DBG("Error in writing to file: %s\n", strerror(errno));
}
fflush(f);
fclose(f);
}
void
ethosu_register_tensor(struct ethosu_subgraph *subgraph,
const struct pipe_tensor *ptensor)
{
struct ethosu_tensor new_tensor = {0};
new_tensor.index = ptensor->index;
new_tensor.shape.height = ptensor->dims[1];
new_tensor.shape.width = ptensor->dims[2];
new_tensor.shape.depth = ptensor->dims[3];
new_tensor.layout = ETHOSU_LAYOUT_NHWC;
util_dynarray_append(&subgraph->tensors, struct ethosu_tensor, new_tensor);
}
void
ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map)
{
struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx);
unsigned size;
if (tensor->layout == ETHOSU_LAYOUT_NHWC) {
size = tensor->shape.width * tensor->shape.height * tensor->shape.depth;
} else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) {
size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16);
} else {
assert(0 && "Unsupported layout");
size = 0; // This should never happen
}
assert(tensor);
if (tensor->size > 0) {
feature_map->tiles.addresses[0] = tensor->offset;
return;
}
tensor->offset = subgraph->io_used;
tensor->size = size;
subgraph->io_used += ALIGN_POT(size, 16);
feature_map->tiles.addresses[0] = tensor->offset;
}
struct ethosu_tensor *
ethosu_find_tensor(struct ethosu_subgraph *subgraph, unsigned tensor_idx)
{
util_dynarray_foreach (&subgraph->tensors, struct ethosu_tensor, tensor) {
if (tensor->index == tensor_idx) {
return tensor;
}
}
return NULL;
}
int
ethosu_round_up_to_multiple(int a, int b)
{
return ((a + b - 1) / b) * b;
}
int
ethosu_round_up_divide(int a, int b)
{
return (a + b - 1) / b;
}
int
ethosu_quantize_scale(double scale, uint32_t *shift)
{
int exponent = 0;
double significand = frexp(scale, &exponent);
uint32_t quantized_scale = round(significand * (double)(1LL << 31));
*shift = 31 - exponent;
if (*shift > 63) {
if (quantized_scale > exp2(*shift - 63)) {
quantized_scale = quantized_scale >> (*shift - 63);
*shift = 63;
} else {
// Not possible to get back within bounds, set scale and shift to 0
// as the shift would shift away all relevant bits anyway.
quantized_scale = 0;
*shift = 0;
}
} else if (*shift < 0 && quantized_scale < exp2(*shift + 32)) {
quantized_scale = quantized_scale << (0 - *shift);
*shift = 0;
}
return quantized_scale;
}
static bool
tensor_quantization_supported(struct pipe_tensor *tensor)
{
/*
* Per-axis quantization not supported, for details see:
* https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor
*/
return tensor->scales == NULL && tensor->zero_points == NULL;
}
bool
ethosu_ml_operation_supported(struct pipe_context *pcontext,
const struct pipe_ml_operation *operation)
{
bool supported = false;
switch (operation->type) {
case PIPE_ML_OPERATION_TYPE_CONVOLUTION: {
struct pipe_tensor *input_tensor = operation->input_tensors[0];
struct pipe_tensor *weight_tensor = operation->conv.weight_tensor;
struct pipe_tensor *bias_tensor = operation->conv.bias_tensor;
struct pipe_tensor *output_tensor = operation->output_tensors[0];
// Dilation and per-axis quantization not yet implemented
if (tensor_quantization_supported(input_tensor) &&
tensor_quantization_supported(weight_tensor) &&
tensor_quantization_supported(bias_tensor) &&
tensor_quantization_supported(output_tensor) &&
operation->conv.dilation_width_factor == 1 &&
operation->conv.dilation_height_factor == 1)
supported = true;
break;
}
case PIPE_ML_OPERATION_TYPE_ADD:
supported = operation->input_tensors[0]->resource == NULL &&
operation->input_tensors[1]->resource == NULL;
break;
case PIPE_ML_OPERATION_TYPE_POOLING:
case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE:
case PIPE_ML_OPERATION_TYPE_PAD:
case PIPE_ML_OPERATION_TYPE_RESIZE:
supported = true;
break;
case PIPE_ML_OPERATION_TYPE_CONCATENATION:
supported = operation->conc.axis == 3 ||
operation->conc.axis == -1;
break;
default:
supported = false;
}
return supported;
}
struct pipe_ml_subgraph *
ethosu_ml_subgraph_create(struct pipe_context *pcontext,
const struct pipe_ml_operation *poperations,
unsigned count)
{
struct pipe_screen *pscreen = pcontext->screen;
struct ethosu_screen *screen = ethosu_screen(pscreen);
struct ethosu_subgraph *subgraph;
subgraph = calloc(1, sizeof(*subgraph));
subgraph->base.context = pcontext;
util_dynarray_init(&subgraph->tensors, NULL);
util_dynarray_init(&subgraph->operations, NULL);
ethosu_lower_graph(subgraph, poperations, count);
ethosu_emit_cmdstream(subgraph);
struct drm_ethosu_cmdstream_bo_create cmd_bo_create = {
.size = (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor),
.data = (uintptr_t)subgraph->cmdstream,
};
if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS))
ethosu_dump_buffer((uint8_t *)subgraph->cmdstream, "cmdstream", 0, 0, 0, (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor));
int ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE, &cmd_bo_create);
assert(ret == 0);
free(subgraph->cmdstream);
subgraph->cmdstream_bo = cmd_bo_create.handle;
if (subgraph->coefs_used > 0) {
subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->coefs_used);
pipe_buffer_write(subgraph->base.context, subgraph->coefs_rsrc, 0, subgraph->coefs_used, subgraph->coefs);
free(subgraph->coefs);
subgraph->coefs = NULL;
if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc,
PIPE_MAP_READ, &transfer_in);
ethosu_dump_buffer(buf, "coefs", 0, 0, 0, pipe_buffer_size(subgraph->coefs_rsrc));
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
}
subgraph->io_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->io_used);
return &subgraph->base;
}
void
ethosu_ml_subgraph_invoke(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned inputs_count, unsigned input_idxs[],
void *inputs[], bool is_signed[])
{
struct ethosu_screen *screen = ethosu_screen(pcontext->screen);
struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
struct drm_ethosu_submit submit = {0};
struct drm_ethosu_job job = {0};
struct timespec start, end;
int ret;
for (unsigned i = 0; i < inputs_count; i++) {
struct ethosu_tensor *input = ethosu_find_tensor(subgraph, input_idxs[i]);
assert(input);
if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS))
ethosu_dump_buffer(inputs[i], "input", 0, 0, 0, input->size);
pipe_buffer_write(pcontext, subgraph->io_rsrc, input->offset, input->size, inputs[i]);
}
if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
PIPE_MAP_READ, &transfer_in);
ethosu_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
job.cmd_bo = subgraph->cmdstream_bo;
if (subgraph->coefs_rsrc) {
job.region_bo_handles[COEFS_REGION] = ethosu_resource(subgraph->coefs_rsrc)->handle;
if (!DBG_ENABLED(ETHOSU_DBG_DISABLE_SRAM)) {
job.region_bo_handles[SCRATCH_REGION] = 0;
job.sram_size = screen->info.sram_size;
}
}
job.region_bo_handles[IO_REGION] = ethosu_resource(subgraph->io_rsrc)->handle;
submit.jobs = (uintptr_t)&job;
submit.job_count = 1;
if (DBG_ENABLED(ETHOSU_DBG_MSGS))
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_SUBMIT, &submit);
assert(ret == 0);
if (DBG_ENABLED(ETHOSU_DBG_MSGS)) {
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
long long duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
DBG("Submission took %lld ms\n", duration_ns / 1000000);
/* Force a sync */
struct pipe_transfer *transfer_in;
pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, PIPE_MAP_READ, &transfer_in);
pipe_buffer_unmap(subgraph->base.context, transfer_in);
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec);
DBG("Execution took %lld ms\n", duration_ns / 1000000);
}
}
void
ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned outputs_count,
unsigned output_idxs[], void *outputsv[],
bool is_signed[])
{
struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
uint8_t **outputs = (uint8_t **)outputsv;
for (int i = 0; i < outputs_count; i++) {
struct ethosu_tensor *output = ethosu_find_tensor(subgraph, output_idxs[i]);
if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) {
struct pipe_transfer *transfer_in;
uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc,
PIPE_MAP_READ, &transfer_in);
ethosu_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc));
pipe_buffer_unmap(subgraph->base.context, transfer_in);
}
pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]);
}
}
void
ethosu_ml_subgraph_destroy(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph)
{
int ret;
struct drm_gem_close arg = {0};
struct ethosu_screen *screen = ethosu_screen(pcontext->screen);
struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph);
pipe_resource_reference(&subgraph->io_rsrc, NULL);
pipe_resource_reference(&subgraph->coefs_rsrc, NULL);
arg.handle = subgraph->cmdstream_bo;
ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg);
assert(ret >= 0);
util_dynarray_fini(&subgraph->operations);
util_dynarray_fini(&subgraph->tensors);
free(subgraph);
}

View file

@ -0,0 +1,229 @@
/*
* Copyright (c) 2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#ifndef ETHOSU_ML_H
#define ETHOSU_ML_H
#include <util/u_dynarray.h>
#include "ethosu_device.h"
#define SHRAM_BANKS 48
#define SHRAM_RESERVED_OUTPUT_BANKS 2
#define SHRAM_RESERVED_UNUSED_BANKS 2
#define SHRAM_RESERVED_END_BANKS 2
#define SHRAM_TOTAL_BANKS SHRAM_BANKS
#define SHRAM_BANK_SIZE_BYTES 1024
#define ACC_BITS 32 /* Use for now always 32-bit accumulators */
#define IFM_GRANULE 8
#define ACC_GRANULE 16
#define ARCH_SPLIT_DEPTH 16
#define BANK_SIZE_BYTES 1024
#define IFM_GRANULE 8
extern struct ethosu_block ARCH_OFM_BLOCK_MAX;
extern struct ethosu_block SUB_KERNEL_MAX;
extern struct ethosu_block IFM_UBLOCK;
extern struct ethosu_block OFM_UBLOCK;
#define COEFS_REGION 0
#define IO_REGION 1
#define SCRATCH_REGION 2
struct ethosu_block {
unsigned width;
unsigned height;
unsigned depth;
};
enum ethosu_operation_type {
ETHOSU_OPERATION_TYPE_CONVOLUTION,
ETHOSU_OPERATION_TYPE_POOLING,
ETHOSU_OPERATION_TYPE_ELTWISE,
ETHOSU_OPERATION_TYPE_DMA,
};
struct ethosu_tile_box {
unsigned height_0; /* The height of tile 0 */
unsigned height_1; /* The height of tile 1, 0 if unused */
unsigned width_0; /* The width of tile 0, and tile 2 (if used) */
unsigned addresses[4]; /* A list of 4 addresses, set unused addresses to 0 */
};
enum ethosu_layout {
ETHOSU_LAYOUT_NHWC,
ETHOSU_LAYOUT_NHCWB16,
};
enum ethosu_rounding_mode {
ETHOSU_ROUNDING_DOUBLE = 0,
ETHOSU_ROUNDING_TRUNCATE,
ETHOSU_ROUNDING_NATURAL,
};
struct ethosu_feature_map {
unsigned tensor_idx;
struct ethosu_block shape;
bool is_signed;
struct ethosu_tile_box tiles;
unsigned zero_point;
float scale;
};
struct ethosu_kernel {
unsigned height;
unsigned width;
unsigned stride_y;
unsigned stride_x;
unsigned dilation_y;
unsigned dilation_x;
bool depthwise;
bool is_signed;
unsigned zero_point;
float scale;
};
struct ethosu_padding {
unsigned top;
unsigned left;
unsigned bottom;
unsigned right;
};
struct ethosu_address_range {
unsigned region;
unsigned address;
long size;
};
struct ethosu_shram_layout {
unsigned ib_start;
unsigned ib_end;
unsigned ib_start2;
unsigned ab_start;
unsigned lut_start;
};
enum ethosu_acc_type {
ETHOSU_ACC_TYPE_INT_32BIT = 0,
ETHOSU_ACC_TYPE_INT_40BIT,
ETHOSU_ACC_TYPE_FP_S5_10,
};
struct ethosu_block_config {
struct ethosu_block ifm_block;
struct ethosu_block ofm_block;
struct ethosu_shram_layout shram_layout;
unsigned bank_size;
enum ethosu_acc_type acc_type;
bool is_partkernel;
};
#define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/
struct ethosu_operation {
enum ethosu_operation_type type;
struct ethosu_block_config block_config;
union {
struct {
struct ethosu_address_range weights;
struct ethosu_address_range scales;
bool part_kernel_first;
bool depthwise;
} conv;
struct {
bool avg; /* true for avg, false for max */
} pooling;
struct {
unsigned lut_bytes;
} eltwise;
struct {
unsigned address;
long size;
} dma;
};
struct ethosu_feature_map ifm;
struct ethosu_feature_map ifm2;
struct ethosu_feature_map ofm;
struct ethosu_kernel kernel;
struct ethosu_padding pad;
bool upscale;
enum ethosu_rounding_mode round_mode;
struct ethosu_address_range read_accesses[MAX_MEMORY_ACCESSES];
struct ethosu_address_range write_accesses[MAX_MEMORY_ACCESSES];
};
struct ethosu_tensor {
unsigned index;
unsigned offset;
unsigned size;
struct ethosu_block shape;
enum ethosu_layout layout;
};
struct ethosu_subgraph {
struct pipe_ml_subgraph base;
struct util_dynarray operations; /* ethosu_operation */
struct util_dynarray tensors; /* ethosu_tensor* */
unsigned cmdstream_used;
uint32_t *cmdstream;
uint32_t *cursor;
uint32_t cmdstream_bo;
struct pipe_resource *io_rsrc;
unsigned io_used;
uint8_t *coefs;
struct pipe_resource *coefs_rsrc;
unsigned coefs_used;
};
bool
ethosu_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation);
struct pipe_ml_subgraph *
ethosu_ml_subgraph_create(struct pipe_context *pcontext,
const struct pipe_ml_operation *poperations,
unsigned count);
void ethosu_ml_subgraph_invoke(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned inputs_count, unsigned input_idxs[],
void *inputs[], bool is_signed[]);
void ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext,
struct pipe_ml_subgraph *psubgraph,
unsigned outputs_count,
unsigned output_idxs[], void *outputs[],
bool is_signed[]);
void ethosu_ml_subgraph_destroy(struct pipe_context *context,
struct pipe_ml_subgraph *psubgraph);
void ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map);
void ethosu_register_tensor(struct ethosu_subgraph *subgraph, const struct pipe_tensor *ptensor);
struct ethosu_tensor *ethosu_find_tensor(struct ethosu_subgraph *subgraph, unsigned tensor_idx);
void ethosu_dump_buffer(const uint8_t *ptr, char *name, int operation_nr,
int suboperation_nr, int offset, unsigned size);
int ethosu_round_up_to_multiple(int a, int b);
int ethosu_round_up_divide(int a, int b);
int ethosu_quantize_scale(double scale, uint32_t *shift);
#endif /* ETHOSU_ML_H */

View file

@ -0,0 +1,193 @@
/*
* Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#include "ethosu_sched.h"
static int
required_input_size(int value, int stride, int border)
{
return (value - 1) * stride + border;
}
static struct ethosu_block
_get_ifm_blocksize(struct ethosu_operation *operation, struct ethosu_block ofm_block)
{
struct ethosu_block ifm_block = {0};
// IFM block height
int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height));
h = ALIGN(h, OFM_UBLOCK.height);
// IFM block width
int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width));
w = ALIGN(w, OFM_UBLOCK.width);
ifm_block.height = h;
ifm_block.width = w;
ifm_block.depth = ofm_block.depth;
return ifm_block;
}
static bool
try_block_config(struct ethosu_operation *operation, struct ethosu_block ofm_block, struct ethosu_block ifm_block, struct ethosu_shram_layout *layout)
{
int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8);
int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE);
int lut_bytes = operation->type == ETHOSU_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0;
int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS);
int lut_start = SHRAM_TOTAL_BANKS - lut_banks;
int ifm_end = SHRAM_RESERVED_OUTPUT_BANKS + ifm_banks;
int ifm2_start = ifm_end;
int acc_start = lut_start;
if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) {
int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8;
int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE);
acc_start -= acc_banks;
} else {
int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */
if (ifm2_start + ifm2_banks > acc_start)
return false;
ifm_end = acc_start;
}
if (ifm_end > acc_start)
return false;
layout->ib_start = SHRAM_RESERVED_OUTPUT_BANKS;
layout->ib_start2 = ifm2_start;
layout->ib_end = ifm_end;
layout->ab_start = acc_start;
layout->lut_start = lut_start;
return true;
}
static struct ethosu_block_config
find_block_config(struct ethosu_operation *operation)
{
struct ethosu_block_config config = {};
struct ethosu_block search_space = ARCH_OFM_BLOCK_MAX;
float ofm_elements = operation->ofm.shape.width * operation->ofm.shape.height * operation->ofm.shape.depth;
float ifm_elements = operation->ifm.shape.width * operation->ifm.shape.height * operation->ifm.shape.depth;
bool is_pooling = operation->type == ETHOSU_OPERATION_TYPE_POOLING;
bool is_depthwise = operation->conv.depthwise;
bool is_equal_depth = is_pooling || is_depthwise || operation->type == ETHOSU_OPERATION_TYPE_ELTWISE;
bool is_convolution = operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION;
float best_cost = FLT_MAX;
unsigned best_coverage = UINT_MAX;
search_space.width = MIN2(search_space.width, operation->ofm.shape.width);
search_space.height = MIN2(search_space.height, operation->ofm.shape.height);
search_space.depth = MIN2(search_space.depth, operation->ofm.shape.depth);
unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH));
if (depth < operation->ofm.shape.depth) {
depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
}
search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width);
search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height);
search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth);
while (depth <= search_space.depth) {
bool wont_fit[search_space.height + 1][search_space.width + 1];
memset(wont_fit, 0, sizeof(wont_fit));
for (unsigned height = OFM_UBLOCK.height; height <= search_space.height; height += OFM_UBLOCK.height) {
for (unsigned width = OFM_UBLOCK.width; width <= search_space.width; width += OFM_UBLOCK.width) {
if (wont_fit[height][width])
continue;
struct ethosu_block ofm_block = {height, width, depth};
struct ethosu_block ifm_block = _get_ifm_blocksize(operation, ofm_block);
if (!is_equal_depth)
ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth);
// Try to fit the blocks in SHRAM
struct ethosu_shram_layout layout = {0};
if (try_block_config(operation, ofm_block, ifm_block, &layout)) {
struct ethosu_block full_blocks = {DIV_ROUND_UP(operation->ofm.shape.width, ofm_block.width),
DIV_ROUND_UP(operation->ofm.shape.height, ofm_block.height),
DIV_ROUND_UP(operation->ofm.shape.depth, ofm_block.depth)};
float blocks[3] = {operation->ofm.shape.width / (float)ofm_block.width,
operation->ofm.shape.height / (float)ofm_block.height,
operation->ofm.shape.depth / (float)ofm_block.depth};
float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0;
float weight_fetch = weight_area * operation->ifm.shape.depth * full_blocks.width * full_blocks.height;
if (!is_depthwise)
weight_fetch *= blocks[2] * ofm_block.depth;
float ifm_fetch = ifm_block.width * ifm_block.height * operation->ifm.shape.depth * blocks[0] * blocks[1];
if (!is_equal_depth)
ifm_fetch *= full_blocks.depth;
float relative_cost = 0;
if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE)
relative_cost = (ifm_fetch + weight_fetch) / ofm_elements;
else
relative_cost = ofm_elements / (height * width * depth);
if (ifm_elements < ifm_block.width * ifm_block.height * ifm_block.depth * 2)
relative_cost /= 2.0f;
if (relative_cost <= best_cost) {
bool choose_this = false;
if (relative_cost == best_cost) {
struct ethosu_block coverage_shape = {
MIN2(ifm_block.height, operation->ifm.shape.height),
MIN2(ifm_block.width, operation->ifm.shape.width),
MIN2(ifm_block.depth, operation->ifm.shape.depth)};
float coverage = (float)(operation->ifm.shape.width * operation->ifm.shape.height) /
(float)MAX2(1, coverage_shape.width * coverage_shape.height);
if (coverage <= best_coverage && (height <= 4 && width <= 4)) {
best_coverage = coverage;
choose_this = true;
}
} else {
best_coverage = UINT_MAX;
choose_this = true;
}
if (choose_this) {
config.shram_layout = layout;
config.ifm_block = ifm_block;
config.ofm_block.height = height;
config.ofm_block.width = width;
config.ofm_block.depth = depth;
best_cost = relative_cost;
}
}
} else {
wont_fit[height][width] = true;
}
}
}
depth += OFM_UBLOCK.depth;
if (depth < operation->ofm.shape.depth) {
depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
}
}
return config;
}
void
ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation)
{
operation->block_config = find_block_config(operation);
}

View file

@ -0,0 +1,13 @@
/*
* Copyright (c) 2025 Tomeu Vizoso <tomeu@tomeuvizoso.net>
* SPDX-License-Identifier: MIT
*/
#ifndef ETHOSU_SCHED_H
#define ETHOSU_SCHED_H
#include "ethosu_ml.h"
void ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation);
#endif /* ETHOSU_SCHED_H */

View file

@ -0,0 +1,125 @@
#!/usr/bin/python3
#
# Copyright © 2019-2024 Google, Inc.
# Copyright © 2024-2025 Tomeu Vizoso
#
# SPDX-License-Identifier: MIT
import sys
import os
import argparse
import time
import datetime
from gen_parser import Parser, Reg, Enum, mask, Error
def dump_c(args, guard, func):
p = Parser()
try:
p.parse(args.rnn, args.xml)
except Error as e:
print(e, file=sys.stderr)
exit(1)
print("#ifndef %s\n#define %s\n" % (guard, guard))
print("""/* Autogenerated file, DO NOT EDIT manually!
This file was generated by the rules-ng-ng gen_header.py tool in this git repository:
http://gitlab.freedesktop.org/mesa/mesa/
git clone https://gitlab.freedesktop.org/mesa/mesa.git
The rules-ng-ng source files this header was generated from are:
""")
maxlen = 0
for filepath in p.xml_files:
maxlen = max(maxlen, len(filepath))
for filepath in p.xml_files:
pad = " " * (maxlen - len(filepath))
filesize = str(os.path.getsize(filepath))
filesize = " " * (7 - len(filesize)) + filesize
filetime = time.ctime(os.path.getmtime(filepath))
print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")")
if p.copyright_year:
current_year = str(datetime.date.today().year)
print()
print("Copyright (C) %s-%s by the following authors:" % (p.copyright_year, current_year))
for author in p.authors:
print("- " + author)
if p.license:
print(p.license)
print("*/")
print()
print("#ifdef __KERNEL__")
print("#include <linux/bug.h>")
print("#define assert(x) BUG_ON(!(x))")
print("#else")
print("#include <assert.h>")
print("#endif")
print()
print("#ifdef __cplusplus")
print("#define __struct_cast(X)")
print("#else")
print("#define __struct_cast(X) (struct X)")
print("#endif")
print()
func(p)
print("\n#endif /* %s */" % guard)
def dump_c_defines(args):
guard = str.replace(os.path.basename(args.xml), '.', '_').upper()
dump_c(args, guard, lambda p: p.dump())
def dump_c_pack_structs(args):
guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + '_STRUCTS'
dump_c(args, guard, lambda p: p.dump_structs())
def dump_py_defines(args):
p = Parser()
try:
p.parse(args.rnn, args.xml)
except Error as e:
print(e, file=sys.stderr)
exit(1)
file_name = os.path.splitext(os.path.basename(args.xml))[0]
print("from enum import IntEnum")
print("class %sRegs(IntEnum):" % file_name.upper())
os.path.basename(args.xml)
p.dump_regs_py()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--rnn', type=str, required=True)
parser.add_argument('--xml', type=str, required=True)
subparsers = parser.add_subparsers(required=True)
parser_c_defines = subparsers.add_parser('c-defines')
parser_c_defines.set_defaults(func=dump_c_defines)
parser_c_pack_structs = subparsers.add_parser('c-pack-structs')
parser_c_pack_structs.set_defaults(func=dump_c_pack_structs)
parser_py_defines = subparsers.add_parser('py-defines')
parser_py_defines.set_defaults(func=dump_py_defines)
args = parser.parse_args()
args.func(args)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,745 @@
import xml.parsers.expat
import sys
import os
import collections
class Error(Exception):
def __init__(self, message):
self.message = message
class Enum(object):
def __init__(self, name):
self.name = name
self.values = []
def has_name(self, name):
for (n, value) in self.values:
if n == name:
return True
return False
def dump(self):
use_hex = False
for (name, value) in self.values:
if value > 0x1000:
use_hex = True
print("enum %s {" % self.name)
for (name, value) in self.values:
if use_hex:
print("\t%s = 0x%08x," % (name, value))
else:
print("\t%s = %d," % (name, value))
print("};\n")
def dump_pack_struct(self):
pass
class Field(object):
def __init__(self, name, low, high, shr, type, parser):
self.name = name
self.low = low
self.high = high
self.shr = shr
self.type = type
builtin_types = [ None, "a3xx_regid", "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ]
maxpos = parser.current_bitsize - 1
if low < 0 or low > maxpos:
raise parser.error("low attribute out of range: %d" % low)
if high < 0 or high > maxpos:
raise parser.error("high attribute out of range: %d" % high)
if high < low:
raise parser.error("low is greater than high: low=%d, high=%d" % (low, high))
if self.type == "boolean" and not low == high:
raise parser.error("booleans should be 1 bit fields")
elif self.type == "float" and not (high - low == 31 or high - low == 15):
raise parser.error("floats should be 16 or 32 bit fields")
elif not self.type in builtin_types and not self.type in parser.enums:
raise parser.error("unknown type '%s'" % self.type)
def ctype(self, var_name):
if self.type == None:
type = "uint32_t"
val = var_name
elif self.type == "boolean":
type = "bool"
val = var_name
elif self.type == "uint" or self.type == "hex" or self.type == "a3xx_regid":
type = "uint32_t"
val = var_name
elif self.type == "int":
type = "int32_t"
val = var_name
elif self.type == "fixed":
type = "float"
val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
elif self.type == "ufixed":
type = "float"
val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix)
elif self.type == "float" and self.high - self.low == 31:
type = "float"
val = "fui(%s)" % var_name
elif self.type == "float" and self.high - self.low == 15:
type = "float"
val = "_mesa_float_to_half(%s)" % var_name
elif self.type in [ "address", "waddress" ]:
type = "uint64_t"
val = var_name
else:
type = "enum %s" % self.type
val = var_name
if self.shr > 0:
val = "(%s >> %d)" % (val, self.shr)
return (type, val)
def tab_to(name, value):
tab_count = (68 - (len(name) & ~7)) // 8
if tab_count <= 0:
tab_count = 1
print(name + ('\t' * tab_count) + value)
def mask(low, high):
return ((0xffffffffffffffff >> (64 - (high + 1 - low))) << low)
def field_name(reg, f):
if f.name:
name = f.name.lower()
else:
# We hit this path when a reg is defined with no bitset fields, ie.
# <reg32 offset="0x88db" name="RB_BLIT_DST_ARRAY_PITCH" low="0" high="28" shr="6" type="uint"/>
name = reg.name.lower()
if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()):
name = "_" + name
return name
class Bitset(object):
def __init__(self, name, template):
self.name = name
self.inline = False
if template:
self.fields = template.fields[:]
else:
self.fields = []
# Get address field if there is one in the bitset, else return None:
def get_address_field(self):
for f in self.fields:
if f.type in [ "address", "waddress" ]:
return f
return None
def dump_regpair_builder(self, reg):
print("#ifndef NDEBUG")
known_mask = 0
for f in self.fields:
known_mask |= mask(f.low, f.high)
if f.type in [ "boolean", "address", "waddress" ]:
continue
type, val = f.ctype("fields.%s" % field_name(reg, f))
print(" assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low)))
print(" assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask))
print("#endif\n")
print(" return (struct fd_reg_pair) {")
if reg.array:
print(" .reg = REG_%s(__i)," % reg.full_name)
else:
print(" .reg = REG_%s," % reg.full_name)
print(" .value =")
for f in self.fields:
if f.type in [ "address", "waddress" ]:
continue
else:
type, val = f.ctype("fields.%s" % field_name(reg, f))
print(" (%-40s << %2d) |" % (val, f.low))
value_name = "dword"
if reg.bit_size == 64:
value_name = "qword"
print(" fields.unknown | fields.%s," % (value_name,))
address = self.get_address_field()
if address:
print(" .bo = fields.bo,")
print(" .is_address = true,")
if f.type == "waddress":
print(" .bo_write = true,")
print(" .bo_offset = fields.bo_offset,")
print(" .bo_shift = %d," % address.shr)
print(" .bo_low = %d," % address.low)
print(" };")
def dump_pack_struct(self, reg=None):
if not reg:
return
prefix = reg.full_name
print("struct %s {" % prefix)
for f in self.fields:
if f.type in [ "address", "waddress" ]:
tab_to(" __bo_type", "bo;")
tab_to(" uint32_t", "bo_offset;")
continue
name = field_name(reg, f)
type, val = f.ctype("var")
tab_to(" %s" % type, "%s;" % name)
if reg.bit_size == 64:
tab_to(" uint64_t", "unknown;")
tab_to(" uint64_t", "qword;")
else:
tab_to(" uint32_t", "unknown;")
tab_to(" uint32_t", "dword;")
print("};\n")
if reg.array:
print("static inline struct fd_reg_pair\npack_%s(uint32_t __i, struct %s fields)\n{" %
(prefix, prefix))
else:
print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" %
(prefix, prefix))
self.dump_regpair_builder(reg)
print("\n}\n")
if self.get_address_field():
skip = ", { .reg = 0 }"
else:
skip = ""
if reg.array:
print("#define %s(__i, ...) pack_%s(__i, __struct_cast(%s) { __VA_ARGS__ })%s\n" %
(prefix, prefix, prefix, skip))
else:
print("#define %s(...) pack_%s(__struct_cast(%s) { __VA_ARGS__ })%s\n" %
(prefix, prefix, prefix, skip))
def dump(self, prefix=None):
if prefix == None:
prefix = self.name
for f in self.fields:
if f.name:
name = prefix + "_" + f.name.upper()
else:
name = prefix
if not f.name and f.low == 0 and f.shr == 0 and not f.type in ["float", "fixed", "ufixed"]:
pass
elif f.type == "boolean" or (f.type == None and f.low == f.high):
tab_to("#define %s" % name, "0x%08x" % (1 << f.low))
else:
tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high))
tab_to("#define %s__SHIFT" % name, "%d" % f.low)
type, val = f.ctype("val")
print("static inline uint32_t %s(%s val)\n{" % (name, type))
if f.shr > 0:
print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1))
print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name))
print()
class Array(object):
def __init__(self, attrs, domain, variant):
if "name" in attrs:
self.name = attrs["name"]
else:
self.name = ""
self.domain = domain
self.variant = variant
self.offset = int(attrs["offset"], 0)
self.stride = int(attrs["stride"], 0)
self.length = int(attrs["length"], 0)
if "usage" in attrs:
self.usages = attrs["usage"].split(',')
else:
self.usages = None
def dump(self):
print("#define _%s(i0) (0x%08x + 0x%x*(i0))\n" % (self.name, self.offset, self.stride))
def dump_pack_struct(self):
pass
def dump_regpair_builder(self):
pass
class Reg(object):
def __init__(self, attrs, domain, array, bit_size):
self.name = attrs["name"]
self.domain = domain
self.array = array
self.offset = int(attrs["offset"], 0)
self.type = None
self.bit_size = bit_size
if array:
self.name = array.name + "_" + self.name
self.full_name = self.name
def dump(self):
if self.array:
offset = self.array.offset + self.offset
print("static inline uint32_t %s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride))
else:
tab_to("#define %s" % self.full_name, "0x%08x" % self.offset)
if self.bitset.inline:
self.bitset.dump(self.full_name)
def dump_pack_struct(self):
if self.bitset.inline:
self.bitset.dump_pack_struct(self)
def dump_regpair_builder(self):
if self.bitset.inline:
self.bitset.dump_regpair_builder(self)
def dump_py(self):
print("\tREG_%s = 0x%08x" % (self.full_name, self.offset))
class Parser(object):
def __init__(self):
self.current_array = None
self.current_domain = None
self.current_prefix = None
self.current_prefix_type = None
self.current_stripe = None
self.current_bitset = None
self.current_bitsize = 32
# The varset attribute on the domain specifies the enum which
# specifies all possible hw variants:
self.current_varset = None
# Regs that have multiple variants.. we only generated the C++
# template based struct-packers for these
self.variant_regs = {}
# Information in which contexts regs are used, to be used in
# debug options
self.usage_regs = collections.defaultdict(list)
self.bitsets = {}
self.enums = {}
self.variants = set()
self.file = []
self.xml_files = []
self.copyright_year = None
self.authors = []
self.license = None
def error(self, message):
parser, filename = self.stack[-1]
return Error("%s:%d:%d: %s" % (filename, parser.CurrentLineNumber, parser.CurrentColumnNumber, message))
def prefix(self, variant=None):
if self.current_prefix_type == "variant" and variant:
return variant
elif self.current_stripe:
return self.current_stripe + "_" + self.current_domain
elif self.current_prefix:
return self.current_prefix + "_" + self.current_domain
else:
return self.current_domain
def parse_field(self, name, attrs):
try:
if "pos" in attrs:
high = low = int(attrs["pos"], 0)
elif "high" in attrs and "low" in attrs:
high = int(attrs["high"], 0)
low = int(attrs["low"], 0)
else:
low = 0
high = self.current_bitsize - 1
if "type" in attrs:
type = attrs["type"]
else:
type = None
if "shr" in attrs:
shr = int(attrs["shr"], 0)
else:
shr = 0
b = Field(name, low, high, shr, type, self)
if type == "fixed" or type == "ufixed":
b.radix = int(attrs["radix"], 0)
self.current_bitset.fields.append(b)
except ValueError as e:
raise self.error(e)
def parse_varset(self, attrs):
# Inherit the varset from the enclosing domain if not overriden:
varset = self.current_varset
if "varset" in attrs:
varset = self.enums[attrs["varset"]]
return varset
def parse_variants(self, attrs):
if not "variants" in attrs:
return None
variant = attrs["variants"].split(",")[0]
if "-" in variant:
variant = variant[:variant.index("-")]
varset = self.parse_varset(attrs)
assert varset.has_name(variant)
return variant
def add_all_variants(self, reg, attrs, parent_variant):
# TODO this should really handle *all* variants, including dealing
# with open ended ranges (ie. "A2XX,A4XX-") (we have the varset
# enum now to make that possible)
variant = self.parse_variants(attrs)
if not variant:
variant = parent_variant
if reg.name not in self.variant_regs:
self.variant_regs[reg.name] = {}
else:
# All variants must be same size:
v = next(iter(self.variant_regs[reg.name]))
assert self.variant_regs[reg.name][v].bit_size == reg.bit_size
self.variant_regs[reg.name][variant] = reg
def add_all_usages(self, reg, usages):
if not usages:
return
for usage in usages:
self.usage_regs[usage].append(reg)
self.variants.add(reg.domain)
def do_validate(self, schemafile):
try:
from lxml import etree
parser, filename = self.stack[-1]
dirname = os.path.dirname(filename)
# we expect this to look like <namespace url> schema.xsd.. I think
# technically it is supposed to be just a URL, but that doesn't
# quite match up to what we do.. Just skip over everything up to
# and including the first whitespace character:
schemafile = schemafile[schemafile.rindex(" ")+1:]
# this is a bit cheezy, but the xml file to validate could be
# in a child director, ie. we don't really know where the schema
# file is, the way the rnn C code does. So if it doesn't exist
# just look one level up
if not os.path.exists(dirname + "/" + schemafile):
schemafile = "../" + schemafile
if not os.path.exists(dirname + "/" + schemafile):
raise self.error("Cannot find schema for: " + filename)
xmlschema_doc = etree.parse(dirname + "/" + schemafile)
xmlschema = etree.XMLSchema(xmlschema_doc)
xml_doc = etree.parse(filename)
if not xmlschema.validate(xml_doc):
error_str = str(xmlschema.error_log.filter_from_errors()[0])
raise self.error("Schema validation failed for: " + filename + "\n" + error_str)
except ImportError:
print("lxml not found, skipping validation", file=sys.stderr)
def do_parse(self, filename):
filepath = os.path.abspath(filename)
if filepath in self.xml_files:
return
self.xml_files.append(filepath)
file = open(filename, "rb")
parser = xml.parsers.expat.ParserCreate()
self.stack.append((parser, filename))
parser.StartElementHandler = self.start_element
parser.EndElementHandler = self.end_element
parser.CharacterDataHandler = self.character_data
parser.buffer_text = True
parser.ParseFile(file)
self.stack.pop()
file.close()
def parse(self, rnn_path, filename):
self.path = rnn_path
self.stack = []
self.do_parse(filename)
def parse_reg(self, attrs, bit_size):
self.current_bitsize = bit_size
if "type" in attrs and attrs["type"] in self.bitsets:
bitset = self.bitsets[attrs["type"]]
if bitset.inline:
self.current_bitset = Bitset(attrs["name"], bitset)
self.current_bitset.inline = True
else:
self.current_bitset = bitset
else:
self.current_bitset = Bitset(attrs["name"], None)
self.current_bitset.inline = True
if "type" in attrs:
self.parse_field(None, attrs)
variant = self.parse_variants(attrs)
if not variant and self.current_array:
variant = self.current_array.variant
self.current_reg = Reg(attrs, self.prefix(variant), self.current_array, bit_size)
self.current_reg.bitset = self.current_bitset
if len(self.stack) == 1:
self.file.append(self.current_reg)
if variant is not None:
self.add_all_variants(self.current_reg, attrs, variant)
usages = None
if "usage" in attrs:
usages = attrs["usage"].split(',')
elif self.current_array:
usages = self.current_array.usages
self.add_all_usages(self.current_reg, usages)
def start_element(self, name, attrs):
self.cdata = ""
if name == "import":
filename = attrs["file"]
self.do_parse(os.path.join(self.path, filename))
elif name == "domain":
self.current_domain = attrs["name"]
if "prefix" in attrs:
self.current_prefix = self.parse_variants(attrs)
self.current_prefix_type = attrs["prefix"]
else:
self.current_prefix = None
self.current_prefix_type = None
if "varset" in attrs:
self.current_varset = self.enums[attrs["varset"]]
elif name == "stripe":
self.current_stripe = self.parse_variants(attrs)
elif name == "enum":
self.current_enum_value = 0
self.current_enum = Enum(attrs["name"])
self.enums[attrs["name"]] = self.current_enum
if len(self.stack) == 1:
self.file.append(self.current_enum)
elif name == "value":
if "value" in attrs:
value = int(attrs["value"], 0)
else:
value = self.current_enum_value
self.current_enum.values.append((attrs["name"], value))
elif name == "reg32":
self.parse_reg(attrs, 32)
elif name == "reg64":
self.parse_reg(attrs, 64)
elif name == "array":
self.current_bitsize = 32
variant = self.parse_variants(attrs)
self.current_array = Array(attrs, self.prefix(variant), variant)
if len(self.stack) == 1:
self.file.append(self.current_array)
elif name == "bitset":
self.current_bitset = Bitset(attrs["name"], None)
if "inline" in attrs and attrs["inline"] == "yes":
self.current_bitset.inline = True
self.bitsets[self.current_bitset.name] = self.current_bitset
if len(self.stack) == 1 and not self.current_bitset.inline:
self.file.append(self.current_bitset)
elif name == "bitfield" and self.current_bitset:
self.parse_field(attrs["name"], attrs)
elif name == "database":
self.do_validate(attrs["xsi:schemaLocation"])
elif name == "copyright":
self.copyright_year = attrs["year"]
elif name == "author":
self.authors.append(attrs["name"] + " <" + attrs["email"] + "> " + attrs["name"])
def end_element(self, name):
if name == "domain":
self.current_domain = None
self.current_prefix = None
self.current_prefix_type = None
elif name == "stripe":
self.current_stripe = None
elif name == "bitset":
self.current_bitset = None
elif name == "reg32":
self.current_reg = None
elif name == "array":
self.current_array = None
elif name == "enum":
self.current_enum = None
elif name == "license":
self.license = self.cdata
def character_data(self, data):
self.cdata += data
def dump_reg_usages(self):
d = collections.defaultdict(list)
for usage, regs in self.usage_regs.items():
for reg in regs:
variants = self.variant_regs.get(reg.name)
if variants:
for variant, vreg in variants.items():
if reg == vreg:
d[(usage, variant)].append(reg)
else:
for variant in self.variants:
d[(usage, variant)].append(reg)
print("#ifdef __cplusplus")
for usage, regs in self.usage_regs.items():
print("template<chip CHIP> constexpr inline uint16_t %s_REGS[] = {};" % (usage.upper()))
for (usage, variant), regs in d.items():
offsets = []
for reg in regs:
if reg.array:
for i in range(reg.array.length):
offsets.append(reg.array.offset + reg.offset + i * reg.array.stride)
if reg.bit_size == 64:
offsets.append(offsets[-1] + 1)
else:
offsets.append(reg.offset)
if reg.bit_size == 64:
offsets.append(offsets[-1] + 1)
offsets.sort()
print("template<> constexpr inline uint16_t %s_REGS<%s>[] = {" % (usage.upper(), variant))
for offset in offsets:
print("\t%s," % hex(offset))
print("};")
print("#endif")
def dump(self):
enums = []
bitsets = []
regs = []
for e in self.file:
if isinstance(e, Enum):
enums.append(e)
elif isinstance(e, Bitset):
bitsets.append(e)
else:
regs.append(e)
for e in enums + bitsets + regs:
e.dump()
self.dump_reg_usages()
print("static inline char *ethosu_get_cmd_name(unsigned domain, uint32_t cmd) {")
for e in regs:
if e.array:
continue
domain = 0 if e.domain == "CMD0" else 1
print(" if (domain == %d && cmd == 0x%08x) return \"%s\";" % (domain, e.offset, e.full_name))
print(" return NULL;")
print("}\n")
def dump_regs_py(self):
regs = []
for e in self.file:
if isinstance(e, Reg):
regs.append(e)
for e in regs:
e.dump_py()
def dump_reg_variants(self, regname, variants):
# Don't bother for things that only have a single variant:
if len(variants) == 1:
return
print("#ifdef __cplusplus")
print("struct __%s {" % regname)
# TODO be more clever.. we should probably figure out which
# fields have the same type in all variants (in which they
# appear) and stuff everything else in a variant specific
# sub-structure.
seen_fields = []
bit_size = 32
array = False
address = None
for variant in variants.keys():
print(" /* %s fields: */" % variant)
reg = variants[variant]
bit_size = reg.bit_size
array = reg.array
for f in reg.bitset.fields:
fld_name = field_name(reg, f)
if fld_name in seen_fields:
continue
seen_fields.append(fld_name)
name = fld_name.lower()
if f.type in [ "address", "waddress" ]:
if address:
continue
address = f
tab_to(" __bo_type", "bo;")
tab_to(" uint32_t", "bo_offset;")
continue
type, val = f.ctype("var")
tab_to(" %s" %type, "%s;" %name)
print(" /* fallback fields: */")
if bit_size == 64:
tab_to(" uint64_t", "unknown;")
tab_to(" uint64_t", "qword;")
else:
tab_to(" uint32_t", "unknown;")
tab_to(" uint32_t", "dword;")
print("};")
# TODO don't hardcode the varset enum name
varenum = "chip"
print("template <%s %s>" % (varenum, varenum.upper()))
print("static inline struct fd_reg_pair")
xtra = ""
xtravar = ""
if array:
xtra = "int __i, "
xtravar = "__i, "
print("__%s(%sstruct __%s fields) {" % (regname, xtra, regname))
for variant in variants.keys():
print(" if (%s == %s) {" % (varenum.upper(), variant))
reg = variants[variant]
reg.dump_regpair_builder()
print(" } else")
print(" assert(!\"invalid variant\");")
print("}")
if bit_size == 64:
skip = ", { .reg = 0 }"
else:
skip = ""
print("#define %s(VARIANT, %s...) __%s<VARIANT>(%s{__VA_ARGS__})%s" % (regname, xtravar, regname, xtravar, skip))
print("#endif /* __cplusplus */")
def dump_structs(self):
for e in self.file:
e.dump_pack_struct()
for regname in self.variant_regs:
self.dump_reg_variants(regname, self.variant_regs[regname])

View file

@ -0,0 +1,33 @@
# Copyright 2019 Google, Inc
# SPDX-License-Identifier: MIT
ethosu_registers = custom_target(
'ethosu_registers.h',
input : ['gen_parser.py', 'gen_header.py', 'registers.xml'],
output : 'ethosu_registers.h',
command : [prog_python, '@INPUT1@', '--rnn', '.', '--xml', '@INPUT2@', 'c-defines'],
capture : true,
)
files_ethosu = files(
'ethosu_cmd.c',
'ethosu_coefs.c',
'ethosu_device.c',
'ethosu_lower.c',
'ethosu_ml.c',
'ethosu_sched.c',
'mlw_codec/mlw_encode.c',
)
libethosu = static_library(
'ethosu',
[files_ethosu, ethosu_registers],
include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src],
gnu_symbol_visibility : 'hidden',
dependencies : [idep_mesautil, dep_libdrm],
)
driver_ethosu = declare_dependency(
compile_args : '-DGALLIUM_ETHOSU',
link_with : [libethosuwinsys, libethosu]
)

View file

@ -0,0 +1,29 @@
/*
* SPDX-FileCopyrightText: Copyright 2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdint.h>
#ifndef MLW_COMMON_H
#define MLW_COMMON_H
#define ZDIV_DISABLE 6 // not alternating mode
#define ZDIV_EOS 7 // indicates end of stream
#define WDIV_UNCOMPRESSED 7 // indicates uncompressed weights
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,65 @@
/*
* SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdint.h>
#ifndef MLW_ENCODE_H
#define MLW_ENCODE_H
#ifdef _MSC_VER
#define MLW_ENCODE_EXPORTED __declspec(dllexport)
#else
#define MLW_ENCODE_EXPORTED __attribute__((visibility("default")))
#endif
#if __cplusplus
extern "C"
{
#endif
MLW_ENCODE_EXPORTED
int mlw_encode(int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose);
MLW_ENCODE_EXPORTED
void mlw_free_outbuf(uint8_t *outbuf);
MLW_ENCODE_EXPORTED
int mlw_reorder_encode(
int ifm_ublock_depth,
int ofm_ublock_depth,
int ofm_depth,
int kernel_height,
int kernel_width,
int ifm_depth,
int* brick_strides,
int16_t* inbuf,
int ofm_block_depth,
int is_depthwise,
int is_partkernel,
int ifm_bitdepth,
int decomp_h,
int decomp_w,
uint8_t **outbuf,
int64_t* padded_length,
int verbose);
#if __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,399 @@
<?xml version="1.0" encoding="UTF-8"?>
<database xmlns="http://nouveau.freedesktop.org/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
<copyright year="2024">
<author name="Tomeu Vizoso" email="tomeu@tomeuvizoso.net"><nick name="tomeu"/>
Initial Author.
</author>
<license>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice (including the
next paragraph) shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
</license>
</copyright>
<domain name="CMD0" width="32">
<reg32 offset="0x0" name="NPU_OP_STOP">
<bitfield name="mask" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x1" name="NPU_OP_IRQ">
<bitfield name="mask" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x2" name="NPU_OP_CONV">
<bitfield name="reserved0" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x3" name="NPU_OP_DEPTHWISE">
<bitfield name="reserved0" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x5" name="NPU_OP_POOL">
<bitfield name="mode" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x6" name="NPU_OP_ELEMENTWISE">
<bitfield name="mode" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x10" name="NPU_OP_DMA_START">
<bitfield name="channel_mode" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x11" name="NPU_OP_DMA_WAIT">
<bitfield name="reserved0" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x12" name="NPU_OP_KERNEL_WAIT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x13" name="NPU_OP_PMU_MASK">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x100" name="NPU_SET_IFM_PAD_TOP">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x101" name="NPU_SET_IFM_PAD_LEFT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x102" name="NPU_SET_IFM_PAD_RIGHT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x103" name="NPU_SET_IFM_PAD_BOTTOM">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x104" name="NPU_SET_IFM_DEPTH_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x105" name="NPU_SET_IFM_PRECISION">
<bitfield name="round_mode" low="14" high="15" type="uint"/>
<bitfield name="reserved2" low="10" high="13" type="uint"/>
<bitfield name="scale_mode" low="8" high="9" type="uint"/>
<bitfield name="format" low="6" high="7" type="uint"/>
<bitfield name="reserved1" low="4" high="5" type="uint"/>
<bitfield name="precision" low="2" high="3" type="uint"/>
<bitfield name="reserved0" low="1" high="1" type="uint"/>
<bitfield name="activation" low="0" high="0" type="uint"/>
</reg32>
<reg32 offset="0x107" name="NPU_SET_IFM_UPSCALE">
<bitfield name="reserved0" low="2" high="15" type="uint"/>
<bitfield name="mode" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x109" name="NPU_SET_IFM_ZERO_POINT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x10a" name="NPU_SET_IFM_WIDTH0_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x10b" name="NPU_SET_IFM_HEIGHT0_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x10c" name="NPU_SET_IFM_HEIGHT1_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x10d" name="NPU_SET_IFM_IB_END">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x10f" name="NPU_SET_IFM_REGION">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x111" name="NPU_SET_OFM_WIDTH_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x112" name="NPU_SET_OFM_HEIGHT_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x113" name="NPU_SET_OFM_DEPTH_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x114" name="NPU_SET_OFM_PRECISION">
<bitfield name="round_mode" low="14" high="15" type="uint"/>
<bitfield name="reserved1" low="9" high="13" type="uint"/>
<bitfield name="scale_mode" low="8" high="8" type="uint"/>
<bitfield name="format" low="6" high="7" type="uint"/>
<bitfield name="reserved0" low="3" high="5" type="uint"/>
<bitfield name="precision" low="1" high="2" type="uint"/>
<bitfield name="activation" low="0" high="0" type="uint"/>
</reg32>
<reg32 offset="0x115" name="NPU_SET_OFM_BLK_WIDTH_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x116" name="NPU_SET_OFM_BLK_HEIGHT_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x117" name="NPU_SET_OFM_BLK_DEPTH_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x118" name="NPU_SET_OFM_ZERO_POINT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x11a" name="NPU_SET_OFM_WIDTH0_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x11b" name="NPU_SET_OFM_HEIGHT0_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x11c" name="NPU_SET_OFM_HEIGHT1_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x11f" name="NPU_SET_OFM_REGION">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x120" name="NPU_SET_KERNEL_WIDTH_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x121" name="NPU_SET_KERNEL_HEIGHT_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x122" name="NPU_SET_KERNEL_STRIDE">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x123" name="NPU_SET_PARALLEL_MODE">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x124" name="NPU_SET_ACC_FORMAT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x125" name="NPU_SET_ACTIVATION">
<bitfield name="act_clip_range" low="12" high="15" type="uint"/>
<bitfield name="type" low="0" high="11" type="uint"/>
</reg32>
<reg32 offset="0x126" name="NPU_SET_ACTIVATION_MIN">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x127" name="NPU_SET_ACTIVATION_MAX">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x128" name="NPU_SET_WEIGHT_REGION">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x129" name="NPU_SET_SCALE_REGION">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x12d" name="NPU_SET_AB_START">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x12f" name="NPU_SET_BLOCKDEP">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x130" name="NPU_SET_DMA0_SRC_REGION">
<bitfield name="reserved0" low="11" high="15" type="uint"/>
<bitfield name="stride_mode" low="9" high="10" type="uint"/>
<bitfield name="internal" low="8" high="8" type="uint"/>
<bitfield name="region" low="0" high="7" type="uint"/>
</reg32>
<reg32 offset="0x131" name="NPU_SET_DMA0_DST_REGION">
<bitfield name="reserved0" low="11" high="15" type="uint"/>
<bitfield name="stride_mode" low="9" high="10" type="uint"/>
<bitfield name="internal" low="8" high="8" type="uint"/>
<bitfield name="region" low="0" high="7" type="uint"/>
</reg32>
<reg32 offset="0x132" name="NPU_SET_DMA0_SIZE0">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x133" name="NPU_SET_DMA0_SIZE1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x180" name="NPU_SET_IFM2_BROADCAST">
<bitfield name="reserved1" low="8" high="15" type="uint"/>
<bitfield name="broadcast_scalar" low="7" high="7" type="uint"/>
<bitfield name="operand_order" low="6" high="6" type="uint"/>
<bitfield name="reserved0" low="3" high="5" type="uint"/>
<bitfield name="broadcast_depth" low="2" high="2" type="uint"/>
<bitfield name="broadcast_width" low="1" high="1" type="uint"/>
<bitfield name="broadcast_height" low="0" high="0" type="uint"/>
</reg32>
<reg32 offset="0x181" name="NPU_SET_IFM2_SCALAR">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x185" name="NPU_SET_IFM2_PRECISION">
<bitfield name="reserved1" low="8" high="15" type="uint"/>
<bitfield name="format" low="6" high="7" type="uint"/>
<bitfield name="reserved0" low="4" high="5" type="uint"/>
<bitfield name="precision" low="0" high="3" type="uint"/>
</reg32>
<reg32 offset="0x189" name="NPU_SET_IFM2_ZERO_POINT">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x18a" name="NPU_SET_IFM2_WIDTH0_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x18b" name="NPU_SET_IFM2_HEIGHT0_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x18c" name="NPU_SET_IFM2_HEIGHT1_M1">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x18d" name="NPU_SET_IFM2_IB_START">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
<reg32 offset="0x18f" name="NPU_SET_IFM2_REGION">
<bitfield name="param" low="0" high="15" type="uint"/>
</reg32>
</domain>
<domain name="CMD1" width="32">
<reg32 offset="0x0" name="NPU_SET_IFM_BASE0">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x1" name="NPU_SET_IFM_BASE1">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x2" name="NPU_SET_IFM_BASE2">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x3" name="NPU_SET_IFM_BASE3">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x4" name="NPU_SET_IFM_STRIDE_X">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x5" name="NPU_SET_IFM_STRIDE_Y">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x6" name="NPU_SET_IFM_STRIDE_C">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x10" name="NPU_SET_OFM_BASE0">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x11" name="NPU_SET_OFM_BASE1">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x12" name="NPU_SET_OFM_BASE2">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x13" name="NPU_SET_OFM_BASE3">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x14" name="NPU_SET_OFM_STRIDE_X">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x15" name="NPU_SET_OFM_STRIDE_Y">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x16" name="NPU_SET_OFM_STRIDE_C">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x20" name="NPU_SET_WEIGHT_BASE">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x21" name="NPU_SET_WEIGHT_LENGTH">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x22" name="NPU_SET_SCALE_BASE">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x23" name="NPU_SET_SCALE_LENGTH">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x24" name="NPU_SET_OFM_SCALE">
<bitfield name="shift" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x25" name="NPU_SET_OPA_SCALE">
<bitfield name="shift" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x26" name="NPU_SET_OPB_SCALE">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x30" name="NPU_SET_DMA0_SRC">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x31" name="NPU_SET_DMA0_DST">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x32" name="NPU_SET_DMA0_LEN">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x33" name="NPU_SET_DMA0_SKIP0">
<bitfield name="param" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x34" name="NPU_SET_DMA0_SKIP1">
<bitfield name="param" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x80" name="NPU_SET_IFM2_BASE0">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x81" name="NPU_SET_IFM2_BASE1">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x82" name="NPU_SET_IFM2_BASE2">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x83" name="NPU_SET_IFM2_BASE3">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x84" name="NPU_SET_IFM2_STRIDE_X">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x85" name="NPU_SET_IFM2_STRIDE_Y">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x86" name="NPU_SET_IFM2_STRIDE_C">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x90" name="NPU_SET_WEIGHT1_BASE">
<bitfield name="param" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x91" name="NPU_SET_WEIGHT1_LENGTH">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x92" name="NPU_SET_SCALE1_BASE">
<bitfield name="param" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
<reg32 offset="0x93" name="NPU_SET_SCALE1_LENGTH">
<bitfield name="reserved0" low="2" high="17" type="uint"/>
<bitfield name="payload_size" low="0" high="1" type="uint"/>
</reg32>
</domain>
</database>

View file

@ -0,0 +1,457 @@
<?xml version="1.0" encoding="UTF-8"?>
<schema xmlns="http://www.w3.org/2001/XMLSchema"
targetNamespace="http://nouveau.freedesktop.org/"
xmlns:rng="http://nouveau.freedesktop.org/"
elementFormDefault="qualified">
<annotation>
<documentation>
An updated version of the old rules.xml file from the
RivaTV project. Specifications by Pekka Paalanen,
preliminary attempt by KoalaBR,
first working version by Jakob Bornecrantz.
For specifications, see the file rules-ng-format.txt
in Nouveau CVS module 'rules-ng'.
</documentation>
<documentation>Version 0.1</documentation>
</annotation>
<!-- Elements -->
<element name="database" type="rng:databaseType" />
<element name="import" type="rng:importType" />
<element name="copyright" type="rng:copyrightType" />
<element name="domain" type="rng:domainType" />
<element name="group" type="rng:groupType" />
<element name="use-group" type="rng:refType" />
<element name="array" type="rng:arrayType" />
<element name="stripe" type="rng:stripeType" />
<element name="reg64" type="rng:registerType" />
<element name="reg32" type="rng:registerType" />
<element name="reg16" type="rng:registerType" />
<element name="reg8" type="rng:registerType" />
<element name="bitset" type="rng:bitsetType" />
<element name="bitfield" type="rng:bitfieldType" />
<element name="enum" type="rng:enumType" />
<element name="value" type="rng:valueType" />
<!-- Copyright elements -->
<element name="author" type="rng:authorType" />
<element name="nick" type="rng:nickType" />
<element name="license" type="rng:docType" />
<!-- Documentation elements -->
<!-- FIXME: allowed only one per parent element -->
<element name="brief" type="rng:briefType" />
<element name="doc" type="rng:docType" />
<element name="b" type="rng:textformatType" />
<element name="i" type="rng:textformatType" />
<element name="u" type="rng:textformatType" />
<element name="code" type="rng:textcodeType" />
<element name="ul" type="rng:listType" />
<element name="ol" type="rng:listType" />
<element name="li" type="rng:listitemType" />
<!-- Copyright element types -->
<complexType name="authorType" mixed="true">
<annotation>
<documentation>
register database author
</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<element ref="rng:nick" />
</choice>
<attribute name="name" type="string" use="required" />
<attribute name="email" type="string" use="required" />
</complexType>
<complexType name="nickType">
<annotation>
<documentation>nickType</documentation>
</annotation>
<attribute name="name" type="string" use="required" />
</complexType>
<!-- Database element types -->
<complexType name="databaseType">
<annotation>
<documentation>databaseType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
</choice>
</complexType>
<complexType name="importType">
<annotation>
<documentation>importType</documentation>
</annotation>
<attribute name="file" type="string" use="required" />
</complexType>
<complexType name="copyrightType">
<annotation>
<documentation>copyrightType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
<element ref="rng:author" />
<element ref="rng:license" />
</choice>
<attribute name="year" type="nonNegativeInteger" use="optional" />
</complexType>
<complexType name="domainType">
<annotation>
<documentation>domainType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
<group ref="rng:regarrayGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
<attribute name="bare" type="rng:Boolean" use="optional" />
<attribute name="prefix" type="NMTOKENS" use="optional" />
<attribute name="width" type="rng:DomainWidth" use="optional" />
<attribute name="size" type="rng:HexOrNumber" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
<attribute name="variants" type="string" use="optional" />
</complexType>
<complexType name="groupType">
<annotation>
<documentation>groupType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
<group ref="rng:regarrayGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
</complexType>
<complexType name="arrayType">
<annotation>
<documentation>arrayType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
<group ref="rng:regarrayGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="optional" />
<attribute name="offset" type="rng:HexOrNumber" use="optional" />
<attribute name="offsets" type="string" use="optional"/>
<attribute name="doffsets" type="string" use="optional"/>
<attribute name="index" type="NMTOKENS" use="optional"/>
<attribute name="stride" type="rng:HexOrNumber" use="required" />
<attribute name="length" type="rng:HexOrNumber" use="required" />
<attribute name="varset" type="NMTOKEN" use="optional" />
<attribute name="variants" type="string" use="optional" />
<attribute name="usage" type="string" use="optional" />
</complexType>
<complexType name="stripeType">
<annotation>
<documentation>stripeType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
<group ref="rng:regarrayGroup" minOccurs="0" />
</choice>
<attribute name="name" type="NMTOKEN" use="optional" />
<attribute name="offset" type="rng:HexOrNumber" use="optional" />
<attribute name="stride" type="rng:HexOrNumber" use="optional" />
<attribute name="length" type="rng:HexOrNumber" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
<attribute name="variants" type="string" use="optional" />
<attribute name="prefix" type="NMTOKENS" use="optional" />
</complexType>
<complexType name="registerType">
<annotation>
<documentation>
registerType used by reg8, reg16, reg32, reg64
</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
<element ref="rng:value" />
<element ref="rng:bitfield" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
<attribute name="offset" type="rng:HexOrNumber" use="required" />
<attribute name="access" type="rng:Access" default="rw" use="optional" />
<attribute name="type" type="NMTOKENS" use="optional" />
<attribute name="shr" type="nonNegativeInteger" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
<attribute name="variants" type="string" use="optional" />
<attribute name="stride" type="rng:HexOrNumber" use="optional" />
<attribute name="length" type="rng:HexOrNumber" use="optional" />
<attribute name="high" type="nonNegativeInteger" use="optional" />
<attribute name="low" type="nonNegativeInteger" use="optional" />
<attribute name="pos" type="nonNegativeInteger" use="optional" />
<attribute name="align" type="nonNegativeInteger" use="optional" />
<attribute name="radix" type="nonNegativeInteger" use="optional" />
<attribute name="usage" type="string" use="optional" />
</complexType>
<complexType name="bitsetType">
<annotation>
<documentation>bitsetType</documentation>
</annotation>
<choice maxOccurs="unbounded">
<element ref="rng:bitfield" />
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
<attribute name="inline" type="rng:Boolean" use="optional" />
<attribute name="bare" type="rng:Boolean" use="optional" />
<attribute name="prefix" type="NMTOKENS" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
</complexType>
<complexType name="bitfieldType">
<annotation>
<documentation>bitfieldType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<element ref="rng:value" maxOccurs="unbounded" />
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
<attribute name="high" type="nonNegativeInteger" use="optional" />
<attribute name="low" type="nonNegativeInteger" use="optional" />
<attribute name="pos" type="nonNegativeInteger" use="optional" />
<attribute name="radix" type="nonNegativeInteger" use="optional" />
<attribute name="align" type="nonNegativeInteger" use="optional" />
<attribute name="type" type="NMTOKENS" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
<attribute name="variants" type="string" use="optional" />
<attribute name="addvariant" type="rng:Boolean" use="optional" />
<attribute name="shr" type="nonNegativeInteger" use="optional" />
</complexType>
<complexType name="enumType">
<annotation>
<documentation>enumType</documentation>
</annotation>
<choice maxOccurs="unbounded">
<element ref="rng:value" />
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
<attribute name="inline" type="rng:Boolean" use="optional" />
<attribute name="bare" type="rng:Boolean" use="optional" />
<attribute name="prefix" type="NMTOKENS" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
</complexType>
<complexType name="valueType">
<annotation>
<documentation>valueType</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:docGroup" />
<group ref="rng:topGroup" />
</choice>
<attribute name="name" type="NMTOKEN" use="required" />
<attribute name="value" type="string" use="optional" />
<attribute name="varset" type="NMTOKEN" use="optional" />
<attribute name="variants" type="string" use="optional" />
</complexType>
<complexType name="refType">
<annotation>
<documentation>refType</documentation>
</annotation>
<attribute name="ref" type="NMTOKEN" use="required" />
</complexType>
<!-- Documentation element types -->
<complexType name="briefType">
<annotation>
<documentation>
brief documentation, no markup
</documentation>
</annotation>
<simpleContent>
<extension base="string" />
</simpleContent>
</complexType>
<complexType name="docType" mixed="true">
<annotation>
<documentation>
root element of documentation sub-tree
</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:textformatGroup" />
<group ref="rng:listGroup" />
<element ref="rng:code" />
</choice>
</complexType>
<complexType name="textformatType" mixed="true">
<annotation>
<documentation>
for bold, underline, italics
</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:textformatGroup" />
</choice>
</complexType>
<complexType name="textcodeType">
<simpleContent>
<extension base="string">
<attribute name="title" type="string" />
</extension>
</simpleContent>
</complexType>
<complexType name="listType">
<annotation>
<documentation>
definition of a list, ordered or unordered
</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<element ref="rng:li" />
</choice>
</complexType>
<complexType name="listitemType" mixed="true">
<annotation>
<documentation>
items of a list
</documentation>
</annotation>
<choice minOccurs="0" maxOccurs="unbounded">
<group ref="rng:textformatGroup" />
<group ref="rng:listGroup" />
<element ref="rng:code" />
</choice>
</complexType>
<!-- Attribute value types -->
<simpleType name="Hexadecimal">
<restriction base="string">
<pattern value="0x[0-9a-f]+" />
<pattern value="0x[0-9A-F]+" />
<pattern value="[0-9]" />
</restriction>
</simpleType>
<simpleType name="HexOrNumber">
<annotation>
<documentation>HexOrNumber</documentation>
</annotation>
<union memberTypes="rng:Hexadecimal nonNegativeInteger" />
</simpleType>
<simpleType name="Boolean">
<restriction base="string">
<enumeration value="true" />
<enumeration value="1" />
<enumeration value="yes" />
<enumeration value="false" />
<enumeration value="0" />
<enumeration value="no" />
</restriction>
</simpleType>
<simpleType name="Access">
<annotation>
<documentation>Access</documentation>
</annotation>
<restriction base="string">
<enumeration value="r" />
<enumeration value="w" />
<enumeration value="rw" />
</restriction>
</simpleType>
<simpleType name="DomainWidth">
<annotation>
<documentation>DomainWidth</documentation>
</annotation>
<restriction base="string">
<enumeration value="8" />
<enumeration value="16" />
<enumeration value="32" />
<enumeration value="64" />
</restriction>
</simpleType>
<!-- Element groups -->
<group name="topGroup">
<choice>
<element ref="rng:copyright" />
<element ref="rng:domain" />
<element ref="rng:enum" />
<element ref="rng:group" />
<element ref="rng:bitset" />
<element ref="rng:import" />
</choice>
</group>
<group name="regarrayGroup">
<choice>
<element ref="rng:reg64" />
<element ref="rng:reg32" />
<element ref="rng:reg16" />
<element ref="rng:reg8" />
<element ref="rng:array" />
<element ref="rng:stripe" />
<element ref="rng:use-group" />
</choice>
</group>
<group name="docGroup">
<choice>
<element ref="rng:brief" />
<element ref="rng:doc" />
</choice>
</group>
<group name="textformatGroup">
<choice>
<element ref="rng:b" />
<element ref="rng:i" />
<element ref="rng:u" />
</choice>
</group>
<group name="listGroup">
<choice>
<element ref="rng:ul" />
<element ref="rng:ol" />
</choice>
</group>
</schema>

View file

@ -190,6 +190,12 @@ if with_gallium_rocket
else
driver_rocket = declare_dependency()
endif
if with_gallium_ethosu
subdir('winsys/ethosu/drm')
subdir('drivers/ethosu')
else
driver_ethosu = declare_dependency()
endif
if with_gallium_zink
subdir('drivers/zink')
else

View file

@ -59,7 +59,7 @@ libgallium_dri = shared_library(
driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
driver_tegra, driver_i915, driver_svga, driver_virgl,
driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
driver_asahi, driver_crocus, driver_rocket
driver_asahi, driver_crocus, driver_rocket, driver_ethosu
],
install : true,
name_suffix : libname_suffix,

View file

@ -0,0 +1,17 @@
/*
* Copyright 2014 Broadcom
* Copyright 2018 Alyssa Rosenzweig
* Copyright 2025 Tomeu Vizoso
* SPDX-License-Identifier: MIT
*/
#ifndef __ETHOSU_DRM_PUBLIC_H__
#define __ETHOSU_DRM_PUBLIC_H__
struct pipe_screen;
struct pipe_screen_config;
struct pipe_screen *
ethosu_drm_screen_create(int drmFD, const struct pipe_screen_config *config);
#endif /* __ETHOSU_DRM_PUBLIC_H__ */

View file

@ -0,0 +1,19 @@
/*
* Copyright 2014 Broadcom
* Copyright 2018 Alyssa Rosenzweig
* Copyright 2025 Tomeu Vizoso
* SPDX-License-Identifier: MIT
*/
#include "util/os_file.h"
#include "util/u_screen.h"
#include "ethosu/ethosu_device.h"
#include "ethosu_drm_public.h"
struct pipe_screen *
ethosu_drm_screen_create(int fd, const struct pipe_screen_config *config)
{
return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL,
ethosu_screen_create);
}

View file

@ -0,0 +1,13 @@
# Copyright 2017 Broadcom
# SPDX-License-Identifier: MIT
libethosuwinsys = static_library(
'ethosuwinsys',
files('ethosu_drm_winsys.c'),
include_directories : [
inc_src, inc_include,
inc_gallium, inc_gallium_aux, inc_gallium_drivers,
],
gnu_symbol_visibility : 'hidden',
dependencies: [dep_libdrm, idep_mesautil],
)