From 2581c3ab60b54dbb4267c2a09b93f2f0b6afa7af Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Sun, 23 Feb 2025 14:26:01 +0100 Subject: [PATCH] ethos: Initial commit of a driver for the Arm Ethos-U65 NPU. Supports all models in the test suite. No optimizations implemented yet. Acked-by: Christian Gmeiner Part-of: --- .clang-format-include | 1 + include/drm-uapi/ethosu_accel.h | 262 ++++ meson.build | 3 +- meson.options | 2 +- src/gallium/drivers/ethosu/.clang-format | 2 + .../drivers/ethosu/ci/ethos-imx93-fails.txt | 0 .../drivers/ethosu/ci/ethos-imx93-flakes.txt | 0 .../drivers/ethosu/ci/ethos-imx93-skips.txt | 14 + src/gallium/drivers/ethosu/decode.py | 75 ++ src/gallium/drivers/ethosu/ethosu_cmd.c | 783 +++++++++++ src/gallium/drivers/ethosu/ethosu_cmd.h | 13 + src/gallium/drivers/ethosu/ethosu_coefs.c | 133 ++ src/gallium/drivers/ethosu/ethosu_coefs.h | 17 + src/gallium/drivers/ethosu/ethosu_device.c | 243 ++++ src/gallium/drivers/ethosu/ethosu_device.h | 84 ++ src/gallium/drivers/ethosu/ethosu_lower.c | 477 +++++++ src/gallium/drivers/ethosu/ethosu_lower.h | 15 + src/gallium/drivers/ethosu/ethosu_ml.c | 363 +++++ src/gallium/drivers/ethosu/ethosu_ml.h | 229 ++++ src/gallium/drivers/ethosu/ethosu_sched.c | 193 +++ src/gallium/drivers/ethosu/ethosu_sched.h | 13 + src/gallium/drivers/ethosu/gen_header.py | 125 ++ src/gallium/drivers/ethosu/gen_parser.py | 745 +++++++++++ src/gallium/drivers/ethosu/meson.build | 33 + .../drivers/ethosu/mlw_codec/mlw_common.h | 29 + .../drivers/ethosu/mlw_codec/mlw_encode.c | 1186 +++++++++++++++++ .../drivers/ethosu/mlw_codec/mlw_encode.h | 65 + src/gallium/drivers/ethosu/registers.xml | 399 ++++++ src/gallium/drivers/ethosu/rules-ng.xsd | 457 +++++++ src/gallium/meson.build | 6 + src/gallium/targets/dri/meson.build | 2 +- .../winsys/ethosu/drm/ethosu_drm_public.h | 17 + .../winsys/ethosu/drm/ethosu_drm_winsys.c | 19 + src/gallium/winsys/ethosu/drm/meson.build | 13 + 34 files changed, 6015 insertions(+), 3 deletions(-) create mode 100644 include/drm-uapi/ethosu_accel.h create mode 100644 src/gallium/drivers/ethosu/.clang-format create mode 100644 src/gallium/drivers/ethosu/ci/ethos-imx93-fails.txt create mode 100644 src/gallium/drivers/ethosu/ci/ethos-imx93-flakes.txt create mode 100644 src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt create mode 100644 src/gallium/drivers/ethosu/decode.py create mode 100644 src/gallium/drivers/ethosu/ethosu_cmd.c create mode 100644 src/gallium/drivers/ethosu/ethosu_cmd.h create mode 100644 src/gallium/drivers/ethosu/ethosu_coefs.c create mode 100644 src/gallium/drivers/ethosu/ethosu_coefs.h create mode 100644 src/gallium/drivers/ethosu/ethosu_device.c create mode 100644 src/gallium/drivers/ethosu/ethosu_device.h create mode 100644 src/gallium/drivers/ethosu/ethosu_lower.c create mode 100644 src/gallium/drivers/ethosu/ethosu_lower.h create mode 100644 src/gallium/drivers/ethosu/ethosu_ml.c create mode 100644 src/gallium/drivers/ethosu/ethosu_ml.h create mode 100644 src/gallium/drivers/ethosu/ethosu_sched.c create mode 100644 src/gallium/drivers/ethosu/ethosu_sched.h create mode 100644 src/gallium/drivers/ethosu/gen_header.py create mode 100644 src/gallium/drivers/ethosu/gen_parser.py create mode 100644 src/gallium/drivers/ethosu/meson.build create mode 100644 src/gallium/drivers/ethosu/mlw_codec/mlw_common.h create mode 100644 src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c create mode 100644 src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h create mode 100644 src/gallium/drivers/ethosu/registers.xml create mode 100644 src/gallium/drivers/ethosu/rules-ng.xsd create mode 100644 src/gallium/winsys/ethosu/drm/ethosu_drm_public.h create mode 100644 src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c create mode 100644 src/gallium/winsys/ethosu/drm/meson.build diff --git a/.clang-format-include b/.clang-format-include index d7c5747177f..ba52553fdc9 100644 --- a/.clang-format-include +++ b/.clang-format-include @@ -1,6 +1,7 @@ # The following files are opted into `ninja clang-format` and # enforcement in the CI. +src/gallium/drivers/ethosu/**/* src/gallium/drivers/i915 src/gallium/drivers/r300/compiler/* src/gallium/drivers/rocket/**/* diff --git a/include/drm-uapi/ethosu_accel.h b/include/drm-uapi/ethosu_accel.h new file mode 100644 index 00000000000..135d6480e3a --- /dev/null +++ b/include/drm-uapi/ethosu_accel.h @@ -0,0 +1,262 @@ +/* SPDX-License-Identifier: MIT */ +/* Copyright (C) 2025 Arm, Ltd. */ +#ifndef _ETHOSU_DRM_H_ +#define _ETHOSU_DRM_H_ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: IOCTL IDs + * + * enum drm_ethosu_ioctl_id - IOCTL IDs + * + * Place new ioctls at the end, don't re-order, don't replace or remove entries. + * + * These IDs are not meant to be used directly. Use the DRM_IOCTL_ETHOSU_xxx + * definitions instead. + */ +enum drm_ethosu_ioctl_id { + /** @DRM_ETHOSU_DEV_QUERY: Query device information. */ + DRM_ETHOSU_DEV_QUERY = 0, + + /** @DRM_ETHOSU_BO_CREATE: Create a buffer object. */ + DRM_ETHOSU_BO_CREATE, + + /** @DRM_ETHOSU_BO_WAIT: Wait on a buffer object's fence. */ + DRM_ETHOSU_BO_WAIT, + + /** + * @DRM_ETHOSU_BO_MMAP_OFFSET: Get the file offset to pass to + * mmap to map a GEM object. + */ + DRM_ETHOSU_BO_MMAP_OFFSET, + + /** + * @DRM_ETHOSU_CMDSTREAM_BO_CREATE: Create a command stream buffer + * object. + */ + DRM_ETHOSU_CMDSTREAM_BO_CREATE, + + /** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */ + DRM_ETHOSU_SUBMIT, +}; + +/** + * DOC: IOCTL arguments + */ + +/** + * enum drm_ethosu_dev_query_type - Query type + * + * Place new types at the end, don't re-order, don't remove or replace. + */ +enum drm_ethosu_dev_query_type { + /** @DRM_ETHOSU_DEV_QUERY_NPU_INFO: Query NPU information. */ + DRM_ETHOSU_DEV_QUERY_NPU_INFO = 0, +}; + +/** + * struct drm_ethosu_gpu_info - NPU information + * + * Structure grouping all queryable information relating to the NPU. + */ +struct drm_ethosu_npu_info { + /** @id : NPU ID. */ + __u32 id; +#define DRM_ETHOSU_ARCH_MAJOR(x) ((x) >> 28) +#define DRM_ETHOSU_ARCH_MINOR(x) (((x) >> 20) & 0xff) +#define DRM_ETHOSU_ARCH_PATCH(x) (((x) >> 16) & 0xf) +#define DRM_ETHOSU_PRODUCT_MAJOR(x) (((x) >> 12) & 0xf) +#define DRM_ETHOSU_VERSION_MAJOR(x) (((x) >> 8) & 0xf) +#define DRM_ETHOSU_VERSION_MINOR(x) (((x) >> 4) & 0xff) +#define DRM_ETHOSU_VERSION_STATUS(x) ((x) & 0xf) + + /** @gpu_rev: GPU revision. */ + __u32 config; + + __u32 sram_size; +}; +/** + * struct drm_ethosu_dev_query - Arguments passed to DRM_ETHOSU_IOCTL_DEV_QUERY + */ +struct drm_ethosu_dev_query { + /** @type: the query type (see drm_ethosu_dev_query_type). */ + __u32 type; + + /** + * @size: size of the type being queried. + * + * If pointer is NULL, size is updated by the driver to provide the + * output structure size. If pointer is not NULL, the driver will + * only copy min(size, actual_structure_size) bytes to the pointer, + * and update the size accordingly. This allows us to extend query + * types without breaking userspace. + */ + __u32 size; + + /** + * @pointer: user pointer to a query type struct. + * + * Pointer can be NULL, in which case, nothing is copied, but the + * actual structure size is returned. If not NULL, it must point to + * a location that's large enough to hold size bytes. + */ + __u64 pointer; +}; + +/** + * enum drm_ethosu_bo_flags - Buffer object flags, passed at creation time. + */ +enum drm_ethosu_bo_flags { + /** + * @DRM_ETHOSU_BO_NO_MMAP: The buffer object will never be CPU-mapped + * in userspace. + */ + DRM_ETHOSU_BO_NO_MMAP = (1 << 0), +}; + +/** + * struct drm_ethosu_bo_create - Arguments passed to DRM_IOCTL_ETHOSU_BO_CREATE. + */ +struct drm_ethosu_bo_create { + /** + * @size: Requested size for the object + * + * The (page-aligned) allocated size for the object will be returned. + */ + __u64 size; + + /** + * @flags: Flags. Must be a combination of drm_ethosu_bo_flags flags. + */ + __u32 flags; + + /** + * @handle: Returned handle for the object. + * + * Object handles are nonzero. + */ + __u32 handle; +}; + +/** + * struct drm_ethosu_bo_mmap_offset - Arguments passed to DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET. + */ +struct drm_ethosu_bo_mmap_offset { + /** @handle: Handle of the object we want an mmap offset for. */ + __u32 handle; + + /** @pad: MBZ. */ + __u32 pad; + + /** @offset: The fake offset to use for subsequent mmap calls. */ + __u64 offset; +}; + +/** + * struct drm_ethosu_wait_bo - ioctl argument for waiting for + * completion of the last DRM_ETHOSU_SUBMIT on a BO. + * + * This is useful for cases where multiple processes might be + * rendering to a BO and you want to wait for all rendering to be + * completed. + */ +struct drm_ethosu_bo_wait { + __u32 handle; + __u32 pad; + __s64 timeout_ns; /* absolute */ +}; + + +struct drm_ethosu_cmdstream_bo_create { + /* Size of the data argument. */ + __u32 size; + + /* Flags, currently must be 0. */ + __u32 flags; + + /* Pointer to the data. */ + __u64 data; + + /** Returned GEM handle for the BO. */ + __u32 handle; + + /* Pad, must be 0. */ + __u32 pad; +}; + +/** + * struct drm_ethosu_job - A job to be run on the NPU + * + * The kernel will schedule the execution of this job taking into account its + * dependencies with other jobs. All tasks in the same job will be executed + * sequentially on the same core, to benefit from memory residency in SRAM. + */ +struct drm_ethosu_job { + /** Input: BO handle for cmdstream. */ + __u32 cmd_bo; + + /** Input: Amount of SRAM to use. */ + __u32 sram_size; + +#define ETHOSU_MAX_REGIONS 8 + /** Input: Array of BO handles for each region. */ + __u32 region_bo_handles[ETHOSU_MAX_REGIONS]; +}; + +/** + * struct drm_ethosu_submit - ioctl argument for submitting commands to the NPU. + * + * The kernel will schedule the execution of these jobs in dependency order. + */ +struct drm_ethosu_submit { + /** Input: Pointer to an array of struct drm_ethosu_job. */ + __u64 jobs; + + /** Input: Number of jobs passed in. */ + __u32 job_count; + + /** Reserved, must be zero. */ + __u32 pad; +}; + + +/** + * DRM_IOCTL_ETHOSU() - Build a ethosu IOCTL number + * @__access: Access type. Must be R, W or RW. + * @__id: One of the DRM_ETHOSU_xxx id. + * @__type: Suffix of the type being passed to the IOCTL. + * + * Don't use this macro directly, use the DRM_IOCTL_ETHOSU_xxx + * values instead. + * + * Return: An IOCTL number to be passed to ioctl() from userspace. + */ +#define DRM_IOCTL_ETHOSU(__access, __id, __type) \ + DRM_IO ## __access(DRM_COMMAND_BASE + DRM_ETHOSU_ ## __id, \ + struct drm_ethosu_ ## __type) + +enum { + DRM_IOCTL_ETHOSU_DEV_QUERY = + DRM_IOCTL_ETHOSU(WR, DEV_QUERY, dev_query), + DRM_IOCTL_ETHOSU_BO_CREATE = + DRM_IOCTL_ETHOSU(WR, BO_CREATE, bo_create), + DRM_IOCTL_ETHOSU_BO_WAIT = + DRM_IOCTL_ETHOSU(WR, BO_WAIT, bo_wait), + DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET = + DRM_IOCTL_ETHOSU(WR, BO_MMAP_OFFSET, bo_mmap_offset), + DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE = + DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create), + DRM_IOCTL_ETHOSU_SUBMIT = + DRM_IOCTL_ETHOSU(WR, SUBMIT, submit), +}; + +#if defined(__cplusplus) +} +#endif + +#endif /* _ETHOSU_DRM_H_ */ diff --git a/meson.build b/meson.build index 9e72f4a963e..c7ed21a0961 100644 --- a/meson.build +++ b/meson.build @@ -186,7 +186,7 @@ elif gallium_drivers.contains('all') gallium_drivers = [ 'r300', 'r600', 'radeonsi', 'crocus', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'i915', 'nouveau', 'svga', 'tegra', 'virgl', 'lima', 'panfrost', 'llvmpipe', 'softpipe', 'iris', - 'zink', 'd3d12', 'asahi', 'rocket' + 'zink', 'd3d12', 'asahi', 'rocket', 'ethosu' ] endif @@ -214,6 +214,7 @@ with_gallium_zink = gallium_drivers.contains('zink') with_gallium_d3d12 = gallium_drivers.contains('d3d12') with_gallium_asahi = gallium_drivers.contains('asahi') with_gallium_rocket = gallium_drivers.contains('rocket') +with_gallium_ethosu = gallium_drivers.contains('ethosu') foreach gallium_driver : gallium_drivers pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper()) endforeach diff --git a/meson.options b/meson.options index 27d778a3c7a..b1f98d7452a 100644 --- a/meson.options +++ b/meson.options @@ -86,7 +86,7 @@ option( value : ['auto'], choices : [ 'all', 'auto', - 'asahi', 'crocus', 'd3d12', 'etnaviv', 'freedreno', 'i915', 'iris', + 'asahi', 'crocus', 'd3d12', 'ethosu', 'etnaviv', 'freedreno', 'i915', 'iris', 'lima', 'llvmpipe', 'nouveau', 'panfrost', 'r300', 'r600', 'radeonsi', 'rocket', 'softpipe', 'svga', 'tegra', 'v3d', 'vc4', 'virgl', 'zink', ], diff --git a/src/gallium/drivers/ethosu/.clang-format b/src/gallium/drivers/ethosu/.clang-format new file mode 100644 index 00000000000..34cd9d7d1d3 --- /dev/null +++ b/src/gallium/drivers/ethosu/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: InheritParentConfig +DisableFormat: false diff --git a/src/gallium/drivers/ethosu/ci/ethos-imx93-fails.txt b/src/gallium/drivers/ethosu/ci/ethos-imx93-fails.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/gallium/drivers/ethosu/ci/ethos-imx93-flakes.txt b/src/gallium/drivers/ethosu/ci/ethos-imx93-flakes.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt b/src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt new file mode 100644 index 00000000000..65e55edf082 --- /dev/null +++ b/src/gallium/drivers/ethosu/ci/ethos-imx93-skips.txt @@ -0,0 +1,14 @@ +Add.Op/.* +AddQuant.Op/.* +Conv2D.Op/.* +DepthwiseConv2D.Op/.* +FullyConnected.Op/.* + +# Don't support unfused Pad operations yet +Models.Op/yolox_000 +Models.Op/yolox_003 +Models.Op/yolox_012 +Models.Op/yolox_027 +Models.Op/yolox_042 +Models.Op/yolox_077 +Models.Op/yolox_086 diff --git a/src/gallium/drivers/ethosu/decode.py b/src/gallium/drivers/ethosu/decode.py new file mode 100644 index 00000000000..6bc4a5780c8 --- /dev/null +++ b/src/gallium/drivers/ethosu/decode.py @@ -0,0 +1,75 @@ +#!/usr/bin/python3 +# +# Copyright © 2024-2025 Tomeu Vizoso +# +# SPDX-License-Identifier: MIT + +import sys +import os +import argparse +import struct +from gen_parser import Parser, Reg, Enum, mask, Error + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--xml', type=str, required=True) + parser.add_argument('--dump', type=str, required=True) + + args = parser.parse_args() + + p = Parser() + + try: + p.parse("", args.xml) + except Error as e: + print(e, file=sys.stderr) + exit(1) + + regs = {} + for e in p.file: + if isinstance(e, Reg): + regs[e.offset] = e + + domains = {} + for e in p.file: + if isinstance(e, Enum): + if e.name == "target": + for name, val in e.values: + domains[name] = val + + f = open(args.dump, mode='rb') + for i in range(0, os.path.getsize(args.dump) // 8): + cmd = f.read(8) + (offset, value, target) = struct.unpack("> field.low + if field_value != 0: + if not first: + print(" | ", end="") + print("%s_%s(%d)" % (reg.full_name.upper(), field.name.upper(), field_value), end="") + first = False + print(");") + else: + print("%x %x %x" % (target, offset, value)) + +if __name__ == '__main__': + main() diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.c b/src/gallium/drivers/ethosu/ethosu_cmd.c new file mode 100644 index 00000000000..fd5fe795602 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_cmd.c @@ -0,0 +1,783 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include "util/macros.h" +#include "util/u_dynarray.h" + +#include "ethosu_cmd.h" +#include "ethosu_coefs.h" +#include "ethosu_ml.h" +#include "ethosu_registers.h" +#include "ethosu_sched.h" + +#define MAX_BLOCKDEP 3 +#define MAX_OUTSTANDING_DMA_OPS 2 +#define MAX_OUTSTANDING_NPU_OPS 2 + +enum ethosu_op_to_scale { + OP_NONE = 0, + OP_A = 1, + OP_B = 2, +}; + +static void +ethosu_ensure_cmdstream(struct ethosu_subgraph *subgraph) +{ + if ((subgraph->cursor - subgraph->cmdstream) < (subgraph->cmdstream_used - 2)) + return; + + unsigned cur_size = subgraph->cursor - subgraph->cmdstream; + subgraph->cmdstream = realloc(subgraph->cmdstream, (subgraph->cmdstream_used + 32) * sizeof(*subgraph->cmdstream)); + subgraph->cursor = subgraph->cmdstream + cur_size; + subgraph->cmdstream_used += 32; +} + +#define EMIT0(cmd, param) \ + do { \ + ethosu_ensure_cmdstream(subgraph); \ + *(subgraph->cursor++) = cmd | (((param) & 0xFFFF) << 16); \ + if (DBG_ENABLED(ETHOSU_DBG_MSGS)) \ + fprintf(stderr, "emit0(%s, 0x%x);\n", ethosu_get_cmd_name(0, cmd), (param) & 0xFFFF); \ + } while (0) + +#define EMIT1(cmd, param, offset) \ + do { \ + ethosu_ensure_cmdstream(subgraph); \ + *(subgraph->cursor++) = cmd | 0x4000 | (((param) & 0xFFFF) << 16); \ + *(subgraph->cursor++) = (offset) & 0xFFFFFFFF; \ + if (DBG_ENABLED(ETHOSU_DBG_MSGS)) \ + fprintf(stderr, "emit1(%s, 0x%x, 0x%x);\n", ethosu_get_cmd_name(1, cmd), (param) & 0xFFFF, (int)(offset)); \ + } while (0) + +static void +emit_addresses( + struct ethosu_subgraph *subgraph, + struct ethosu_feature_map *feature_map, + uint32_t cmd_base0, uint32_t cmd_base1, uint32_t cmd_base2, uint32_t cmd_base3) +{ + EMIT1(cmd_base0, 0x0, feature_map->tiles.addresses[0]); + EMIT1(cmd_base1, 0x0, feature_map->tiles.addresses[1]); + EMIT1(cmd_base2, 0x0, feature_map->tiles.addresses[2]); + EMIT1(cmd_base3, 0x0, feature_map->tiles.addresses[3]); +} + +static void +emit_tiles( + struct ethosu_subgraph *subgraph, + struct ethosu_feature_map *feature_map, + uint32_t cmd_height0, uint32_t cmd_height1, uint32_t cmd_width0) +{ + EMIT0(cmd_height0, feature_map->tiles.height_0 - 1); + EMIT0(cmd_height1, feature_map->tiles.height_1 - 1); + EMIT0(cmd_width0, feature_map->tiles.width_0 - 1); +} + +static void +emit_strides( + struct ethosu_subgraph *subgraph, + struct ethosu_feature_map *feature_map, + uint32_t cmd_stride_c, uint32_t cmd_stride_y, uint32_t cmd_stride_x) +{ + unsigned elem_size = 1; + unsigned tensor_x, tensor_y, tensor_c; + struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx); + + if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) { + tensor_x = 16 * elem_size; + tensor_c = tensor_x * tensor->shape.width; + tensor_y = elem_size * tensor->shape.width * ALIGN(tensor->shape.depth, 16); + } else { + tensor_c = elem_size; + tensor_x = tensor->shape.depth * tensor_c; + tensor_y = tensor->shape.width * tensor_x; + } + + EMIT1(cmd_stride_c, 0x0, tensor_c); + EMIT1(cmd_stride_y, 0x0, tensor_y); + EMIT1(cmd_stride_x, 0x0, tensor_x); +} + +static void +emit_ifm(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map) +{ + EMIT0(NPU_SET_IFM_REGION, IO_REGION); + emit_addresses( + subgraph, + feature_map, + NPU_SET_IFM_BASE0, + NPU_SET_IFM_BASE1, + NPU_SET_IFM_BASE2, + NPU_SET_IFM_BASE3); + + emit_tiles( + subgraph, feature_map, NPU_SET_IFM_HEIGHT0_M1, NPU_SET_IFM_HEIGHT1_M1, NPU_SET_IFM_WIDTH0_M1); + + EMIT0(NPU_SET_IFM_DEPTH_M1, feature_map->shape.depth - 1); + emit_strides(subgraph, feature_map, NPU_SET_IFM_STRIDE_C, NPU_SET_IFM_STRIDE_Y, NPU_SET_IFM_STRIDE_X); + EMIT0(NPU_SET_IFM_ZERO_POINT, feature_map->zero_point); +} + +static void +emit_ifm_precision(struct ethosu_subgraph *subgraph, + struct ethosu_feature_map *feature_map, + enum ethosu_op_to_scale op_to_scale, uint32_t precision_cmd) +{ + struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx); + unsigned prec = 0; + + if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) + prec |= NPU_SET_IFM_PRECISION_FORMAT(1); + + if (feature_map->is_signed) + prec |= NPU_SET_IFM_PRECISION_ACTIVATION(1); // signed activation + + prec |= NPU_SET_IFM_PRECISION_SCALE_MODE(op_to_scale); + + EMIT0(precision_cmd, prec); +} + +static void +emit_padding(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_IFM_PAD_TOP, operation->pad.top); + EMIT0(NPU_SET_IFM_PAD_LEFT, operation->pad.left); + EMIT0(NPU_SET_IFM_PAD_BOTTOM, operation->pad.bottom); + EMIT0(NPU_SET_IFM_PAD_RIGHT, operation->pad.right); +} + +static void +emit_ofm(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map) +{ + EMIT0(NPU_SET_OFM_REGION, IO_REGION); + emit_addresses( + subgraph, + feature_map, + NPU_SET_OFM_BASE0, + NPU_SET_OFM_BASE1, + NPU_SET_OFM_BASE2, + NPU_SET_OFM_BASE3); + emit_tiles( + subgraph, feature_map, NPU_SET_OFM_HEIGHT0_M1, NPU_SET_OFM_HEIGHT1_M1, NPU_SET_OFM_WIDTH0_M1); + EMIT0(NPU_SET_OFM_HEIGHT_M1, feature_map->shape.height - 1); + EMIT0(NPU_SET_OFM_WIDTH_M1, feature_map->shape.width - 1); + EMIT0(NPU_SET_OFM_DEPTH_M1, feature_map->shape.depth - 1); + emit_strides(subgraph, feature_map, NPU_SET_OFM_STRIDE_C, NPU_SET_OFM_STRIDE_Y, NPU_SET_OFM_STRIDE_X); + EMIT0(NPU_SET_OFM_ZERO_POINT, feature_map->zero_point); +} + +static void +emit_ofm_precision(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, operation->ofm.tensor_idx); + unsigned prec = 0; + + if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) + prec |= NPU_SET_OFM_PRECISION_FORMAT(1); + + if (operation->ofm.is_signed) + prec |= NPU_SET_OFM_PRECISION_ACTIVATION(1); + + if (operation->type == ETHOSU_OPERATION_TYPE_POOLING || + operation->type == ETHOSU_OPERATION_TYPE_ELTWISE) { + prec |= NPU_SET_OFM_PRECISION_SCALE_MODE(1); + } + + prec |= NPU_SET_OFM_PRECISION_ROUND_MODE(operation->round_mode); + + EMIT0(NPU_SET_OFM_PRECISION, prec); +} + +static void +emit_kernel(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_KERNEL_HEIGHT_M1, operation->kernel.height - 1); + EMIT0(NPU_SET_KERNEL_WIDTH_M1, operation->kernel.width - 1); + unsigned stride = (operation->kernel.stride_x - 1) & 1; + stride |= ((operation->kernel.stride_y - 1) & 1) << 1; + stride |= ((operation->kernel.stride_x - 1) >> 1) << 6; + stride |= ((operation->kernel.stride_y - 1) >> 1) << 9; + stride |= (operation->kernel.dilation_x - 1) << 3; + stride |= (operation->kernel.dilation_y - 1) << 4; + stride |= operation->conv.part_kernel_first << 2; + EMIT0(NPU_SET_KERNEL_STRIDE, stride); +} + +static void +emit_weights(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_WEIGHT_REGION, operation->conv.weights.region); + EMIT1(NPU_SET_WEIGHT_BASE, 0x0, operation->conv.weights.address); + EMIT1(NPU_SET_WEIGHT_LENGTH, 0x0, operation->conv.weights.size); +} + +static void +emit_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_SCALE_REGION, operation->conv.scales.region); + EMIT1(NPU_SET_SCALE_BASE, 0x0, operation->conv.scales.address); + EMIT1(NPU_SET_SCALE_LENGTH, 0x0, operation->conv.scales.size); +} + +static void +emit_activation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_ACTIVATION, 0x0); + + if (operation->ofm.is_signed) { + EMIT0(NPU_SET_ACTIVATION_MIN, 0xff80); + EMIT0(NPU_SET_ACTIVATION_MAX, 0x7f); + } else { + EMIT0(NPU_SET_ACTIVATION_MIN, 0x00); + EMIT0(NPU_SET_ACTIVATION_MAX, 0xff); + } +} + +static void +emit_block_config(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_OFM_BLK_HEIGHT_M1, operation->block_config.ofm_block.height - 1); + EMIT0(NPU_SET_OFM_BLK_WIDTH_M1, operation->block_config.ofm_block.width - 1); + EMIT0(NPU_SET_OFM_BLK_DEPTH_M1, operation->block_config.ofm_block.depth - 1); +} + +static void +emit_shram_registers(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_IFM_IB_END, operation->block_config.shram_layout.ib_end); + EMIT0(NPU_SET_AB_START, operation->block_config.shram_layout.ab_start); + + if (operation->type == ETHOSU_OPERATION_TYPE_ELTWISE) + EMIT0(NPU_SET_IFM2_IB_START, operation->block_config.shram_layout.ib_start2); + + EMIT0(NPU_SET_ACC_FORMAT, operation->block_config.acc_type); +} + +static void +emit_common(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, enum ethosu_op_to_scale op_to_scale) +{ + emit_ifm(subgraph, &operation->ifm); + emit_ifm_precision(subgraph, &operation->ifm, op_to_scale, NPU_SET_IFM_PRECISION); + EMIT0(NPU_SET_IFM_UPSCALE, operation->upscale); + + if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) + emit_padding(subgraph, operation); + + emit_ofm(subgraph, &operation->ofm); + + emit_ofm_precision(subgraph, operation); + + if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) + emit_kernel(subgraph, operation); + + if (operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION) { + emit_weights(subgraph, operation); + emit_biases(subgraph, operation); + } + + emit_activation(subgraph, operation); + + emit_block_config(subgraph, operation); + if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen))) + emit_shram_registers(subgraph, operation); + else + EMIT0(NPU_SET_ACC_FORMAT, 0x300); // FIXME should be based on # of MACs, only works for >=256 MACs +} + +static void +emit_convolution(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + ethosu_allocate_feature_map(subgraph, &operation->ifm); + operation->ifm.tiles.height_0 = operation->ifm.shape.height; + operation->ifm.tiles.height_1 = operation->ifm.shape.height; + operation->ifm.tiles.width_0 = operation->ifm.shape.width; + + ethosu_allocate_feature_map(subgraph, &operation->ofm); + operation->ofm.tiles.height_0 = operation->ofm.shape.height; + operation->ofm.tiles.height_1 = operation->ofm.shape.height; + operation->ofm.tiles.width_0 = operation->ofm.shape.width; + + emit_common(subgraph, operation, false); +} + +static unsigned +quantise_pooling_scale(unsigned nr_kernel_elements, unsigned rescale_bits, unsigned *out_shift) +{ + int k = 0; + long long N = 0; + + frexp(nr_kernel_elements - 1, &k); + N = 31 - rescale_bits; + *out_shift = N + k; + + return ((1LL << (N + k)) + (1LL << k)) / nr_kernel_elements; +} + +static unsigned +pooling_emit_ofm_scaling( + double input1_scale, + double output_scale, + unsigned kernel_height, + unsigned kernel_width, + uint32_t *out_shift) +{ + double rescale = input1_scale / output_scale; + unsigned rescale_bits = 0; + unsigned scale; + + if (kernel_height == 1 && kernel_width == 1) { + if (rescale > 1.0) + rescale_bits = 32 - __builtin_clz(ceil(rescale)) + 1; + else if (rescale < 1.0) + rescale_bits = -(32 - __builtin_clz(ceil(1 / rescale))) - 1; + } + scale = quantise_pooling_scale(kernel_height * kernel_width, rescale_bits, out_shift); + scale = ceil(scale * rescale); + return scale; +} + +static void +emit_pooling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + unsigned scale; + unsigned scale_shift; + + emit_common(subgraph, operation, false); + + if (operation->pooling.avg) { + scale = pooling_emit_ofm_scaling( + operation->ifm.scale, + operation->ofm.scale, + operation->kernel.height, + operation->kernel.width, + &scale_shift); + + EMIT1(NPU_SET_OFM_SCALE, scale_shift, scale); + } +} + +static void +emit_ifm2(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, bool has_scalar) +{ + if (!has_scalar) { + EMIT0(NPU_SET_IFM2_REGION, IO_REGION); + emit_addresses(subgraph, &operation->ifm2, NPU_SET_IFM2_BASE0, NPU_SET_IFM2_BASE1, NPU_SET_IFM2_BASE2, NPU_SET_IFM2_BASE3); + emit_tiles(subgraph, &operation->ifm2, NPU_SET_IFM2_HEIGHT0_M1, NPU_SET_IFM2_HEIGHT1_M1, NPU_SET_IFM2_WIDTH0_M1); + emit_strides(subgraph, &operation->ifm2, NPU_SET_IFM2_STRIDE_C, NPU_SET_IFM2_STRIDE_Y, NPU_SET_IFM2_STRIDE_X); + } + EMIT0(NPU_SET_IFM2_ZERO_POINT, operation->ifm2.zero_point); +} + +static void +emit_ifm2_broadcast(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + unsigned ifm2_broadcast = 0; + + EMIT0(NPU_SET_IFM2_BROADCAST, ifm2_broadcast); +} + +/* +def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int: + input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None + input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None + output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None + + if npu_op.activation is not None and npu_op.activation.op_type in ( + NpuActivationOp.SIGMOID, + NpuActivationOp.TANH, + ): + output_scale = 1 / 0x3000 + + if npu_op.sub_op_type == NpuElementWiseOp.MUL: + if npu_op.rescale: + ofm_scale, shift = npu_op.rescale + elif None in (input_scale, input2_scale, output_scale): + ofm_scale = 1 + shift = 0 + else: + ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale) + else: # Add/Sub + # Default operand scaling is no scaling + opa_scale = opb_scale = 1 + opa_shift = 0 + bitdepth = npu_op.ifm.data_type.size_in_bits() + use_advanced_scaling = False + if npu_op.rescale is not None: + # Explicit ofm scaling + ofm_scale, shift = npu_op.rescale + elif None in (input_scale, input2_scale, output_scale): + # No ofm scaling + ofm_scale = 1 + shift = 0 + elif input_scale == input2_scale and bitdepth == 16: + # int16 same scaling + opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale( + input_scale, input2_scale, output_scale + ) + # align the double rounding with that of advanced scaling + opa_scale //= 2 + opb_scale //= 2 + shift -= 1 + opa_shift = 0 # Unused for this case + elif input_scale == input2_scale: + # Same scaling + opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale( + input_scale, input2_scale, output_scale + ) + opa_shift = 0 # Unused for this case + # For 8 bit we can't guarantee double rounding with simplified scaling will always be + # the same as with advanced scaling due to different shifts. When the ofm scale fulfils + # the following we know that double rounding will have no effect for advanced scaling + # no matter the input, so we can safely use simplified scaling with double rounding disabled. + use_advanced_scaling = int(ofm_scale) & 0xFFF != 0 + else: + use_advanced_scaling = True + if use_advanced_scaling: + # Use advanced implementation only when input/output scales differ, + # or when we can't guarantee the absence of rounding errors + ( + opa_scale, + opa_shift, + ofm_scale, + shift, + op_to_scale, + ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth) + opb_scale = 0 # Unused for this case + if npu_op.reversed_operands: + # If the operand order is reversed we also have to swap which operand is scaled + if op_to_scale == scaling.OperandToScale.OPa: + op_to_scale = scaling.OperandToScale.OPb + else: + op_to_scale = scaling.OperandToScale.OPa + emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift) + emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale) +*/ + +static void +simplified_elementwise_add_sub_scale( + double input1_scale, + double input2_scale, + double output_scale, + uint32_t input_shift, + double *out_input1_rescale, + double *out_input2_rescale, + uint32_t *out_out_scale, + uint32_t *out_out_shift) +{ + double max_input_scale = MAX2(input1_scale, input2_scale); + double input_shift_val = (double)(1LL << input_shift); /* Use 1LL for large shifts */ + + *out_input1_rescale = input1_scale * input_shift_val / (2.0 * max_input_scale); + *out_input2_rescale = input2_scale * input_shift_val / (2.0 * max_input_scale); + + /* + * Be careful with division by zero or very small output_scale if output_scale + * can be zero or close to zero. + */ + double output_rescale_val; + if (output_scale == 0.0) { + /* Handle error or return specific value */ + output_rescale_val = 0.0; /* Or INFINITY, depending on desired behavior */ + } else { + output_rescale_val = (2.0 * max_input_scale) / (output_scale * input_shift_val); + } + + *out_out_scale = ethosu_quantize_scale(output_rescale_val, out_out_shift); +} + +static enum ethosu_op_to_scale +eltwise_emit_ofm_scaling(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + double max_input_scale = MAX2(operation->ifm.scale, operation->ifm2.scale); + double min_input_scale = MIN2(operation->ifm.scale, operation->ifm2.scale); + unsigned bitdepth = 8; + uint32_t input_shift = (bitdepth == 8) ? 20 : 15; + double input1_rescale_tmp; + double input2_rescale_tmp; + unsigned ofm_scale, ofm_shift; + unsigned opa_scale, opa_shift; + + simplified_elementwise_add_sub_scale( + min_input_scale, max_input_scale, operation->ofm.scale, input_shift, + &input1_rescale_tmp, &input2_rescale_tmp, + &ofm_scale, &ofm_shift); + + opa_scale = ethosu_quantize_scale(input1_rescale_tmp, &opa_shift); + + EMIT1(NPU_SET_OPA_SCALE, opa_shift, opa_scale); + EMIT1(NPU_SET_OPB_SCALE, 0x0, 0x0); + EMIT1(NPU_SET_OFM_SCALE, ofm_shift, ofm_scale); + + if (operation->ifm.scale < operation->ifm2.scale) + return OP_A; + else + return OP_B; +} + +static void +emit_eltwise(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + bool has_scalar = false; + enum ethosu_op_to_scale op_to_scale = OP_NONE; + + op_to_scale = eltwise_emit_ofm_scaling(subgraph, operation); + + emit_common(subgraph, operation, op_to_scale); + + emit_ifm2(subgraph, operation, has_scalar); + emit_ifm_precision(subgraph, &operation->ifm2, OP_NONE, NPU_SET_IFM2_PRECISION); + emit_ifm2_broadcast(subgraph, operation); +} + +static void +emit_dma(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + EMIT0(NPU_SET_DMA0_SRC_REGION, COEFS_REGION); + EMIT1(NPU_SET_DMA0_SRC, 0x0, operation->dma.address); + EMIT0(NPU_SET_DMA0_DST_REGION, SCRATCH_REGION); + EMIT1(NPU_SET_DMA0_DST, 0x0, 0x0); + EMIT1(NPU_SET_DMA0_LEN, 0x0, operation->dma.size); +} + +static void +emit_operation_code(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + switch (operation->type) { + case ETHOSU_OPERATION_TYPE_CONVOLUTION: + + if (operation->conv.depthwise) + EMIT0(NPU_OP_DEPTHWISE, 0x0); + else + EMIT0(NPU_OP_CONV, 0x0); + + break; + case ETHOSU_OPERATION_TYPE_POOLING: + EMIT0(NPU_OP_POOL, operation->pooling.avg); + break; + case ETHOSU_OPERATION_TYPE_ELTWISE: + EMIT0(NPU_OP_ELEMENTWISE, 0x1); + break; + case ETHOSU_OPERATION_TYPE_DMA: + EMIT0(NPU_OP_DMA_START, 0x0); + break; + } +} + +static void +emit_cmd_waits(struct ethosu_subgraph *subgraph, int npu_waits, int dma_waits) +{ + if (npu_waits >= 0) + EMIT0(NPU_OP_KERNEL_WAIT, npu_waits); + + if (dma_waits >= 0) + EMIT0(NPU_OP_DMA_WAIT, dma_waits); +} + +static bool +ethosu_intersects_accesses(struct ethosu_address_range *a, struct ethosu_address_range *b) +{ + for (int i = 0; i < MAX_MEMORY_ACCESSES; i++) { + for (int j = 0; j < MAX_MEMORY_ACCESSES; j++) { + if (a[i].size == 0 || b[j].size == 0) + continue; + if (a[i].region != b[j].region) + continue; + if (a[i].address < b[j].address + b[j].size && + b[j].address < a[i].address + a[i].size) + return true; + } + } + + return false; +} + +static bool +ethosu_operations_conflict(struct ethosu_subgraph *subgraph, + struct ethosu_operation *op1, struct ethosu_operation *op2) +{ + /* True dependencies, or write -> read */ + if (ethosu_intersects_accesses(op1->write_accesses, op2->read_accesses)) + return true; + + /* Anti-dependencies, or read -> write */ + if (ethosu_intersects_accesses(op1->read_accesses, op2->write_accesses)) + return true; + + /* Output dependencies, or write -> write */ + if (ethosu_intersects_accesses(op1->write_accesses, op2->write_accesses)) + return true; + + /* read -> read does not cause a conflict */ + return false; +} + +static void +get_wait_dependency(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, + struct util_dynarray *outstanding_dma_ops, + struct util_dynarray *outstanding_npu_ops, + int *npu_waits, int *dma_waits) +{ + unsigned kern_wait = -1; + unsigned dma_wait = -1; + struct util_dynarray *outstanding_ops = NULL; + + if (operation->type == ETHOSU_OPERATION_TYPE_DMA) { + outstanding_ops = outstanding_npu_ops; + + util_dynarray_append(outstanding_dma_ops, struct ethosu_operation *, operation); + + unsigned dmap_ops = util_dynarray_num_elements(outstanding_dma_ops, struct ethosu_operation *); + if (dmap_ops > MAX_OUTSTANDING_DMA_OPS) + (void)util_dynarray_pop(outstanding_dma_ops, struct ethosu_operation *); + } else { + outstanding_ops = outstanding_dma_ops; + + util_dynarray_append(outstanding_npu_ops, struct ethosu_operation *, operation); + + unsigned npu_ops = util_dynarray_num_elements(outstanding_npu_ops, struct ethosu_operation *); + if (npu_ops > MAX_OUTSTANDING_NPU_OPS) + (void)util_dynarray_pop(outstanding_npu_ops, struct ethosu_operation *); + } + + unsigned waits = -1; + for (int idx = util_dynarray_num_elements(outstanding_ops, struct ethosu_operation *) - 1; idx >= 0; idx--) { + waits += 1; + struct ethosu_operation *other_op = *util_dynarray_element(outstanding_ops, struct ethosu_operation *, idx); + if (other_op == operation) + continue; + if (ethosu_operations_conflict(subgraph, other_op, operation)) { + if (operation->type == ETHOSU_OPERATION_TYPE_DMA) + kern_wait = waits; + else + dma_wait = waits; + // Current op needs to wait, and after it has waited, + // outstanding_ops[0..idx] are not outstanding any longer. + for (int i = 0; i <= idx; i++) + (void)util_dynarray_pop(outstanding_ops, struct ethosu_operation *); + break; + } + } + + *npu_waits = kern_wait; + *dma_waits = dma_wait; +} + +static void +fill_memory_accesses(struct ethosu_subgraph *subgraph) +{ + util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) { + switch (operation->type) { + case ETHOSU_OPERATION_TYPE_DMA: + operation->read_accesses[0].region = COEFS_REGION; + operation->read_accesses[0].address = operation->dma.address; + operation->read_accesses[0].size = operation->dma.size; + + operation->write_accesses[0].region = SCRATCH_REGION; + operation->write_accesses[0].address = 0x0; + operation->write_accesses[0].size = operation->dma.size; + + break; + default: + operation->read_accesses[0].region = IO_REGION; + operation->read_accesses[0].address = operation->ifm.tiles.addresses[0]; + operation->read_accesses[0].size = operation->ifm.shape.height * operation->ifm.shape.width * operation->ifm.shape.depth; + + operation->read_accesses[1].region = IO_REGION; + operation->read_accesses[1].address = operation->ifm2.tiles.addresses[0]; + operation->read_accesses[1].size = operation->ifm2.shape.height * operation->ifm2.shape.width * operation->ifm2.shape.depth; + + operation->read_accesses[2].region = operation->conv.scales.region; + operation->read_accesses[2].address = operation->conv.scales.address; + operation->read_accesses[2].size = operation->conv.scales.size; + + operation->read_accesses[3].region = operation->conv.weights.region; + operation->read_accesses[3].address = operation->conv.weights.address; + operation->read_accesses[3].size = operation->conv.weights.size; + + operation->write_accesses[0].region = IO_REGION; + operation->write_accesses[0].address = operation->ofm.tiles.addresses[0]; + operation->write_accesses[0].size = operation->ofm.shape.height * operation->ofm.shape.width * operation->ofm.shape.depth; + break; + } + } +} + +static unsigned +calc_blockdep(struct ethosu_subgraph *subgraph, struct ethosu_operation *prev_op, struct ethosu_operation *operation) +{ + if (!prev_op) + return 0; + + // Check if the reserved shram will be used in current/prev op + bool prev_uses_lut = false; // prev_op->activation && prev_op->activation->op_type == NpuActivationOp.TABLE_LOOKUP; + bool curr_uses_lut = false; // operation->activation && operation->activation->op_type == NpuActivationOp.TABLE_LOOKUP; + if (prev_uses_lut && SHRAM_RESERVED_UNUSED_BANKS == 0 && !curr_uses_lut) + return 0; + + return MAX_BLOCKDEP; /* TODO: Check if there is actually overlap between the FMs */ +} + +void +ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph) +{ + struct ethosu_operation *prev_op = NULL; + struct util_dynarray outstanding_dma_ops; + struct util_dynarray outstanding_npu_ops; + + util_dynarray_init(&outstanding_dma_ops, NULL); + util_dynarray_init(&outstanding_npu_ops, NULL); + + subgraph->cmdstream_used = 32; + subgraph->cmdstream = calloc(subgraph->cmdstream_used, sizeof(*subgraph->cmdstream)); + subgraph->cursor = subgraph->cmdstream; + + fill_memory_accesses(subgraph); + + /* Compile */ + + if (ethosu_is_u65(ethosu_screen(subgraph->base.context->screen))) + EMIT0(NPU_SET_PARALLEL_MODE, 0x0); + + util_dynarray_foreach (&subgraph->operations, struct ethosu_operation, operation) { + + int npu_waits, dma_waits; + + get_wait_dependency(subgraph, operation, &outstanding_dma_ops, &outstanding_npu_ops, + &npu_waits, &dma_waits); + + switch (operation->type) { + case ETHOSU_OPERATION_TYPE_CONVOLUTION: + emit_convolution(subgraph, operation); + break; + case ETHOSU_OPERATION_TYPE_POOLING: + emit_pooling(subgraph, operation); + break; + case ETHOSU_OPERATION_TYPE_ELTWISE: + emit_eltwise(subgraph, operation); + break; + case ETHOSU_OPERATION_TYPE_DMA: + emit_dma(subgraph, operation); + break; + } + + if (operation->type != ETHOSU_OPERATION_TYPE_DMA) { + unsigned blockdep = calc_blockdep(subgraph, prev_op, operation); + blockdep = MIN2(blockdep, MAX_BLOCKDEP); + EMIT0(NPU_SET_BLOCKDEP, blockdep); + + prev_op = operation; + } + + emit_cmd_waits(subgraph, npu_waits, dma_waits); + emit_operation_code(subgraph, operation); + } + + EMIT0(NPU_OP_STOP, 0xffff); + + util_dynarray_fini(&outstanding_dma_ops); + util_dynarray_fini(&outstanding_npu_ops); +} diff --git a/src/gallium/drivers/ethosu/ethosu_cmd.h b/src/gallium/drivers/ethosu/ethosu_cmd.h new file mode 100644 index 00000000000..372391eac69 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_cmd.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef ETHOSU_CMD_H +#define ETHOSU_CMD_H + +#include "ethosu_ml.h" + +void ethosu_emit_cmdstream(struct ethosu_subgraph *subgraph); + +#endif /* ETHOSU_CMD_H */ diff --git a/src/gallium/drivers/ethosu/ethosu_coefs.c b/src/gallium/drivers/ethosu/ethosu_coefs.c new file mode 100644 index 00000000000..a46cc3370cd --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_coefs.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "util/u_inlines.h" + +#include "mlw_codec/mlw_encode.h" +#include "ethosu_coefs.h" + +static void +fill_scale_and_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, uint8_t **scales, long *scales_size, struct pipe_resource *bias_rsrc) +{ + struct pipe_transfer *transfer_in; + int32_t *biases = pipe_buffer_map(subgraph->base.context, bias_rsrc, + PIPE_MAP_READ, &transfer_in); + unsigned idx = 0; + + *scales_size = ALIGN(operation->ofm.shape.depth * 10, 16); + *scales = malloc(*scales_size); + memset(*scales, 0, *scales_size); + + for (unsigned i = 0; i < operation->ofm.shape.depth; i++) { + uint64_t bias = biases[i]; + double conv_scale = ((double)operation->ifm.scale * (double)operation->kernel.scale) / (double)operation->ofm.scale; + uint32_t shift; + int scale = ethosu_quantize_scale(conv_scale, &shift); + + (*scales)[idx++] = (bias >> (0 * 8)) & 0xFF; + (*scales)[idx++] = (bias >> (1 * 8)) & 0xFF; + (*scales)[idx++] = (bias >> (2 * 8)) & 0xFF; + (*scales)[idx++] = (bias >> (3 * 8)) & 0xFF; + (*scales)[idx++] = (bias >> (4 * 8)) & 0xFF; + + (*scales)[idx++] = (scale >> (0 * 8)) & 0xFF; + (*scales)[idx++] = (scale >> (1 * 8)) & 0xFF; + (*scales)[idx++] = (scale >> (2 * 8)) & 0xFF; + (*scales)[idx++] = (scale >> (3 * 8)) & 0xFF; + + (*scales)[idx++] = shift & 0x3F; + } + + pipe_buffer_unmap(subgraph->base.context, transfer_in); +} + +static void +calculate_weights_strides(struct ethosu_operation *operation, int out_strides[4]) +{ + if (operation->kernel.depthwise) { + out_strides[0] = 1; + out_strides[1] = operation->ofm.shape.depth * operation->kernel.height; + out_strides[2] = operation->ofm.shape.depth; + out_strides[3] = operation->ofm.shape.depth * operation->kernel.width; + } else { + out_strides[3] = 1; + out_strides[2] = out_strides[3] * operation->ifm.shape.depth; + out_strides[1] = out_strides[2] * operation->kernel.width; + out_strides[0] = out_strides[1] * operation->kernel.height; + } +} + +static void +fill_weights(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation, uint8_t **weights, long *weights_size, struct pipe_resource *weight_rsrc) +{ + int brick_strides[4] = {0}; + unsigned input_channels = operation->ifm.shape.depth; + + if (operation->kernel.depthwise) + input_channels = 1; + + calculate_weights_strides(operation, brick_strides); + + struct pipe_transfer *transfer_in; + uint8_t *input_weights_8 = pipe_buffer_map(subgraph->base.context, weight_rsrc, + PIPE_MAP_READ, &transfer_in); + int16_t *input_weights = malloc(pipe_buffer_size(weight_rsrc) * sizeof(*input_weights)); + for (int i = 0; i < pipe_buffer_size(weight_rsrc); i++) { + if (operation->kernel.is_signed) + input_weights[i] = (int8_t)input_weights_8[i] - operation->kernel.zero_point; + else + input_weights[i] = input_weights_8[i] - operation->kernel.zero_point; + } + pipe_buffer_unmap(subgraph->base.context, transfer_in); + + long padded_size = 0; + *weights_size = mlw_reorder_encode( + IFM_UBLOCK.depth, + OFM_UBLOCK.depth, + operation->ofm.shape.depth, + operation->kernel.height, + operation->kernel.width, + input_channels, + brick_strides, + input_weights, + operation->block_config.ofm_block.depth, + operation->kernel.depthwise, + operation->conv.part_kernel_first, + 8 /* ifm_bitdepth */, + 8 /* decomp_h */, + 8 /* decomp_w */, + weights, + &padded_size, + DBG_ENABLED(ETHOSU_DBG_MSGS)); + + free(input_weights); +} + +void +fill_coefs(struct ethosu_subgraph *subgraph, + struct ethosu_operation *operation, + struct pipe_resource *bias_rsrc, + struct pipe_resource *weight_rsrc) +{ + uint8_t *scales = NULL; + fill_scale_and_biases(subgraph, operation, &scales, &operation->conv.scales.size, bias_rsrc); + + operation->conv.scales.region = COEFS_REGION; + operation->conv.scales.address = subgraph->coefs_used; + subgraph->coefs_used += ALIGN_POT(operation->conv.scales.size, 16); + subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used); + memcpy(subgraph->coefs + operation->conv.scales.address, scales, operation->conv.scales.size); + free(scales); + + uint8_t *weights = NULL; + fill_weights(subgraph, operation, &weights, &operation->conv.weights.size, weight_rsrc); + + operation->conv.weights.region = COEFS_REGION; + operation->conv.weights.address = subgraph->coefs_used; + subgraph->coefs_used += ALIGN_POT(operation->conv.weights.size, 16); + subgraph->coefs = realloc(subgraph->coefs, subgraph->coefs_used); + memcpy(subgraph->coefs + operation->conv.weights.address, weights, operation->conv.weights.size); + free(weights); +} diff --git a/src/gallium/drivers/ethosu/ethosu_coefs.h b/src/gallium/drivers/ethosu/ethosu_coefs.h new file mode 100644 index 00000000000..7b63f11de2b --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_coefs.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef ETHOSU_COEFS_H +#define ETHOSU_COEFS_H + +#include "ethosu_ml.h" + +void +fill_coefs(struct ethosu_subgraph *subgraph, + struct ethosu_operation *operation, + struct pipe_resource *bias_rsrc, + struct pipe_resource *weight_rsrc); + +#endif /* ETHOSU_COEFS_H */ diff --git a/src/gallium/drivers/ethosu/ethosu_device.c b/src/gallium/drivers/ethosu/ethosu_device.c new file mode 100644 index 00000000000..392717f85a3 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_device.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "ethosu_device.h" +#include "ethosu_ml.h" + +#include "drm-uapi/ethosu_accel.h" + +#include +#include "util/os_mman.h" +#include "util/u_inlines.h" +#include "util/u_surface.h" +#include "util/u_transfer.h" + +static const struct debug_named_value ethosu_debug_options[] = { + {"dbg_msgs", ETHOSU_DBG_MSGS, "Print debug messages"}, + {"dump_bos", ETHOSU_DBG_DUMP_BOS, "Dump buffers for analysis"}, + {"zero_bos", ETHOSU_DBG_ZERO, "Zero buffers for debugging"}, + {"disable_nhcwb16", ETHOSU_DBG_DISABLE_NHCWB16, "Disable NHCWB16"}, + {"disable_sram", ETHOSU_DBG_DISABLE_SRAM, "Disable SRAM"}, + DEBUG_NAMED_VALUE_END}; + +DEBUG_GET_ONCE_FLAGS_OPTION(ethosu_debug, "ETHOSU_DEBUG", ethosu_debug_options, 0) +int ethosu_debug = 0; + +static void +ethosu_destroy_screen(struct pipe_screen *pscreen) +{ + struct ethosu_screen *screen = ethosu_screen(pscreen); + + ralloc_free(screen); +} + +static void +ethosu_destroy_context(struct pipe_context *pctx) +{ + struct ethosu_context *ctx = ethosu_context(pctx); + + ralloc_free(ctx); +} + +static void * +ethosu_buffer_map(struct pipe_context *pctx, + struct pipe_resource *prsc, unsigned level, + unsigned usage, const struct pipe_box *box, + struct pipe_transfer **out_transfer) +{ + struct ethosu_screen *screen = ethosu_screen(pctx->screen); + struct ethosu_resource *rsc = ethosu_resource(prsc); + struct drm_ethosu_bo_wait bo_wait = {0}; + struct drm_ethosu_bo_mmap_offset bo_mmap_offset = {0}; + int ret; + + assert(level == 0); + assert(prsc->target == PIPE_BUFFER); + assert(box->y == 0); + assert(box->z == 0); + assert(box->height == 1); + assert(box->depth == 1); + + struct pipe_transfer *transfer = rzalloc(NULL, struct pipe_transfer); + transfer->level = level; + transfer->usage = usage; + transfer->box = *box; + + pipe_resource_reference(&transfer->resource, prsc); + + bo_wait.handle = rsc->handle; + bo_wait.timeout_ns = INT64_MAX; + + ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_WAIT, &bo_wait); + if (ret == -1) + goto free_transfer; + + bo_mmap_offset.handle = rsc->handle; + ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET, &bo_mmap_offset); + if (ret == -1) + goto free_transfer; + + uint8_t *map = os_mmap(NULL, prsc->width0, PROT_READ | PROT_WRITE, MAP_SHARED, + screen->fd, bo_mmap_offset.offset); + assert(map != MAP_FAILED); + if (map == MAP_FAILED) + goto free_transfer; + + *out_transfer = transfer; + + return map + box->x; + +free_transfer: + pipe_resource_reference(&transfer->resource, NULL); + ralloc_free(transfer); + return NULL; +} + +static void +ethosu_buffer_unmap(struct pipe_context *pctx, + struct pipe_transfer *transfer) +{ + pipe_resource_reference(&transfer->resource, NULL); + ralloc_free(transfer); +} + +static struct pipe_context * +ethosu_create_context(struct pipe_screen *screen, + void *priv, unsigned flags) +{ + struct ethosu_context *ctx = rzalloc(NULL, struct ethosu_context); + struct pipe_context *pctx = &ctx->base; + + if (!ctx) + return NULL; + + pctx->screen = screen; + pctx->priv = priv; + + pctx->destroy = ethosu_destroy_context; + + pctx->buffer_map = ethosu_buffer_map; + pctx->buffer_unmap = ethosu_buffer_unmap; + pctx->resource_copy_region = util_resource_copy_region; + pctx->buffer_subdata = u_default_buffer_subdata; + pctx->clear_buffer = u_default_clear_buffer; + + pctx->ml_operation_supported = ethosu_ml_operation_supported; + pctx->ml_subgraph_create = ethosu_ml_subgraph_create; + pctx->ml_subgraph_invoke = ethosu_ml_subgraph_invoke; + pctx->ml_subgraph_read_output = ethosu_ml_subgraph_read_outputs; + pctx->ml_subgraph_destroy = ethosu_ml_subgraph_destroy; + + return pctx; +} + +static struct pipe_resource * +ethosu_resource_create(struct pipe_screen *pscreen, + const struct pipe_resource *templat) +{ + struct ethosu_screen *screen = ethosu_screen(pscreen); + struct drm_ethosu_bo_create arg = {0}; + struct ethosu_resource *rsc; + int ret; + + assert(templat->target == PIPE_BUFFER); + assert(templat->height0 == 1); + assert(templat->depth0 == 1); + assert(templat->array_size == 1); + + rsc = rzalloc(NULL, struct ethosu_resource); + if (!rsc) + return NULL; + + rsc->base = *templat; + rsc->base.screen = pscreen; + rsc->base.nr_samples = templat->nr_samples; + pipe_reference_init(&rsc->base.reference, 1); + + rsc->bo_size = templat->width0; + + arg.size = templat->width0; + + ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_BO_CREATE, &arg); + if (ret < 0) + goto free_rsc; + + rsc->handle = arg.handle; + + return &rsc->base; + +free_rsc: + ralloc_free(rsc); + return NULL; +} + +static void +ethosu_resource_destroy(struct pipe_screen *pscreen, + struct pipe_resource *prsc) +{ + struct ethosu_resource *rsc = ethosu_resource(prsc); + struct ethosu_screen *screen = ethosu_screen(pscreen); + struct drm_gem_close arg = {0}; + int ret; + + arg.handle = rsc->handle; + + ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg); + assert(ret >= 0); + + ralloc_free(rsc); +} + +static int +ethosu_screen_get_fd(struct pipe_screen *pscreen) +{ + return ethosu_screen(pscreen)->fd; +} + +static void +dev_query(struct ethosu_screen *screen) +{ + int ret; + struct drm_ethosu_npu_info *info = &screen->info; + struct drm_ethosu_dev_query dev_query = { + .type = DRM_ETHOSU_DEV_QUERY_NPU_INFO, + .size = sizeof(*info), + .pointer = (uintptr_t)info, + }; + + ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_DEV_QUERY, &dev_query); + assert(ret != -1); +} + +struct pipe_screen * +ethosu_screen_create(int fd, + const struct pipe_screen_config *config, + struct renderonly *ro) +{ + struct ethosu_screen *ethosu_screen; + struct pipe_screen *screen; + + ethosu_screen = rzalloc(NULL, struct ethosu_screen); + if (!ethosu_screen) + return NULL; + + screen = ðosu_screen->pscreen; + + ethosu_debug = debug_get_option_ethosu_debug(); + + ethosu_screen->fd = fd; + dev_query(ethosu_screen); + + if (DBG_ENABLED(ETHOSU_DBG_DISABLE_SRAM)) + ethosu_screen->info.sram_size = 0; + + screen->get_screen_fd = ethosu_screen_get_fd; + screen->destroy = ethosu_destroy_screen; + screen->context_create = ethosu_create_context; + screen->resource_create = ethosu_resource_create; + screen->resource_destroy = ethosu_resource_destroy; + + return screen; +} \ No newline at end of file diff --git a/src/gallium/drivers/ethosu/ethosu_device.h b/src/gallium/drivers/ethosu/ethosu_device.h new file mode 100644 index 00000000000..b121661baad --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_device.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "renderonly/renderonly.h" +#include "util/log.h" + +#include "drm-uapi/ethosu_accel.h" + +#ifndef ETHOSU_SCREEN_H +#define ETHOSU_SCREEN_H + +enum ethosu_dbg { + ETHOSU_DBG_MSGS = BITFIELD_BIT(0), + ETHOSU_DBG_DUMP_BOS = BITFIELD_BIT(1), + ETHOSU_DBG_ZERO = BITFIELD_BIT(2), + ETHOSU_DBG_DISABLE_NHCWB16 = BITFIELD_BIT(3), + ETHOSU_DBG_DISABLE_SRAM = BITFIELD_BIT(4), +}; + +extern int ethosu_debug; + +#define DBG_ENABLED(flag) unlikely(ethosu_debug &(flag)) + +#define DBG(fmt, ...) \ + do { \ + if (DBG_ENABLED(ETHOSU_DBG_MSGS)) \ + mesa_logd("%s:%d: " fmt, __func__, __LINE__, \ + ##__VA_ARGS__); \ + } while (0) + +struct ethosu_screen { + struct pipe_screen pscreen; + + int fd; + struct drm_ethosu_npu_info info; +}; + +static inline struct ethosu_screen * +ethosu_screen(struct pipe_screen *p) +{ + return (struct ethosu_screen *)p; +} + +static inline bool +ethosu_is_u65(struct ethosu_screen *e) +{ + return DRM_ETHOSU_ARCH_MAJOR(e->info.id) == 1; +} + +struct ethosu_context { + struct pipe_context base; +}; + +static inline struct ethosu_context * +ethosu_context(struct pipe_context *pctx) +{ + return (struct ethosu_context *)pctx; +} + +struct ethosu_resource { + struct pipe_resource base; + + uint32_t handle; + uint64_t phys_addr; + uint64_t obj_addr; + uint64_t bo_size; +}; + +static inline struct ethosu_resource * +ethosu_resource(struct pipe_resource *p) +{ + return (struct ethosu_resource *)p; +} + +struct pipe_screen *ethosu_screen_create(int fd, + const struct pipe_screen_config *config, + struct renderonly *ro); + +#endif /* ETHOSU_SCREEN_H */ diff --git a/src/gallium/drivers/ethosu/ethosu_lower.c b/src/gallium/drivers/ethosu/ethosu_lower.c new file mode 100644 index 00000000000..c452ab21a7e --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_lower.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "ethosu_lower.h" +#include "ethosu_coefs.h" +#include "ethosu_sched.h" + +static bool +is_depthwise(const struct pipe_ml_operation *poperation) +{ + unsigned input_channels = poperation->input_tensors[0]->dims[3]; + unsigned output_channels = poperation->output_tensors[0]->dims[3]; + + return poperation->conv.depthwise && input_channels > 1 && + output_channels > 1; +} + +static unsigned +needed_total_padding(unsigned input_size, unsigned stride, unsigned filter_size) +{ + if (input_size % stride == 0) + return MAX2(filter_size - stride, 0); + + return MAX2(filter_size - (input_size % stride), 0); +} + +static bool +ethosu_is_part_kernel_first(struct ethosu_operation *operation) +{ + // Determine which block traversal strategy has better DPU utilization + unsigned kernel_size = operation->kernel.height * operation->kernel.width; + unsigned depth = operation->ifm.shape.depth; + float depth_utilization = (float)depth / ethosu_round_up_to_multiple(depth, 32); + float part_kernel_utilization = ((float)depth / ethosu_round_up_to_multiple(depth, 8)); + part_kernel_utilization *= (float)kernel_size / ethosu_round_up_to_multiple(kernel_size, 4); + + if (operation->type != ETHOSU_OPERATION_TYPE_CONVOLUTION) + return false; + + if (operation->kernel.depthwise) + return false; + + // Part-kernel first is always better for ifm depths <= 8 + if (part_kernel_utilization >= depth_utilization || depth <= 8) + return true; + + return false; +} + +static void +set_feature_maps(struct pipe_tensor *input_tensor, + struct pipe_tensor *output_tensor, + struct ethosu_operation *operation) +{ + operation->ifm.tensor_idx = input_tensor->index; + operation->ifm.shape.height = input_tensor->dims[1]; + operation->ifm.shape.width = input_tensor->dims[2]; + operation->ifm.shape.depth = input_tensor->dims[3]; + operation->ifm.zero_point = input_tensor->zero_point; + operation->ifm.scale = input_tensor->scale; + operation->ifm.is_signed = input_tensor->is_signed; + + operation->ofm.tensor_idx = output_tensor->index; + operation->ofm.shape.height = output_tensor->dims[1]; + operation->ofm.shape.width = output_tensor->dims[2]; + operation->ofm.shape.depth = output_tensor->dims[3]; + operation->ofm.zero_point = output_tensor->zero_point; + operation->ofm.scale = output_tensor->scale; + operation->ofm.is_signed = output_tensor->is_signed; +} + +static const struct pipe_ml_operation * +ethosu_find_first_consumer(const struct pipe_ml_operation *poperations, + unsigned count, + unsigned tensor_index) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + for (unsigned j = 0; j < poperation->input_count; j++) + if (poperation->input_tensors[j]->index == tensor_index) + return poperation; + } + + return NULL; +} + +static void +allocate_feature_maps(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + ethosu_allocate_feature_map(subgraph, &operation->ifm); + operation->ifm.tiles.height_0 = operation->ifm.shape.height; + operation->ifm.tiles.height_1 = operation->ifm.shape.height; + operation->ifm.tiles.width_0 = operation->ifm.shape.width; + + ethosu_allocate_feature_map(subgraph, &operation->ofm); + operation->ofm.tiles.height_0 = operation->ofm.shape.height; + operation->ofm.tiles.height_1 = operation->ofm.shape.height; + operation->ofm.tiles.width_0 = operation->ofm.shape.width; +} + +static const struct pipe_ml_operation * +ethosu_find_first_producer(const struct pipe_ml_operation *poperations, unsigned count, + unsigned tensor_index) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + + for (unsigned j = 0; j < poperation->output_count; j++) { + if (poperation->output_tensors[j]->index == tensor_index) + return poperation; + } + } + + return NULL; +} + +static void +ethosu_lower_convolution(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct pipe_tensor *input_tensor, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_CONVOLUTION; + + operation->conv.depthwise = is_depthwise(poperation); + // operation->padding_same = poperation->conv.padding_same; + // operation->stride = poperation->conv.stride_x; + + set_feature_maps(input_tensor, poperation->output_tensors[0], operation); + + operation->kernel.height = poperation->conv.weight_tensor->dims[1]; + operation->kernel.width = poperation->conv.weight_tensor->dims[2]; + operation->kernel.stride_y = poperation->conv.stride_y; + operation->kernel.stride_x = poperation->conv.stride_x; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + operation->kernel.depthwise = is_depthwise(poperation); + operation->kernel.scale = poperation->conv.weight_tensor->scale; + operation->kernel.zero_point = poperation->conv.weight_tensor->zero_point; + operation->kernel.is_signed = poperation->conv.weight_tensor->is_signed; + + operation->conv.part_kernel_first = ethosu_is_part_kernel_first(operation); + + if (poperation->conv.padding_same) { + unsigned vert = needed_total_padding(input_tensor->dims[1], poperation->conv.stride_y, poperation->conv.weight_tensor->dims[1]); + unsigned horiz = needed_total_padding(input_tensor->dims[2], poperation->conv.stride_x, poperation->conv.weight_tensor->dims[2]); + + operation->pad.top = vert / 2; + operation->pad.left = horiz / 2; + operation->pad.bottom = (vert + 1) / 2; + operation->pad.right = (horiz + 1) / 2; + } else { + operation->pad.top = 0; + operation->pad.left = 0; + operation->pad.bottom = 0; + operation->pad.right = 0; + } + + allocate_feature_maps(subgraph, operation); + + ethosu_sched_operation(subgraph, operation); + fill_coefs(subgraph, operation, poperation->conv.bias_tensor->resource, poperation->conv.weight_tensor->resource); +} + +static void +ethosu_lower_pooling(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_POOLING; + operation->pooling.avg = poperation->pooling.type == PIPE_ML_POOLING_TYPE_AVG; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + + operation->kernel.height = poperation->pooling.filter_height; + operation->kernel.width = poperation->pooling.filter_width; + operation->kernel.stride_y = poperation->pooling.stride_y; + operation->kernel.stride_x = poperation->pooling.stride_x; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + if (poperation->pooling.padding_same) { + unsigned vert = needed_total_padding(operation->ifm.shape.height, poperation->pooling.stride_y, poperation->pooling.filter_height); + unsigned horiz = needed_total_padding(operation->ifm.shape.width, poperation->pooling.stride_x, poperation->pooling.filter_width); + + operation->pad.top = vert / 2; + operation->pad.left = horiz / 2; + operation->pad.bottom = (vert + 1) / 2; + operation->pad.right = (horiz + 1) / 2; + } else { + operation->pad.top = 0; + operation->pad.left = 0; + operation->pad.bottom = 0; + operation->pad.right = 0; + } + + allocate_feature_maps(subgraph, operation); + ethosu_sched_operation(subgraph, operation); +} + +static void +ethosu_lower_concatenation(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + unsigned input_idx, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_POOLING; + operation->pooling.avg = true; + + set_feature_maps(poperation->input_tensors[input_idx], poperation->output_tensors[0], operation); + operation->ofm.shape.depth = operation->ifm.shape.depth; + + operation->round_mode = ETHOSU_ROUNDING_NATURAL; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + allocate_feature_maps(subgraph, operation); + for (unsigned i = 0; i < input_idx; i++) { + struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, operation->ofm.tensor_idx); + + if (tensor->layout == ETHOSU_LAYOUT_NHWC) + operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3]; + else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) + operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16); + else + assert(0 && "Unsupported layout"); + } + + ethosu_sched_operation(subgraph, operation); +} + +static void +ethosu_lower_resize(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_POOLING; + operation->pooling.avg = true; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + operation->ifm.zero_point = 0; + operation->ofm.zero_point = 0; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + operation->upscale = true; + + allocate_feature_maps(subgraph, operation); + ethosu_sched_operation(subgraph, operation); +} + +static void +ethosu_lower_strided_slice(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_POOLING; + operation->pooling.avg = true; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + operation->ifm.shape = operation->ofm.shape; + operation->ifm.zero_point = 0; + operation->ofm.zero_point = 0; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + allocate_feature_maps(subgraph, operation); + + unsigned augmented_coord[5]; + augmented_coord[0] = 0; + for (int i = 0; i < 4; ++i) { + augmented_coord[i + 1] = poperation->slice.begin[i]; + } + + unsigned augmented_strides[5]; + augmented_strides[0] = operation->ifm.shape.depth * operation->ifm.shape.width * operation->ifm.shape.height; + augmented_strides[1] = 1; + augmented_strides[2] = operation->ifm.shape.depth * operation->ifm.shape.width; + augmented_strides[3] = operation->ifm.shape.depth; + augmented_strides[4] = 1; + + unsigned address_offset = 0; + for (int i = 0; i < 5; ++i) + address_offset += augmented_coord[i] * augmented_strides[i]; + + operation->ifm.tiles.addresses[0] += address_offset; + + ethosu_sched_operation(subgraph, operation); +} + +static void +ethosu_lower_add(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_ELTWISE; + + set_feature_maps(poperation->input_tensors[0], poperation->output_tensors[0], operation); + + operation->ifm2.tensor_idx = poperation->input_tensors[1]->index; + operation->ifm2.shape.height = poperation->input_tensors[1]->dims[1]; + operation->ifm2.shape.width = poperation->input_tensors[1]->dims[2]; + operation->ifm2.shape.depth = poperation->input_tensors[1]->dims[3]; + operation->ifm2.zero_point = poperation->input_tensors[1]->zero_point; + operation->ifm2.scale = poperation->input_tensors[1]->scale; + operation->ifm2.is_signed = poperation->input_tensors[1]->is_signed; + + operation->kernel.height = 1; + operation->kernel.width = 1; + operation->kernel.stride_y = 1; + operation->kernel.stride_x = 1; + operation->kernel.dilation_y = 1; + operation->kernel.dilation_x = 1; + + allocate_feature_maps(subgraph, operation); + + ethosu_allocate_feature_map(subgraph, &operation->ifm2); + operation->ifm2.tiles.height_0 = operation->ifm2.shape.height; + operation->ifm2.tiles.height_1 = operation->ifm2.shape.height; + operation->ifm2.tiles.width_0 = operation->ifm2.shape.width; + + ethosu_sched_operation(subgraph, operation); +} + +static void +ethosu_lower_dma(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperation, + struct ethosu_operation *conv_operation, + struct ethosu_operation *operation) +{ + operation->type = ETHOSU_OPERATION_TYPE_DMA; + + operation->dma.address = conv_operation->conv.scales.address; + operation->dma.size = conv_operation->conv.scales.size + conv_operation->conv.weights.size; + + conv_operation->conv.scales.region = SCRATCH_REGION; + conv_operation->conv.scales.address = 0; + + conv_operation->conv.weights.region = SCRATCH_REGION; + conv_operation->conv.weights.address = conv_operation->conv.scales.size; +} + +static void +register_tensors(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperations, + unsigned count) +{ + for (unsigned i = 0; i < count; i++) { + const struct pipe_ml_operation *poperation = &poperations[i]; + + for (unsigned j = 0; j < poperation->input_count; j++) { + struct pipe_tensor *ptensor = poperation->input_tensors[j]; + ethosu_register_tensor(subgraph, ptensor); + } + + for (unsigned j = 0; j < poperation->output_count; j++) { + struct pipe_tensor *ptensor = poperation->output_tensors[j]; + ethosu_register_tensor(subgraph, ptensor); + + if (!DBG_ENABLED(ETHOSU_DBG_DISABLE_NHCWB16)) { + struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, ptensor->index); + if (tensor->shape.depth % 16 == 0 && + ethosu_find_first_consumer(poperations, count, ptensor->index)) { + tensor->layout = ETHOSU_LAYOUT_NHCWB16; + } + } + } + } +} + +void +ethosu_lower_graph(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperations, unsigned count) +{ + register_tensors(subgraph, poperations, count); + + /* Lower */ + for (int i = 0; i < count; i++) { + struct ethosu_operation operation = {0}; + + switch (poperations[i].type) { + + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { + struct pipe_tensor *input_tensor = poperations[i].input_tensors[0]; + const struct pipe_ml_operation *producer = ethosu_find_first_producer(poperations, count, input_tensor->index); + bool padded_input = producer && producer->type == PIPE_ML_OPERATION_TYPE_PAD; + + if (padded_input) { + input_tensor = producer->input_tensors[0]; + } + + ethosu_lower_convolution(subgraph, &poperations[i], input_tensor, &operation); + + if (padded_input) { + operation.pad.top = 1; + operation.pad.left = 1; + } + + if (operation.conv.scales.size + operation.conv.weights.size <= + ethosu_screen(subgraph->base.context->screen)->info.sram_size) { + struct ethosu_operation dma_operation = {0}; + ethosu_lower_dma(subgraph, &poperations[i], &operation, &dma_operation); + + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + dma_operation); + } + + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_ADD: { + ethosu_lower_add(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_POOLING: { + ethosu_lower_pooling(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: { + ethosu_lower_strided_slice(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_CONCATENATION: { + for (int j = 0; j < poperations[i].input_count; j++) { + ethosu_lower_concatenation(subgraph, &poperations[i], j, &operation); + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + operation); + } + break; + } + + case PIPE_ML_OPERATION_TYPE_RESIZE: { + ethosu_lower_resize(subgraph, &poperations[i], &operation); + util_dynarray_append(&subgraph->operations, struct ethosu_operation, + operation); + break; + } + + case PIPE_ML_OPERATION_TYPE_PAD: { + // Just ignore the pad operation for now, as it will be handled by its consumers + break; + } + + default: + DBG("poperation->type %d\n", poperations[i].type); + UNREACHABLE("Unsupported ML operation type"); + } + } +} \ No newline at end of file diff --git a/src/gallium/drivers/ethosu/ethosu_lower.h b/src/gallium/drivers/ethosu/ethosu_lower.h new file mode 100644 index 00000000000..bcaf57b0cbc --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_lower.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef ETHOSU_LOWER_H +#define ETHOSU_LOWER_H + +#include "ethosu_ml.h" + +void +ethosu_lower_graph(struct ethosu_subgraph *subgraph, + const struct pipe_ml_operation *poperations, unsigned count); + +#endif /* ETHOSU_LOWER_H */ diff --git a/src/gallium/drivers/ethosu/ethosu_ml.c b/src/gallium/drivers/ethosu/ethosu_ml.c new file mode 100644 index 00000000000..de513f78633 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_ml.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "pipe/p_defines.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "util/macros.h" +#include "util/u_dynarray.h" +#include "util/u_inlines.h" + +#include +#include +#include +#include +#include +#include + +#include "drm-uapi/ethosu_accel.h" + +#include "ethosu_cmd.h" +#include "ethosu_lower.h" +#include "ethosu_ml.h" + +struct ethosu_block IFM_UBLOCK = {2, 2, 8}; +struct ethosu_block OFM_UBLOCK = {2, 2, 8}; +struct ethosu_block ARCH_OFM_BLOCK_MAX = {64, 32, 128}; +struct ethosu_block SUB_KERNEL_MAX = {8, 8, 65536}; + +void +ethosu_dump_buffer(const uint8_t *ptr, char *name, int operation_nr, + int suboperation_nr, int offset, unsigned size) +{ + char buffer[255]; + + snprintf(buffer, sizeof(buffer), "mesa-%s-%03u-%03u.bin", name, operation_nr, + suboperation_nr); + + FILE *f = fopen(buffer, "wb"); + assert(f); + fwrite(ptr + offset, 1, size, f); + if (ferror(f)) { + DBG("Error in writing to file: %s\n", strerror(errno)); + } + fflush(f); + fclose(f); +} + +void +ethosu_register_tensor(struct ethosu_subgraph *subgraph, + const struct pipe_tensor *ptensor) +{ + struct ethosu_tensor new_tensor = {0}; + new_tensor.index = ptensor->index; + new_tensor.shape.height = ptensor->dims[1]; + new_tensor.shape.width = ptensor->dims[2]; + new_tensor.shape.depth = ptensor->dims[3]; + new_tensor.layout = ETHOSU_LAYOUT_NHWC; + util_dynarray_append(&subgraph->tensors, struct ethosu_tensor, new_tensor); +} + +void +ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map) +{ + struct ethosu_tensor *tensor = ethosu_find_tensor(subgraph, feature_map->tensor_idx); + unsigned size; + + if (tensor->layout == ETHOSU_LAYOUT_NHWC) { + size = tensor->shape.width * tensor->shape.height * tensor->shape.depth; + } else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) { + size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16); + } else { + assert(0 && "Unsupported layout"); + size = 0; // This should never happen + } + + assert(tensor); + + if (tensor->size > 0) { + feature_map->tiles.addresses[0] = tensor->offset; + return; + } + + tensor->offset = subgraph->io_used; + tensor->size = size; + subgraph->io_used += ALIGN_POT(size, 16); + + feature_map->tiles.addresses[0] = tensor->offset; +} + +struct ethosu_tensor * +ethosu_find_tensor(struct ethosu_subgraph *subgraph, unsigned tensor_idx) +{ + util_dynarray_foreach (&subgraph->tensors, struct ethosu_tensor, tensor) { + if (tensor->index == tensor_idx) { + return tensor; + } + } + return NULL; +} + +int +ethosu_round_up_to_multiple(int a, int b) +{ + return ((a + b - 1) / b) * b; +} + +int +ethosu_round_up_divide(int a, int b) +{ + return (a + b - 1) / b; +} + +int +ethosu_quantize_scale(double scale, uint32_t *shift) +{ + int exponent = 0; + double significand = frexp(scale, &exponent); + uint32_t quantized_scale = round(significand * (double)(1LL << 31)); + *shift = 31 - exponent; + if (*shift > 63) { + if (quantized_scale > exp2(*shift - 63)) { + quantized_scale = quantized_scale >> (*shift - 63); + *shift = 63; + } else { + // Not possible to get back within bounds, set scale and shift to 0 + // as the shift would shift away all relevant bits anyway. + quantized_scale = 0; + *shift = 0; + } + } else if (*shift < 0 && quantized_scale < exp2(*shift + 32)) { + quantized_scale = quantized_scale << (0 - *shift); + *shift = 0; + } + + return quantized_scale; +} + +static bool +tensor_quantization_supported(struct pipe_tensor *tensor) +{ + /* + * Per-axis quantization not supported, for details see: + * https://ai.google.dev/edge/litert/models/quantization_spec#per-axis_vs_per-tensor + */ + return tensor->scales == NULL && tensor->zero_points == NULL; +} + +bool +ethosu_ml_operation_supported(struct pipe_context *pcontext, + const struct pipe_ml_operation *operation) +{ + bool supported = false; + + switch (operation->type) { + case PIPE_ML_OPERATION_TYPE_CONVOLUTION: { + struct pipe_tensor *input_tensor = operation->input_tensors[0]; + struct pipe_tensor *weight_tensor = operation->conv.weight_tensor; + struct pipe_tensor *bias_tensor = operation->conv.bias_tensor; + struct pipe_tensor *output_tensor = operation->output_tensors[0]; + + // Dilation and per-axis quantization not yet implemented + if (tensor_quantization_supported(input_tensor) && + tensor_quantization_supported(weight_tensor) && + tensor_quantization_supported(bias_tensor) && + tensor_quantization_supported(output_tensor) && + operation->conv.dilation_width_factor == 1 && + operation->conv.dilation_height_factor == 1) + supported = true; + + break; + } + case PIPE_ML_OPERATION_TYPE_ADD: + supported = operation->input_tensors[0]->resource == NULL && + operation->input_tensors[1]->resource == NULL; + break; + case PIPE_ML_OPERATION_TYPE_POOLING: + case PIPE_ML_OPERATION_TYPE_STRIDED_SLICE: + case PIPE_ML_OPERATION_TYPE_PAD: + case PIPE_ML_OPERATION_TYPE_RESIZE: + supported = true; + break; + case PIPE_ML_OPERATION_TYPE_CONCATENATION: + supported = operation->conc.axis == 3 || + operation->conc.axis == -1; + break; + default: + supported = false; + } + + return supported; +} + +struct pipe_ml_subgraph * +ethosu_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count) +{ + struct pipe_screen *pscreen = pcontext->screen; + struct ethosu_screen *screen = ethosu_screen(pscreen); + struct ethosu_subgraph *subgraph; + + subgraph = calloc(1, sizeof(*subgraph)); + subgraph->base.context = pcontext; + + util_dynarray_init(&subgraph->tensors, NULL); + util_dynarray_init(&subgraph->operations, NULL); + + ethosu_lower_graph(subgraph, poperations, count); + + ethosu_emit_cmdstream(subgraph); + + struct drm_ethosu_cmdstream_bo_create cmd_bo_create = { + .size = (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor), + .data = (uintptr_t)subgraph->cmdstream, + }; + + if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) + ethosu_dump_buffer((uint8_t *)subgraph->cmdstream, "cmdstream", 0, 0, 0, (subgraph->cursor - subgraph->cmdstream) * sizeof(*subgraph->cursor)); + + int ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE, &cmd_bo_create); + assert(ret == 0); + + free(subgraph->cmdstream); + + subgraph->cmdstream_bo = cmd_bo_create.handle; + + if (subgraph->coefs_used > 0) { + subgraph->coefs_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->coefs_used); + pipe_buffer_write(subgraph->base.context, subgraph->coefs_rsrc, 0, subgraph->coefs_used, subgraph->coefs); + + free(subgraph->coefs); + subgraph->coefs = NULL; + + if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) { + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->coefs_rsrc, + PIPE_MAP_READ, &transfer_in); + ethosu_dump_buffer(buf, "coefs", 0, 0, 0, pipe_buffer_size(subgraph->coefs_rsrc)); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + } + } + + subgraph->io_rsrc = pipe_buffer_create(pscreen, 0, PIPE_USAGE_DEFAULT, subgraph->io_used); + + return &subgraph->base; +} + +void +ethosu_ml_subgraph_invoke(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned inputs_count, unsigned input_idxs[], + void *inputs[], bool is_signed[]) +{ + struct ethosu_screen *screen = ethosu_screen(pcontext->screen); + struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph); + struct drm_ethosu_submit submit = {0}; + struct drm_ethosu_job job = {0}; + struct timespec start, end; + int ret; + + for (unsigned i = 0; i < inputs_count; i++) { + struct ethosu_tensor *input = ethosu_find_tensor(subgraph, input_idxs[i]); + assert(input); + + if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) + ethosu_dump_buffer(inputs[i], "input", 0, 0, 0, input->size); + + pipe_buffer_write(pcontext, subgraph->io_rsrc, input->offset, input->size, inputs[i]); + } + + if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) { + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, + PIPE_MAP_READ, &transfer_in); + ethosu_dump_buffer(buf, "io-before", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc)); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + } + + job.cmd_bo = subgraph->cmdstream_bo; + + if (subgraph->coefs_rsrc) { + job.region_bo_handles[COEFS_REGION] = ethosu_resource(subgraph->coefs_rsrc)->handle; + if (!DBG_ENABLED(ETHOSU_DBG_DISABLE_SRAM)) { + job.region_bo_handles[SCRATCH_REGION] = 0; + job.sram_size = screen->info.sram_size; + } + } + + job.region_bo_handles[IO_REGION] = ethosu_resource(subgraph->io_rsrc)->handle; + + submit.jobs = (uintptr_t)&job; + submit.job_count = 1; + + if (DBG_ENABLED(ETHOSU_DBG_MSGS)) + clock_gettime(CLOCK_MONOTONIC_RAW, &start); + + ret = drmIoctl(screen->fd, DRM_IOCTL_ETHOSU_SUBMIT, &submit); + assert(ret == 0); + + if (DBG_ENABLED(ETHOSU_DBG_MSGS)) { + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + long long duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + DBG("Submission took %lld ms\n", duration_ns / 1000000); + + /* Force a sync */ + struct pipe_transfer *transfer_in; + pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, PIPE_MAP_READ, &transfer_in); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + duration_ns = (long long)(end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + DBG("Execution took %lld ms\n", duration_ns / 1000000); + } +} + +void +ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, + unsigned output_idxs[], void *outputsv[], + bool is_signed[]) +{ + struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph); + uint8_t **outputs = (uint8_t **)outputsv; + + for (int i = 0; i < outputs_count; i++) { + struct ethosu_tensor *output = ethosu_find_tensor(subgraph, output_idxs[i]); + + if (DBG_ENABLED(ETHOSU_DBG_DUMP_BOS)) { + struct pipe_transfer *transfer_in; + uint8_t *buf = pipe_buffer_map(subgraph->base.context, subgraph->io_rsrc, + PIPE_MAP_READ, &transfer_in); + ethosu_dump_buffer(buf, "io-after", 0, 0, 0, pipe_buffer_size(subgraph->io_rsrc)); + pipe_buffer_unmap(subgraph->base.context, transfer_in); + } + + pipe_buffer_read(pcontext, subgraph->io_rsrc, output->offset, output->size, outputs[i]); + } +} + +void +ethosu_ml_subgraph_destroy(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph) +{ + int ret; + struct drm_gem_close arg = {0}; + struct ethosu_screen *screen = ethosu_screen(pcontext->screen); + struct ethosu_subgraph *subgraph = (struct ethosu_subgraph *)(psubgraph); + + pipe_resource_reference(&subgraph->io_rsrc, NULL); + pipe_resource_reference(&subgraph->coefs_rsrc, NULL); + + arg.handle = subgraph->cmdstream_bo; + ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &arg); + assert(ret >= 0); + + util_dynarray_fini(&subgraph->operations); + util_dynarray_fini(&subgraph->tensors); + + free(subgraph); +} diff --git a/src/gallium/drivers/ethosu/ethosu_ml.h b/src/gallium/drivers/ethosu/ethosu_ml.h new file mode 100644 index 00000000000..9dc9bbe9869 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_ml.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2024 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef ETHOSU_ML_H +#define ETHOSU_ML_H + +#include + +#include "ethosu_device.h" + +#define SHRAM_BANKS 48 +#define SHRAM_RESERVED_OUTPUT_BANKS 2 +#define SHRAM_RESERVED_UNUSED_BANKS 2 +#define SHRAM_RESERVED_END_BANKS 2 +#define SHRAM_TOTAL_BANKS SHRAM_BANKS +#define SHRAM_BANK_SIZE_BYTES 1024 +#define ACC_BITS 32 /* Use for now always 32-bit accumulators */ +#define IFM_GRANULE 8 +#define ACC_GRANULE 16 +#define ARCH_SPLIT_DEPTH 16 +#define BANK_SIZE_BYTES 1024 +#define IFM_GRANULE 8 + +extern struct ethosu_block ARCH_OFM_BLOCK_MAX; +extern struct ethosu_block SUB_KERNEL_MAX; +extern struct ethosu_block IFM_UBLOCK; +extern struct ethosu_block OFM_UBLOCK; + +#define COEFS_REGION 0 +#define IO_REGION 1 +#define SCRATCH_REGION 2 + +struct ethosu_block { + unsigned width; + unsigned height; + unsigned depth; +}; + +enum ethosu_operation_type { + ETHOSU_OPERATION_TYPE_CONVOLUTION, + ETHOSU_OPERATION_TYPE_POOLING, + ETHOSU_OPERATION_TYPE_ELTWISE, + ETHOSU_OPERATION_TYPE_DMA, +}; + +struct ethosu_tile_box { + unsigned height_0; /* The height of tile 0 */ + unsigned height_1; /* The height of tile 1, 0 if unused */ + unsigned width_0; /* The width of tile 0, and tile 2 (if used) */ + unsigned addresses[4]; /* A list of 4 addresses, set unused addresses to 0 */ +}; + +enum ethosu_layout { + ETHOSU_LAYOUT_NHWC, + ETHOSU_LAYOUT_NHCWB16, +}; + +enum ethosu_rounding_mode { + ETHOSU_ROUNDING_DOUBLE = 0, + ETHOSU_ROUNDING_TRUNCATE, + ETHOSU_ROUNDING_NATURAL, +}; +struct ethosu_feature_map { + unsigned tensor_idx; + struct ethosu_block shape; + bool is_signed; + struct ethosu_tile_box tiles; + unsigned zero_point; + float scale; +}; + +struct ethosu_kernel { + unsigned height; + unsigned width; + unsigned stride_y; + unsigned stride_x; + unsigned dilation_y; + unsigned dilation_x; + bool depthwise; + bool is_signed; + unsigned zero_point; + float scale; +}; + +struct ethosu_padding { + unsigned top; + unsigned left; + unsigned bottom; + unsigned right; +}; + +struct ethosu_address_range { + unsigned region; + unsigned address; + long size; +}; + +struct ethosu_shram_layout { + unsigned ib_start; + unsigned ib_end; + unsigned ib_start2; + unsigned ab_start; + unsigned lut_start; +}; + +enum ethosu_acc_type { + ETHOSU_ACC_TYPE_INT_32BIT = 0, + ETHOSU_ACC_TYPE_INT_40BIT, + ETHOSU_ACC_TYPE_FP_S5_10, +}; + +struct ethosu_block_config { + struct ethosu_block ifm_block; + struct ethosu_block ofm_block; + struct ethosu_shram_layout shram_layout; + unsigned bank_size; + enum ethosu_acc_type acc_type; + bool is_partkernel; +}; + +#define MAX_MEMORY_ACCESSES 5 /* IFM, IFM2, Scales, Weights, LUT*/ + +struct ethosu_operation { + enum ethosu_operation_type type; + + struct ethosu_block_config block_config; + + union { + struct { + struct ethosu_address_range weights; + struct ethosu_address_range scales; + bool part_kernel_first; + bool depthwise; + } conv; + + struct { + bool avg; /* true for avg, false for max */ + } pooling; + + struct { + unsigned lut_bytes; + } eltwise; + + struct { + unsigned address; + long size; + } dma; + }; + + struct ethosu_feature_map ifm; + struct ethosu_feature_map ifm2; + struct ethosu_feature_map ofm; + + struct ethosu_kernel kernel; + struct ethosu_padding pad; + bool upscale; + enum ethosu_rounding_mode round_mode; + + struct ethosu_address_range read_accesses[MAX_MEMORY_ACCESSES]; + struct ethosu_address_range write_accesses[MAX_MEMORY_ACCESSES]; +}; + +struct ethosu_tensor { + unsigned index; + unsigned offset; + unsigned size; + struct ethosu_block shape; + enum ethosu_layout layout; +}; + +struct ethosu_subgraph { + struct pipe_ml_subgraph base; + + struct util_dynarray operations; /* ethosu_operation */ + struct util_dynarray tensors; /* ethosu_tensor* */ + + unsigned cmdstream_used; + uint32_t *cmdstream; + uint32_t *cursor; + uint32_t cmdstream_bo; + + struct pipe_resource *io_rsrc; + unsigned io_used; + + uint8_t *coefs; + struct pipe_resource *coefs_rsrc; + unsigned coefs_used; +}; + +bool +ethosu_ml_operation_supported(struct pipe_context *pcontext, const struct pipe_ml_operation *operation); + +struct pipe_ml_subgraph * +ethosu_ml_subgraph_create(struct pipe_context *pcontext, + const struct pipe_ml_operation *poperations, + unsigned count); + +void ethosu_ml_subgraph_invoke(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned inputs_count, unsigned input_idxs[], + void *inputs[], bool is_signed[]); + +void ethosu_ml_subgraph_read_outputs(struct pipe_context *pcontext, + struct pipe_ml_subgraph *psubgraph, + unsigned outputs_count, + unsigned output_idxs[], void *outputs[], + bool is_signed[]); + +void ethosu_ml_subgraph_destroy(struct pipe_context *context, + struct pipe_ml_subgraph *psubgraph); + +void ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feature_map *feature_map); + +void ethosu_register_tensor(struct ethosu_subgraph *subgraph, const struct pipe_tensor *ptensor); + +struct ethosu_tensor *ethosu_find_tensor(struct ethosu_subgraph *subgraph, unsigned tensor_idx); + +void ethosu_dump_buffer(const uint8_t *ptr, char *name, int operation_nr, + int suboperation_nr, int offset, unsigned size); + +int ethosu_round_up_to_multiple(int a, int b); + +int ethosu_round_up_divide(int a, int b); + +int ethosu_quantize_scale(double scale, uint32_t *shift); + +#endif /* ETHOSU_ML_H */ diff --git a/src/gallium/drivers/ethosu/ethosu_sched.c b/src/gallium/drivers/ethosu/ethosu_sched.c new file mode 100644 index 00000000000..45021362402 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_sched.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "ethosu_sched.h" + +static int +required_input_size(int value, int stride, int border) +{ + return (value - 1) * stride + border; +} + +static struct ethosu_block +_get_ifm_blocksize(struct ethosu_operation *operation, struct ethosu_block ofm_block) +{ + struct ethosu_block ifm_block = {0}; + + // IFM block height + int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height)); + h = ALIGN(h, OFM_UBLOCK.height); + + // IFM block width + int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width)); + w = ALIGN(w, OFM_UBLOCK.width); + + ifm_block.height = h; + ifm_block.width = w; + ifm_block.depth = ofm_block.depth; + + return ifm_block; +} + +static bool +try_block_config(struct ethosu_operation *operation, struct ethosu_block ofm_block, struct ethosu_block ifm_block, struct ethosu_shram_layout *layout) +{ + int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8); + int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE); + int lut_bytes = operation->type == ETHOSU_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0; + int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS); + int lut_start = SHRAM_TOTAL_BANKS - lut_banks; + int ifm_end = SHRAM_RESERVED_OUTPUT_BANKS + ifm_banks; + int ifm2_start = ifm_end; + int acc_start = lut_start; + + if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) { + int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8; + int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE); + acc_start -= acc_banks; + } else { + int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */ + + if (ifm2_start + ifm2_banks > acc_start) + return false; + + ifm_end = acc_start; + } + + if (ifm_end > acc_start) + return false; + + layout->ib_start = SHRAM_RESERVED_OUTPUT_BANKS; + layout->ib_start2 = ifm2_start; + layout->ib_end = ifm_end; + layout->ab_start = acc_start; + layout->lut_start = lut_start; + + return true; +} + +static struct ethosu_block_config +find_block_config(struct ethosu_operation *operation) +{ + struct ethosu_block_config config = {}; + struct ethosu_block search_space = ARCH_OFM_BLOCK_MAX; + float ofm_elements = operation->ofm.shape.width * operation->ofm.shape.height * operation->ofm.shape.depth; + float ifm_elements = operation->ifm.shape.width * operation->ifm.shape.height * operation->ifm.shape.depth; + bool is_pooling = operation->type == ETHOSU_OPERATION_TYPE_POOLING; + bool is_depthwise = operation->conv.depthwise; + bool is_equal_depth = is_pooling || is_depthwise || operation->type == ETHOSU_OPERATION_TYPE_ELTWISE; + bool is_convolution = operation->type == ETHOSU_OPERATION_TYPE_CONVOLUTION; + float best_cost = FLT_MAX; + unsigned best_coverage = UINT_MAX; + + search_space.width = MIN2(search_space.width, operation->ofm.shape.width); + search_space.height = MIN2(search_space.height, operation->ofm.shape.height); + search_space.depth = MIN2(search_space.depth, operation->ofm.shape.depth); + + unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH)); + + if (depth < operation->ofm.shape.depth) { + depth = ALIGN(depth, ARCH_SPLIT_DEPTH); + } + + search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width); + search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height); + search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth); + + while (depth <= search_space.depth) { + bool wont_fit[search_space.height + 1][search_space.width + 1]; + memset(wont_fit, 0, sizeof(wont_fit)); + + for (unsigned height = OFM_UBLOCK.height; height <= search_space.height; height += OFM_UBLOCK.height) { + for (unsigned width = OFM_UBLOCK.width; width <= search_space.width; width += OFM_UBLOCK.width) { + + if (wont_fit[height][width]) + continue; + + struct ethosu_block ofm_block = {height, width, depth}; + struct ethosu_block ifm_block = _get_ifm_blocksize(operation, ofm_block); + + if (!is_equal_depth) + ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth); + + // Try to fit the blocks in SHRAM + struct ethosu_shram_layout layout = {0}; + if (try_block_config(operation, ofm_block, ifm_block, &layout)) { + + struct ethosu_block full_blocks = {DIV_ROUND_UP(operation->ofm.shape.width, ofm_block.width), + DIV_ROUND_UP(operation->ofm.shape.height, ofm_block.height), + DIV_ROUND_UP(operation->ofm.shape.depth, ofm_block.depth)}; + float blocks[3] = {operation->ofm.shape.width / (float)ofm_block.width, + operation->ofm.shape.height / (float)ofm_block.height, + operation->ofm.shape.depth / (float)ofm_block.depth}; + + float weight_area = is_convolution ? operation->kernel.width * operation->kernel.height : 0; + float weight_fetch = weight_area * operation->ifm.shape.depth * full_blocks.width * full_blocks.height; + if (!is_depthwise) + weight_fetch *= blocks[2] * ofm_block.depth; + + float ifm_fetch = ifm_block.width * ifm_block.height * operation->ifm.shape.depth * blocks[0] * blocks[1]; + if (!is_equal_depth) + ifm_fetch *= full_blocks.depth; + + float relative_cost = 0; + if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) + relative_cost = (ifm_fetch + weight_fetch) / ofm_elements; + else + relative_cost = ofm_elements / (height * width * depth); + + if (ifm_elements < ifm_block.width * ifm_block.height * ifm_block.depth * 2) + relative_cost /= 2.0f; + + if (relative_cost <= best_cost) { + bool choose_this = false; + + if (relative_cost == best_cost) { + struct ethosu_block coverage_shape = { + MIN2(ifm_block.height, operation->ifm.shape.height), + MIN2(ifm_block.width, operation->ifm.shape.width), + MIN2(ifm_block.depth, operation->ifm.shape.depth)}; + float coverage = (float)(operation->ifm.shape.width * operation->ifm.shape.height) / + (float)MAX2(1, coverage_shape.width * coverage_shape.height); + + if (coverage <= best_coverage && (height <= 4 && width <= 4)) { + best_coverage = coverage; + choose_this = true; + } + } else { + best_coverage = UINT_MAX; + choose_this = true; + } + + if (choose_this) { + config.shram_layout = layout; + config.ifm_block = ifm_block; + config.ofm_block.height = height; + config.ofm_block.width = width; + config.ofm_block.depth = depth; + + best_cost = relative_cost; + } + } + } else { + wont_fit[height][width] = true; + } + } + } + + depth += OFM_UBLOCK.depth; + if (depth < operation->ofm.shape.depth) { + depth = ALIGN(depth, ARCH_SPLIT_DEPTH); + } + } + + return config; +} + +void +ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation) +{ + operation->block_config = find_block_config(operation); +} diff --git a/src/gallium/drivers/ethosu/ethosu_sched.h b/src/gallium/drivers/ethosu/ethosu_sched.h new file mode 100644 index 00000000000..eb5876fd907 --- /dev/null +++ b/src/gallium/drivers/ethosu/ethosu_sched.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef ETHOSU_SCHED_H +#define ETHOSU_SCHED_H + +#include "ethosu_ml.h" + +void ethosu_sched_operation(struct ethosu_subgraph *subgraph, struct ethosu_operation *operation); + +#endif /* ETHOSU_SCHED_H */ diff --git a/src/gallium/drivers/ethosu/gen_header.py b/src/gallium/drivers/ethosu/gen_header.py new file mode 100644 index 00000000000..b54516a812c --- /dev/null +++ b/src/gallium/drivers/ethosu/gen_header.py @@ -0,0 +1,125 @@ +#!/usr/bin/python3 +# +# Copyright © 2019-2024 Google, Inc. +# Copyright © 2024-2025 Tomeu Vizoso +# +# SPDX-License-Identifier: MIT + +import sys +import os +import argparse +import time +import datetime +from gen_parser import Parser, Reg, Enum, mask, Error + + +def dump_c(args, guard, func): + p = Parser() + + try: + p.parse(args.rnn, args.xml) + except Error as e: + print(e, file=sys.stderr) + exit(1) + + print("#ifndef %s\n#define %s\n" % (guard, guard)) + + print("""/* Autogenerated file, DO NOT EDIT manually! + +This file was generated by the rules-ng-ng gen_header.py tool in this git repository: +http://gitlab.freedesktop.org/mesa/mesa/ +git clone https://gitlab.freedesktop.org/mesa/mesa.git + +The rules-ng-ng source files this header was generated from are: +""") + maxlen = 0 + for filepath in p.xml_files: + maxlen = max(maxlen, len(filepath)) + for filepath in p.xml_files: + pad = " " * (maxlen - len(filepath)) + filesize = str(os.path.getsize(filepath)) + filesize = " " * (7 - len(filesize)) + filesize + filetime = time.ctime(os.path.getmtime(filepath)) + print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")") + if p.copyright_year: + current_year = str(datetime.date.today().year) + print() + print("Copyright (C) %s-%s by the following authors:" % (p.copyright_year, current_year)) + for author in p.authors: + print("- " + author) + if p.license: + print(p.license) + print("*/") + + print() + print("#ifdef __KERNEL__") + print("#include ") + print("#define assert(x) BUG_ON(!(x))") + print("#else") + print("#include ") + print("#endif") + print() + + print("#ifdef __cplusplus") + print("#define __struct_cast(X)") + print("#else") + print("#define __struct_cast(X) (struct X)") + print("#endif") + print() + + func(p) + + print("\n#endif /* %s */" % guard) + + +def dump_c_defines(args): + guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + dump_c(args, guard, lambda p: p.dump()) + + +def dump_c_pack_structs(args): + guard = str.replace(os.path.basename(args.xml), '.', '_').upper() + '_STRUCTS' + dump_c(args, guard, lambda p: p.dump_structs()) + + +def dump_py_defines(args): + p = Parser() + + try: + p.parse(args.rnn, args.xml) + except Error as e: + print(e, file=sys.stderr) + exit(1) + + file_name = os.path.splitext(os.path.basename(args.xml))[0] + + print("from enum import IntEnum") + print("class %sRegs(IntEnum):" % file_name.upper()) + + os.path.basename(args.xml) + + p.dump_regs_py() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--rnn', type=str, required=True) + parser.add_argument('--xml', type=str, required=True) + + subparsers = parser.add_subparsers(required=True) + + parser_c_defines = subparsers.add_parser('c-defines') + parser_c_defines.set_defaults(func=dump_c_defines) + + parser_c_pack_structs = subparsers.add_parser('c-pack-structs') + parser_c_pack_structs.set_defaults(func=dump_c_pack_structs) + + parser_py_defines = subparsers.add_parser('py-defines') + parser_py_defines.set_defaults(func=dump_py_defines) + + args = parser.parse_args() + args.func(args) + + +if __name__ == '__main__': + main() diff --git a/src/gallium/drivers/ethosu/gen_parser.py b/src/gallium/drivers/ethosu/gen_parser.py new file mode 100644 index 00000000000..8cf8676c4bf --- /dev/null +++ b/src/gallium/drivers/ethosu/gen_parser.py @@ -0,0 +1,745 @@ +import xml.parsers.expat +import sys +import os +import collections + +class Error(Exception): + def __init__(self, message): + self.message = message + +class Enum(object): + def __init__(self, name): + self.name = name + self.values = [] + + def has_name(self, name): + for (n, value) in self.values: + if n == name: + return True + return False + + def dump(self): + use_hex = False + for (name, value) in self.values: + if value > 0x1000: + use_hex = True + + print("enum %s {" % self.name) + for (name, value) in self.values: + if use_hex: + print("\t%s = 0x%08x," % (name, value)) + else: + print("\t%s = %d," % (name, value)) + print("};\n") + + def dump_pack_struct(self): + pass + +class Field(object): + def __init__(self, name, low, high, shr, type, parser): + self.name = name + self.low = low + self.high = high + self.shr = shr + self.type = type + + builtin_types = [ None, "a3xx_regid", "boolean", "uint", "hex", "int", "fixed", "ufixed", "float", "address", "waddress" ] + + maxpos = parser.current_bitsize - 1 + + if low < 0 or low > maxpos: + raise parser.error("low attribute out of range: %d" % low) + if high < 0 or high > maxpos: + raise parser.error("high attribute out of range: %d" % high) + if high < low: + raise parser.error("low is greater than high: low=%d, high=%d" % (low, high)) + if self.type == "boolean" and not low == high: + raise parser.error("booleans should be 1 bit fields") + elif self.type == "float" and not (high - low == 31 or high - low == 15): + raise parser.error("floats should be 16 or 32 bit fields") + elif not self.type in builtin_types and not self.type in parser.enums: + raise parser.error("unknown type '%s'" % self.type) + + def ctype(self, var_name): + if self.type == None: + type = "uint32_t" + val = var_name + elif self.type == "boolean": + type = "bool" + val = var_name + elif self.type == "uint" or self.type == "hex" or self.type == "a3xx_regid": + type = "uint32_t" + val = var_name + elif self.type == "int": + type = "int32_t" + val = var_name + elif self.type == "fixed": + type = "float" + val = "((int32_t)(%s * %d.0))" % (var_name, 1 << self.radix) + elif self.type == "ufixed": + type = "float" + val = "((uint32_t)(%s * %d.0))" % (var_name, 1 << self.radix) + elif self.type == "float" and self.high - self.low == 31: + type = "float" + val = "fui(%s)" % var_name + elif self.type == "float" and self.high - self.low == 15: + type = "float" + val = "_mesa_float_to_half(%s)" % var_name + elif self.type in [ "address", "waddress" ]: + type = "uint64_t" + val = var_name + else: + type = "enum %s" % self.type + val = var_name + + if self.shr > 0: + val = "(%s >> %d)" % (val, self.shr) + + return (type, val) + +def tab_to(name, value): + tab_count = (68 - (len(name) & ~7)) // 8 + if tab_count <= 0: + tab_count = 1 + print(name + ('\t' * tab_count) + value) + +def mask(low, high): + return ((0xffffffffffffffff >> (64 - (high + 1 - low))) << low) + +def field_name(reg, f): + if f.name: + name = f.name.lower() + else: + # We hit this path when a reg is defined with no bitset fields, ie. + # + name = reg.name.lower() + + if (name in [ "double", "float", "int" ]) or not (name[0].isalpha()): + name = "_" + name + + return name + +class Bitset(object): + def __init__(self, name, template): + self.name = name + self.inline = False + if template: + self.fields = template.fields[:] + else: + self.fields = [] + + # Get address field if there is one in the bitset, else return None: + def get_address_field(self): + for f in self.fields: + if f.type in [ "address", "waddress" ]: + return f + return None + + def dump_regpair_builder(self, reg): + print("#ifndef NDEBUG") + known_mask = 0 + for f in self.fields: + known_mask |= mask(f.low, f.high) + if f.type in [ "boolean", "address", "waddress" ]: + continue + type, val = f.ctype("fields.%s" % field_name(reg, f)) + print(" assert((%-40s & 0x%08x) == 0);" % (val, 0xffffffff ^ mask(0 , f.high - f.low))) + print(" assert((%-40s & 0x%08x) == 0);" % ("fields.unknown", known_mask)) + print("#endif\n") + + print(" return (struct fd_reg_pair) {") + if reg.array: + print(" .reg = REG_%s(__i)," % reg.full_name) + else: + print(" .reg = REG_%s," % reg.full_name) + + print(" .value =") + for f in self.fields: + if f.type in [ "address", "waddress" ]: + continue + else: + type, val = f.ctype("fields.%s" % field_name(reg, f)) + print(" (%-40s << %2d) |" % (val, f.low)) + value_name = "dword" + if reg.bit_size == 64: + value_name = "qword" + print(" fields.unknown | fields.%s," % (value_name,)) + + address = self.get_address_field() + if address: + print(" .bo = fields.bo,") + print(" .is_address = true,") + if f.type == "waddress": + print(" .bo_write = true,") + print(" .bo_offset = fields.bo_offset,") + print(" .bo_shift = %d," % address.shr) + print(" .bo_low = %d," % address.low) + + print(" };") + + def dump_pack_struct(self, reg=None): + if not reg: + return + + prefix = reg.full_name + + print("struct %s {" % prefix) + for f in self.fields: + if f.type in [ "address", "waddress" ]: + tab_to(" __bo_type", "bo;") + tab_to(" uint32_t", "bo_offset;") + continue + name = field_name(reg, f) + + type, val = f.ctype("var") + + tab_to(" %s" % type, "%s;" % name) + if reg.bit_size == 64: + tab_to(" uint64_t", "unknown;") + tab_to(" uint64_t", "qword;") + else: + tab_to(" uint32_t", "unknown;") + tab_to(" uint32_t", "dword;") + print("};\n") + + if reg.array: + print("static inline struct fd_reg_pair\npack_%s(uint32_t __i, struct %s fields)\n{" % + (prefix, prefix)) + else: + print("static inline struct fd_reg_pair\npack_%s(struct %s fields)\n{" % + (prefix, prefix)) + + self.dump_regpair_builder(reg) + + print("\n}\n") + + if self.get_address_field(): + skip = ", { .reg = 0 }" + else: + skip = "" + + if reg.array: + print("#define %s(__i, ...) pack_%s(__i, __struct_cast(%s) { __VA_ARGS__ })%s\n" % + (prefix, prefix, prefix, skip)) + else: + print("#define %s(...) pack_%s(__struct_cast(%s) { __VA_ARGS__ })%s\n" % + (prefix, prefix, prefix, skip)) + + + def dump(self, prefix=None): + if prefix == None: + prefix = self.name + for f in self.fields: + if f.name: + name = prefix + "_" + f.name.upper() + else: + name = prefix + + if not f.name and f.low == 0 and f.shr == 0 and not f.type in ["float", "fixed", "ufixed"]: + pass + elif f.type == "boolean" or (f.type == None and f.low == f.high): + tab_to("#define %s" % name, "0x%08x" % (1 << f.low)) + else: + tab_to("#define %s__MASK" % name, "0x%08x" % mask(f.low, f.high)) + tab_to("#define %s__SHIFT" % name, "%d" % f.low) + type, val = f.ctype("val") + + print("static inline uint32_t %s(%s val)\n{" % (name, type)) + if f.shr > 0: + print("\tassert(!(val & 0x%x));" % mask(0, f.shr - 1)) + print("\treturn ((%s) << %s__SHIFT) & %s__MASK;\n}" % (val, name, name)) + print() + +class Array(object): + def __init__(self, attrs, domain, variant): + if "name" in attrs: + self.name = attrs["name"] + else: + self.name = "" + self.domain = domain + self.variant = variant + self.offset = int(attrs["offset"], 0) + self.stride = int(attrs["stride"], 0) + self.length = int(attrs["length"], 0) + if "usage" in attrs: + self.usages = attrs["usage"].split(',') + else: + self.usages = None + + def dump(self): + print("#define _%s(i0) (0x%08x + 0x%x*(i0))\n" % (self.name, self.offset, self.stride)) + + def dump_pack_struct(self): + pass + + def dump_regpair_builder(self): + pass + +class Reg(object): + def __init__(self, attrs, domain, array, bit_size): + self.name = attrs["name"] + self.domain = domain + self.array = array + self.offset = int(attrs["offset"], 0) + self.type = None + self.bit_size = bit_size + if array: + self.name = array.name + "_" + self.name + self.full_name = self.name + + def dump(self): + if self.array: + offset = self.array.offset + self.offset + print("static inline uint32_t %s(uint32_t i0) { return 0x%08x + 0x%x*i0; }" % (self.full_name, offset, self.array.stride)) + else: + tab_to("#define %s" % self.full_name, "0x%08x" % self.offset) + + if self.bitset.inline: + self.bitset.dump(self.full_name) + + def dump_pack_struct(self): + if self.bitset.inline: + self.bitset.dump_pack_struct(self) + + def dump_regpair_builder(self): + if self.bitset.inline: + self.bitset.dump_regpair_builder(self) + + def dump_py(self): + print("\tREG_%s = 0x%08x" % (self.full_name, self.offset)) + + +class Parser(object): + def __init__(self): + self.current_array = None + self.current_domain = None + self.current_prefix = None + self.current_prefix_type = None + self.current_stripe = None + self.current_bitset = None + self.current_bitsize = 32 + # The varset attribute on the domain specifies the enum which + # specifies all possible hw variants: + self.current_varset = None + # Regs that have multiple variants.. we only generated the C++ + # template based struct-packers for these + self.variant_regs = {} + # Information in which contexts regs are used, to be used in + # debug options + self.usage_regs = collections.defaultdict(list) + self.bitsets = {} + self.enums = {} + self.variants = set() + self.file = [] + self.xml_files = [] + self.copyright_year = None + self.authors = [] + self.license = None + + def error(self, message): + parser, filename = self.stack[-1] + return Error("%s:%d:%d: %s" % (filename, parser.CurrentLineNumber, parser.CurrentColumnNumber, message)) + + def prefix(self, variant=None): + if self.current_prefix_type == "variant" and variant: + return variant + elif self.current_stripe: + return self.current_stripe + "_" + self.current_domain + elif self.current_prefix: + return self.current_prefix + "_" + self.current_domain + else: + return self.current_domain + + def parse_field(self, name, attrs): + try: + if "pos" in attrs: + high = low = int(attrs["pos"], 0) + elif "high" in attrs and "low" in attrs: + high = int(attrs["high"], 0) + low = int(attrs["low"], 0) + else: + low = 0 + high = self.current_bitsize - 1 + + if "type" in attrs: + type = attrs["type"] + else: + type = None + + if "shr" in attrs: + shr = int(attrs["shr"], 0) + else: + shr = 0 + + b = Field(name, low, high, shr, type, self) + + if type == "fixed" or type == "ufixed": + b.radix = int(attrs["radix"], 0) + + self.current_bitset.fields.append(b) + except ValueError as e: + raise self.error(e) + + def parse_varset(self, attrs): + # Inherit the varset from the enclosing domain if not overriden: + varset = self.current_varset + if "varset" in attrs: + varset = self.enums[attrs["varset"]] + return varset + + def parse_variants(self, attrs): + if not "variants" in attrs: + return None + variant = attrs["variants"].split(",")[0] + if "-" in variant: + variant = variant[:variant.index("-")] + + varset = self.parse_varset(attrs) + + assert varset.has_name(variant) + + return variant + + def add_all_variants(self, reg, attrs, parent_variant): + # TODO this should really handle *all* variants, including dealing + # with open ended ranges (ie. "A2XX,A4XX-") (we have the varset + # enum now to make that possible) + variant = self.parse_variants(attrs) + if not variant: + variant = parent_variant + + if reg.name not in self.variant_regs: + self.variant_regs[reg.name] = {} + else: + # All variants must be same size: + v = next(iter(self.variant_regs[reg.name])) + assert self.variant_regs[reg.name][v].bit_size == reg.bit_size + + self.variant_regs[reg.name][variant] = reg + + def add_all_usages(self, reg, usages): + if not usages: + return + + for usage in usages: + self.usage_regs[usage].append(reg) + + self.variants.add(reg.domain) + + def do_validate(self, schemafile): + try: + from lxml import etree + + parser, filename = self.stack[-1] + dirname = os.path.dirname(filename) + + # we expect this to look like schema.xsd.. I think + # technically it is supposed to be just a URL, but that doesn't + # quite match up to what we do.. Just skip over everything up to + # and including the first whitespace character: + schemafile = schemafile[schemafile.rindex(" ")+1:] + + # this is a bit cheezy, but the xml file to validate could be + # in a child director, ie. we don't really know where the schema + # file is, the way the rnn C code does. So if it doesn't exist + # just look one level up + if not os.path.exists(dirname + "/" + schemafile): + schemafile = "../" + schemafile + + if not os.path.exists(dirname + "/" + schemafile): + raise self.error("Cannot find schema for: " + filename) + + xmlschema_doc = etree.parse(dirname + "/" + schemafile) + xmlschema = etree.XMLSchema(xmlschema_doc) + + xml_doc = etree.parse(filename) + if not xmlschema.validate(xml_doc): + error_str = str(xmlschema.error_log.filter_from_errors()[0]) + raise self.error("Schema validation failed for: " + filename + "\n" + error_str) + except ImportError: + print("lxml not found, skipping validation", file=sys.stderr) + + def do_parse(self, filename): + filepath = os.path.abspath(filename) + if filepath in self.xml_files: + return + self.xml_files.append(filepath) + file = open(filename, "rb") + parser = xml.parsers.expat.ParserCreate() + self.stack.append((parser, filename)) + parser.StartElementHandler = self.start_element + parser.EndElementHandler = self.end_element + parser.CharacterDataHandler = self.character_data + parser.buffer_text = True + parser.ParseFile(file) + self.stack.pop() + file.close() + + def parse(self, rnn_path, filename): + self.path = rnn_path + self.stack = [] + self.do_parse(filename) + + def parse_reg(self, attrs, bit_size): + self.current_bitsize = bit_size + if "type" in attrs and attrs["type"] in self.bitsets: + bitset = self.bitsets[attrs["type"]] + if bitset.inline: + self.current_bitset = Bitset(attrs["name"], bitset) + self.current_bitset.inline = True + else: + self.current_bitset = bitset + else: + self.current_bitset = Bitset(attrs["name"], None) + self.current_bitset.inline = True + if "type" in attrs: + self.parse_field(None, attrs) + + variant = self.parse_variants(attrs) + if not variant and self.current_array: + variant = self.current_array.variant + + self.current_reg = Reg(attrs, self.prefix(variant), self.current_array, bit_size) + self.current_reg.bitset = self.current_bitset + + if len(self.stack) == 1: + self.file.append(self.current_reg) + + if variant is not None: + self.add_all_variants(self.current_reg, attrs, variant) + + usages = None + if "usage" in attrs: + usages = attrs["usage"].split(',') + elif self.current_array: + usages = self.current_array.usages + + self.add_all_usages(self.current_reg, usages) + + def start_element(self, name, attrs): + self.cdata = "" + if name == "import": + filename = attrs["file"] + self.do_parse(os.path.join(self.path, filename)) + elif name == "domain": + self.current_domain = attrs["name"] + if "prefix" in attrs: + self.current_prefix = self.parse_variants(attrs) + self.current_prefix_type = attrs["prefix"] + else: + self.current_prefix = None + self.current_prefix_type = None + if "varset" in attrs: + self.current_varset = self.enums[attrs["varset"]] + elif name == "stripe": + self.current_stripe = self.parse_variants(attrs) + elif name == "enum": + self.current_enum_value = 0 + self.current_enum = Enum(attrs["name"]) + self.enums[attrs["name"]] = self.current_enum + if len(self.stack) == 1: + self.file.append(self.current_enum) + elif name == "value": + if "value" in attrs: + value = int(attrs["value"], 0) + else: + value = self.current_enum_value + self.current_enum.values.append((attrs["name"], value)) + elif name == "reg32": + self.parse_reg(attrs, 32) + elif name == "reg64": + self.parse_reg(attrs, 64) + elif name == "array": + self.current_bitsize = 32 + variant = self.parse_variants(attrs) + self.current_array = Array(attrs, self.prefix(variant), variant) + if len(self.stack) == 1: + self.file.append(self.current_array) + elif name == "bitset": + self.current_bitset = Bitset(attrs["name"], None) + if "inline" in attrs and attrs["inline"] == "yes": + self.current_bitset.inline = True + self.bitsets[self.current_bitset.name] = self.current_bitset + if len(self.stack) == 1 and not self.current_bitset.inline: + self.file.append(self.current_bitset) + elif name == "bitfield" and self.current_bitset: + self.parse_field(attrs["name"], attrs) + elif name == "database": + self.do_validate(attrs["xsi:schemaLocation"]) + elif name == "copyright": + self.copyright_year = attrs["year"] + elif name == "author": + self.authors.append(attrs["name"] + " <" + attrs["email"] + "> " + attrs["name"]) + + def end_element(self, name): + if name == "domain": + self.current_domain = None + self.current_prefix = None + self.current_prefix_type = None + elif name == "stripe": + self.current_stripe = None + elif name == "bitset": + self.current_bitset = None + elif name == "reg32": + self.current_reg = None + elif name == "array": + self.current_array = None + elif name == "enum": + self.current_enum = None + elif name == "license": + self.license = self.cdata + + def character_data(self, data): + self.cdata += data + + def dump_reg_usages(self): + d = collections.defaultdict(list) + for usage, regs in self.usage_regs.items(): + for reg in regs: + variants = self.variant_regs.get(reg.name) + if variants: + for variant, vreg in variants.items(): + if reg == vreg: + d[(usage, variant)].append(reg) + else: + for variant in self.variants: + d[(usage, variant)].append(reg) + + print("#ifdef __cplusplus") + + for usage, regs in self.usage_regs.items(): + print("template constexpr inline uint16_t %s_REGS[] = {};" % (usage.upper())) + + for (usage, variant), regs in d.items(): + offsets = [] + + for reg in regs: + if reg.array: + for i in range(reg.array.length): + offsets.append(reg.array.offset + reg.offset + i * reg.array.stride) + if reg.bit_size == 64: + offsets.append(offsets[-1] + 1) + else: + offsets.append(reg.offset) + if reg.bit_size == 64: + offsets.append(offsets[-1] + 1) + + offsets.sort() + + print("template<> constexpr inline uint16_t %s_REGS<%s>[] = {" % (usage.upper(), variant)) + for offset in offsets: + print("\t%s," % hex(offset)) + print("};") + + print("#endif") + + def dump(self): + enums = [] + bitsets = [] + regs = [] + for e in self.file: + if isinstance(e, Enum): + enums.append(e) + elif isinstance(e, Bitset): + bitsets.append(e) + else: + regs.append(e) + + for e in enums + bitsets + regs: + e.dump() + + self.dump_reg_usages() + + print("static inline char *ethosu_get_cmd_name(unsigned domain, uint32_t cmd) {") + for e in regs: + if e.array: + continue + domain = 0 if e.domain == "CMD0" else 1 + print(" if (domain == %d && cmd == 0x%08x) return \"%s\";" % (domain, e.offset, e.full_name)) + print(" return NULL;") + print("}\n") + + def dump_regs_py(self): + regs = [] + for e in self.file: + if isinstance(e, Reg): + regs.append(e) + + for e in regs: + e.dump_py() + + + def dump_reg_variants(self, regname, variants): + # Don't bother for things that only have a single variant: + if len(variants) == 1: + return + print("#ifdef __cplusplus") + print("struct __%s {" % regname) + # TODO be more clever.. we should probably figure out which + # fields have the same type in all variants (in which they + # appear) and stuff everything else in a variant specific + # sub-structure. + seen_fields = [] + bit_size = 32 + array = False + address = None + for variant in variants.keys(): + print(" /* %s fields: */" % variant) + reg = variants[variant] + bit_size = reg.bit_size + array = reg.array + for f in reg.bitset.fields: + fld_name = field_name(reg, f) + if fld_name in seen_fields: + continue + seen_fields.append(fld_name) + name = fld_name.lower() + if f.type in [ "address", "waddress" ]: + if address: + continue + address = f + tab_to(" __bo_type", "bo;") + tab_to(" uint32_t", "bo_offset;") + continue + type, val = f.ctype("var") + tab_to(" %s" %type, "%s;" %name) + print(" /* fallback fields: */") + if bit_size == 64: + tab_to(" uint64_t", "unknown;") + tab_to(" uint64_t", "qword;") + else: + tab_to(" uint32_t", "unknown;") + tab_to(" uint32_t", "dword;") + print("};") + # TODO don't hardcode the varset enum name + varenum = "chip" + print("template <%s %s>" % (varenum, varenum.upper())) + print("static inline struct fd_reg_pair") + xtra = "" + xtravar = "" + if array: + xtra = "int __i, " + xtravar = "__i, " + print("__%s(%sstruct __%s fields) {" % (regname, xtra, regname)) + for variant in variants.keys(): + print(" if (%s == %s) {" % (varenum.upper(), variant)) + reg = variants[variant] + reg.dump_regpair_builder() + print(" } else") + print(" assert(!\"invalid variant\");") + print("}") + + if bit_size == 64: + skip = ", { .reg = 0 }" + else: + skip = "" + + print("#define %s(VARIANT, %s...) __%s(%s{__VA_ARGS__})%s" % (regname, xtravar, regname, xtravar, skip)) + print("#endif /* __cplusplus */") + + def dump_structs(self): + for e in self.file: + e.dump_pack_struct() + + for regname in self.variant_regs: + self.dump_reg_variants(regname, self.variant_regs[regname]) diff --git a/src/gallium/drivers/ethosu/meson.build b/src/gallium/drivers/ethosu/meson.build new file mode 100644 index 00000000000..28f696a1bf5 --- /dev/null +++ b/src/gallium/drivers/ethosu/meson.build @@ -0,0 +1,33 @@ +# Copyright 2019 Google, Inc +# SPDX-License-Identifier: MIT + +ethosu_registers = custom_target( + 'ethosu_registers.h', + input : ['gen_parser.py', 'gen_header.py', 'registers.xml'], + output : 'ethosu_registers.h', + command : [prog_python, '@INPUT1@', '--rnn', '.', '--xml', '@INPUT2@', 'c-defines'], + capture : true, +) + +files_ethosu = files( + 'ethosu_cmd.c', + 'ethosu_coefs.c', + 'ethosu_device.c', + 'ethosu_lower.c', + 'ethosu_ml.c', + 'ethosu_sched.c', + 'mlw_codec/mlw_encode.c', +) + +libethosu = static_library( + 'ethosu', + [files_ethosu, ethosu_registers], + include_directories : [inc_gallium_aux, inc_gallium, inc_include, inc_src], + gnu_symbol_visibility : 'hidden', + dependencies : [idep_mesautil, dep_libdrm], +) + +driver_ethosu = declare_dependency( + compile_args : '-DGALLIUM_ETHOSU', + link_with : [libethosuwinsys, libethosu] +) diff --git a/src/gallium/drivers/ethosu/mlw_codec/mlw_common.h b/src/gallium/drivers/ethosu/mlw_codec/mlw_common.h new file mode 100644 index 00000000000..4bb38387221 --- /dev/null +++ b/src/gallium/drivers/ethosu/mlw_codec/mlw_common.h @@ -0,0 +1,29 @@ +/* + * SPDX-FileCopyrightText: Copyright 2020, 2022 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifndef MLW_COMMON_H +#define MLW_COMMON_H + +#define ZDIV_DISABLE 6 // not alternating mode +#define ZDIV_EOS 7 // indicates end of stream + +#define WDIV_UNCOMPRESSED 7 // indicates uncompressed weights + +#endif diff --git a/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c new file mode 100644 index 00000000000..47dd132090b --- /dev/null +++ b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.c @@ -0,0 +1,1186 @@ +/* + * SPDX-FileCopyrightText: Copyright 2020-2022, 2024 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlw_common.h" +#include "mlw_encode.h" + +#define DPRINTF(...) +//#define DPRINTF(...) printf(__VA_ARGS__) + +#define ZERO_RUN_THRES 4 + +#ifndef min +#define min(a,b) ((a)<(b)?(a):(b)) +#endif +#ifndef max +#define max(a,b) ((a)>(b)?(a):(b)) +#endif + +#define CHECKED_MALLOC(var, size) { if ( !(var = malloc(size)) ) break; } + +typedef struct palette { + int16_t lut[32]; + int16_t inv_lut[512]; + int palsize; // number of palette entries + int palbits; // bit width of palette entries + int use_zero_runs; // zeros are coded separately + int only_palette; // no values outside the palette + int direct_offset; // added to the decoded weight index before direct conversion to sign/mag + int only_zeros; // special case that the section is all zeros +} palette_t; + +static int is_power_of_two( int x ) { + return ((x-1) & x)==0; +} + +static int round_up_divide(int num, int den) +{ + return (num + den - 1) / den; +} + +static int round_up(int num, int den) +{ + return round_up_divide(num, den) * den; +} + +static int get_palette_index_bits( int size ) { + int i; + for(i=7; i>=0; i--) + if (size > (1< (i-last_restart_idx)/4; + + if (got_palette) { + // Check if the next value is not covered by the current palette + if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) { + // New value: increase the palette size + palette_size++; + DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size); + if ( is_power_of_two(palette_size-1-exclude_zero) ) { + if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) { + // create a new palette because we extend a long lasting palette to require one more index bit + DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt ); + if (restart_i == max_palettes) { + max_palettes = max_palettes*2; + restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) ); + if (!restart_pos) { + return -1; + } + } + DPRINTF("restart %d pos %d\n", restart_i, i); + restart_pos[restart_i++] = i; + last_restart_idx = i; + got_palette=0; + zero_cnt=0; + } + } + } + } + + prev_idx[ buf[i]+256 ] = i; + if (buf[i]==0) + zero_cnt++; + + static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}}; + int k; + // loop over window sizes + for(k=0; k<5; k++) { + // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ. + int N = window_sizes[k][0] * (got_palette?2:1); + if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) { + // Search backward to the position N nonzero values earlier + int nzcnt=0; + for( j=i; j>last_restart_idx; j--) { + if ( buf[j]!=0 ) { + if (nzcnt==N+1) + break; + nzcnt++; + } + } + int restart_idx = j; + + // Calculate the size of a new palette (starting at restart_idx) + int new_palette_size=0; + for(j=0; j<512; j++) { + if ( prev_idx[j] >= restart_idx ) { + new_palette_size++; + } + } + + int create_new_palette=0; + if (got_palette) { + int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero ); + int old_size_bits = get_palette_index_bits( palette_size - exclude_zero ); + int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20; + if ( savings>0 ) { + // Create new palette because it can be smaller than the existing palette + create_new_palette=1; + DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx); + } + } else { + if ( (new_palette_size-exclude_zero) <= 32) { + int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero ); + // estimate if we will make savings by using palette mode + int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20; + create_new_palette = savings>0; + } + } + if (create_new_palette) { + palette_size=new_palette_size; + got_palette=1; + last_restart_idx = restart_idx; + DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size); + if ( restart_pos[restart_i-1] != last_restart_idx) { + if (restart_i == max_palettes) { + max_palettes = max_palettes*2; + restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) ); + if (!restart_pos) { + return -1; + } + } + restart_pos[restart_i++] = last_restart_idx; + } + zero_cnt=0; + for( j=last_restart_idx; j<=i; j++) + if (buf[j]==0) + zero_cnt++; + } + } + } + } + // Reallocate to actual size + *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) ); + return *palette_restart_positions ? restart_i : -1; +} + +// Calculate frequency table +static void calc_freq( const int16_t *buf, int size, int freq[512] ) { + int i; + memset(freq, 0, 512*sizeof(int)); + for(i=0; ibb ? -1 : aa0) { + all_max_val = max(all_max_val, palval); + } + } + + // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc) + for(i=0; i<31; i++) { + if ((freq64[i]>>16)!=0) + break; + } + p->direct_offset = i; + + // Sort in descending frequency order + qsort(freq64, 512, sizeof(uint64_t), cmp_uint64); + + // Identify special case that there are no weights to code + // in the weight index stream (i.e. all weights are zeros) + p->only_zeros = (freq64[0]>>16)==0; + if (p->only_zeros) { + p->direct_offset=0; + } + + // Check if all weights fit into the palette (and the palette is not empty) + p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0; + + int max_palette_size; + if (p->only_palette) { + max_palette_size = 32; + } else { + // For direct-lut we must make sure that the encoded weight + // index is not > 511. We do that by limiting the palette size + // such that the greatest value can be reached after subtracting + // the palette size. + max_palette_size = min(32, 511-all_max_val); + if (max_palette_size==1) { + max_palette_size=0; // because palette of size 1 is not supported + } + } + + // Setup the 32 entry palette + int16_t palette_max_val = 0, val; + int cnt, pal_cnt=0; + for(i=0; i>16); + val = freq64[i]&0xffff; + if ( cnt==0 ) + break; + p->lut[i] = val; + palette_max_val = max(palette_max_val, val); + pal_cnt+=cnt; + } + if (i==1) + p->lut[i++] = 0; // palette size of 1 is not supported, make it 2 + + // Heuristic for when to use the palette. If more than half of the + // weights are in the palette then we use it. This ensures we don't + // use palette for e.g. rectangular distributions. + int palbits_val; + if (pal_cnt > all_cnt/2) { + p->palsize = i; + palbits_val = palette_max_val; + } else { + // No palette + p->palsize = 0; + // If no palette, then palbits is used to specify the + // number of bits required for uncompressed mode, i.e. + // the number of bits for the greatest weight value + palbits_val = all_max_val; + } + + // the palette entry bit width + // minimum 2bits (because PALBITS is in range 2..9) + int palbits=2; + while( (1<palbits = palbits; + p->use_zero_runs = use_zero_runs; +} + +// Return 1 if zero runs should be used +// If palette_size is 512, then palette is not used (in that case the palette is setup +// with the standard alternating unsigned to signed mapping) +static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) { + int freq[512], i; + + // Calculate frequencies of the given weight stream + calc_freq( inbuf, inbuf_size, freq); + + // Find two most common values + int most_common_freq[2]={0}, most_common_val[2]={0}; + for(i=0; i<512; i++) { + if ( freq[i] > most_common_freq[0] ) { + most_common_freq[1] = most_common_freq[0]; + most_common_val[1] = most_common_val[0]; + most_common_freq[0] = freq[i]; + most_common_val[0] = i-256; + } else if ( freq[i] > most_common_freq[1] ) { + most_common_freq[1] = freq[i]; + most_common_val[1] = i-256; + } + } + + // Decide if zero-runs (alternating mode) should be used: + // * zero should be the most common symbol + // * zero should be sufficiently more common than the second most common symbol + int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1]; + + // Create the palette + create_palette( freq, use_zero_runs, p); + + return use_zero_runs; +} + +static void create_inverse_palette( palette_t *p) { + int i; + memset( p->inv_lut, 0, sizeof(p->inv_lut)); + for(i=0; i<512; i++) { + int val = i; + int sign = val&1; + int mag = val>>1; + int weight = sign ? -mag : mag; + int index = weight+256; + if (index >= 0 && index < 512) + p->inv_lut[ index ] = i + p->palsize - p->direct_offset; + } + for(i=0; ipalsize; i++) { + int val = p->lut[i]; + int sign = val&1; + int mag = val>>1; + int weight = sign ? -mag : mag; + int index = weight+256; + assert(index >= 0 && index < 512); + if (index >= 0 && index < 512) + p->inv_lut[ index ] = i; + } +} + +#define NWCFG 13 +#define NZCFG 4 // restrict search to ZDIV=0..3 +#define MAX_ZWCFG (max(NWCFG,NZCFG)) + +// search state +typedef struct search_state { + int bitcnt; // number of bits to reach this state + uint8_t prev_cfg; // previous grc parameter config +} search_state_t; + +// (trunc<<4) | div, 0x20 means uncompressed +static const uint8_t w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 }; +static const uint8_t z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 }; + + + +// An algorithm similar to the Viterbi algorithm is used to search for a +// good GRC parameter sequence for the given input value sequence. +// The inval buffer can contain weights, weight indices or runs. +// The return value is the resulting number of bitstream sections. +static int search_grc_params( const int *inval_buf, + int n_inval, + int zrun_mode, + int uncompressed_bits, + uint8_t *grc_param_cfg, + int *grc_param_pos, + int max_grc_param_cfg, + int *existing_grc_param_pos, + int n_existing_grc_param_pos, + int *bitcnt ) +{ + int n_cfg = zrun_mode ? NZCFG : NWCFG; + const uint8_t *grc_params = zrun_mode ? z_grc_params : w_grc_params; + int i,j; + + search_state_t *state[MAX_ZWCFG]; + for(i=0; i>4; + int q = value>>div; + int bits = trunc ? min(q+1,2) + div : q+1+div; + if (!zrun_mode && ((trunc && q>2) || q>31)) + bits=10000; // it's not possible to code the current value; give it a high cost + if (trunc==2) + bits=uncompressed_bits; + + if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) { + // Change GRC parameters + state[j][i+1].prev_cfg = best_cfg; + state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits; + } else { + // Keep same GRC parameters + state[j][i+1].prev_cfg = j; + state[j][i+1].bitcnt = state[j][i].bitcnt + bits; + } + } + } + + + // Best GRC parameter + int best_bitcnt=0x7fffffff, best_cfg=0; + for(j=0; j=0; i--) { + if (state[cfg][i].prev_cfg != cfg || i==0) { + n_cmds++; + cfg = state[cfg][i].prev_cfg; + } + } + + (void)(max_grc_param_cfg); + assert(n_cmds<=max_grc_param_cfg); + + cfg = best_cfg; + j=n_cmds-1; + int endpos=n_inval; + for(i=n_inval; i>=0; i--) { + if (state[cfg][i].prev_cfg != cfg || i==0) { + grc_param_cfg[j] = cfg; + grc_param_pos[j] = endpos; + j--; + cfg = state[cfg][i].prev_cfg; + endpos = i-1; + } + } + assert(j==-1); + + for(i=0; ibuf = buf; + bb->pos = 0; + bb->buf_size = size; + bb->log_symbols = log_symbols; +} + +static void bitbuf_putbit( bitbuf_t *bb, uint8_t bit) { + int byte_pos = bb->pos>>3; + uint8_t bit_pos = bb->pos&7; + assert( byte_pos >= 0 ); + assert( byte_pos < bb->buf_size ); + bb->buf[ byte_pos ] = ((bb->buf[ byte_pos ] & ~(1U<pos += 1; +} + +static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) { + int i; + if (len>0) { + if (bb->log_symbols) + printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data); + for(i=0; i>i)&1)); + } + } +} + +// Return new bitpos +static int encode_slice( const int *w_value, + const int *z_value, + int nvalues, + palette_t *p, + int new_palette, + int uncompressed_bits, + int w_cfg, + int z_cfg, + uint8_t *bitbuf, + int bitbuf_size, + int bitpos, + int verbose ) +{ + int i,j; + bitbuf_t bitbuf_s, *bb=&bitbuf_s; + bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 ); + bb->pos = bitpos; + + assert(nvalues<32768); + if (w_cfg < 0 || z_cfg < 0) + return bitpos; + // GRC parameters for this slice + int w_grc_div = w_grc_params[w_cfg] & 15; + int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1; + int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2; + int z_grc_div = z_grc_params[z_cfg] & 15; + + if (w_uncompressed) { + w_grc_div = uncompressed_bits; + } + + int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE; + int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED; + + if (verbose&1) { + printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n", + bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize); + } + + // Write slice header + bitbuf_put( bb, "ZDIV", 3, zdiv); + bitbuf_put( bb, "SLICELEN", 15, nvalues-1 ); + bitbuf_put( bb, "WDIV", 3, wdiv); + bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc ); + bitbuf_put( bb, "NEWPAL", 1, new_palette ); + if (new_palette) { + bitbuf_put( bb, "DIROFS", 5, p->direct_offset ); + bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1)); + bitbuf_put( bb, "PALBITS", 3, p->palbits-2 ); + for(i=0; ipalsize; i++) { + bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] ); + } + } + + int z_nvalues = nvalues + (new_palette?1:0); + int w_pos=0, z_pos=0; + int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0; + int z_unary=0, z_q=-1, z_r=0; + int w_nsymbols=0, w_remain[12]={0}; + int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0}; + int z_nsymbols=0, z_remain[12]={0}; + int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0}; + int z_unary_len = z_grc_div<3 ? 12 : 8; + do { + int balance = p->use_zero_runs ? w_pos - z_pos : 0; + int w_enable = balance<8 && w_pos=0 && p->use_zero_runs && z_pos5 ? 8 : 12; + while(j>w_grc_div; + w_r = value&((1<=0 && j0 ? (1<0) { + w_unary1 |= w_q>1 ? (1<=0) { + w_remain[w_nsymbols] = w_r; + w_nsymbols++; + w_pos++; + } + } + } + + if (z_enable) { + // Encode chunk (zrun) + j=0; + z_nsymbols=0; + z_unary=0; + while(j>z_grc_div; + z_r = value&((1<=0 && j0 ? (1<=0) { + z_remain[z_nsymbols] = z_r; + z_nsymbols++; + z_pos++; + } + } + } + + // Write chunk to bitstream + if (w_enable && !w_uncompressed) { + bitbuf_put( bb, "WUNARY0", 12, w_unary0); + } + if (z_enable) { + bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary); + } + if (w_enable && !w_uncompressed) { + bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1); + } + if (w_prev_enable) { + for(i=0; ipos; +} + +// return new bitpos +static int encode_section( const int16_t *inbuf, + int size, + palette_t *p, + uint8_t *bitbuf, + int bitbuf_size, + int bitpos, + int verbose ) +{ + int uncompressed_bits; + + // Uncompressed mode can only be used if either all weights + // are in the palette OR if the palette is not used. + if (p->only_palette) { + // Uncompressed bits derived from palette size + uncompressed_bits=0; + while( (1<palsize ) + uncompressed_bits++; + } else if (p->palsize==0) { + // Uncompressed bits is palbits (which is the bitdepth of the greatest weight) + uncompressed_bits = p->palbits; + } else { + // Don't use uncompressed + uncompressed_bits = 100; + } + + uint8_t *w_slice_cfg=0; + uint8_t *z_slice_cfg=0; + int *w_slice_pos=0; + int *z_slice_pos=0; + int *weight_values =0; + int *zrun_values = 0; + do { + CHECKED_MALLOC( weight_values, size*sizeof(int) ); + CHECKED_MALLOC( zrun_values, size*sizeof(int) ); + + // Get weights (or weight indicies) AND zero-runs from the input weight stream. + int i=0, n_weights = 0, zcnt; + while(1) { + if (p->use_zero_runs) { + zcnt=0; + // Count zero run + // Special case: if all weights in the section are zero, we must + // still ensure we have one coded weight so the the slice length + // doesn't become 0. Therefore we skip the first zero run and code + // the zero explicitly as a weight value instead + if (!p->only_zeros || i>0) { + while( iinv_lut[inbuf[i]+256]; + weight_values[n_weights] = value; + n_weights++; + i++; + } + + // Search for good GRC parameters for the weight stream + int n_w_slice, w_bitcnt; + CHECKED_MALLOC( w_slice_cfg, size ); + CHECKED_MALLOC( w_slice_pos, size*sizeof(int) ); + n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt); + if ( n_w_slice < 0 ) { // Memory allocation failed + bitpos = -1; + break; + } + if (n_weights==0) + n_w_slice = 0; + + // Search for good GRC parameters for the zrun stream + int n_z_slice=0, z_bitcnt=0; + if (p->use_zero_runs) { + CHECKED_MALLOC( z_slice_cfg, size ); + CHECKED_MALLOC( z_slice_pos, size*sizeof(int) ); + n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt); + if ( n_z_slice < 0 ) { // Memory allocation failed + bitpos = -1; + break; + } + } + + // Encode bitstream slice + int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1; + while(posuse_zero_runs ? zrun_values+pos+(!new_palette) : 0; + bitpos = encode_slice( weight_values+pos, zrun_buf, len, + p, new_palette, uncompressed_bits, + w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0, + bitbuf, bitbuf_size, bitpos, verbose ); + new_palette = 0; + + if (i_w_sliceuse_zero_runs) { + free(z_slice_cfg); + free(z_slice_pos); + } + free(weight_values); + free(zrun_values); + + return bitpos; +} + +// Encode the given weight stream +// inbuf uncompressed 9bit signed weights +// inbuf_size number of weights +// outbuf compressed bitstream, buffer is malloced within this function +// verbose if non-zero, printf log +// Return value is the size in bytes of the compressed output +// Return -1 if error +int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) { + int i; + // Range check + for(i=0; i255) { + printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]); + return -1; + } + } + + int bitbuf_size = inbuf_size*2+1024; + assert(*outbuf == NULL); + *outbuf = malloc( bitbuf_size ); + if (!*outbuf) + { // Failed to allocate buffer + return -1; + } + + // Analyse input data to find palette re-programming points + int *palette_restart_pos = NULL; + int n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos); + + // Compress each section (using a single palette) separately + int bitpos = 0; + for ( i = 0; i < n_restarts && bitpos >= 0; i++ ) { + palette_t palette; + int pos, size; + pos = palette_restart_pos[i]; + size = (i= 0 && n_restarts >= 0 ) { // If allocation fails bitpos or n_restarts < 0 + // Add end of stream marker and align to 128bit + bitbuf_t bitbuf_s, *bb=&bitbuf_s; + bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 ); + bb->pos = bitpos; + bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS); + bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff ); + + // Pad with 0xff until 64bit aligned + while( bb->pos & 127 ) { + bitbuf_put( bb, "PAD", 8, 0xff ); + } + bitpos = bb->pos; + + assert((bitpos&127)==0); + int outbuf_size = bitpos/8; + *outbuf = realloc(*outbuf, outbuf_size); + if ( *outbuf ) { + ret = outbuf_size; + } + } + + free(palette_restart_pos); + + return ret; +} + +void mlw_free_outbuf( uint8_t *outbuf ) { + if (outbuf) + free(outbuf); +} + +struct brick_buf_s +{ + int16_t* buf; + int* strides; +}; +typedef struct brick_buf_s brick_buf_t; + +static int16_t get_brick_weight(brick_buf_t* buf, int ofm_z, int wy, int wx, int ifm_z) +{ + int16_t* p = buf->buf; + + p += ofm_z * buf->strides[0]; + p += wy * buf->strides[1]; + p += wx * buf->strides[2]; + p += ifm_z * buf->strides[3]; + + return *p; +} + +static void reorder_free(int16_t* buf) +{ + if (buf) + { + free(buf); + } +} + +static int16_t* reorder( + int ifm_ublock_depth, + int ofm_ublock_depth, + int ofm_depth, + int kernel_height, + int kernel_width, + int ifm_depth, + int* strides, + int16_t* inbuf, + int ofm_block_depth, + int is_depthwise, + int is_partkernel, + int ifm_bitdepth, + int decomp_h, + int decomp_w, + int64_t* padded_length) +{ + *padded_length = -1; + /* Size unknown. Start with one page at least */ + int64_t length = round_up(max(1, sizeof(int16_t)* + ofm_depth* + kernel_height* + kernel_width* + ifm_depth), + 4*1024) / sizeof(int16_t); + int16_t* weights = (int16_t*)malloc(length * sizeof(int16_t)); + if (!weights) + { // Alloc failed, so exit + return NULL; + } + + brick_buf_t brick_buf; + brick_buf.buf = inbuf; + brick_buf.strides = strides; + + int ifm_block_depth = is_partkernel || ifm_bitdepth == 16 ? 16 : 32; + int64_t weight_cnt = 0; + for (int ofm_block_z = 0; ofm_block_z < ofm_depth; ofm_block_z += ofm_block_depth) + { + int clipped_ofm_block_depth = min(ofm_block_depth, ofm_depth - ofm_block_z); + // IFM blocks required for the brick + for (int ifm_block_z = 0; ifm_block_z < (is_depthwise ? 1 : ifm_depth); ifm_block_z += ifm_block_depth) + { + int clipped_ifm_block_depth; + if (is_depthwise) + { + clipped_ifm_block_depth = ifm_ublock_depth; + } + else + { + clipped_ifm_block_depth = is_partkernel ? + min(ifm_block_depth, ifm_depth - ifm_block_z) : ifm_block_depth; + } + // Weight decomposition + // Subkernel Splitting (H) + for (int subkernel_y = 0; subkernel_y < kernel_height; subkernel_y += decomp_h) + { + int sub_height = min(kernel_height - subkernel_y, decomp_h); + // Subkernel splitting (W) + for (int subkernel_x = 0; subkernel_x < kernel_width; subkernel_x += decomp_w) + { + int sub_width = min(kernel_width - subkernel_x, decomp_w); + int subkernel_elements = sub_width * sub_height; + // Part kernel first works across the kernel H/W and needs padding + if (is_partkernel) + { + if (ifm_bitdepth == 16 && subkernel_elements % 2 != 0) + { + subkernel_elements = round_up(subkernel_elements, 2); + } + else if (ifm_bitdepth == 8 && subkernel_elements % 4 != 0) + { + subkernel_elements = round_up(subkernel_elements, 4); + } + } + else if (is_depthwise) + { + subkernel_elements = round_up(subkernel_elements, 4); + } + int ifm_block_depth_outer = is_partkernel ? clipped_ifm_block_depth : 1; + int ifm_block_depth_inner = is_partkernel ? 1 : clipped_ifm_block_depth; + for (int ifm_ublk_outer = 0; ifm_ublk_outer < ifm_block_depth_outer; ifm_ublk_outer += ifm_ublock_depth) + { + // OFM Ublocks in OFM-block over depth + for (int ofm_ublk = 0; ofm_ublk < clipped_ofm_block_depth; ofm_ublk += ofm_ublock_depth) + { + // HW Kernel element traversal - cannot be a H/W loop due to element + // padding requirement on depthwise/part-kernel configurations + for (int element = 0; element < subkernel_elements; element++) + { + int kx = element % sub_width; + int ky = element / sub_width; + // IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise) + // In case of part-kernel-first IFM Ublock traversal have already been handled + // and this loop is ignored. + for (int ifm_ublk_inner = 0; ifm_ublk_inner < ifm_block_depth_inner; ifm_ublk_inner += ifm_ublock_depth) + { + // Feed OFM ublock elements + for (int ofm_ublock_z = 0; ofm_ublock_z < ofm_ublock_depth; ofm_ublock_z++) + { + // Source IFM ublock elements (only 1 element deep if depthwise) + for (int ifm_ublock_z = 0; ifm_ublock_z < (is_depthwise ? 1 : ifm_ublock_depth); ifm_ublock_z++) + { + // Source position within the current subkernel + int wx = subkernel_x + kx; + int wy = subkernel_y + ky; + // Source IFM/OFM slices + int ifm_ublk = ifm_ublk_inner + ifm_ublk_outer; + int ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z; + int ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z; + if ((ifm_z < ifm_depth) && (ofm_z < ofm_depth) && (ky < sub_height)) + { + weights[weight_cnt] = get_brick_weight(&brick_buf, ofm_z, wy, wx, ifm_z); + //fprintf(stderr, "weights[%ld] %d ofm_z %d wy %d wx %d ifm_z %d\n", weight_cnt, weights[weight_cnt], ofm_z, wy, wx, ifm_z); + } + else + { + weights[weight_cnt] = 0; + } + weight_cnt++; + if (weight_cnt == length) + { + // Reallocate by doubling the buffer size as needed + length *= 2; + weights = (int16_t*)realloc(weights, length * sizeof(int16_t)); + if (!weights) + { // Realloc failed, so exit + return NULL; + } + } + } + } + } + } + } + } + } + } + } + } + + + weights = (int16_t*)realloc(weights, weight_cnt * sizeof(int16_t)); + if ( weights ) { + *padded_length = weight_cnt; + } + + return weights; +} + +// Reorder and encode the given weight stream +// Return value is the size in bytes of the compressed output +// Return -1 if error +int mlw_reorder_encode( + int ifm_ublock_depth, + int ofm_ublock_depth, + int ofm_depth, + int kernel_height, + int kernel_width, + int ifm_depth, + int* brick_strides, + int16_t* inbuf, + int ofm_block_depth, + int is_depthwise, + int is_partkernel, + int ifm_bitdepth, + int decomp_h, + int decomp_w, + uint8_t **outbuf, // *outbuf must be freed by caller + int64_t* padded_length, + int verbose) +{ + if (verbose) { + fprintf(stderr, "mlw_reorder_encode: %d %d %d %d %d %d (%d %d %d %d) %d %d %d %d %d %d\n", ifm_ublock_depth, + ofm_ublock_depth, + ofm_depth, + kernel_height, + kernel_width, + ifm_depth, + brick_strides[0], + brick_strides[1], + brick_strides[2], + brick_strides[3], + ofm_block_depth, + is_depthwise, + is_partkernel, + ifm_bitdepth, + decomp_h, + decomp_w); + } + /* Reorder weights */ + int16_t* weights = reorder( + ifm_ublock_depth, + ofm_ublock_depth, + ofm_depth, + kernel_height, + kernel_width, + ifm_depth, + brick_strides, + inbuf, + ofm_block_depth, + is_depthwise, + is_partkernel, + ifm_bitdepth, + decomp_h, + decomp_w, + padded_length); + + /* Then encode */ + int output_length = -1; + if (*padded_length > 0 && *padded_length <= INT32_MAX) + { + output_length = mlw_encode(weights, (int)*padded_length, outbuf, verbose); + } + reorder_free(weights); + + return output_length; +} diff --git a/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h new file mode 100644 index 00000000000..3162031e69d --- /dev/null +++ b/src/gallium/drivers/ethosu/mlw_codec/mlw_encode.h @@ -0,0 +1,65 @@ +/* + * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifndef MLW_ENCODE_H +#define MLW_ENCODE_H + +#ifdef _MSC_VER + #define MLW_ENCODE_EXPORTED __declspec(dllexport) +#else + #define MLW_ENCODE_EXPORTED __attribute__((visibility("default"))) +#endif + +#if __cplusplus +extern "C" +{ +#endif + +MLW_ENCODE_EXPORTED +int mlw_encode(int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose); + +MLW_ENCODE_EXPORTED +void mlw_free_outbuf(uint8_t *outbuf); + +MLW_ENCODE_EXPORTED +int mlw_reorder_encode( + int ifm_ublock_depth, + int ofm_ublock_depth, + int ofm_depth, + int kernel_height, + int kernel_width, + int ifm_depth, + int* brick_strides, + int16_t* inbuf, + int ofm_block_depth, + int is_depthwise, + int is_partkernel, + int ifm_bitdepth, + int decomp_h, + int decomp_w, + uint8_t **outbuf, + int64_t* padded_length, + int verbose); + +#if __cplusplus +} +#endif + +#endif diff --git a/src/gallium/drivers/ethosu/registers.xml b/src/gallium/drivers/ethosu/registers.xml new file mode 100644 index 00000000000..961accc44ed --- /dev/null +++ b/src/gallium/drivers/ethosu/registers.xml @@ -0,0 +1,399 @@ + + + + + + +Initial Author. + + + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice (including the +next paragraph) shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gallium/drivers/ethosu/rules-ng.xsd b/src/gallium/drivers/ethosu/rules-ng.xsd new file mode 100644 index 00000000000..414dee1d746 --- /dev/null +++ b/src/gallium/drivers/ethosu/rules-ng.xsd @@ -0,0 +1,457 @@ + + + + + + An updated version of the old rules.xml file from the + RivaTV project. Specifications by Pekka Paalanen, + preliminary attempt by KoalaBR, + first working version by Jakob Bornecrantz. + For specifications, see the file rules-ng-format.txt + in Nouveau CVS module 'rules-ng'. + + Version 0.1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + register database author + + + + + + + + + + + + nickType + + + + + + + + + databaseType + + + + + + + + + + importType + + + + + + + copyrightType + + + + + + + + + + + + + domainType + + + + + + + + + + + + + + + + + + groupType + + + + + + + + + + + + arrayType + + + + + + + + + + + + + + + + + + + + + stripeType + + + + + + + + + + + + + + + + + + + registerType used by reg8, reg16, reg32, reg64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + bitsetType + + + + + + + + + + + + + + + + bitfieldType + + + + + + + + + + + + + + + + + + + + + + enumType + + + + + + + + + + + + + + + + valueType + + + + + + + + + + + + + + refType + + + + + + + + + + + brief documentation, no markup + + + + + + + + + + + root element of documentation sub-tree + + + + + + + + + + + + + for bold, underline, italics + + + + + + + + + + + + + + + + + + + definition of a list, ordered or unordered + + + + + + + + + + + items of a list + + + + + + + + + + + + + + + + + + + + + + + + HexOrNumber + + + + + + + + + + + + + + + + + + Access + + + + + + + + + + + DomainWidth + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/gallium/meson.build b/src/gallium/meson.build index fbbceb2d45f..9b02fd1189a 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -190,6 +190,12 @@ if with_gallium_rocket else driver_rocket = declare_dependency() endif +if with_gallium_ethosu + subdir('winsys/ethosu/drm') + subdir('drivers/ethosu') +else + driver_ethosu = declare_dependency() +endif if with_gallium_zink subdir('drivers/zink') else diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build index 97f641c4e98..134d7e4adf8 100644 --- a/src/gallium/targets/dri/meson.build +++ b/src/gallium/targets/dri/meson.build @@ -59,7 +59,7 @@ libgallium_dri = shared_library( driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv, driver_tegra, driver_i915, driver_svga, driver_virgl, driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12, - driver_asahi, driver_crocus, driver_rocket + driver_asahi, driver_crocus, driver_rocket, driver_ethosu ], install : true, name_suffix : libname_suffix, diff --git a/src/gallium/winsys/ethosu/drm/ethosu_drm_public.h b/src/gallium/winsys/ethosu/drm/ethosu_drm_public.h new file mode 100644 index 00000000000..8d45a0c2322 --- /dev/null +++ b/src/gallium/winsys/ethosu/drm/ethosu_drm_public.h @@ -0,0 +1,17 @@ +/* + * Copyright 2014 Broadcom + * Copyright 2018 Alyssa Rosenzweig + * Copyright 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#ifndef __ETHOSU_DRM_PUBLIC_H__ +#define __ETHOSU_DRM_PUBLIC_H__ + +struct pipe_screen; +struct pipe_screen_config; + +struct pipe_screen * +ethosu_drm_screen_create(int drmFD, const struct pipe_screen_config *config); + +#endif /* __ETHOSU_DRM_PUBLIC_H__ */ diff --git a/src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c b/src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c new file mode 100644 index 00000000000..33e1e870f6e --- /dev/null +++ b/src/gallium/winsys/ethosu/drm/ethosu_drm_winsys.c @@ -0,0 +1,19 @@ +/* + * Copyright 2014 Broadcom + * Copyright 2018 Alyssa Rosenzweig + * Copyright 2025 Tomeu Vizoso + * SPDX-License-Identifier: MIT + */ + +#include "util/os_file.h" +#include "util/u_screen.h" + +#include "ethosu/ethosu_device.h" +#include "ethosu_drm_public.h" + +struct pipe_screen * +ethosu_drm_screen_create(int fd, const struct pipe_screen_config *config) +{ + return u_pipe_screen_lookup_or_create(os_dupfd_cloexec(fd), config, NULL, + ethosu_screen_create); +} diff --git a/src/gallium/winsys/ethosu/drm/meson.build b/src/gallium/winsys/ethosu/drm/meson.build new file mode 100644 index 00000000000..f9fa8ea5d73 --- /dev/null +++ b/src/gallium/winsys/ethosu/drm/meson.build @@ -0,0 +1,13 @@ +# Copyright 2017 Broadcom +# SPDX-License-Identifier: MIT + +libethosuwinsys = static_library( + 'ethosuwinsys', + files('ethosu_drm_winsys.c'), + include_directories : [ + inc_src, inc_include, + inc_gallium, inc_gallium_aux, inc_gallium_drivers, + ], + gnu_symbol_visibility : 'hidden', + dependencies: [dep_libdrm, idep_mesautil], +)